diff --git a/dev-share-api/Configuration/VectorDbSettings.cs b/dev-share-api/Configuration/VectorDbSettings.cs index 5729bec..025dc9d 100644 --- a/dev-share-api/Configuration/VectorDbSettings.cs +++ b/dev-share-api/Configuration/VectorDbSettings.cs @@ -5,8 +5,8 @@ public class VectorDbSettings public const string SectionName = "VectorDb"; // Collections - public string ResourceCollection { get; set; } = "DevShare_Resource"; - public string InsightCollection { get; set; } = "DevShare_Insight"; + public string ResourceCollection { get; set; } = "BlotzShare_Resource"; + public string InsightCollection { get; set; } = "BlotzShare_Insight"; // Vector dimensions public uint Dimensions { get; set; } = 384; // MiniLM-L6-v2 dimension diff --git a/dev-share-api/Controllers/ApiController.cs b/dev-share-api/Controllers/ApiController.cs index f234cd8..817bfdf 100644 --- a/dev-share-api/Controllers/ApiController.cs +++ b/dev-share-api/Controllers/ApiController.cs @@ -1,14 +1,9 @@ -using HtmlAgilityPack; -using Microsoft.Playwright; using Models; using Microsoft.AspNetCore.Mvc; using Services; using Qdrant.Client.Grpc; -using System.Text; using Executor; using System.Collections.Concurrent; -using System.Text.Json; -using Newtonsoft.Json.Linq; namespace Controllers; @@ -24,6 +19,7 @@ public class ExtractController : ControllerBase private readonly IOnlineResearchService _onlineResearchService; private readonly IServiceScopeFactory _scopeFactory; private static readonly ConcurrentDictionary TaskStore = new(); + private static readonly HttpClient _httpClient = new(); public ExtractController( IEmbeddingService embeddingService, @@ -57,6 +53,13 @@ public async Task Share([FromBody] UrlRequest request) Console.WriteLine($"Extracting: {url}"); + bool isVideo = await UrlTypeDetector.IsVideoUrlAsync(url, _httpClient); + + if (isVideo) + { + return BadRequest(new { Type = "Video", Message = "Video URL detected" }); + } + var taskId = Guid.NewGuid().ToString(); var task = new ShareTask { @@ -186,29 +189,82 @@ public async Task> Indexing([FromBody] string collect return Ok(await _vectorService.IndexingAsync(collectionName, field)); } - [HttpPost("insight/share")] - public async Task ShareInsight([FromBody] ShareInsightRequest request) + [HttpPost("collections/{collectionName}")] + public async Task CreateCollection(string collectionName) + { + try + { + await _vectorService.CreateCollectionAsync(collectionName); + return Ok(new { message = $"Collection '{collectionName}' created successfully" }); + } + catch (Exception ex) + { + return StatusCode(500, new { error = $"Failed to create collection: {ex.Message}" }); + } + } + + [HttpPost("vectors/resource")] + public async Task UpsertResource([FromBody] ShareVectorRequest request) { - var insightId = request.InsightId ?? Guid.NewGuid().ToString(); - var denseEmbedding = await _embeddingService.GetDenseEmbeddingAsync(request.Content); - var (indices, values) = await _embeddingService.GetSparseEmbeddingAsync(request.Content); + try + { + var resourceId = request.ResourceId ?? Guid.NewGuid().ToString(); + var denseEmbedding = await _embeddingService.GetDenseEmbeddingAsync(request.Content); + var (indices, values) = await _embeddingService.GetSparseEmbeddingAsync(request.Content); + + var denseVector = new DenseVector(); + denseVector.Data.AddRange(denseEmbedding); - var denseVector = new DenseVector(); - denseVector.Data.AddRange(denseEmbedding); + var sparseVector = new SparseVector(); + sparseVector.Indices.AddRange(indices); + sparseVector.Values.AddRange(values); - var sparseVector = new SparseVector(); - sparseVector.Indices.AddRange(indices); - sparseVector.Values.AddRange(values); + var vectors = new Dictionary + { + ["dense_vector"] = new() { Dense = denseVector }, + ["sparse_vector"] = new() { Sparse = sparseVector } + }; - var vectors = new Dictionary + request.Vectors = vectors; + await _vectorService.UpsertResourceAsync(resourceId, request.Url, request.Content, request.Vectors); + return Ok(new { message = "Resource vector upserted successfully" }); + } + catch (Exception ex) { - ["dense_vector"] = new() { Dense = denseVector }, - ["sparse_vector"] = new() { Sparse = sparseVector } - }; + return StatusCode(500, new { error = $"Failed to upsert resource vector: {ex.Message}" }); + } + } - request.Vectors = vectors; - await _vectorService.UpsertInsightAsync(insightId, request.Url, request.Content, request.ResourceId, request.Vectors); - return Ok(); + [HttpPost("vectors/insight")] + public async Task UpsertInsight([FromBody] ShareInsightRequest request) + { + try + { + var insightId = request.InsightId ?? Guid.NewGuid().ToString(); + var denseEmbedding = await _embeddingService.GetDenseEmbeddingAsync(request.Content); + var (indices, values) = await _embeddingService.GetSparseEmbeddingAsync(request.Content); + + var denseVector = new DenseVector(); + denseVector.Data.AddRange(denseEmbedding); + + var sparseVector = new SparseVector(); + sparseVector.Indices.AddRange(indices); + sparseVector.Values.AddRange(values); + + var vectors = new Dictionary + { + ["dense_vector"] = new() { Dense = denseVector }, + ["sparse_vector"] = new() { Sparse = sparseVector } + }; + + request.Vectors = vectors; + await _vectorService.UpsertInsightAsync(insightId, request.Url, request.Content, request.ResourceId, request.Vectors); + return Ok(new { message = "Insight vector upserted successfully" }); + } + catch (Exception ex) + { + return StatusCode(500, new { error = $"Failed to upsert resource vector: {ex.Message}" }); + } } //todo make sure the return data from service is List and List diff --git a/dev-share-api/Models/ShareVectorRequest.cs b/dev-share-api/Models/ShareVectorRequest.cs new file mode 100644 index 0000000..0ea219e --- /dev/null +++ b/dev-share-api/Models/ShareVectorRequest.cs @@ -0,0 +1,11 @@ +using Qdrant.Client.Grpc; + +namespace Models; + +public class ShareVectorRequest +{ + public string ResourceId { get; set; } + public string Url { get; set; } + public string Content { get; set; } + public Dictionary Vectors { get; set; } +} \ No newline at end of file diff --git a/dev-share-api/Services/OnlineResearchService.cs b/dev-share-api/Services/OnlineResearchService.cs index 310154d..40e90ef 100644 --- a/dev-share-api/Services/OnlineResearchService.cs +++ b/dev-share-api/Services/OnlineResearchService.cs @@ -13,7 +13,7 @@ public interface IOnlineResearchService public class OnlineResearchService : IOnlineResearchService { private readonly AzureOpenAIClient _client; - private readonly string _deploymentName = "gpt-4o-mini"; + private const string _deploymentName = "gpt-4o-mini"; private static readonly JsonSerializerOptions _jsonOptions = new() { PropertyNameCaseInsensitive = true, @@ -28,96 +28,111 @@ public OnlineResearchService(AzureOpenAIClient openAIClient) public async Task> PerformOnlineResearchAsync(string query, int topK = 3) { if (string.IsNullOrWhiteSpace(query)) - { throw new ArgumentException("Query cannot be empty", nameof(query)); - } - try - { - var response = await GetOpenAIResponseAsync(query, topK); - return await ParseResponseToVectorResourceDtos(response); - } - catch (Exception ex) + var messages = new List { - throw; - } + new SystemChatMessage($@" + You are an AI research assistant. Your task is to return an array of up to {topK} concise and factual resources relevant to the user's query. + + For each resource, provide: + - Title: A title for the answer of the summary in less than 15 words. + - Content: A concise factual answer or summary. + - Url: A direct, relevant web source. + + Always call the `generate_research_results` function with your result in JSON: + {{ + ""results"": [ + {{ + ""title"": string, + ""content"": string, + ""url"": string + }} + ] + }} + + Guidelines: + - No explanations or formatting. + - Never return plain text; always structured JSON using the function. + - Results must be unique and from reputable sources. + "), + new UserChatMessage(query) + }; + + var tool = CreateGenerateResearchResultsTool(topK); + + return await CallToolAndDeserializeAsync( + toolFunctionName: "generate_research_results", + messages: messages, + tool: tool + ).ContinueWith(t => t.Result?.Results ?? new List()); ; } - private async Task GetOpenAIResponseAsync(string query, int topK) + private ChatTool CreateGenerateResearchResultsTool(int topK) { - var prompt = GeneratePrompt(query, topK); - ChatCompletion response = await _client.GetChatClient(_deploymentName) - .CompleteChatAsync(prompt); - - return response.Content?.FirstOrDefault()?.Text ?? string.Empty; + return ChatTool.CreateFunctionTool( + functionName: "generate_research_results", + functionDescription: $"Returns up to {topK} concise and factual research results for the given query.", + functionParameters: BinaryData.FromObjectAsJson(new + { + type = "object", + properties = new + { + results = new + { + type = "array", + items = new + { + type = "object", + properties = new + { + title = new { type = "string", description = "Title" }, + content = new { type = "string", description = "Concise, factual answer or summary." }, + url = new { type = "string", description = "Direct relevant web source." } + }, + required = new[] { "content", "url" } + }, + minItems = 1, + maxItems = topK + } + }, + required = new[] { "results" } + }) + ); } - private static async Task> ParseResponseToVectorResourceDtos(string response) + public async Task CallToolAndDeserializeAsync( + string toolFunctionName, + List messages, + ChatTool tool) { - if (string.IsNullOrWhiteSpace(response)) + var client = _client.GetChatClient(deploymentName: _deploymentName); + ChatCompletionOptions options = new() { - return new[] { CreateFallbackDto(response) }; - } + Tools = { tool } + }; + ChatCompletion response = await client.CompleteChatAsync(messages, options); - try - { - // Clean the response by removing Markdown code block and escapes - var cleanedResponse = response + var toolCall = response.ToolCalls.FirstOrDefault(tc => tc.FunctionName == toolFunctionName); + if (toolCall == null) + throw new InvalidOperationException("No function call response found."); + + var jsonRes = toolCall.FunctionArguments.ToString(); + var cleanedResponse = jsonRes .Replace("```json", "") .Replace("```", "") .Replace("\\n", "") .Replace("\n", "") .Trim(); + var result = JsonSerializer.Deserialize(cleanedResponse, _jsonOptions); + if (result == null) + throw new InvalidOperationException("Deserialization failed."); - var results = await Task.Run(() => - JsonSerializer.Deserialize(cleanedResponse, _jsonOptions)); - - if (results?.Any() == true) - { - return results; - } - - // Try parsing as single object if array fails - var singleResult = await Task.Run(() => - JsonSerializer.Deserialize(cleanedResponse, _jsonOptions)); - - return singleResult != null - ? new[] { singleResult } - : new[] { CreateFallbackDto(response) }; - } - catch (JsonException ex) - { - return new[] { CreateFallbackDto(response) }; - } + return result; } - private static string GeneratePrompt(string query, int topK) + private class ResourceResultWrapper { - return @$" - You are an AI assistant. Given a user query, return an array of {topK} JSON objects with the following fields suitable for a vector database: - - [ - {{ - ""Content"": ""First concise, factual answer here."", - ""Url"": ""https://relevant-source-1.com"" - }}, - {{ - ""Content"": ""Second concise, factual answer here."", - ""Url"": ""https://relevant-source-2.com"" - }} - ] - - User query: {query} - - Return exactly {topK} JSON objects in an array. Ensure each answer is unique and relevant."; - } - - private static ResourceDto CreateFallbackDto(string fallBackContent) - { - return new() - { - Content = fallBackContent, - Url = string.Empty - }; + public List Results { get; set; } = new(); } } diff --git a/dev-share-api/Services/VectorService.cs b/dev-share-api/Services/VectorService.cs index bef6d11..04f8f5a 100644 --- a/dev-share-api/Services/VectorService.cs +++ b/dev-share-api/Services/VectorService.cs @@ -3,13 +3,13 @@ using Models; using Qdrant.Client; using Qdrant.Client.Grpc; -using System.Text.Json; namespace Services; public interface IVectorService { Task InitializeAsync(); + Task CreateCollectionAsync(string collectionName); Task IndexingAsync(string collectionName, string fieldName); Task UpdateCollectionAsync(string collectionName); @@ -135,7 +135,7 @@ public async Task> SearchInsightAsync(string query, int t return insightResults.Select(MapToInsightDto).ToList(); } - private async Task CreateCollectionAsync(string collectionName) + public async Task CreateCollectionAsync(string collectionName) { try { diff --git a/dev-share-api/utils/UrlTypeDetector.cs b/dev-share-api/utils/UrlTypeDetector.cs new file mode 100644 index 0000000..4c933d4 --- /dev/null +++ b/dev-share-api/utils/UrlTypeDetector.cs @@ -0,0 +1,49 @@ +public static class UrlTypeDetector +{ + private static readonly string[] VideoExtensions = { ".mp4", ".webm", ".m3u8", ".mov", ".avi" }; + private static readonly string[] VideoDomains = { "youtube.com", "youtu.be", "vimeo.com", "dailymotion.com" }; + + public static bool IsLikelyVideoFromPattern(string url) + { + if (!Uri.TryCreate(url, UriKind.Absolute, out var uri)) + return false; + + var host = uri.Host.ToLowerInvariant(); + if (VideoDomains.Any(domain => host.Contains(domain))) + return true; + + var path = uri.AbsolutePath.ToLowerInvariant(); + if (VideoExtensions.Any(ext => path.EndsWith(ext))) + return true; + + return false; + } + + public static async Task IsVideoByContentTypeAsync(string url, HttpClient? client = null) + { + client ??= new HttpClient(); + + try + { + using var request = new HttpRequestMessage(HttpMethod.Head, url); + request.Headers.TryAddWithoutValidation("User-Agent", "Mozilla/5.0"); + + using var response = await client.SendAsync(request); + + if (!response.IsSuccessStatusCode) + return false; + + var mediaType = response.Content.Headers.ContentType?.MediaType; + return mediaType?.StartsWith("video/", StringComparison.OrdinalIgnoreCase) == true; + } + catch + { + return false; + } + } + + public static async Task IsVideoUrlAsync(string url, HttpClient? client = null) + { + return IsLikelyVideoFromPattern(url) || await IsVideoByContentTypeAsync(url, client); + } +}