Merge pull request #23 from TheBluCoder/dev-branch

TheBluCoder · web-flow · commit 825ee7f3de55 · 2025-03-26T14:41:30.000-04:00
Dev branch
diff --git a/.gitignore b/.gitignore
@@ -59,3 +59,4 @@ unit_test.py
 testing_workflow.py
 *.yaml
 
+scripts/
diff --git a/backend/mainService/Dockerfile b/backend/mainService/Dockerfile
@@ -7,6 +7,7 @@ WORKDIR /app
 # Removes the package lists downloaded during the update to reduce the image size.
 RUN apt-get update && apt-get install -y \
     build-essential \
+    cron \
     && rm -rf /var/lib/apt/lists/*
 
 # Set the PATH environment variable to include /app
@@ -33,6 +34,5 @@ RUN playwright install && playwright install-deps
 # Expose the port the app runs on
 EXPOSE 8000
 
-
-# Command to run the application
-CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "8000"] 
+# Start both cron and the FastAPI application
+CMD ["sh", "-c", "cron && uvicorn app:app --host 0.0.0.0 --port 8000"] 
diff --git a/backend/mainService/requirements.txt b/backend/mainService/requirements.txt
@@ -25,4 +25,6 @@ google-genai
 redis>=4.2.0
 uvicorn
 httpx>=0.28.1
+pypdf
+pypdf2
 
diff --git a/backend/mainService/scripts/delete_stale_data.py b/backend/mainService/scripts/delete_stale_data.py
diff --git a/backend/mainService/src/config/config.py b/backend/mainService/src/config/config.py
@@ -32,7 +32,7 @@ class ScraperConfig:
     """
     This is the timeout duration for the requests made to the web scraper
     """
-    TIMEOUT_DURATION: int = 8000
+    TIMEOUT_DURATION: int = 10000
 
     def __post_init__(self):
         if self.MAX_FILE_SIZE <= 0:
diff --git a/backend/mainService/src/llm/Pinecone.py b/backend/mainService/src/llm/Pinecone.py
@@ -186,14 +186,14 @@ async def set_current_index(
 
         :param index_name: Name of the index to set as current
         """
+        if not await self._pc.has_index(index_name):
+                return False
         if not self._current_index_name == index_name and self._current_index:
             await self._current_index.close()
         elif self._current_index_name == index_name:
             return True
 
         if not index_host:
-            if not await self._pc.has_index(index_name):
-                return False
             index_model = await self._pc.describe_index(index_name)
             self._current_index_host = index_model.host
         else:
diff --git a/backend/mainService/src/llm/chat_llm/Gemini_llm.py b/backend/mainService/src/llm/chat_llm/Gemini_llm.py
@@ -11,7 +11,7 @@
 
 
 class Genai_cite:
-    model = "gemini-2.0-pro-exp-02-05"
+    model = "gemini-2.0-flash"
 
     def __init__(self, api_key: str = os.getenv("GOOGLE_API_KEY"),
                  llm_model: str = f'models/{model}'):
diff --git a/backend/mainService/src/llm/chat_llm/Groq_llm.py b/backend/mainService/src/llm/chat_llm/Groq_llm.py
@@ -19,7 +19,7 @@ def __init__(self, api_key: str = os.getenv("GROQ_API_KEY"),
         self.client = Groq(api_key=self.api_key)
         self.llm_model = llm_model
 
-    def getKeywordSearchTerm(self, document: str) -> Optional[str]:
+    def getKeywordSearchTerm(self, document: str, proposed_title: Optional[str] = None) -> str:
         """
         Generate a search term from the provided document using LLM.
 
@@ -46,12 +46,17 @@ def getKeywordSearchTerm(self, document: str) -> Optional[str]:
 
             # Make API call with error handling
 
+            if proposed_title:
+                document = f"Here is the proposed title: {proposed_title}\n\nHere is the content: {document}"
+            else:
+                document = f"Here is the content: {document}"
+
             completion = self.client.chat.completions.create(
                 model=self.llm_model,
                 messages=[
                     {
                         "role": "user",
-                        "content": f"summarize the provided into a google search term and return a json response as 'search_term : value', if no content provided, your response should be 'message:no content to summarize'. Here is the content: {document}"
+                        "content": f"summarize the provided into a google search term and return a json response as 'search_term : value', if no content provided, your response should be 'message:no content to summarize'.{document}"
                     },
                 ],
                 temperature=0.9,
diff --git a/backend/mainService/src/scraper/site_specific/async_frontier_scraper.py b/backend/mainService/src/scraper/site_specific/async_frontier_scraper.py
@@ -38,7 +38,7 @@ async def _get_download_link(self, url: str) -> Optional[str]:
         try:
             page = await self.context.new_page()
             if not url.endswith("pdf"):
-                await page.goto(url, wait_until='networkidle')
+                await page.goto(url, wait_until='networkidle', timeout=self.element_timeout)
                 await self._interact_with_dropdown(page)
                 download_link = await self._extract_download_link(page)
             else:
diff --git a/backend/mainService/src/services/citation_service.py b/backend/mainService/src/services/citation_service.py
@@ -130,8 +130,7 @@ async def process_citation(self,
         """
         try:
             # Step 0: Generate index name
-            title = (self.summarize_llm.getKeywordSearchTerm(content)
-                     if title.lower() == "untitled" else title)
+            title = self.summarize_llm.getKeywordSearchTerm(content, proposed_title=title)
             index_name = self._generate_index_name(title)
             logger.info(f"index_name = {index_name}")
             if await self.PC.set_current_index(index_name):
@@ -230,10 +229,11 @@ async def _process_documents(
 
         try:
             cleaned_result = search_results["cleaned_result"]
-            download_results = await self.scraper.get_pdfs(
-                target_urls=cleaned_result.get("links"),
-                storage_path=search_results["search_key"]
-            )
+            async with asyncio.timeout(15):  # 15 second timeout
+                download_results = await self.scraper.get_pdfs(
+                    target_urls=cleaned_result.get("links"),
+                    storage_path=search_results["search_key"]
+                )
 
             return await self._prepare_document_batches(
                 download_results,
diff --git a/backend/metricsService/requirements.txt b/backend/metricsService/requirements.txt
@@ -7,3 +7,4 @@ python-dotenv==1.0.1
 Requests==2.32.3
 scholarly==1.7.11
 uvicorn
+
diff --git a/backend/metricsService/src/services/author_reputation.py b/backend/metricsService/src/services/author_reputation.py
@@ -37,7 +37,8 @@
 from ..utils.api_config import (
     ORCID_API,
     SEMANTIC_SCHOLAR_AUTHOR_SEARCH_API,
-    OPEN_ALEX_AUTHOR_API
+    OPEN_ALEX_AUTHOR_API,
+    DEFAULT_TIMEOUT
 )
 from ..utils.api_utils import rate_limit
 from ..utils.logging_config import get_logger
@@ -64,7 +65,7 @@ async def get_authorship_reputation(author_id: Optional[str] = None, author_name
             orcid_response = requests.get(
                 f"{ORCID_API}{author_id}/works",
                 headers={"Accept": "application/json"},
-                timeout=15
+                timeout=DEFAULT_TIMEOUT
             )
             if orcid_response.status_code == 200:
                 orcid_data = orcid_response.json()
@@ -119,7 +120,7 @@ async def get_openalex_author_reputation(author_name: str):
     """Fetch author reputation from OpenAlex using the authors endpoint."""
     await rate_limit()
     try:
-        response = requests.get(f"{OPEN_ALEX_AUTHOR_API}?search={author_name}", timeout=10)
+        response = requests.get(f"{OPEN_ALEX_AUTHOR_API}?search={author_name}", timeout=DEFAULT_TIMEOUT)
         if response.status_code == 200:
             data = response.json()
             if data.get("results"):
@@ -138,7 +139,7 @@ async def get_semantic_scholar_author_reputation(author_name: str):
     await rate_limit()
     try:
         params = {"query": author_name, "fields": "hIndex,paperCount", "limit": 1}
-        response = requests.get(SEMANTIC_SCHOLAR_AUTHOR_SEARCH_API, params=params, timeout=10)
+        response = requests.get(SEMANTIC_SCHOLAR_AUTHOR_SEARCH_API, params=params, timeout=DEFAULT_TIMEOUT)
         if response.status_code == 200:
             data = response.json()
             if data.get("data") and len(data["data"]) > 0:
diff --git a/backend/metricsService/src/utils/api_config.py b/backend/metricsService/src/utils/api_config.py
@@ -34,3 +34,4 @@
 OPEN_CITATIONS_API = "https://opencitations.net/index/api/v1/"
 MAX_CONCURRENT_WORKERS = 20
 DEFAULT_CONCURRENT_WORKERS = 10
+DEFAULT_TIMEOUT = 10

Original file line number	Diff line number	Diff line change
`@@ -59,3 +59,4 @@ unit_test.py`
`59`	`59`	`testing_workflow.py`
`60`	`60`	`*.yaml`
`61`	`61`
	`62`	`+scripts/`