Skip to content

Commit 825ee7f

Browse files
authored
Merge pull request #23 from TheBluCoder/dev-branch
Dev branch
2 parents 12291fd + 5653d8a commit 825ee7f

13 files changed

Lines changed: 31 additions & 145 deletions

File tree

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,3 +59,4 @@ unit_test.py
5959
testing_workflow.py
6060
*.yaml
6161

62+
scripts/

backend/mainService/Dockerfile

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ WORKDIR /app
77
# Removes the package lists downloaded during the update to reduce the image size.
88
RUN apt-get update && apt-get install -y \
99
build-essential \
10+
cron \
1011
&& rm -rf /var/lib/apt/lists/*
1112

1213
# Set the PATH environment variable to include /app
@@ -33,6 +34,5 @@ RUN playwright install && playwright install-deps
3334
# Expose the port the app runs on
3435
EXPOSE 8000
3536

36-
37-
# Command to run the application
38-
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "8000"]
37+
# Start both cron and the FastAPI application
38+
CMD ["sh", "-c", "cron && uvicorn app:app --host 0.0.0.0 --port 8000"]

backend/mainService/requirements.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,4 +25,6 @@ google-genai
2525
redis>=4.2.0
2626
uvicorn
2727
httpx>=0.28.1
28+
pypdf
29+
pypdf2
2830

backend/mainService/scripts/delete_stale_data.py

Lines changed: 0 additions & 125 deletions
This file was deleted.

backend/mainService/src/config/config.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ class ScraperConfig:
3232
"""
3333
This is the timeout duration for the requests made to the web scraper
3434
"""
35-
TIMEOUT_DURATION: int = 8000
35+
TIMEOUT_DURATION: int = 10000
3636

3737
def __post_init__(self):
3838
if self.MAX_FILE_SIZE <= 0:

backend/mainService/src/llm/Pinecone.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -186,14 +186,14 @@ async def set_current_index(
186186
187187
:param index_name: Name of the index to set as current
188188
"""
189+
if not await self._pc.has_index(index_name):
190+
return False
189191
if not self._current_index_name == index_name and self._current_index:
190192
await self._current_index.close()
191193
elif self._current_index_name == index_name:
192194
return True
193195

194196
if not index_host:
195-
if not await self._pc.has_index(index_name):
196-
return False
197197
index_model = await self._pc.describe_index(index_name)
198198
self._current_index_host = index_model.host
199199
else:

backend/mainService/src/llm/chat_llm/Gemini_llm.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111

1212

1313
class Genai_cite:
14-
model = "gemini-2.0-pro-exp-02-05"
14+
model = "gemini-2.0-flash"
1515

1616
def __init__(self, api_key: str = os.getenv("GOOGLE_API_KEY"),
1717
llm_model: str = f'models/{model}'):

backend/mainService/src/llm/chat_llm/Groq_llm.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ def __init__(self, api_key: str = os.getenv("GROQ_API_KEY"),
1919
self.client = Groq(api_key=self.api_key)
2020
self.llm_model = llm_model
2121

22-
def getKeywordSearchTerm(self, document: str) -> Optional[str]:
22+
def getKeywordSearchTerm(self, document: str, proposed_title: Optional[str] = None) -> str:
2323
"""
2424
Generate a search term from the provided document using LLM.
2525
@@ -46,12 +46,17 @@ def getKeywordSearchTerm(self, document: str) -> Optional[str]:
4646

4747
# Make API call with error handling
4848

49+
if proposed_title:
50+
document = f"Here is the proposed title: {proposed_title}\n\nHere is the content: {document}"
51+
else:
52+
document = f"Here is the content: {document}"
53+
4954
completion = self.client.chat.completions.create(
5055
model=self.llm_model,
5156
messages=[
5257
{
5358
"role": "user",
54-
"content": f"summarize the provided into a google search term and return a json response as 'search_term : value', if no content provided, your response should be 'message:no content to summarize'. Here is the content: {document}"
59+
"content": f"summarize the provided into a google search term and return a json response as 'search_term : value', if no content provided, your response should be 'message:no content to summarize'.{document}"
5560
},
5661
],
5762
temperature=0.9,

backend/mainService/src/scraper/site_specific/async_frontier_scraper.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ async def _get_download_link(self, url: str) -> Optional[str]:
3838
try:
3939
page = await self.context.new_page()
4040
if not url.endswith("pdf"):
41-
await page.goto(url, wait_until='networkidle')
41+
await page.goto(url, wait_until='networkidle', timeout=self.element_timeout)
4242
await self._interact_with_dropdown(page)
4343
download_link = await self._extract_download_link(page)
4444
else:

backend/mainService/src/services/citation_service.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -130,8 +130,7 @@ async def process_citation(self,
130130
"""
131131
try:
132132
# Step 0: Generate index name
133-
title = (self.summarize_llm.getKeywordSearchTerm(content)
134-
if title.lower() == "untitled" else title)
133+
title = self.summarize_llm.getKeywordSearchTerm(content, proposed_title=title)
135134
index_name = self._generate_index_name(title)
136135
logger.info(f"index_name = {index_name}")
137136
if await self.PC.set_current_index(index_name):
@@ -230,10 +229,11 @@ async def _process_documents(
230229

231230
try:
232231
cleaned_result = search_results["cleaned_result"]
233-
download_results = await self.scraper.get_pdfs(
234-
target_urls=cleaned_result.get("links"),
235-
storage_path=search_results["search_key"]
236-
)
232+
async with asyncio.timeout(15): # 15 second timeout
233+
download_results = await self.scraper.get_pdfs(
234+
target_urls=cleaned_result.get("links"),
235+
storage_path=search_results["search_key"]
236+
)
237237

238238
return await self._prepare_document_batches(
239239
download_results,

0 commit comments

Comments
 (0)