Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
fd39071
adding logs to debug manifest
femalves Sep 23, 2024
e369d3b
adding logs to debug manifest
femalves Sep 23, 2024
510268a
removing problematic log
femalves Sep 23, 2024
e2a4be5
adding logs
femalves Sep 23, 2024
1f18a0e
Encoding URL
femalves Sep 24, 2024
29a3989
Encoding URL
femalves Sep 24, 2024
d4dc69e
Encoding id
femalves Sep 30, 2024
c65b686
adding more logs
femalves Oct 2, 2024
c202ab0
solving color_type:Color bug
femalves Oct 22, 2024
111162c
debugging page color error
femalves Oct 23, 2024
5cd9379
Merge branch 'master' into debug
femalves Oct 23, 2024
da2e65c
fixing color download
femalves Nov 7, 2024
ccce3dd
fixing thumbnail bug
femalves Nov 12, 2024
8d2b1f5
introducing ampersand searches
femalves Nov 12, 2024
3964042
introducing ampersand searches
femalves Nov 12, 2024
7e2ddff
adding logs to debug book download
femalves Nov 19, 2024
575b952
adding profiling to debug book download
femalves Dec 3, 2024
2e31b98
adding logs
femalves Dec 30, 2024
fb757f3
adding change files and allowing lowercase search
femalves Dec 30, 2024
6bdc09a
adding memory limit logs
femalves Dec 30, 2024
14bd39d
fixing bug in pdf generation
femalves Jan 21, 2025
de1c531
adding lowercase queries
femalves Jan 21, 2025
259fe52
removing lowercase queries
femalves Jan 21, 2025
8eb7d8c
changing search
femalves Jan 21, 2025
79b72a2
saving keyword
femalves Jan 21, 2025
5753bcf
modifying search
femalves Jan 22, 2025
83abdb7
removing lowercase
femalves Jan 22, 2025
092ec07
removing search changes
femalves Jan 22, 2025
dd42e73
adding match
femalves Jan 22, 2025
3e96cfd
removing search changes
femalves Jan 22, 2025
ca3d16d
adding logs to debug metadata
femalves Feb 11, 2025
50a5e72
adding new return message
femalves Feb 12, 2025
a67d967
adding more logs
femalves Mar 4, 2025
1d9d51f
fixing logs
femalves Mar 4, 2025
31c1e32
removed unnecessary logs
femalves Mar 27, 2025
818e157
removed stale imports
femalves Mar 27, 2025
795ecdd
removing unnecessary spaces from manifest_factory
femalves Apr 21, 2025
2045753
removing unnecessary spaces from manifest_factory
femalves Apr 21, 2025
ae85895
last fixes
femalves Apr 21, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion scan_explorer_service/manifest_factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,6 @@ def get_or_create_canvas(self, page: Page):
annotation.on = canvas.id
canvas.add_annotation(annotation)
canvas_dict[page.id] = canvas

return canvas

def create_image_annotation(self, page: Page):
Expand Down
11 changes: 7 additions & 4 deletions scan_explorer_service/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,22 +156,25 @@ def image_url(self):
@property
def image_path(self):
separator = current_app.config.get('IMAGE_API_SLASH_SUB', '%2F')
image_path = separator.join(self.image_path_basic)
image_path = separator.join(self.image_path_basic[0])
if self.color_type != PageColor.BW:
image_path += '.tif'
return image_path

@property
def image_path_basic(self):
image_format = ''
image_path = [self.collection.type, self.collection.journal, self.collection.volume]
image_path = [item.replace('.', '_') for item in image_path]
image_path = ['bitmaps'] + image_path + ['600', self.name]

return image_path
if self.color_type != PageColor.BW:
image_format = '.tif'
return image_path, image_format

@property
def thumbnail_url(self):
return f'{self.image_url}/square/480,480/0/{self.image_color_quality}.jpg'
url = f'{self.image_url}/square/480,480/0/{self.image_color_quality}.jpg'
return url

@property
def image_color_quality(self):
Expand Down
2 changes: 2 additions & 0 deletions scan_explorer_service/open_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,7 @@ def set_page_search_fields(query: dict) -> dict:
return query

def page_os_search(qs: str, page, limit, sort):
qs = qs.replace("&", "+")
query = create_query_string_query(qs)
query = set_page_search_fields(query)
from_number = (page - 1) * limit
Expand Down Expand Up @@ -143,6 +144,7 @@ def page_ocr_os_search(collection_id: str, page_number:int):
return es_result

def aggregate_search(qs: str, aggregate_field, page, limit, sort):
qs = qs.replace("&", "+")
query = create_query_string_query(qs)
query = append_aggregate(query, aggregate_field, page, limit, sort)
es_result = es_search(query)
Expand Down
1 change: 0 additions & 1 deletion scan_explorer_service/tests/test_proxy.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,6 @@ def test_fetch_object(self, mock_read_object_s3):

@patch('scan_explorer_service.views.image_proxy.fetch_object')
def test_pdf_save_success_article(self, mock_fetch_object):
# mock_read_object_s3.return_value = b'my_image_name'
mock_fetch_object.return_value = b'my_image_name'

data = {
Expand Down
6 changes: 6 additions & 0 deletions scan_explorer_service/tests/test_search_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,12 @@ def test_parse_query(self):
final_query, _ = parse_query_string('PageColor:grAYsCaLe')
self.assertEqual(final_query, 'page_color:Grayscale')

final_query, _ = parse_query_string('PageColor:BW')
self.assertEqual(final_query, 'page_color:BW')

final_query, _ = parse_query_string('PageColor:cOlor')
Copy link

@tjacovich tjacovich Apr 22, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should this be PageColor:color?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this is okay the way it is for now. I was testing weird letter casing.

But, I still haven't integrated JC's changes in the search to dev. I will need to test this again once I do that.

self.assertEqual(final_query, 'page_color:Color')


if __name__ == '__main__':
unittest.main()
8 changes: 4 additions & 4 deletions scan_explorer_service/utils/s3_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ def write_object_s3(self, file_bytes, object_name):
try:
response = self.bucket.put_object(Body=file_bytes, Key=object_name)
except (ClientError, ParamValidationError) as e:
current_app.logger.info.exception(e)
current_app.logger.exception(f"Error writing object {object_name}: {str(e)}")
raise e
return response.e_tag

Expand All @@ -35,8 +35,8 @@ def read_object_s3(self, object_name):
s3_obj.seek(0)
s3_file = s3_obj.read()
return s3_file
except (ClientError, ParamValidationError) as e:
current_app.logger.exception(e)
raise e
except Exception as e:
current_app.logger.exception(f"Unexpected error reading object {object_name}: {str(e)}")
raise


10 changes: 6 additions & 4 deletions scan_explorer_service/utils/search_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,11 @@ class OrderOptions(str, enum.Enum):

def parse_query_args(args):
qs = re.sub(':\s*', ':', args.get('q', '', str))

qs, qs_dict = parse_query_string(qs)



page = args.get('page', 1, int)
limit = args.get('limit', 10, int)
sort_raw = args.get('sort')
Expand All @@ -73,7 +76,7 @@ def parse_query_string(qs):
qs_only_free = qs_only_free.replace(kv, "")
if len(kv_arr) == 2:
qs_dict[kv_arr[0].lower()] = kv_arr[1].strip()
#If the option have qutoes we remove them from the free. Previous removal would than have failed
#If the option have quotes we remove them from the free. Previous removal would than have failed
alt_kv = kv_arr[0] + ':"' + kv_arr[1] + '"'
qs_only_free = qs_only_free.replace(alt_kv, '')

Expand All @@ -87,10 +90,9 @@ def parse_query_string(qs):
#Translate input on the keys to the dedicated OS columns
insensitive_replace = re.compile(re.escape(key), re.IGNORECASE)
qs = insensitive_replace.sub(query_translations[key.lower()], qs)

insensitive_replace = re.compile(re.escape(qs_dict[key]), re.IGNORECASE)
# To ensure only the strings after the colon are replaced and no partial replacements are made
insensitive_replace = re.compile(r'(?<=:)\b' + re.escape(qs_dict[key]) + r'\b', re.IGNORECASE)
qs = insensitive_replace.sub(qs_dict[key], qs)

return qs, qs_dict

def parse_sorting_option(sort_input: str):
Expand Down
83 changes: 49 additions & 34 deletions scan_explorer_service/views/image_proxy.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from typing import Union
from flask import Blueprint, Response, current_app, request, stream_with_context, jsonify
from flask import Blueprint, Response, current_app, request, stream_with_context, jsonify, send_file
from flask_discoverer import advertise
from urllib import parse as urlparse
import img2pdf
Expand All @@ -9,6 +9,9 @@
from scan_explorer_service.utils.db_utils import item_thumbnail
from scan_explorer_service.utils.s3_utils import S3Provider
from scan_explorer_service.utils.utils import url_for_proxy
import re
import io
import sys

bp_proxy = Blueprint('proxy', __name__, url_prefix='/image')

Expand All @@ -23,9 +26,9 @@ def image_proxy(path):
req_headers['X-Forwarded-Host'] = current_app.config.get('PROXY_SERVER')
req_headers['X-Forwarded-Path'] = current_app.config.get('PROXY_PREFIX').rstrip('/') + '/image'

r = requests.request(request.method, req_url, params=request.args, stream=True,
headers=req_headers, allow_redirects=False, data=request.form)

encoded_url = re.sub(r"[+&]", "%2B", req_url)
r = requests.request(request.method, encoded_url, params=request.args, stream=True,
headers=req_headers, allow_redirects=False, data=request.form)
excluded_headers = ['content-encoding','content-length', 'transfer-encoding', 'connection']
headers = [(name, value) for (name, value) in r.headers.items() if name.lower() not in excluded_headers]

Expand All @@ -41,13 +44,14 @@ def generate():
def image_proxy_thumbnail():
"""Helper to generate the correct url for a thumbnail given an ID and type"""
try:
id = request.args.get('id')
id = request.args.get('id').replace(" ", "+")
type = request.args.get('type')
with current_app.session_scope() as session:
thumbnail_path = item_thumbnail(session, id, type)
path = urlparse.urlparse(thumbnail_path).path

remove = urlparse.urlparse(url_for_proxy('proxy.image_proxy', path='')).path

path = path.replace(remove, '')

return image_proxy(path)
Expand Down Expand Up @@ -75,61 +79,72 @@ def get_pages(item, session, page_start, page_end, page_limit):
query = session.query(Page).filter(Page.collection_id == item.id,
Page.volume_running_page_num >= page_start,
Page.volume_running_page_num <= page_end).order_by(Page.volume_running_page_num).limit(page_limit)
current_app.logger.info(f"Got pages {page_start}-{page_end}: {query}")
return query


@stream_with_context
def fetch_images(session, item, page_start, page_end, page_limit, memory_limit):
n_pages = 0
memory_sum = 0
query = get_pages(item, session, page_start, page_end, page_limit)
for page in query.all():

n_pages += 1

current_app.logger.debug(f"Generating image for page: {n_pages}")
current_app.logger.debug(f'Id: {page.id}, Volume_page: {page.volume_running_page_num}, memory: {memory_sum}')
if n_pages > page_limit:
break
if memory_sum > memory_limit:
current_app.logger.error(f"Memory limit reached: {memory_sum} > {memory_limit}")
break

object_name = '/'.join(page.image_path_basic)
current_app.logger.debug(f"Image path: {object_name}")
im_data = fetch_object(object_name, 'AWS_BUCKET_NAME_IMAGE')
n_pages = 0
memory_sum = 0
query = get_pages(item, session, page_start, page_end, page_limit)
for page in query.all():

n_pages += 1

current_app.logger.debug(f"Generating image for page: {n_pages}")
if n_pages > page_limit:
break
if memory_sum > memory_limit:
current_app.logger.error(f"Memory limit reached: {memory_sum} > {memory_limit}")
break
image_path, format = page.image_path_basic
object_name = '/'.join(image_path)
object_name += format

im_data = fetch_object(object_name, 'AWS_BUCKET_NAME_IMAGE')
memory_sum += sys.getsizeof(im_data)

yield im_data
yield im_data


def fetch_object(object_name, bucket_name):
file_content = S3Provider(current_app.config, bucket_name).read_object_s3(object_name)
current_app.logger.info(f"Successfully fetched object from S3 bucket: {object_name}")
if not file_content:
current_app.logger.error(f"Failed to fetch content for {object_name}. File might be empty.")
raise ValueError(f"File content is empty for {object_name}")
return file_content


def fetch_article(item):
def fetch_article(item, memory_limit):
try:
current_app.logger.info(f"Item is an article: {item.id}")
object_name = f'{item.id}.pdf'.lower()
full_path = f'pdfs/{object_name}'
file_content = fetch_object(full_path, 'AWS_BUCKET_NAME_PDF')
response = Response(file_content, mimetype='application/pdf')
response.headers['Content-Disposition'] = f'attachment; filename="{object_name}"'
return response

if len(file_content) > memory_limit:
current_app.logger.error(f"Memory limit reached: {len(file_content)} > {memory_limit}")

file_stream = io.BytesIO(file_content)
file_stream.seek(0)
return send_file(
file_stream,
as_attachment=True,
attachment_filename=object_name,
mimetype='application/pdf'
)
except Exception as e:
current_app.logger.exception(f"Failed to get PDF using fallback method for {object_name}: {str(e)}")


def generate_pdf(item, session, page_start, page_end, page_limit, memory_limit):
if isinstance(item, Article):
response = fetch_article(item)
response = fetch_article(item, memory_limit)
if response:
return response
else:
current_app.logger.debug(f"Response fetch article was empty")
page_end = page_limit

current_app.logger.debug(f"Article is not an article or fetch article failed.")
return Response(img2pdf.convert([im for im in fetch_images(session, item, page_start, page_end, page_limit, memory_limit)]), mimetype='application/pdf')


Expand All @@ -147,7 +162,7 @@ def pdf_save():
with current_app.session_scope() as session:

item = get_item(session, id)
current_app.logger.info(f"Item retrieved successfully: {item.id}")
current_app.logger.debug(f"Item retrieved successfully: {item.id}")

response = generate_pdf(item, session, page_start, page_end, page_limit, memory_limit)
return response
Expand Down
2 changes: 1 addition & 1 deletion scan_explorer_service/views/manifest.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def get_manifest(id: str):
manifest = manifest_factory.create_manifest(item)
search_url = url_for_proxy('manifest.search', id=id)
manifest_factory.add_search_service(manifest, search_url)

return manifest.toJSON(top=True)
else:
return jsonify(exception='Article not found'), 404
Expand Down
11 changes: 5 additions & 6 deletions scan_explorer_service/views/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,17 +22,16 @@ def article_extra(bibcode: str):

if auth_token and ads_search_service:
try:
params = {'q': f'bibcode:{bibcode}', 'fl':'title,author'}
params = {'q': f'bibcode:{bibcode}', 'fl':'title,author'}
headers = {'Authorization': f'Bearer {auth_token}'}
response = requests.get(ads_search_service, params, headers=headers).json()
docs = response.get('response').get('docs')

if docs:
return docs[0]
except:
else:
return jsonify(message='No article found'), 404
except Exception as e:
return jsonify(message='Failed to retrieve external ADS article metadata'), 500


return {}

@advertise(scopes=['api'], rate_limit=[300, 3600*24])
Expand Down Expand Up @@ -141,7 +140,7 @@ def article_search():
page_count = page_os_search(qs, page, limit, sort)['hits']['total']['value']
return jsonify(serialize_os_article_result(result, page, limit, text_query, collection_count, page_count))
except Exception as e:
current_app.logger.error(f"{e}")
current_app.logger.exception(f"An exception has occurred: {e}")
return jsonify(message=str(e), type=ApiErrors.SearchError.value), 400


Expand Down