diff --git a/scan_explorer_service/manifest_factory.py b/scan_explorer_service/manifest_factory.py index 2f76eb1..6fa313e 100644 --- a/scan_explorer_service/manifest_factory.py +++ b/scan_explorer_service/manifest_factory.py @@ -58,7 +58,6 @@ def get_or_create_canvas(self, page: Page): annotation.on = canvas.id canvas.add_annotation(annotation) canvas_dict[page.id] = canvas - return canvas def create_image_annotation(self, page: Page): diff --git a/scan_explorer_service/models.py b/scan_explorer_service/models.py index 72a16d9..e0487bf 100644 --- a/scan_explorer_service/models.py +++ b/scan_explorer_service/models.py @@ -156,22 +156,25 @@ def image_url(self): @property def image_path(self): separator = current_app.config.get('IMAGE_API_SLASH_SUB', '%2F') - image_path = separator.join(self.image_path_basic) + image_path = separator.join(self.image_path_basic[0]) if self.color_type != PageColor.BW: image_path += '.tif' return image_path @property def image_path_basic(self): + image_format = '' image_path = [self.collection.type, self.collection.journal, self.collection.volume] image_path = [item.replace('.', '_') for item in image_path] image_path = ['bitmaps'] + image_path + ['600', self.name] - - return image_path + if self.color_type != PageColor.BW: + image_format = '.tif' + return image_path, image_format @property def thumbnail_url(self): - return f'{self.image_url}/square/480,480/0/{self.image_color_quality}.jpg' + url = f'{self.image_url}/square/480,480/0/{self.image_color_quality}.jpg' + return url @property def image_color_quality(self): diff --git a/scan_explorer_service/open_search.py b/scan_explorer_service/open_search.py index 85e7220..2fad6b3 100644 --- a/scan_explorer_service/open_search.py +++ b/scan_explorer_service/open_search.py @@ -112,6 +112,7 @@ def set_page_search_fields(query: dict) -> dict: return query def page_os_search(qs: str, page, limit, sort): + qs = qs.replace("&", "+") query = create_query_string_query(qs) query = set_page_search_fields(query) from_number = (page - 1) * limit @@ -143,6 +144,7 @@ def page_ocr_os_search(collection_id: str, page_number:int): return es_result def aggregate_search(qs: str, aggregate_field, page, limit, sort): + qs = qs.replace("&", "+") query = create_query_string_query(qs) query = append_aggregate(query, aggregate_field, page, limit, sort) es_result = es_search(query) diff --git a/scan_explorer_service/tests/test_proxy.py b/scan_explorer_service/tests/test_proxy.py index fcc869a..052c15a 100644 --- a/scan_explorer_service/tests/test_proxy.py +++ b/scan_explorer_service/tests/test_proxy.py @@ -167,7 +167,6 @@ def test_fetch_object(self, mock_read_object_s3): @patch('scan_explorer_service.views.image_proxy.fetch_object') def test_pdf_save_success_article(self, mock_fetch_object): - # mock_read_object_s3.return_value = b'my_image_name' mock_fetch_object.return_value = b'my_image_name' data = { diff --git a/scan_explorer_service/tests/test_search_utils.py b/scan_explorer_service/tests/test_search_utils.py index 6265192..adf77da 100644 --- a/scan_explorer_service/tests/test_search_utils.py +++ b/scan_explorer_service/tests/test_search_utils.py @@ -37,6 +37,12 @@ def test_parse_query(self): final_query, _ = parse_query_string('PageColor:grAYsCaLe') self.assertEqual(final_query, 'page_color:Grayscale') + final_query, _ = parse_query_string('PageColor:BW') + self.assertEqual(final_query, 'page_color:BW') + + final_query, _ = parse_query_string('PageColor:cOlor') + self.assertEqual(final_query, 'page_color:Color') + if __name__ == '__main__': unittest.main() diff --git a/scan_explorer_service/utils/s3_utils.py b/scan_explorer_service/utils/s3_utils.py index 2cffd65..c32f257 100644 --- a/scan_explorer_service/utils/s3_utils.py +++ b/scan_explorer_service/utils/s3_utils.py @@ -24,7 +24,7 @@ def write_object_s3(self, file_bytes, object_name): try: response = self.bucket.put_object(Body=file_bytes, Key=object_name) except (ClientError, ParamValidationError) as e: - current_app.logger.info.exception(e) + current_app.logger.exception(f"Error writing object {object_name}: {str(e)}") raise e return response.e_tag @@ -35,8 +35,8 @@ def read_object_s3(self, object_name): s3_obj.seek(0) s3_file = s3_obj.read() return s3_file - except (ClientError, ParamValidationError) as e: - current_app.logger.exception(e) - raise e + except Exception as e: + current_app.logger.exception(f"Unexpected error reading object {object_name}: {str(e)}") + raise diff --git a/scan_explorer_service/utils/search_utils.py b/scan_explorer_service/utils/search_utils.py index 439ffc9..3049657 100644 --- a/scan_explorer_service/utils/search_utils.py +++ b/scan_explorer_service/utils/search_utils.py @@ -53,8 +53,11 @@ class OrderOptions(str, enum.Enum): def parse_query_args(args): qs = re.sub(':\s*', ':', args.get('q', '', str)) + qs, qs_dict = parse_query_string(qs) + + page = args.get('page', 1, int) limit = args.get('limit', 10, int) sort_raw = args.get('sort') @@ -73,7 +76,7 @@ def parse_query_string(qs): qs_only_free = qs_only_free.replace(kv, "") if len(kv_arr) == 2: qs_dict[kv_arr[0].lower()] = kv_arr[1].strip() - #If the option have qutoes we remove them from the free. Previous removal would than have failed + #If the option have quotes we remove them from the free. Previous removal would than have failed alt_kv = kv_arr[0] + ':"' + kv_arr[1] + '"' qs_only_free = qs_only_free.replace(alt_kv, '') @@ -87,10 +90,9 @@ def parse_query_string(qs): #Translate input on the keys to the dedicated OS columns insensitive_replace = re.compile(re.escape(key), re.IGNORECASE) qs = insensitive_replace.sub(query_translations[key.lower()], qs) - - insensitive_replace = re.compile(re.escape(qs_dict[key]), re.IGNORECASE) + # To ensure only the strings after the colon are replaced and no partial replacements are made + insensitive_replace = re.compile(r'(?<=:)\b' + re.escape(qs_dict[key]) + r'\b', re.IGNORECASE) qs = insensitive_replace.sub(qs_dict[key], qs) - return qs, qs_dict def parse_sorting_option(sort_input: str): diff --git a/scan_explorer_service/views/image_proxy.py b/scan_explorer_service/views/image_proxy.py index e15b310..3beb58f 100644 --- a/scan_explorer_service/views/image_proxy.py +++ b/scan_explorer_service/views/image_proxy.py @@ -1,5 +1,5 @@ from typing import Union -from flask import Blueprint, Response, current_app, request, stream_with_context, jsonify +from flask import Blueprint, Response, current_app, request, stream_with_context, jsonify, send_file from flask_discoverer import advertise from urllib import parse as urlparse import img2pdf @@ -9,6 +9,9 @@ from scan_explorer_service.utils.db_utils import item_thumbnail from scan_explorer_service.utils.s3_utils import S3Provider from scan_explorer_service.utils.utils import url_for_proxy +import re +import io +import sys bp_proxy = Blueprint('proxy', __name__, url_prefix='/image') @@ -23,9 +26,9 @@ def image_proxy(path): req_headers['X-Forwarded-Host'] = current_app.config.get('PROXY_SERVER') req_headers['X-Forwarded-Path'] = current_app.config.get('PROXY_PREFIX').rstrip('/') + '/image' - r = requests.request(request.method, req_url, params=request.args, stream=True, - headers=req_headers, allow_redirects=False, data=request.form) - + encoded_url = re.sub(r"[+&]", "%2B", req_url) + r = requests.request(request.method, encoded_url, params=request.args, stream=True, + headers=req_headers, allow_redirects=False, data=request.form) excluded_headers = ['content-encoding','content-length', 'transfer-encoding', 'connection'] headers = [(name, value) for (name, value) in r.headers.items() if name.lower() not in excluded_headers] @@ -41,13 +44,14 @@ def generate(): def image_proxy_thumbnail(): """Helper to generate the correct url for a thumbnail given an ID and type""" try: - id = request.args.get('id') + id = request.args.get('id').replace(" ", "+") type = request.args.get('type') with current_app.session_scope() as session: thumbnail_path = item_thumbnail(session, id, type) path = urlparse.urlparse(thumbnail_path).path remove = urlparse.urlparse(url_for_proxy('proxy.image_proxy', path='')).path + path = path.replace(remove, '') return image_proxy(path) @@ -75,61 +79,72 @@ def get_pages(item, session, page_start, page_end, page_limit): query = session.query(Page).filter(Page.collection_id == item.id, Page.volume_running_page_num >= page_start, Page.volume_running_page_num <= page_end).order_by(Page.volume_running_page_num).limit(page_limit) - current_app.logger.info(f"Got pages {page_start}-{page_end}: {query}") return query @stream_with_context def fetch_images(session, item, page_start, page_end, page_limit, memory_limit): - n_pages = 0 - memory_sum = 0 - query = get_pages(item, session, page_start, page_end, page_limit) - for page in query.all(): - - n_pages += 1 - - current_app.logger.debug(f"Generating image for page: {n_pages}") - current_app.logger.debug(f'Id: {page.id}, Volume_page: {page.volume_running_page_num}, memory: {memory_sum}') - if n_pages > page_limit: - break - if memory_sum > memory_limit: - current_app.logger.error(f"Memory limit reached: {memory_sum} > {memory_limit}") - break - - object_name = '/'.join(page.image_path_basic) - current_app.logger.debug(f"Image path: {object_name}") - im_data = fetch_object(object_name, 'AWS_BUCKET_NAME_IMAGE') + n_pages = 0 + memory_sum = 0 + query = get_pages(item, session, page_start, page_end, page_limit) + for page in query.all(): + + n_pages += 1 + + current_app.logger.debug(f"Generating image for page: {n_pages}") + if n_pages > page_limit: + break + if memory_sum > memory_limit: + current_app.logger.error(f"Memory limit reached: {memory_sum} > {memory_limit}") + break + image_path, format = page.image_path_basic + object_name = '/'.join(image_path) + object_name += format + + im_data = fetch_object(object_name, 'AWS_BUCKET_NAME_IMAGE') + memory_sum += sys.getsizeof(im_data) - yield im_data + yield im_data def fetch_object(object_name, bucket_name): file_content = S3Provider(current_app.config, bucket_name).read_object_s3(object_name) - current_app.logger.info(f"Successfully fetched object from S3 bucket: {object_name}") + if not file_content: + current_app.logger.error(f"Failed to fetch content for {object_name}. File might be empty.") + raise ValueError(f"File content is empty for {object_name}") return file_content -def fetch_article(item): +def fetch_article(item, memory_limit): try: - current_app.logger.info(f"Item is an article: {item.id}") object_name = f'{item.id}.pdf'.lower() full_path = f'pdfs/{object_name}' file_content = fetch_object(full_path, 'AWS_BUCKET_NAME_PDF') - response = Response(file_content, mimetype='application/pdf') - response.headers['Content-Disposition'] = f'attachment; filename="{object_name}"' - return response + + if len(file_content) > memory_limit: + current_app.logger.error(f"Memory limit reached: {len(file_content)} > {memory_limit}") + + file_stream = io.BytesIO(file_content) + file_stream.seek(0) + return send_file( + file_stream, + as_attachment=True, + attachment_filename=object_name, + mimetype='application/pdf' + ) except Exception as e: current_app.logger.exception(f"Failed to get PDF using fallback method for {object_name}: {str(e)}") def generate_pdf(item, session, page_start, page_end, page_limit, memory_limit): if isinstance(item, Article): - response = fetch_article(item) + response = fetch_article(item, memory_limit) if response: return response else: + current_app.logger.debug(f"Response fetch article was empty") page_end = page_limit - + current_app.logger.debug(f"Article is not an article or fetch article failed.") return Response(img2pdf.convert([im for im in fetch_images(session, item, page_start, page_end, page_limit, memory_limit)]), mimetype='application/pdf') @@ -147,7 +162,7 @@ def pdf_save(): with current_app.session_scope() as session: item = get_item(session, id) - current_app.logger.info(f"Item retrieved successfully: {item.id}") + current_app.logger.debug(f"Item retrieved successfully: {item.id}") response = generate_pdf(item, session, page_start, page_end, page_limit, memory_limit) return response diff --git a/scan_explorer_service/views/manifest.py b/scan_explorer_service/views/manifest.py index adfe0fe..b1dd1eb 100644 --- a/scan_explorer_service/views/manifest.py +++ b/scan_explorer_service/views/manifest.py @@ -35,7 +35,7 @@ def get_manifest(id: str): manifest = manifest_factory.create_manifest(item) search_url = url_for_proxy('manifest.search', id=id) manifest_factory.add_search_service(manifest, search_url) - + return manifest.toJSON(top=True) else: return jsonify(exception='Article not found'), 404 diff --git a/scan_explorer_service/views/metadata.py b/scan_explorer_service/views/metadata.py index 5721be8..8d610ab 100644 --- a/scan_explorer_service/views/metadata.py +++ b/scan_explorer_service/views/metadata.py @@ -22,17 +22,16 @@ def article_extra(bibcode: str): if auth_token and ads_search_service: try: - params = {'q': f'bibcode:{bibcode}', 'fl':'title,author'} + params = {'q': f'bibcode:{bibcode}', 'fl':'title,author'} headers = {'Authorization': f'Bearer {auth_token}'} response = requests.get(ads_search_service, params, headers=headers).json() docs = response.get('response').get('docs') - if docs: return docs[0] - except: + else: + return jsonify(message='No article found'), 404 + except Exception as e: return jsonify(message='Failed to retrieve external ADS article metadata'), 500 - - return {} @advertise(scopes=['api'], rate_limit=[300, 3600*24]) @@ -141,7 +140,7 @@ def article_search(): page_count = page_os_search(qs, page, limit, sort)['hits']['total']['value'] return jsonify(serialize_os_article_result(result, page, limit, text_query, collection_count, page_count)) except Exception as e: - current_app.logger.error(f"{e}") + current_app.logger.exception(f"An exception has occurred: {e}") return jsonify(message=str(e), type=ApiErrors.SearchError.value), 400