From 0c9d473c37a555c3077d0d5d5d2eabe7715b844a Mon Sep 17 00:00:00 2001 From: femalves Date: Tue, 24 Feb 2026 17:35:06 -0500 Subject: [PATCH 1/2] Use bulk inserts in put_collection to avoid request timeouts --- scan_explorer_service/tests/test_metadata.py | 76 ++++++++++++++++++++ scan_explorer_service/views/metadata.py | 54 +++++++++++--- 2 files changed, 121 insertions(+), 9 deletions(-) diff --git a/scan_explorer_service/tests/test_metadata.py b/scan_explorer_service/tests/test_metadata.py index 9275298..7f3ca8e 100644 --- a/scan_explorer_service/tests/test_metadata.py +++ b/scan_explorer_service/tests/test_metadata.py @@ -205,6 +205,82 @@ def test_article_collection(self): self.assertStatus(r, 200) self.assertEqual(data, {'id': 'journalvolume', 'selected_page': 100}) + def test_put_collection_with_articles(self): + """put_collection bulk-inserts articles and links them to pages.""" + collection_json = { + 'type': 'type', + 'journal': self.collection.journal, + 'volume': self.collection.volume, + 'pages': [{ + 'name': 'pageA', + 'color_type': 'BW', + 'page_type': 'Normal', + 'label': '1', + 'width': 100, + 'height': 100, + 'volume_running_page_num': 1, + 'articles': [{'bibcode': '2000ApJ...001..001A'}], + }] + } + url = url_for("metadata.put_collection") + r = self.client.put(url, json=collection_json) + self.assertStatus(r, 200) + + collection_id = r.get_json()['id'] + articles = self.app.db.session.query(Article).filter(Article.collection_id == collection_id).all() + self.assertEqual(len(articles), 1) + self.assertEqual(articles[0].bibcode, '2000ApJ...001..001A') + + pages = self.app.db.session.query(Page).filter(Page.collection_id == collection_id).all() + self.assertEqual(len(pages), 1) + + def test_put_collection_deduplicates_articles(self): + """An article appearing in multiple pages is inserted only once.""" + collection_json = { + 'type': 'type', + 'journal': self.collection.journal, + 'volume': self.collection.volume, + 'pages': [ + { + 'name': 'pageA', + 'color_type': 'BW', + 'page_type': 'Normal', + 'label': '1', + 'width': 100, + 'height': 100, + 'volume_running_page_num': 1, + 'articles': [{'bibcode': '2000ApJ...001..001A'}], + }, + { + 'name': 'pageB', + 'color_type': 'BW', + 'page_type': 'Normal', + 'label': '2', + 'width': 100, + 'height': 100, + 'volume_running_page_num': 2, + 'articles': [{'bibcode': '2000ApJ...001..001A'}], + }, + ] + } + url = url_for("metadata.put_collection") + r = self.client.put(url, json=collection_json) + self.assertStatus(r, 200) + + collection_id = r.get_json()['id'] + articles = self.app.db.session.query(Article).filter(Article.collection_id == collection_id).all() + self.assertEqual(len(articles), 1) + + pages = self.app.db.session.query(Page).filter(Page.collection_id == collection_id).all() + self.assertEqual(len(pages), 2) + + from scan_explorer_service.models import page_article_association_table as pat + page_ids = [p.id for p in pages] + links = self.app.db.session.execute( + pat.select().where(pat.c.page_id.in_(page_ids)) + ).fetchall() + self.assertEqual(len(links), 2) + if __name__ == '__main__': unittest.main() diff --git a/scan_explorer_service/views/metadata.py b/scan_explorer_service/views/metadata.py index 8d610ab..6881e34 100644 --- a/scan_explorer_service/views/metadata.py +++ b/scan_explorer_service/views/metadata.py @@ -1,7 +1,8 @@ +from datetime import datetime, timezone from typing import Union from flask import Blueprint, current_app, jsonify, request from scan_explorer_service.utils.db_utils import article_get_or_create, article_overwrite, collection_overwrite, page_get_or_create, page_overwrite -from scan_explorer_service.models import Article, Collection, Page +from scan_explorer_service.models import Article, Collection, Page, page_article_association_table from flask_discoverer import advertise from scan_explorer_service.utils.search_utils import * from scan_explorer_service.views.view_utils import ApiErrors @@ -76,16 +77,51 @@ def put_collection(): try: collection = Collection(**json) collection_overwrite(session, collection) - - for page_json in json.get('pages', []): - page_json['collection_id'] = collection.id - page = page_get_or_create(session, **page_json) - for article_json in page_json.get('articles', []): - article_json['collection_id'] = collection.id - page.articles.append(article_get_or_create(session, **article_json)) + now = datetime.now(timezone.utc).replace(tzinfo=None) + pages_data = [] + articles_data = {} + page_article_data = [] - session.add(page) + for page_json in json.get('pages', []): + page_json['collection_id'] = collection.id + articles = page_json.pop('articles', []) + page = Page(**page_json) + pages_data.append({ + 'id': page.id, + 'name': page.name, + 'label': page.label, + 'format': page.format, + 'color_type': page.color_type, + 'page_type': page.page_type, + 'width': page.width, + 'height': page.height, + 'collection_id': page.collection_id, + 'volume_running_page_num': page.volume_running_page_num, + 'created': now, + 'updated': now, + }) + for article_json in articles: + bibcode = article_json['bibcode'] + if bibcode not in articles_data: + articles_data[bibcode] = { + 'id': bibcode, + 'bibcode': bibcode, + 'collection_id': collection.id, + 'created': now, + 'updated': now, + } + page_article_data.append({ + 'page_id': page.id, + 'article_id': bibcode, + }) + + if pages_data: + session.bulk_insert_mappings(Page, pages_data) + if articles_data: + session.bulk_insert_mappings(Article, list(articles_data.values())) + if page_article_data: + session.execute(page_article_association_table.insert(), page_article_data) session.commit() return jsonify({'id': collection.id}), 200 From 7f9335a29cb049e1629feb4ef95567e4a70ae448 Mon Sep 17 00:00:00 2001 From: femalves Date: Tue, 24 Feb 2026 17:40:47 -0500 Subject: [PATCH 2/2] Handle duplicate article bibcodes across collections with ON CONFLICT DO NOTHING --- scan_explorer_service/views/metadata.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/scan_explorer_service/views/metadata.py b/scan_explorer_service/views/metadata.py index 6881e34..f1a7a84 100644 --- a/scan_explorer_service/views/metadata.py +++ b/scan_explorer_service/views/metadata.py @@ -3,6 +3,7 @@ from flask import Blueprint, current_app, jsonify, request from scan_explorer_service.utils.db_utils import article_get_or_create, article_overwrite, collection_overwrite, page_get_or_create, page_overwrite from scan_explorer_service.models import Article, Collection, Page, page_article_association_table +from sqlalchemy.dialects.postgresql import insert as pg_insert from flask_discoverer import advertise from scan_explorer_service.utils.search_utils import * from scan_explorer_service.views.view_utils import ApiErrors @@ -119,7 +120,9 @@ def put_collection(): if pages_data: session.bulk_insert_mappings(Page, pages_data) if articles_data: - session.bulk_insert_mappings(Article, list(articles_data.values())) + session.execute( + pg_insert(Article.__table__).values(list(articles_data.values())).on_conflict_do_nothing() + ) if page_article_data: session.execute(page_article_association_table.insert(), page_article_data) session.commit()