diff --git a/cli.py b/cli.py index 56460f6..7496bd8 100644 --- a/cli.py +++ b/cli.py @@ -1,3 +1,5 @@ +from csv import DictReader +import io import json import pprint import re @@ -85,10 +87,7 @@ def search(query, indexes, term): else: body = query - config = { - "index": indexes if indexes else app.all_indexes, - "body": body - } + config = {"index": indexes if indexes else app.all_indexes, "body": body} result = app.elasticsearch.search(**config) print("\n", "=" * 12, " RESULT ", "=" * 12) @@ -129,53 +128,58 @@ def index(years): """ # BUILD THE METADATA DICT FROM THE GITHUB TSV FILE - response = requests.get(app.config['METADATA_FILE_URL']) + response = requests.get(app.config["METADATA_FILE_URL"]) metadata = {} - lines = response.text.splitlines() - header = lines.pop(0).split('\t') - for line in lines: - _d = {} - # replace empty strings with null values - _values = [v if v != "" else None for v in line.split('\t')] - for i, k in enumerate(header): - # filter indexable columns - if k in app.config['METADATA_FILE_INDEXABLE_COLUMNS']: - # brutally try to cast values as integer - try: - _d[k] = int(_values[i]) - except (TypeError, ValueError): - _d[k] = _values[i] - - metadata[_d['id']] = _d - # remove id from nested metadata object - metadata[_d['id']].pop("id") - - _DTS_URL = app.config['DTS_URL'] + reader = DictReader(io.StringIO(response.text), delimiter="\t") + for row in reader: + try: + metadata[row["id"]] = { + "author_name": row["author_name"], + "author_firstname": row["author_firstname"], + "title_rich": row["title_rich"], + "promotion_year": int(row["promotion_year"]) if row["promotion_year"] else None, + "topic_notBefore": int(row["topic_notBefore"]) if row["topic_notBefore"] else None, + "topic_notAfter": int(row["topic_notAfter"]) if row["topic_notAfter"] else None, + "author_gender": int(row["author_gender"]) if row["author_gender"] else None, + # 1/2, verify that there is no other value + "author_is_enc_teacher": 1 if row["author_is_enc_teacher"]=="1" else None, + } + except Exception as exc: + print(f"ERROR while indexing {row['id']}, {exc}") + + _DTS_URL = app.config["DTS_URL"] # INDEXATION DES DOCUMENTS + all_docs = [] try: - _index_name = app.config['DOCUMENT_INDEX'] + _index_name = app.config["DOCUMENT_INDEX"] if years == "all": - years = app.config['ALL_YEARS'] - start_year, end_year = (int(y) for y in years.split('-')) + years = app.config["ALL_YEARS"] + start_year, end_year = (int(y) for y in years.split("-")) for year in range(start_year, end_year + 1): - _ids = [d for d in metadata.keys() if str(year) in d and "_PREV" not in d and "_NEXT" not in d] + _ids = [ + d + for d in metadata.keys() + if str(year) in d and "_PREV" not in d and "_NEXT" not in d + ] for encpos_id in _ids: - response = requests.get(f'{_DTS_URL}/document?id={encpos_id}') + response = requests.get(f"{_DTS_URL}/document?id={encpos_id}") print(encpos_id, response.status_code) content = extract_body(response.text) content = remove_html_tags(content) + all_docs.append("\n".join([ + json.dumps( + {"index": {"_index": _index_name, "_id": encpos_id}} + ), + json.dumps( + {"content": content, "metadata": metadata[encpos_id]} + ) + ])) - app.elasticsearch.index( - index=_index_name, - id=encpos_id, - body={ - "content": content, - "metadata": metadata[encpos_id] - }) + app.elasticsearch.bulk(body=all_docs, request_timeout=60*10) except Exception as e: print('Indexation error: ', str(e)) @@ -192,4 +196,3 @@ def index(years): cli.add_command(index) cli.add_command(search) return cli - diff --git a/elasticsearch/_global.conf.json b/elasticsearch/_global.conf.json index 7ee9b63..f4fc7dc 100644 --- a/elasticsearch/_global.conf.json +++ b/elasticsearch/_global.conf.json @@ -4,6 +4,10 @@ }, "analysis": { "filter": { + "my_stop_french":{ + "type": "stop", + "stopwords": "_french_" + }, "french_elision": { "type": "elision", "articles_case": true, @@ -29,25 +33,24 @@ "type": "html_strip" } }, + "normalizer":{ + "keyword": { + "filter": [ + "icu_folding" + ] + } + }, "analyzer": { "folding": { "tokenizer": "standard", - "stopwords": "_french_", "filter": [ "french_elision", - "icu_folding" + "icu_folding", + "my_stop_french" ], "char_filter": [ "html_stripper" ] - }, - "keyword": { - "tokenizer": "keyword", - "stopwords": "_french_", - "filter": [ - "french_elision", - "icu_folding" - ] } } } diff --git a/elasticsearch/encpos_document.conf.json b/elasticsearch/encpos_document.conf.json index 8c2d476..592a2d2 100644 --- a/elasticsearch/encpos_document.conf.json +++ b/elasticsearch/encpos_document.conf.json @@ -6,27 +6,51 @@ "analyzer": "folding", "term_vector": "with_positions_offsets" }, + "metadata_all": { + "type": "text" + }, "metadata": { "properties": { "author_firstname": { "type": "text", - "fielddata": true, + "copy_to": "metadata_all", "fields": { "keyword": { - "type": "text", - "analyzer": "keyword", - "fielddata": "true" + "type": "keyword", + "normalizer": "keyword" } } }, "author_name": { "type": "text", - "fielddata": true, + "copy_to": "metadata_all", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "keyword" + } + } + }, + "promotion_year": { + "type": "short", + "copy_to": "metadata_all" + }, + "topic_notAfter": { + "type": "short", + "copy_to": "metadata_all" + }, + "topic_notBefore": { + "type": "short", + "copy_to": "metadata_all" + }, + "title_rich": { + "type": "text", + "copy_to": "metadata_all", "fields": { "keyword": { - "type": "text", - "analyzer": "keyword", - "fielddata": "true" + "type": "keyword", + "normalizer": "keyword", + "ignore_above": 256 } } } diff --git a/requirements.txt b/requirements.txt index 4f8dc0a..cf65303 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,7 +5,7 @@ elasticsearch==8.12.1 Flask==1.1.2 itsdangerous==1.1.0 Jinja2==2.11.3 -lxml==4.6.3 +lxml==4.9.4 MarkupSafe==1.1.1 python-dotenv==0.17.0 requests