Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
79 changes: 41 additions & 38 deletions cli.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from csv import DictReader
import io
import json
import pprint
import re
Expand Down Expand Up @@ -85,10 +87,7 @@ def search(query, indexes, term):
else:
body = query

config = {
"index": indexes if indexes else app.all_indexes,
"body": body
}
config = {"index": indexes if indexes else app.all_indexes, "body": body}

result = app.elasticsearch.search(**config)
print("\n", "=" * 12, " RESULT ", "=" * 12)
Expand Down Expand Up @@ -129,53 +128,58 @@ def index(years):
"""

# BUILD THE METADATA DICT FROM THE GITHUB TSV FILE
response = requests.get(app.config['METADATA_FILE_URL'])
response = requests.get(app.config["METADATA_FILE_URL"])
metadata = {}
lines = response.text.splitlines()
header = lines.pop(0).split('\t')
for line in lines:
_d = {}
# replace empty strings with null values
_values = [v if v != "" else None for v in line.split('\t')]
for i, k in enumerate(header):
# filter indexable columns
if k in app.config['METADATA_FILE_INDEXABLE_COLUMNS']:
# brutally try to cast values as integer
try:
_d[k] = int(_values[i])
except (TypeError, ValueError):
_d[k] = _values[i]

metadata[_d['id']] = _d
# remove id from nested metadata object
metadata[_d['id']].pop("id")

_DTS_URL = app.config['DTS_URL']
reader = DictReader(io.StringIO(response.text), delimiter="\t")
for row in reader:
try:
metadata[row["id"]] = {
"author_name": row["author_name"],
"author_firstname": row["author_firstname"],
"title_rich": row["title_rich"],
"promotion_year": int(row["promotion_year"]) if row["promotion_year"] else None,
"topic_notBefore": int(row["topic_notBefore"]) if row["topic_notBefore"] else None,
"topic_notAfter": int(row["topic_notAfter"]) if row["topic_notAfter"] else None,
"author_gender": int(row["author_gender"]) if row["author_gender"] else None,
# 1/2, verify that there is no other value
"author_is_enc_teacher": 1 if row["author_is_enc_teacher"]=="1" else None,
}
except Exception as exc:
print(f"ERROR while indexing {row['id']}, {exc}")

_DTS_URL = app.config["DTS_URL"]

# INDEXATION DES DOCUMENTS
all_docs = []
try:
_index_name = app.config['DOCUMENT_INDEX']
_index_name = app.config["DOCUMENT_INDEX"]
if years == "all":
years = app.config['ALL_YEARS']
start_year, end_year = (int(y) for y in years.split('-'))
years = app.config["ALL_YEARS"]
start_year, end_year = (int(y) for y in years.split("-"))
for year in range(start_year, end_year + 1):

_ids = [d for d in metadata.keys() if str(year) in d and "_PREV" not in d and "_NEXT" not in d]
_ids = [
d
for d in metadata.keys()
if str(year) in d and "_PREV" not in d and "_NEXT" not in d
]

for encpos_id in _ids:
response = requests.get(f'{_DTS_URL}/document?id={encpos_id}')
response = requests.get(f"{_DTS_URL}/document?id={encpos_id}")
print(encpos_id, response.status_code)

content = extract_body(response.text)
content = remove_html_tags(content)
all_docs.append("\n".join([
json.dumps(
{"index": {"_index": _index_name, "_id": encpos_id}}
),
json.dumps(
{"content": content, "metadata": metadata[encpos_id]}
)
]))

app.elasticsearch.index(
index=_index_name,
id=encpos_id,
body={
"content": content,
"metadata": metadata[encpos_id]
})
app.elasticsearch.bulk(body=all_docs, request_timeout=60*10)

except Exception as e:
print('Indexation error: ', str(e))
Expand All @@ -192,4 +196,3 @@ def index(years):
cli.add_command(index)
cli.add_command(search)
return cli

23 changes: 13 additions & 10 deletions elasticsearch/_global.conf.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,10 @@
},
"analysis": {
"filter": {
"my_stop_french":{
"type": "stop",
"stopwords": "_french_"
},
"french_elision": {
"type": "elision",
"articles_case": true,
Expand All @@ -29,25 +33,24 @@
"type": "html_strip"
}
},
"normalizer":{
"keyword": {
"filter": [
"icu_folding"
]
}
},
"analyzer": {
"folding": {
"tokenizer": "standard",
"stopwords": "_french_",
"filter": [
"french_elision",
"icu_folding"
"icu_folding",
"my_stop_french"
],
"char_filter": [
"html_stripper"
]
},
"keyword": {
"tokenizer": "keyword",
"stopwords": "_french_",
"filter": [
"french_elision",
"icu_folding"
]
}
}
}
Expand Down
40 changes: 32 additions & 8 deletions elasticsearch/encpos_document.conf.json
Original file line number Diff line number Diff line change
Expand Up @@ -6,27 +6,51 @@
"analyzer": "folding",
"term_vector": "with_positions_offsets"
},
"metadata_all": {
"type": "text"
},
"metadata": {
"properties": {
"author_firstname": {
"type": "text",
"fielddata": true,
"copy_to": "metadata_all",
"fields": {
"keyword": {
"type": "text",
"analyzer": "keyword",
"fielddata": "true"
"type": "keyword",
"normalizer": "keyword"
}
}
},
"author_name": {
"type": "text",
"fielddata": true,
"copy_to": "metadata_all",
"fields": {
"keyword": {
"type": "keyword",
"normalizer": "keyword"
}
}
},
"promotion_year": {
"type": "short",
"copy_to": "metadata_all"
},
"topic_notAfter": {
"type": "short",
"copy_to": "metadata_all"
},
"topic_notBefore": {
"type": "short",
"copy_to": "metadata_all"
},
"title_rich": {
"type": "text",
"copy_to": "metadata_all",
"fields": {
"keyword": {
"type": "text",
"analyzer": "keyword",
"fielddata": "true"
"type": "keyword",
"normalizer": "keyword",
"ignore_above": 256
}
}
}
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ elasticsearch==8.12.1
Flask==1.1.2
itsdangerous==1.1.0
Jinja2==2.11.3
lxml==4.6.3
lxml==4.9.4
MarkupSafe==1.1.1
python-dotenv==0.17.0
requests
Expand Down