Skip to content
Merged
13 changes: 11 additions & 2 deletions ftm_analyze/analysis/analyzer.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import logging
import textwrap
from typing import Generator

import juditha
Expand Down Expand Up @@ -178,6 +179,10 @@ def __init__(
):
self.entity = model.make_entity(entity.schema)
self.entity.id = entity.id
if entity.get("language", quiet=True):
self.entity.set("language", entity.get("language"))
if entity.get("detectedLanguage", quiet=True):
self.entity.set("detectedLanguage", entity.get("detectedLanguage"))
self.aggregator_entities = TagAggregatorFasttext()
self.aggregator_patterns = TagAggregator()
self.validate_names = validate_names
Expand All @@ -195,12 +200,16 @@ def __init__(
else:
self.ner_extract = extract_spacy

def feed(self, entity):
def feed(self, entity, overwrite_lang=False):
if not entity.schema.is_a(ANALYZABLE):
return
texts = entity.get_type_values(registry.text)
# overwrite_lang to completely delete all detectedLanguage values?
if overwrite_lang and entity.has("detectedLanguage", quiet=True):
self.entity.pop("detectedLanguage")
for text in text_chunks(texts):
detect_languages(self.entity, text)
for subsection in textwrap.wrap(text, settings.translation_chunk_size):
detect_languages(self.entity, subsection)
for prop, tag in self.ner_extract(self.entity, text):
self.aggregator_entities.add(prop, tag)
for prop, tag in extract_patterns(self.entity, text):
Expand Down
4 changes: 0 additions & 4 deletions ftm_analyze/analysis/language.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,6 @@ def get_lid_model():

def detect_languages(entity, text, k=1):
"""Given a list of lines, return a list of (line, lang)"""
if entity.has("language", quiet=True) or entity.has("detectedLanguage"):
# Don't detect if a language is hard-coded.
return
entity.pop("detectedLanguage")
langs = get_lid_model().predict(text, k=k)
for lang, score in zip(*langs):
if score <= THRESHOLD:
Expand Down
4 changes: 4 additions & 0 deletions ftm_analyze/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,9 @@ def cli_analyze(
refine_locations: Annotated[
bool, typer.Option(help="Refine location mentions via geonames")
] = settings.refine_locations,
overwrite_lang: Annotated[
bool, typer.Option(help="Ignore the language property, overwrite the detectedLanguage property")
] = settings.overwrite_lang,
):
"""
Analyze a stream of entities.
Expand All @@ -90,6 +93,7 @@ def cli_analyze(
validate_names,
refine_mentions,
refine_locations,
overwrite_lang,
)
smart_write_proxies(out_uri, results)

Expand Down
5 changes: 4 additions & 1 deletion ftm_analyze/logic.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ def analyze_entity(
validate_names: bool | None = settings.validate_names,
refine_mentions: bool | None = settings.refine_mentions,
refine_locations: bool | None = settings.refine_locations,
overwrite_lang : bool = False
) -> Generator[EntityProxy, None, None]:
"""
Analyze an Entity.
Expand All @@ -41,7 +42,7 @@ def analyze_entity(
refine_mentions,
refine_locations,
)
analyzer.feed(entity)
analyzer.feed(entity, overwrite_lang=overwrite_lang)
yield from analyzer.flush()


Expand All @@ -52,6 +53,7 @@ def analyze_entities(
validate_names: bool | None = settings.validate_names,
refine_mentions: bool | None = settings.refine_mentions,
refine_locations: bool | None = settings.refine_locations,
overwrite_lang : bool = False
) -> Generator[EntityProxy, None, None]:
for e in logged_items(entities, "Analyze", 1000, item_name="Entity", logger=log):
yield from analyze_entity(
Expand All @@ -61,4 +63,5 @@ def analyze_entities(
validate_names,
refine_mentions,
refine_locations,
overwrite_lang,
)
9 changes: 8 additions & 1 deletion ftm_analyze/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,4 +126,11 @@ class Settings(BaseSettings):
validate_names: bool = False
"""Validate NER results against known name tokens via `juditha`"""

target_lang = str(environ.get("FTM_TRANSLATE_TARGET_LANGUAGE", None))
overwrite_lang: bool = False
"""Ignore the language property, overwrite the detectedLanguage property"""

translation_chunk_size: int = 512
"""A text is chunked into substrings of this size, for language detection"""


target_lang = str(environ.get("FTM_TRANSLATE_TARGET_LANGUAGE", None))
4 changes: 3 additions & 1 deletion ftm_analyze/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,11 @@ def analyze(job: DatasetJob) -> None:
to_translate: list[EntityProxy] = []
to_geocode: list[EntityProxy] = []
to_index: list[EntityProxy] = []

overwrite_lang = job.context.get("overwrite_lang", False)
with job.get_writer() as bulk:
for entity in job.load_entities():
for result in analyze_entity(entity):
for result in analyze_entity(entity, overwrite_lang=overwrite_lang):
bulk.put(result, origin=ORIGIN, fragment=entity.id)
to_index.append(make_stub_entity(result))
if should_geocode(result):
Expand Down
22 changes: 22 additions & 0 deletions tests/test_analyze.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,3 +106,25 @@ def test_analyze_extract_location():
entity = _analyze_entity(entity)
assert entity.first("locationMentioned", "New York City")
assert "lives in [New York City](LOC)"

def test_analyze_language_preservation():
text = "C'est le caniche d'Emmanuel Macron. " * 2
entity = model.make_entity("PlainText")
entity.id = "test2"
entity.add("bodyText", text)

entity = [e for e in logic.analyze_entity(entity, overwrite_lang=False)][-1]
# if the detectedLanguage property is not set, it should be detected
assert entity.get("detectedLanguage") == ["fra"]

entity.set("detectedLanguage", "ron")
# do not overwrite the detectedLanguage
entity = [e for e in logic.analyze_entity(entity, overwrite_lang=False)][-1]
# if the property is set, it should be preserved
assert entity.get("detectedLanguage") == ["ron", "fra"]

entity.set("detectedLanguage", "ron")
# finally, overwrite the detectedLanguage
entity = [e for e in logic.analyze_entity(entity, overwrite_lang=True)][-1]
# the property should be corrected
assert entity.get("detectedLanguage") == ["fra"]