diff --git a/.gitignore b/.gitignore
index b1974a108..94146e076 100644
--- a/.gitignore
+++ b/.gitignore
@@ -175,6 +175,8 @@ dump.rdb
.worktrees/
.claude/
+.grunt-build-stamp
+
# Local TODO / audit notes (not committed)
TODO-*.txt
TODO-*.md
diff --git a/CLAUDE.md b/CLAUDE.md
index 8adf68597..516dac5e2 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -25,6 +25,22 @@ management system built with Django. Python >=3.10,<3.15.
- Public frontend (Foundation CSS): monochrome Foundation-Icons
(` `)
- Django admin (`templates/admin/`): use emoji (no Foundation Icons)
+- **Django template comments `{# ... #}` są jedno-liniowe — KAZDA LINIA
+ MUSI mieć własne otwarcie `{#` i zamknięcie `#}` na tej samej linii.**
+ Po `\n` w środku komentarza parser przestaje go widzieć i tekst wycieka
+ do wyrenderowanego HTML-u. Powtarzający się błąd. Reguła:
+ - ❌ ZABRONIONE wieloliniowe komentarze typu:
+ ```django
+ {# linia 1
+ linia 2 #}
+ ```
+ - ✅ ZAWSZE każda linia z osobnym `{# ... #}`:
+ ```django
+ {# linia 1 #}
+ {# linia 2 #}
+ ```
+ - Alternatywa dla bloków: `{% comment %}...{% endcomment %}` (też OK,
+ ale per-line `{# #}` jest preferowane przez użytkownika).
## Python and Django Execution
diff --git a/Makefile b/Makefile
index 001e82d0f..fac15cb9e 100644
--- a/Makefile
+++ b/Makefile
@@ -119,13 +119,13 @@ clean-pycache: ## Usuń __pycache__, *.pyc oraz .eggs/.cache
rm -rf .eggs .cache
clean: clean-pycache ## Szersze czyszczenie: egg-info, logi, build, dist, staticroot/CACHE, .tox
+ rm -f .grunt-build-stamp
find . -type d -name \*egg-info -print0 | xargs -0 rm -rf
find . -name \*~ -print0 | xargs -0 rm -f
find . -name \*.prof -print0 | xargs -0 rm -f
rm -rf prof/
find . -name \*\\.log -print0 | xargs -0 rm -f
- find . -name \*\\.log -print0 | xargs -0 rm -f
- find . -name \#\* -print0 | xargs -0 rm -f
+ find . -name \#\* -not -path './node_modules/*' -print0 | xargs -0 rm -rf
rm -rf build dist/*django_bpp*whl dist/*bpp_iplweb*whl *.log dist
rm -rf src/django_bpp/staticroot/CACHE
rm -rf .tox
@@ -149,11 +149,13 @@ distclean: clean ## Pełne czyszczenie: + node_modules, staticroot, media, dist,
grunt-build: ## Uruchom `grunt build` (SCSS → CSS, bundling JS)
grunt build
-# CSS output files (targets)
-CSS_TARGETS := src/bpp/static/scss/app-blue.css src/bpp/static/scss/app-green.css src/bpp/static/scss/app-orange.css
+# grunt build kompiluje WSZYSTKIE SCSS → CSS za jednym odpaleniem.
+# Pattern rule $(CSS_TARGETS): $(SCSS_SOURCES) odpalałby grunt N razy
+# (raz per out-of-date target). Zamiast tego: jeden stamp file zależy od
+# wszystkich SCSS + node_modules; grunt dotyka stampu po zakończeniu.
-# SCSS source files
-SCSS_SOURCES := $(wildcard src/bpp/static/scss/*.scss)
+SCSS_SOURCES := $(wildcard src/bpp/static/scss/*.scss) \
+ $(wildcard src/*/static/*/scss/*.scss)
# Node modules dependency
NODE_MODULES := node_modules/.installed
@@ -166,14 +168,16 @@ $(NODE_MODULES): package.json yarn.lock
export PUPPETEER_SKIP_CHROME_DOWNLOAD=true PUPPETEER_SKIP_CHROME_HEADLESS_SHELL_DOWNLOAD=true && $(YARN_CMD) install --no-progress --emoji false -s
touch $(NODE_MODULES)
-$(CSS_TARGETS): $(SCSS_SOURCES) $(NODE_MODULES)
+CSS_STAMP := .grunt-build-stamp
+
+$(CSS_STAMP): $(SCSS_SOURCES) $(NODE_MODULES)
grunt build
+ @touch $(CSS_STAMP)
$(MO_FILES): $(PO_FILES)
- # cd src && django-admin compilemessages
uv run python src/manage.py compilemessages --locale=pl --ignore=site-packages
-assets: $(CSS_TARGETS) $(MO_FILES) ## Zbuduj frontend (CSS + .mo); uruchamia `yarn install` jeśli trzeba
+assets: $(CSS_STAMP) $(MO_FILES) ## Zbuduj frontend (CSS + .mo); uruchamia `yarn install` jeśli trzeba
yarn: $(NODE_MODULES) ## Zainstaluj zależności Node.js (yarn install)
diff --git a/src/bpp/newsfragments/+deduplikator-autorow-general.feature.rst b/src/bpp/newsfragments/+deduplikator-autorow-general.feature.rst
new file mode 100644
index 000000000..2da6d38b7
--- /dev/null
+++ b/src/bpp/newsfragments/+deduplikator-autorow-general.feature.rst
@@ -0,0 +1,6 @@
+Deduplikator autorów: nowy tryb "ogólny" znajdujący duplikaty wśród
+autorów spoza listy pracowników instytucji w PBN. Jeden przycisk
+"Skanuj duplikaty" uruchamia obie fazy (PBN + ogólna) sekwencyjnie.
+Widok pozwala filtrować wyniki radio-button-em (PBN/Ogólny/Oba),
+eksport XLSX zawiera kolumnę "Tryb". Anulowanie fazy ogólnej skutkuje
+statusem "Częściowo zakończone" — wyniki PBN pozostają dostępne.
diff --git a/src/bpp/newsfragments/+deduplikator-autorow-ui-overhaul.feature.rst b/src/bpp/newsfragments/+deduplikator-autorow-ui-overhaul.feature.rst
new file mode 100644
index 000000000..cc19ee26b
--- /dev/null
+++ b/src/bpp/newsfragments/+deduplikator-autorow-ui-overhaul.feature.rst
@@ -0,0 +1,28 @@
+Deduplikator autorów: gruntowna przebudowa UI. Tytuł i pozycje
+menu uproszczone z "Deduplikator autorów PBN" na "Deduplikator
+autorów" (bez znacznika BETA), wpis dodany dodatkowo do podmenu
+"Operacje". Tryb skanowania (PBN/ogólny) prezentowany jest jako
+kolorowy badge przy "Główny rekord autora", filtr "Pokaż wyniki"
+zmieniony z radio-buttonów na poziomy button-group.
+
+Przyciski na karcie każdego potencjalnego duplikatu pogrupowane
+w trzy logiczne sekcje: Podgląd (otwórz wyd. ciągłe/zwarte,
+redagowanie, stronę główną, PBN), Decyzja ("Nie jest duplikatem
+głównego autora", usuń autora bez publikacji), Scalanie (cztery
+warianty scalania). Przyciski "Scal + ustaw dyscyplinę" oraz
+"Scal + ustaw subdyscyplinę" są ukryte, gdy główny autor nie ma
+żadnej dyscypliny.
+
+Powody podobieństwa renderowane są jako kolorowe chipy z ikonami
+Foundation, z tonami match/info/weak/warn dobranymi do siły
+przesłanki. Procent pewności jest sklampowany do zakresu 0–100%
+(wcześniej widoczne były wartości typu 140% wynikające z surowego
+score).
+
+Naprawione: oznaczenie autora jako nie-duplikat (przycisk
+"Nie jest duplikatem głównego autora") wykonuje się teraz przez
+AJAX z fadeOut karty, zamiast przeładowywać widok i przeskakiwać
+do kolejnego głównego autora. Naprawiono też "Scal wszystkie",
+który dla kandydatów z trybu ogólnego zwracał błąd 400 (JS
+wysyłał ``main_scientist_id`` zamiast ``main_autor_id``); brakujące
+parametry trafiają teraz dodatkowo do Rollbara.
diff --git a/src/bpp/system.py b/src/bpp/system.py
index 17be57ff6..210421b15 100644
--- a/src/bpp/system.py
+++ b/src/bpp/system.py
@@ -78,7 +78,7 @@
from bpp.models.struktura import Jednostka_Wydzial
from bpp.models.system import Charakter_PBN
from bpp.models.wydawca import Poziom_Wydawcy, Wydawca
-from deduplikator_autorow.models import IgnoredAuthor, LogScalania, NotADuplicate
+from deduplikator_autorow.models import IgnoredScientist, LogScalania, NotADuplicate
from dynamic_columns.models import ModelAdmin, ModelAdminColumn
from ewaluacja_common.models import Rodzaj_Autora
from ewaluacja_liczba_n.models import IloscUdzialowDlaAutoraZaRok, LiczbaNDlaUczelni
@@ -189,7 +189,7 @@
RozbieznosciZrodelView,
NotADuplicate,
LogScalania,
- IgnoredAuthor,
+ IgnoredScientist,
],
"indeks autorów": [Autor, Autor_Jednostka],
"administracja": [
diff --git a/src/bpp/tests/test_autocomplete/test_autocomplete_authors.py b/src/bpp/tests/test_autocomplete/test_autocomplete_authors.py
index b2c5cc003..8deefa593 100644
--- a/src/bpp/tests/test_autocomplete/test_autocomplete_authors.py
+++ b/src/bpp/tests/test_autocomplete/test_autocomplete_authors.py
@@ -20,7 +20,6 @@
)
-
def test_dyscyplina_naukowa_przypisanie_autocomplete(
app, autor_jan_kowalski, dyscyplina1, dyscyplina2, rok
):
@@ -75,7 +74,6 @@ def test_dyscyplina_naukowa_przypisanie_autocomplete(
assert res.json["results"][0]["text"] == "memetyka stosowana"
-
def test_dyscyplina_naukowa_przypisanie_autocomplete_brak_autora(
app,
):
@@ -90,7 +88,6 @@ def test_dyscyplina_naukowa_przypisanie_autocomplete_brak_autora(
assert res.json["results"][0]["text"] == "Podaj autora"
-
def test_dyscyplina_naukowa_przypisanie_autocomplete_brak_drugiej(
app, autor_jan_kowalski, dyscyplina1, dyscyplina2, rok
):
@@ -133,6 +130,30 @@ def autocomplete(s):
assert Autor.objects.first().imiona == "Baz Quux"
+@pytest.mark.django_db
+def test_AutorAutocomplete_create_object_creates_log_entry(rf, admin_user, db):
+ from django.contrib.admin.models import ADDITION, LogEntry
+ from django.contrib.contenttypes.models import ContentType
+
+ autor_count_before = Autor.objects.count()
+
+ ac = AutorAutocomplete()
+ ac.request = rf.post("/", data={"text": "Kowalski Jan"})
+ ac.request.user = admin_user
+
+ obj = ac.create_object("Kowalski Jan")
+
+ assert obj.pk != -1
+ assert Autor.objects.count() == autor_count_before + 1
+
+ ct = ContentType.objects.get_for_model(Autor)
+ log = LogEntry.objects.get(
+ content_type=ct, object_id=str(obj.pk), action_flag=ADDITION
+ )
+ assert log.user == admin_user
+ assert "autocomplete" in log.change_message
+
+
@pytest.mark.django_db
def test_Status_KorektyAutocomplete(statusy_korekt):
"""Test status korekty autocomplete filtering."""
diff --git a/src/bpp/views/autocomplete/authors.py b/src/bpp/views/autocomplete/authors.py
index 1cea8c001..0373b49a6 100644
--- a/src/bpp/views/autocomplete/authors.py
+++ b/src/bpp/views/autocomplete/authors.py
@@ -94,10 +94,27 @@ class AutorAutocomplete(GroupRequiredMixin, AutorAutocompleteBase):
def create_object(self, text):
try:
- return Autor.objects.create_from_string(text)
+ obj = Autor.objects.create_from_string(text)
except ValueError:
return self.err
+ from django.contrib.admin.models import ADDITION, LogEntry
+ from django.contrib.contenttypes.models import ContentType
+
+ try:
+ LogEntry.objects.create(
+ user_id=self.request.user.pk,
+ content_type_id=ContentType.objects.get_for_model(Autor).pk,
+ object_id=str(obj.pk),
+ object_repr=str(obj)[:200],
+ action_flag=ADDITION,
+ change_message="Utworzono z formularza autocomplete",
+ )
+ except (AttributeError, TypeError):
+ pass
+
+ return obj
+
class PublicAutorAutocomplete(AutorAutocompleteBase):
"""Public autocomplete for authors (no create, no PBN/MNISW markers)."""
diff --git a/src/deduplikator_autorow/admin.py b/src/deduplikator_autorow/admin.py
index 8f4fe5e8d..407f2e8a2 100644
--- a/src/deduplikator_autorow/admin.py
+++ b/src/deduplikator_autorow/admin.py
@@ -10,6 +10,7 @@
DuplicateCandidate,
DuplicateScanRun,
IgnoredAuthor,
+ IgnoredScientist,
LogScalania,
NotADuplicate,
)
@@ -76,8 +77,8 @@ def get_author_last_name(self, obj):
get_author_last_name.admin_order_field = "scientist_pk"
-@admin.register(IgnoredAuthor)
-class IgnoredAuthorAdmin(DynamicAdminFilterMixin, admin.ModelAdmin):
+@admin.register(IgnoredScientist)
+class IgnoredScientistAdmin(DynamicAdminFilterMixin, admin.ModelAdmin):
list_display = [
"get_scientist_display",
"get_autor_display",
@@ -133,6 +134,42 @@ def save_model(self, request, obj, form, change):
super().save_model(request, obj, form, change)
+@admin.register(IgnoredAuthor)
+class IgnoredAuthorAdmin(DynamicAdminFilterMixin, admin.ModelAdmin):
+ list_display = [
+ "get_autor_display",
+ "reason",
+ "created_by",
+ "created_on",
+ ]
+
+ list_filter = ["created_on", "created_by"]
+
+ search_fields = [
+ "autor__nazwisko",
+ "autor__imiona",
+ "reason",
+ "created_by__username",
+ ]
+
+ readonly_fields = ["created_on"]
+ date_hierarchy = "created_on"
+ ordering = ["-created_on"]
+
+ def get_autor_display(self, obj):
+ if obj.autor:
+ url = reverse("admin:bpp_autor_change", args=[obj.autor.pk])
+ return mark_safe(f'{obj.autor} ')
+ return "-"
+
+ get_autor_display.short_description = "Autor (BPP)"
+
+ def save_model(self, request, obj, form, change):
+ if not change:
+ obj.created_by = request.user
+ super().save_model(request, obj, form, change)
+
+
@admin.register(LogScalania)
class LogScalaniaAdmin(DynamicAdminFilterMixin, admin.ModelAdmin):
list_display = [
diff --git a/src/deduplikator_autorow/migrations/0009_rename_ignoredauthor_ignoredscientist.py b/src/deduplikator_autorow/migrations/0009_rename_ignoredauthor_ignoredscientist.py
new file mode 100644
index 000000000..cff55c56e
--- /dev/null
+++ b/src/deduplikator_autorow/migrations/0009_rename_ignoredauthor_ignoredscientist.py
@@ -0,0 +1,23 @@
+from django.db import migrations
+
+
+class Migration(migrations.Migration):
+
+ dependencies = [
+ ("deduplikator_autorow", "0008_add_priority_field"),
+ ]
+
+ operations = [
+ migrations.RenameModel(
+ old_name="IgnoredAuthor",
+ new_name="IgnoredScientist",
+ ),
+ migrations.AlterModelOptions(
+ name="ignoredscientist",
+ options={
+ "ordering": ["-created_on"],
+ "verbose_name": "Ignorowany Scientist (PBN)",
+ "verbose_name_plural": "Ignorowani Scientist (PBN)",
+ },
+ ),
+ ]
diff --git a/src/deduplikator_autorow/migrations/0010_add_ignored_author.py b/src/deduplikator_autorow/migrations/0010_add_ignored_author.py
new file mode 100644
index 000000000..b9f55e6dc
--- /dev/null
+++ b/src/deduplikator_autorow/migrations/0010_add_ignored_author.py
@@ -0,0 +1,142 @@
+import django.db.models.deletion
+import django.utils.timezone
+from django.conf import settings
+from django.db import migrations, models
+
+
+def rename_leftover_ignoredauthor_indexes(apps, schema_editor):
+ """Rename indexes that PostgreSQL kept after RenameModel in 0009.
+
+ Migration 0009 renamed the IgnoredAuthor model to IgnoredScientist, which
+ in PostgreSQL renames the table but keeps existing index names. Those
+ `deduplikator_autorow_ignoredauthor_*` index names would collide with
+ auto-generated names for the new IgnoredAuthor model created here.
+
+ We rename them to match the new (IgnoredScientist) table to avoid the
+ collision and keep names consistent with the actual table. SQL is
+ idempotent (uses IF EXISTS) so it works against fresh DBs too.
+ """
+ renames = [
+ (
+ "deduplikator_autorow_ignoredauthor_autor_id_5e237500",
+ "deduplikator_autorow_ignoredsci_autor_id_5e237500",
+ ),
+ (
+ "deduplikator_autorow_ignoredauthor_created_by_id_3d0a197e",
+ "deduplikator_autorow_ignoredsci_created_by_id_3d0a197e",
+ ),
+ (
+ "deduplikator_autorow_ignoredauthor_scientist_id_ae6083d3_like",
+ "deduplikator_autorow_ignoredsci_scientist_id_ae6083d3_like",
+ ),
+ (
+ "deduplikator_autorow_ignoredauthor_pkey",
+ "deduplikator_autorow_ignoredscientist_pkey",
+ ),
+ (
+ "deduplikator_autorow_ignoredauthor_scientist_id_key",
+ "deduplikator_autorow_ignoredscientist_scientist_id_key",
+ ),
+ ]
+ with schema_editor.connection.cursor() as cursor:
+ for old_name, new_name in renames:
+ cursor.execute(
+ f'ALTER INDEX IF EXISTS "{old_name}" RENAME TO "{new_name}"'
+ )
+
+
+def reverse_rename_leftover_ignoredauthor_indexes(apps, schema_editor):
+ renames = [
+ (
+ "deduplikator_autorow_ignoredsci_autor_id_5e237500",
+ "deduplikator_autorow_ignoredauthor_autor_id_5e237500",
+ ),
+ (
+ "deduplikator_autorow_ignoredsci_created_by_id_3d0a197e",
+ "deduplikator_autorow_ignoredauthor_created_by_id_3d0a197e",
+ ),
+ (
+ "deduplikator_autorow_ignoredsci_scientist_id_ae6083d3_like",
+ "deduplikator_autorow_ignoredauthor_scientist_id_ae6083d3_like",
+ ),
+ (
+ "deduplikator_autorow_ignoredscientist_pkey",
+ "deduplikator_autorow_ignoredauthor_pkey",
+ ),
+ (
+ "deduplikator_autorow_ignoredscientist_scientist_id_key",
+ "deduplikator_autorow_ignoredauthor_scientist_id_key",
+ ),
+ ]
+ with schema_editor.connection.cursor() as cursor:
+ for old_name, new_name in renames:
+ cursor.execute(
+ f'ALTER INDEX IF EXISTS "{old_name}" RENAME TO "{new_name}"'
+ )
+
+
+class Migration(migrations.Migration):
+
+ dependencies = [
+ ("bpp", "0413_bppuser_autor_onetoone"),
+ ("deduplikator_autorow", "0009_rename_ignoredauthor_ignoredscientist"),
+ migrations.swappable_dependency(settings.AUTH_USER_MODEL),
+ ]
+
+ operations = [
+ migrations.RunPython(
+ rename_leftover_ignoredauthor_indexes,
+ reverse_rename_leftover_ignoredauthor_indexes,
+ ),
+ migrations.CreateModel(
+ name="IgnoredAuthor",
+ fields=[
+ (
+ "id",
+ models.BigAutoField(
+ auto_created=True,
+ primary_key=True,
+ serialize=False,
+ verbose_name="ID",
+ ),
+ ),
+ (
+ "reason",
+ models.CharField(
+ blank=True,
+ max_length=500,
+ verbose_name="Powód ignorowania",
+ ),
+ ),
+ (
+ "created_on",
+ models.DateTimeField(
+ default=django.utils.timezone.now,
+ verbose_name="Data utworzenia",
+ ),
+ ),
+ (
+ "autor",
+ models.OneToOneField(
+ help_text="Autor BPP do ignorowania w deduplikacji ogólnej",
+ on_delete=django.db.models.deletion.CASCADE,
+ to="bpp.autor",
+ verbose_name="Autor (BPP)",
+ ),
+ ),
+ (
+ "created_by",
+ models.ForeignKey(
+ on_delete=django.db.models.deletion.CASCADE,
+ to=settings.AUTH_USER_MODEL,
+ verbose_name="Utworzył",
+ ),
+ ),
+ ],
+ options={
+ "verbose_name": "Ignorowany autor (BPP)",
+ "verbose_name_plural": "Ignorowani autorzy (BPP)",
+ "ordering": ["-created_on"],
+ },
+ ),
+ ]
diff --git a/src/deduplikator_autorow/migrations/0011_scan_mode_phase_partial.py b/src/deduplikator_autorow/migrations/0011_scan_mode_phase_partial.py
new file mode 100644
index 000000000..9e56f826e
--- /dev/null
+++ b/src/deduplikator_autorow/migrations/0011_scan_mode_phase_partial.py
@@ -0,0 +1,71 @@
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+ dependencies = [
+ ("deduplikator_autorow", "0010_add_ignored_author"),
+ ]
+
+ operations = [
+ migrations.AlterField(
+ model_name="duplicatescanrun",
+ name="status",
+ field=models.CharField(
+ choices=[
+ ("pending", "Oczekuje"),
+ ("running", "W trakcie"),
+ ("completed", "Zakończone"),
+ (
+ "partial_completed",
+ "Częściowo zakończone (faza PBN OK, general anulowana)",
+ ),
+ ("cancelled", "Anulowane"),
+ ("failed", "Błąd"),
+ ],
+ db_index=True,
+ default="pending",
+ max_length=20,
+ verbose_name="Status",
+ ),
+ ),
+ migrations.AddField(
+ model_name="duplicatescanrun",
+ name="phase",
+ field=models.CharField(
+ blank=True,
+ choices=[("pbn", "Faza PBN"), ("general", "Faza ogólna")],
+ max_length=20,
+ verbose_name="Aktualna faza",
+ ),
+ ),
+ migrations.AddField(
+ model_name="duplicatecandidate",
+ name="scan_mode",
+ field=models.CharField(
+ choices=[("pbn", "PBN"), ("general", "Ogólny")],
+ db_index=True,
+ default="pbn",
+ max_length=20,
+ verbose_name="Tryb skanowania",
+ ),
+ ),
+ migrations.RemoveConstraint(
+ model_name="duplicatecandidate",
+ name="unique_scan_main_duplicate",
+ ),
+ migrations.AddIndex(
+ model_name="duplicatecandidate",
+ index=models.Index(
+ fields=["scan_run", "scan_mode", "status"],
+ name="deduplikato_scan_ru_78ad22_idx",
+ ),
+ ),
+ migrations.AddConstraint(
+ model_name="duplicatecandidate",
+ constraint=models.UniqueConstraint(
+ fields=("scan_run", "scan_mode", "main_autor", "duplicate_autor"),
+ name="unique_scan_mode_main_duplicate",
+ ),
+ ),
+ ]
diff --git a/src/deduplikator_autorow/models.py b/src/deduplikator_autorow/models.py
index 9def5b9ea..33c299126 100644
--- a/src/deduplikator_autorow/models.py
+++ b/src/deduplikator_autorow/models.py
@@ -30,8 +30,8 @@ def __str__(self):
return f"Autor {self.autor} (not duplicate) - {self.created_by}"
-class IgnoredAuthor(models.Model):
- """Authors that should be completely ignored in the deduplication process"""
+class IgnoredScientist(models.Model):
+ """Scientists from PBN that should be completely ignored in deduplication"""
scientist = models.OneToOneField(
"pbn_api.Scientist",
@@ -66,8 +66,8 @@ class IgnoredAuthor(models.Model):
)
class Meta:
- verbose_name = "Ignorowany autor"
- verbose_name_plural = "Ignorowani autorzy"
+ verbose_name = "Ignorowany Scientist (PBN)"
+ verbose_name_plural = "Ignorowani Scientist (PBN)"
ordering = ["-created_on"]
def __str__(self):
@@ -76,6 +76,39 @@ def __str__(self):
return f"Ignorowany: Scientist #{self.scientist.pk}"
+class IgnoredAuthor(models.Model):
+ """BPP authors (without PBN-Scientist link) that should be ignored in deduplication."""
+
+ autor = models.OneToOneField(
+ "bpp.Autor",
+ on_delete=models.CASCADE,
+ db_index=True,
+ verbose_name="Autor (BPP)",
+ help_text="Autor BPP do ignorowania w deduplikacji ogólnej",
+ )
+
+ reason = models.CharField(
+ max_length=500,
+ blank=True,
+ verbose_name="Powód ignorowania",
+ )
+
+ created_on = models.DateTimeField("Data utworzenia", default=timezone.now)
+ created_by = models.ForeignKey(
+ BppUser,
+ on_delete=models.CASCADE,
+ verbose_name="Utworzył",
+ )
+
+ class Meta:
+ verbose_name = "Ignorowany autor (BPP)"
+ verbose_name_plural = "Ignorowani autorzy (BPP)"
+ ordering = ["-created_on"]
+
+ def __str__(self):
+ return f"Ignorowany autor: {self.autor}"
+
+
class LogScalania(models.Model):
"""Log of author merge operations with detailed tracking"""
@@ -226,6 +259,10 @@ class Status(models.TextChoices):
PENDING = "pending", "Oczekuje"
RUNNING = "running", "W trakcie"
COMPLETED = "completed", "Zakończone"
+ PARTIAL_COMPLETED = (
+ "partial_completed",
+ "Częściowo zakończone (faza PBN OK, general anulowana)",
+ )
CANCELLED = "cancelled", "Anulowane"
FAILED = "failed", "Błąd"
@@ -274,6 +311,13 @@ class Status(models.TextChoices):
blank=True,
)
+ phase = models.CharField(
+ "Aktualna faza",
+ max_length=20,
+ blank=True,
+ choices=[("pbn", "Faza PBN"), ("general", "Faza ogólna")],
+ )
+
class Meta:
verbose_name = "Skanowanie duplikatów"
verbose_name_plural = "Skanowania duplikatów"
@@ -352,6 +396,14 @@ class Status(models.TextChoices):
help_text="Priorytet wyświetlania: 100=prace 2022-2025 z dyscyplinami, 50=prace 2022-2025, 0=inne",
)
+ scan_mode = models.CharField(
+ "Tryb skanowania",
+ max_length=20,
+ choices=[("pbn", "PBN"), ("general", "Ogólny")],
+ default="pbn",
+ db_index=True,
+ )
+
# Status tracking
status = models.CharField(
"Status",
@@ -402,11 +454,12 @@ class Meta:
models.Index(fields=["scan_run", "status"]),
models.Index(fields=["main_autor", "status"]),
models.Index(fields=["priority", "confidence_score"]),
+ models.Index(fields=["scan_run", "scan_mode", "status"]),
]
constraints = [
models.UniqueConstraint(
- fields=["scan_run", "main_autor", "duplicate_autor"],
- name="unique_scan_main_duplicate",
+ fields=["scan_run", "scan_mode", "main_autor", "duplicate_autor"],
+ name="unique_scan_mode_main_duplicate",
),
]
diff --git a/src/deduplikator_autorow/static/deduplikator_autorow/scss/deduplikator_autorow.scss b/src/deduplikator_autorow/static/deduplikator_autorow/scss/deduplikator_autorow.scss
index 45f7d7f94..4d3ca06e1 100644
--- a/src/deduplikator_autorow/static/deduplikator_autorow/scss/deduplikator_autorow.scss
+++ b/src/deduplikator_autorow/static/deduplikator_autorow/scss/deduplikator_autorow.scss
@@ -1,6 +1,17 @@
// Deduplikator Autorow - Styles
// BEM convention: .deduplikator-autorow__element--modifier
+// Foundation .label ma domyślnie kwadratowe rogi - w obrębie deduplikatora
+// chcemy jednolitą "pigułkową" estetykę (zgodną z chipami powodów
+// podobieństwa). Wrapper .deduplikator-autorow-page ogranicza override
+// tylko do tej strony, żeby nie wpływać globalnie na inne widoki BPP.
+.deduplikator-autorow-page .label {
+ border-radius: 999px;
+ padding: 3px 12px;
+ font-weight: 600;
+ letter-spacing: 0.02em;
+}
+
// =============================================================================
// SIDEBAR ACCORDION
// =============================================================================
@@ -252,6 +263,16 @@
color: #666;
}
+.deduplikator-autorow__search-btn-flat-right {
+ border-top-right-radius: 0;
+ border-bottom-right-radius: 0;
+}
+
+.deduplikator-autorow__search-btn-flat-right + .input-group-button .button {
+ border-top-left-radius: 0;
+ border-bottom-left-radius: 0;
+}
+
// =============================================================================
// DISCIPLINE TABLE
// =============================================================================
@@ -306,25 +327,21 @@
.deduplikator-autorow__publication-list {
max-height: 300px;
overflow-y: auto;
- border: 1px solid #e1e1e1;
- padding: 10px;
+ padding: 0;
}
.deduplikator-autorow__publication-list--short {
max-height: 250px;
- background-color: #f9f9f9;
}
.deduplikator-autorow__publication-item {
- margin-bottom: 8px;
- padding: 5px;
- border-left: 3px solid #1779ba;
+ margin-bottom: 4px;
+ padding: 0;
}
.deduplikator-autorow__publication-item--duplicate {
- margin-bottom: 6px;
- padding: 3px;
- border-left: 2px solid #8a8a8a;
+ margin-bottom: 2px;
+ padding: 0;
}
.deduplikator-autorow__publication-link {
@@ -352,6 +369,28 @@
.deduplikator-autorow__duplicate-card {
margin-bottom: 20px;
+ // Niektóre nadrzędne layouty BPP wymuszają text-align: center w obrębie
+ // .callout (zaobserwowane w warningowych callout-ach panelu duplikatów —
+ // imiona/nazwiska autorów wyświetlały się wycentrowane). Wymuszamy
+ // domyślne wyrównanie do lewej dla całej karty.
+ text-align: left;
+}
+
+// Stan "wyłączone" dla przycisków "Scal wszystkie" gdy w grupie jest kandydat
+// poniżej progu pewności. Trzymamy je klikalne (do wyświetlenia komunikatu)
+// dlatego nie używamy [disabled] - tylko aria-disabled + klasa wizualna.
+.deduplikator-autorow__merge-all-btn--disabled,
+.button.deduplikator-autorow__merge-all-btn--disabled {
+ opacity: 0.55;
+ cursor: not-allowed;
+ background-color: #b5b5b5 !important;
+ color: #fff !important;
+
+ &:hover,
+ &:focus {
+ background-color: #b5b5b5 !important;
+ box-shadow: none;
+ }
}
.deduplikator-autorow__duplicate-header {
@@ -406,6 +445,10 @@
.deduplikator-autorow__duplicates-header {
margin-bottom: 15px;
+
+ .grid-x + .grid-x {
+ margin-top: 0.5em;
+ }
}
.deduplikator-autorow__duplicates-title {
@@ -481,3 +524,235 @@
.deduplikator-autorow__confidence-low {
color: red;
}
+
+// Mode badges, partial-completed banner, mode filter, scan phase
+.deduplikator-autorow {
+ &__main-record-title {
+ display: flex;
+ align-items: center;
+ gap: 0.6em;
+ flex-wrap: wrap;
+ }
+
+ &__badge {
+ display: inline-flex;
+ align-items: center;
+ gap: 0.35em;
+ padding: 4px 10px 4px 9px;
+ border-radius: 999px;
+ font-size: 0.7em;
+ font-weight: 700;
+ letter-spacing: 0.04em;
+ text-transform: uppercase;
+ color: #fff;
+ line-height: 1;
+ box-shadow: 0 1px 2px rgba(0, 0, 0, 0.15);
+ vertical-align: middle;
+
+ .fi-link,
+ .fi-magnifying-glass {
+ font-size: 1em;
+ line-height: 1;
+ }
+
+ &--pbn {
+ background: linear-gradient(180deg, #42a5f5 0%, #1976d2 100%);
+ border: 1px solid #1565c0;
+ }
+
+ &--general {
+ background: linear-gradient(180deg, #ffb74d 0%, #f57c00 100%);
+ border: 1px solid #ef6c00;
+ }
+ }
+
+ &__partial-banner {
+ margin: 1em 0;
+ }
+
+ // Top bar — przyciski trybu po lewej, wyszukiwarka po prawej.
+ &__top-bar {
+ display: flex;
+ align-items: center;
+ justify-content: space-between;
+ gap: 1em;
+ margin: 1em 0;
+ flex-wrap: nowrap;
+ }
+
+ &__top-search {
+ flex: 0 1 auto;
+ min-width: 200px;
+ margin: 0;
+ }
+
+ &__top-search-group {
+ margin: 0;
+ }
+
+ &__top-search-info {
+ display: block;
+ margin-top: 0.4em;
+ color: #555;
+ }
+
+ &__confidence-filter {
+ flex: 0 0 auto;
+ margin: 0;
+ display: flex;
+ align-items: center;
+ }
+
+ &__confidence-buttons {
+ margin: 0;
+
+ .button {
+ margin: 0;
+ display: inline-flex;
+ align-items: center;
+ gap: 0.4em;
+ }
+ }
+
+ // Mode filter (Pokaż wyniki: PBN/Ogólny/Oba) — Foundation button-group based
+ &__mode-filter {
+ flex: 0 0 auto;
+ margin: 0;
+ display: flex;
+ align-items: center;
+ gap: 0.75em;
+ flex-wrap: nowrap;
+ }
+
+ &__mode-filter-label {
+ font-weight: 600;
+ color: #4a4a4a;
+ }
+
+ &__mode-buttons {
+ margin: 0;
+
+ .button {
+ margin: 0;
+ display: inline-flex;
+ align-items: center;
+ gap: 0.4em;
+ }
+ }
+
+ &__mode-count {
+ display: inline-block;
+ margin-left: 0.3em;
+ padding: 1px 7px;
+ border-radius: 999px;
+ background: rgba(0, 0, 0, 0.18);
+ color: inherit;
+ font-size: 0.8em;
+ font-weight: 700;
+ line-height: 1.4;
+
+ .hollow & {
+ background: rgba(0, 0, 0, 0.08);
+ }
+ }
+
+ &__scan-phase {
+ margin-top: 0.5em;
+ font-style: italic;
+ }
+
+ // Action group sections within each duplicate card (Podgląd / Decyzja / Scalanie)
+ &__actions {
+ display: flex;
+ flex-direction: column;
+ gap: 14px;
+ }
+
+ &__action-group {
+ background: rgba(0, 0, 0, 0.03);
+ border: 1px solid rgba(0, 0, 0, 0.08);
+ border-radius: 6px;
+ padding: 10px 12px;
+ }
+
+ &__action-group-title {
+ font-size: 0.72rem;
+ font-weight: 700;
+ text-transform: uppercase;
+ letter-spacing: 0.06em;
+ color: #555;
+ margin: 0 0 8px 0;
+ display: flex;
+ align-items: center;
+ gap: 0.4em;
+
+ .fi-eye,
+ .fi-checkbox,
+ .fi-arrows-compress {
+ color: #888;
+ }
+ }
+
+ // Opisy bibliograficzne renderują / wokół tytułów — to jest OK,
+ // tytuł ma być boldem. Problem: Foundation daje .callout a:not(.close-button)
+ // { font-weight: bolder }, więc cały tekst w wewnątrz .callout jest
+ // bold. Resetujemy font-weight na w obrębie itemów publikacji z
+ // wyższą specyficznością niż .callout a:not(.close-button), żeby wygrać
+ // kaskadę. Zostawiamy / z ich domyślnym bold, żeby tytuł nadal
+ // był wytłuszczony.
+ .callout &__publication-item a,
+ .callout &__publication-item--duplicate a {
+ font-weight: normal;
+ }
+
+ // Reason chips — small pills with icon + text
+ &__reasons-chips {
+ display: flex;
+ flex-wrap: wrap;
+ gap: 6px;
+ padding: 0;
+ margin: 0;
+ }
+
+ &__reason-chip {
+ display: inline-flex;
+ align-items: center;
+ gap: 0.35em;
+ padding: 3px 9px;
+ border-radius: 999px;
+ font-size: 0.78rem;
+ line-height: 1.5;
+ border: 1px solid transparent;
+ white-space: nowrap;
+ max-width: 100%;
+
+ .deduplikator-autorow__reason-chip-text {
+ overflow: hidden;
+ text-overflow: ellipsis;
+ }
+
+ &--match {
+ background: #e6f4ea;
+ border-color: #b6dec0;
+ color: #1b5e20;
+ }
+
+ &--info {
+ background: #e7f0fc;
+ border-color: #b9d4f0;
+ color: #0d3a73;
+ }
+
+ &--weak {
+ background: #f0f0f0;
+ border-color: #d8d8d8;
+ color: #555;
+ }
+
+ &--warn {
+ background: #fff4e0;
+ border-color: #f5d49a;
+ color: #8a4b00;
+ }
+ }
+}
diff --git a/src/deduplikator_autorow/tasks.py b/src/deduplikator_autorow/tasks.py
index 101027480..3027491d5 100644
--- a/src/deduplikator_autorow/tasks.py
+++ b/src/deduplikator_autorow/tasks.py
@@ -56,6 +56,33 @@ def _get_user_by_id(user_id):
return None
+def _calculate_priority_from_meta(meta_entry: dict) -> int:
+ """Computes priority from meta dict (no SQL).
+
+ Mirrors :func:`calculate_author_priority` but uses cached fields
+ from the meta dict produced by ``build_autor_meta``. Avoids
+ per-candidate SQL on the hot path of ``_run_general_phase``.
+
+ Priority values:
+ 100 - has 2022-2025 publications WITH disciplines
+ 50 - has 2022-2025 publications (any)
+ 0 - no recent publications
+
+ TODO: ``calculate_author_priority`` checks disciplines specifically
+ in 2022-2025 (``Autor_Dyscyplina.objects.filter(rok__gte=2022,
+ rok__lte=2025)``). The meta-cache only stores ``ma_dyscypline``
+ (any year), so this is an approximation. Acceptable for v1 since
+ priority is a sort hint, not a correctness invariant. To achieve
+ exact parity, store year-filtered discipline data in meta.
+ """
+ recent_lata = {rok for rok in meta_entry["lata_publikacji"] if 2022 <= rok <= 2025}
+ if not recent_lata:
+ return 0
+ if meta_entry["ma_dyscypline"]:
+ return 100
+ return 50
+
+
def calculate_author_priority(autor):
"""
Calculate priority based on publication dates and disciplines.
@@ -217,124 +244,266 @@ def _process_author_duplicates(osoba_z_instytucji, scan_run, min_confidence):
return candidates
-@shared_task(bind=True, name="deduplikator_autorow.scan_for_duplicates")
-def scan_for_duplicates(self, user_id=None, min_confidence=MIN_CONFIDENCE_TO_STORE):
- """
- Background task to scan all authors for potential duplicates.
-
- This task:
- 1. Creates a DuplicateScanRun record
- 2. Deletes all existing DuplicateCandidate records (replace mode)
- 3. Iterates through all OsobaZInstytucji
- 4. For each, calls szukaj_kopii() to find candidates
- 5. For each candidate, calls analiza_duplikatow() and stores in DuplicateCandidate
- 6. Updates progress periodically
- 7. Marks run as completed
-
- Args:
- user_id: Optional ID of the user who triggered the scan
- min_confidence: Minimum confidence score to store a candidate (default: 50)
+def _run_general_phase(scan_run, min_confidence=MIN_CONFIDENCE_TO_STORE):
+ """Faza 2 skanu — duplikaty general (no SQL on hot path).
- Returns:
- dict: Result with status, scan_run_id, and statistics
+ Algorytm:
+ 1. build_autor_meta + build_buckets — pre-load wszystkich autorów.
+ 2. Read IgnoredAuthor / NotADuplicate exclusions.
+ 3. generate_pairs — pary score >= min_confidence.
+ 4. find_clusters — connected components.
+ 5. Cluster-skip jeśli ktokolwiek w klastrze ma OsobaZInstytucji.
+ 6. Pick main przez hierarchię B; emit pary (main, dup) jako
+ DuplicateCandidate(scan_mode='general').
+ 7. Sprawdza scan_run.status == CANCELLED między batchami.
"""
- from pbn_api.models import OsobaZInstytucji
-
- from .models import DuplicateCandidate, DuplicateScanRun, IgnoredAuthor
-
- logger.info("Starting duplicate scan task...")
-
- user = _get_user_by_id(user_id)
+ from .models import (
+ DuplicateCandidate,
+ DuplicateScanRun,
+ IgnoredAuthor,
+ NotADuplicate,
+ )
+ from .utils.analysis_meta import analiza_pary_meta
+ from .utils.cluster import find_clusters
+ from .utils.main_selection import pick_main_pk
+ from .utils.meta import build_autor_meta, build_buckets
+ from .utils.search_general import generate_pairs
+
+ logger.info("General phase: building meta cache...")
+ meta = build_autor_meta()
+ buckets = build_buckets(meta)
+ logger.info("General phase: %d autorów, %d bucketów", len(meta), len(buckets))
+
+ ignored_pks = set(IgnoredAuthor.objects.values_list("autor_id", flat=True))
+ notadup_pks = set(NotADuplicate.objects.values_list("autor_id", flat=True))
+
+ pairs_data: dict[tuple[int, int], tuple[int, list[str]]] = {}
+ for pk_a, pk_b, score, reasons in generate_pairs(
+ buckets, meta, ignored_pks, notadup_pks, min_confidence
+ ):
+ pairs_data[(pk_a, pk_b)] = (score, reasons)
+ logger.info("General phase: znaleziono %d par", len(pairs_data))
+
+ clusters = find_clusters(list(pairs_data.keys()))
+ logger.info("General phase: %d klastrów wstępnych", len(clusters))
+
+ skipped_count = 0
+ candidates_to_create: list[DuplicateCandidate] = []
+ for cluster in clusters:
+ if any(meta[pk]["ma_osoba_z_instytucji"] for pk in cluster):
+ skipped_count += 1
+ continue
+ main_pk = pick_main_pk(cluster, meta)
+ for dup_pk in cluster - {main_pk}:
+ key = (min(main_pk, dup_pk), max(main_pk, dup_pk))
+ if key in pairs_data:
+ score, reasons = pairs_data[key]
+ else:
+ score, reasons = analiza_pary_meta(meta[main_pk], meta[dup_pk])
+ main_obj = meta[main_pk]["obj"]
+ dup_obj = meta[dup_pk]["obj"]
+ candidates_to_create.append(
+ DuplicateCandidate(
+ scan_run=scan_run,
+ main_autor=main_obj,
+ duplicate_autor=dup_obj,
+ confidence_score=score,
+ confidence_percent=normalize_confidence(score),
+ reasons=reasons,
+ priority=_calculate_priority_from_meta(meta[dup_pk]),
+ main_autor_name=str(main_obj),
+ duplicate_autor_name=str(dup_obj),
+ main_publications_count=meta[main_pk]["publikacje_count"],
+ duplicate_publications_count=meta[dup_pk]["publikacje_count"],
+ scan_mode="general",
+ )
+ )
+ if len(candidates_to_create) >= 1000:
+ with transaction.atomic():
+ DuplicateCandidate.objects.bulk_create(
+ candidates_to_create, ignore_conflicts=True
+ )
+ candidates_to_create = []
+ scan_run.refresh_from_db()
+ if scan_run.status == DuplicateScanRun.Status.CANCELLED:
+ logger.info("General phase cancelled mid-batch")
+ return
+
+ if candidates_to_create:
+ with transaction.atomic():
+ DuplicateCandidate.objects.bulk_create(
+ candidates_to_create, ignore_conflicts=True
+ )
- scan_run = DuplicateScanRun.objects.create(
- status=DuplicateScanRun.Status.RUNNING,
- created_by=user,
- celery_task_id=self.request.id or "",
+ logger.info(
+ "General phase: %d klastrów pominiętych (z OsobaZInstytucji)",
+ skipped_count,
)
- try:
- deleted_count = DuplicateCandidate.objects.all().delete()[0]
- logger.info(f"Deleted {deleted_count} existing candidates")
- ignored_scientist_ids = set(
- IgnoredAuthor.objects.values_list("scientist_id", flat=True)
- )
+def _run_pbn_phase(scan_run, min_confidence=MIN_CONFIDENCE_TO_STORE):
+ """Faza 1 skanu — duplikaty PBN (OsobaZInstytucji).
- osoby_query = OsobaZInstytucji.objects.select_related("personId").all()
- if ignored_scientist_ids:
- osoby_query = osoby_query.exclude(personId__pk__in=ignored_scientist_ids)
+ Iteruje przez wszystkie OsobaZInstytucji (z wyjątkiem IgnoredScientist),
+ dla każdej szuka kopii (`szukaj_kopii`), analizuje (`analiza_duplikatow`)
+ i tworzy DuplicateCandidate. Polluje `scan_run.status` między autorami —
+ jeśli zewnętrzny `cancel_scan` ustawił CANCELLED, kończy wcześnie
+ (status pozostaje CANCELLED — caller decyduje o finalizacji).
- total_count = osoby_query.count()
- scan_run.total_authors_to_scan = total_count
- scan_run.save(update_fields=["total_authors_to_scan"])
+ Aktualizuje pola `total_authors_to_scan`, `authors_scanned` i
+ `duplicates_found` na `scan_run` w trakcie pracy.
+ """
+ from pbn_api.models import OsobaZInstytucji
- logger.info(f"Scanning {total_count} authors for duplicates...")
+ from .models import DuplicateCandidate, DuplicateScanRun, IgnoredScientist
- authors_scanned = 0
- duplicates_found = 0
- candidates_to_create = []
+ ignored_scientist_ids = set(
+ IgnoredScientist.objects.values_list("scientist_id", flat=True)
+ )
- for osoba_z_instytucji in osoby_query.iterator():
- scan_run.refresh_from_db()
- if scan_run.status == DuplicateScanRun.Status.CANCELLED:
- logger.info("Scan cancelled by user")
- return {
- "status": "cancelled",
- "scan_run_id": scan_run.pk,
- "authors_scanned": authors_scanned,
- "duplicates_found": duplicates_found,
- }
+ osoby_query = OsobaZInstytucji.objects.select_related("personId").all()
+ if ignored_scientist_ids:
+ osoby_query = osoby_query.exclude(personId__pk__in=ignored_scientist_ids)
- authors_scanned += 1
+ total_count = osoby_query.count()
+ scan_run.total_authors_to_scan = total_count
+ scan_run.save(update_fields=["total_authors_to_scan"])
- new_candidates = _process_author_duplicates(
- osoba_z_instytucji, scan_run, min_confidence
- )
- candidates_to_create.extend(new_candidates)
- duplicates_found += len(new_candidates)
+ logger.info(f"PBN phase: scanning {total_count} authors...")
- if len(candidates_to_create) >= 1000:
+ authors_scanned = 0
+ duplicates_found = 0
+ candidates_to_create = []
+
+ for osoba_z_instytucji in osoby_query.iterator():
+ scan_run.refresh_from_db()
+ if scan_run.status == DuplicateScanRun.Status.CANCELLED:
+ logger.info("PBN phase cancelled by user")
+ if candidates_to_create:
with transaction.atomic():
DuplicateCandidate.objects.bulk_create(
candidates_to_create, ignore_conflicts=True
)
- candidates_to_create = []
+ scan_run.authors_scanned = authors_scanned
+ scan_run.duplicates_found = duplicates_found
+ scan_run.save(update_fields=["authors_scanned", "duplicates_found"])
+ return
- if authors_scanned % PROGRESS_UPDATE_INTERVAL == 0:
- scan_run.authors_scanned = authors_scanned
- scan_run.duplicates_found = duplicates_found
- scan_run.save(update_fields=["authors_scanned", "duplicates_found"])
- logger.info(
- f"Progress: {authors_scanned}/{total_count} authors, "
- f"{duplicates_found} duplicates found"
- )
+ authors_scanned += 1
+
+ new_candidates = _process_author_duplicates(
+ osoba_z_instytucji, scan_run, min_confidence
+ )
+ candidates_to_create.extend(new_candidates)
+ duplicates_found += len(new_candidates)
- if candidates_to_create:
+ if len(candidates_to_create) >= 1000:
with transaction.atomic():
DuplicateCandidate.objects.bulk_create(
candidates_to_create, ignore_conflicts=True
)
+ candidates_to_create = []
+
+ if authors_scanned % PROGRESS_UPDATE_INTERVAL == 0:
+ scan_run.authors_scanned = authors_scanned
+ scan_run.duplicates_found = duplicates_found
+ scan_run.save(update_fields=["authors_scanned", "duplicates_found"])
+ logger.info(
+ f"PBN progress: {authors_scanned}/{total_count} authors, "
+ f"{duplicates_found} duplicates found"
+ )
+
+ if candidates_to_create:
+ with transaction.atomic():
+ DuplicateCandidate.objects.bulk_create(
+ candidates_to_create, ignore_conflicts=True
+ )
+
+ scan_run.authors_scanned = authors_scanned
+ scan_run.duplicates_found = duplicates_found
+ scan_run.save(update_fields=["authors_scanned", "duplicates_found"])
+
+ logger.info(
+ f"PBN phase done: {authors_scanned} authors scanned, "
+ f"{duplicates_found} duplicates found"
+ )
+
+
+@shared_task(bind=True, name="deduplikator_autorow.scan_for_duplicates")
+def scan_for_duplicates(self, user_id=None, min_confidence=MIN_CONFIDENCE_TO_STORE):
+ """Combined task: faza PBN + faza general w jednym przebiegu.
+
+ Statusy końcowe:
+ - COMPLETED: obie fazy ukończone.
+ - PARTIAL_COMPLETED: faza PBN OK, faza general anulowana → wyniki PBN
+ dostępne.
+ - CANCELLED: faza PBN anulowana → brak wyników.
+ - FAILED: nieobsłużony wyjątek.
+ """
+ from .models import DuplicateCandidate, DuplicateScanRun
+
+ logger.info("Starting duplicate scan task (combined PBN + general)...")
+
+ user = _get_user_by_id(user_id)
+ scan_run = DuplicateScanRun.objects.create(
+ status=DuplicateScanRun.Status.RUNNING,
+ created_by=user,
+ celery_task_id=self.request.id or "",
+ )
+
+ try:
+ # Replace mode: clear all previous candidates
+ deleted_count = DuplicateCandidate.objects.all().delete()[0]
+ logger.info(f"Deleted {deleted_count} existing candidates")
+
+ # FAZA 1: PBN
+ scan_run.phase = "pbn"
+ scan_run.save(update_fields=["phase"])
+ _run_pbn_phase(scan_run, min_confidence)
+ scan_run.refresh_from_db()
+ if scan_run.status == DuplicateScanRun.Status.CANCELLED:
+ scan_run.finished_at = timezone.now()
+ scan_run.save(update_fields=["finished_at"])
+ logger.info("Scan cancelled in PBN phase")
+ return {
+ "status": "cancelled",
+ "scan_run_id": scan_run.pk,
+ }
+
+ # FAZA 2: general
+ scan_run.phase = "general"
+ scan_run.save(update_fields=["phase"])
+ _run_general_phase(scan_run, min_confidence)
+ scan_run.refresh_from_db()
+ if scan_run.status == DuplicateScanRun.Status.CANCELLED:
+ scan_run.status = DuplicateScanRun.Status.PARTIAL_COMPLETED
+ scan_run.finished_at = timezone.now()
+ scan_run.save(update_fields=["status", "finished_at"])
+ logger.info("Scan cancelled in general phase → PARTIAL_COMPLETED")
+ return {
+ "status": "partial_completed",
+ "scan_run_id": scan_run.pk,
+ }
+ total_cands = DuplicateCandidate.objects.filter(scan_run=scan_run).count()
scan_run.status = DuplicateScanRun.Status.COMPLETED
scan_run.finished_at = timezone.now()
- scan_run.authors_scanned = authors_scanned
- scan_run.duplicates_found = duplicates_found
+ scan_run.duplicates_found = total_cands
scan_run.save()
logger.info(
- f"Scan completed: {authors_scanned} authors scanned, "
- f"{duplicates_found} duplicates found"
+ f"Scan completed: {scan_run.authors_scanned} authors scanned, "
+ f"{total_cands} duplicates found"
)
return {
"status": "success",
"scan_run_id": scan_run.pk,
- "authors_scanned": authors_scanned,
- "duplicates_found": duplicates_found,
+ "duplicates_found": total_cands,
}
except Exception as e:
- logger.error(f"Error during duplicate scan: {str(e)}", exc_info=True)
+ logger.exception("Error during duplicate scan")
scan_run.status = DuplicateScanRun.Status.FAILED
scan_run.finished_at = timezone.now()
scan_run.error_message = str(e)
diff --git a/src/deduplikator_autorow/templates/deduplikator_autorow/duplicate_authors.html b/src/deduplikator_autorow/templates/deduplikator_autorow/duplicate_authors.html
index 0f9e904e5..1ac032b10 100644
--- a/src/deduplikator_autorow/templates/deduplikator_autorow/duplicate_authors.html
+++ b/src/deduplikator_autorow/templates/deduplikator_autorow/duplicate_authors.html
@@ -1,20 +1,21 @@
{% extends "base.html" %}
{% load static %}
-{% block extra_css %}
+{% block extrahead %}
+{{ block.super }}
{% endblock %}
-{% block title %}Deduplikator Autorów PBN{% endblock %}
+{% block title %}Deduplikator autorów{% endblock %}
{% block breadcrumbs %}
Strona główna
- Deduplikator autorów PBN
+ Deduplikator autorów
{% endblock %}
{% block content %}
-