Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 50 additions & 10 deletions formfyxer/lit_explorer.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@
unlock_pdf_in_place,
is_tagged,
get_original_text_with_fields,
_get_named_parent,
_unnest_pdf_fields,
)

import math
Expand Down Expand Up @@ -118,6 +120,53 @@ def _truncate_to_token_limit(
return encoding.decode(tokens[:max_tokens])


def _rewrite_pdf_fields_in_place(
in_file: str, field_names: List[str], new_names: List[str]
) -> None:
"""Rewrite PDF field names in traversal order, preserving repeated sources.

For nested field trees we keep the existing parent hierarchy and rewrite only the
leaf segment. For flat fields we write the full target name, including dotted
names like ``users.name.first``.
"""
if len(field_names) != len(new_names):
raise ValueError("field_names and new_names must have the same length")

my_pdf = pikepdf.Pdf.open(in_file, allow_overwriting_input=True)
try:
if not hasattr(my_pdf.Root, "AcroForm") or not hasattr(
my_pdf.Root.AcroForm, "Fields"
):
return

flattened_fields = [
child_field
for parent_field in iter(my_pdf.Root.AcroForm.Fields)
for child_field in _unnest_pdf_fields(parent_field)
]
if len(flattened_fields) != len(field_names):
raise ValueError(
"PDF field traversal count did not match parsed field-name count"
Comment on lines +147 to +149
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Avoid aborting rewrites when AcroForm contains filtered widgets

In parse_form(), field_names comes from get_existing_pdf_fields(), which skips template widgets and fields whose page cannot be resolved (formfyxer/pdf_wrangling.py:536-552). _rewrite_pdf_fields_in_place() rebuilds flattened_fields directly from _unnest_pdf_fields() without applying those filters, then raises on any count mismatch. On PDFs that contain one of those skipped widgets, rewrite=True now aborts the entire rename pass even though the visible fields were parsed successfully; this helper needs to use the same filtered field set instead of treating the extra entries as fatal.

Useful? React with 👍 / 👎.

)

for field_data, old_name, new_name in zip(flattened_fields, field_names, new_names):
cleaned_name = re.sub(r"^\*", "", new_name)
if old_name == cleaned_name:
continue

target = _get_named_parent(field_data["all"])
if not target:
continue

# Nested fields keep their parent hierarchy; flat fields accept full dotted names.
target_name = cleaned_name.split(".")[-1] if "." in old_name else cleaned_name
target.T = target_name

my_pdf.save(in_file)
finally:
my_pdf.close()


def _normalize_openai_base_url(openai_base_url: Optional[str]) -> Optional[str]:
"""Normalize OpenAI-compatible base URLs, including Azure resource URLs."""
base_url = str(openai_base_url or "").strip()
Expand Down Expand Up @@ -2081,16 +2130,7 @@ def _looks_reasonable(candidate: str) -> bool:
stats["debug fields"] = debug_fields
if rewrite:
try:
my_pdf = pikepdf.Pdf.open(in_file, allow_overwriting_input=True)
fields_too = (
my_pdf.Root.AcroForm.Fields
) # [0]["/Kids"][0]["/Kids"][0]["/Kids"][0]["/Kids"]
# print(repr(fields_too))
for k, field_name in enumerate(new_names):
# print(k,field)
fields_too[k].T = re.sub(r"^\*", "", field_name)
my_pdf.save(in_file)
my_pdf.close()
_rewrite_pdf_fields_in_place(in_file, field_names, new_names)
except Exception as ex:
stats["error"] = f"could not change form fields: {ex}"
return stats
Expand Down
133 changes: 133 additions & 0 deletions formfyxer/tests/test_lit_explorer_pdf_labeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from unittest.mock import Mock, patch

from formfyxer.lit_explorer import (
_rewrite_pdf_fields_in_place,
parse_form,
rename_pdf_fields_with_context,
text_complete,
Expand Down Expand Up @@ -73,6 +74,138 @@ def test_parse_form_accepts_model_and_openai_base_url(self):
self.assertIn("model", signature.parameters)
self.assertIn("openai_base_url", signature.parameters)

@patch("formfyxer.lit_explorer.needs_calculations", return_value=False)
@patch("formfyxer.lit_explorer.is_tagged", return_value=False)
@patch("formfyxer.lit_explorer.get_sensitive_data_types", return_value=[])
@patch("formfyxer.lit_explorer.field_types_and_sizes", return_value=[])
@patch("formfyxer.lit_explorer.all_caps_words", return_value=0)
@patch("formfyxer.lit_explorer.transformed_sentences", return_value=[])
@patch("formfyxer.lit_explorer.get_citations", return_value=[])
@patch("formfyxer.lit_explorer.get_passive_sentences", return_value=[])
@patch("formfyxer.lit_explorer.split_sentences", return_value=["Sentence one."])
@patch("formfyxer.lit_explorer.spot", return_value=[])
@patch("formfyxer.lit_explorer.guess_form_name", return_value="")
@patch("formfyxer.lit_explorer.describe_form", return_value="")
@patch("formfyxer.lit_explorer.cleanup_text", side_effect=lambda text: text)
@patch("formfyxer.lit_explorer.extract_text", return_value="Sample form text")
@patch(
"formfyxer.lit_explorer.get_existing_pdf_fields",
return_value=[
[
SimpleNamespace(name="field_a"),
SimpleNamespace(name="field_b"),
]
],
)
@patch("formfyxer.lit_explorer.unlock_pdf_in_place")
@patch("formfyxer.lit_explorer.pikepdf.open")
@patch("formfyxer.lit_explorer.get_openai_api_key_from_sources", return_value="sk-test")
@patch(
"formfyxer.lit_explorer.rename_pdf_fields_with_context",
return_value={
"field_a": "*users.name.first",
"field_b": "users.name.last",
},
)
@patch("formfyxer.lit_explorer._rewrite_pdf_fields_in_place")
def test_parse_form_rewrite_uses_rename_helper_for_nested_fields(
self,
mock_rewrite_pdf_fields,
_mock_rename_with_context,
_mock_get_api_key,
mock_pikepdf_open,
_mock_unlock,
_mock_get_existing_fields,
_mock_extract_text,
_mock_cleanup_text,
_mock_describe_form,
_mock_guess_form_name,
_mock_spot,
_mock_split_sentences,
_mock_get_passive_sentences,
_mock_get_citations,
_mock_transformed_sentences,
_mock_all_caps_words,
_mock_field_types_and_sizes,
_mock_get_sensitive_data_types,
_mock_is_tagged,
_mock_needs_calculations,
):
fake_pdf = SimpleNamespace(pages=[object()], docinfo=SimpleNamespace(Title="Form"))
mock_pikepdf_open.return_value = fake_pdf

with patch(
"formfyxer.lit_explorer.textstat.text_standard", return_value=6.0
), patch(
"formfyxer.lit_explorer.textstat.difficult_words_list", return_value=[]
), patch(
"formfyxer.lit_explorer.time_to_answer_form", return_value=[-1, -1]
):
result = parse_form("fake.pdf", rewrite=True)

mock_rewrite_pdf_fields.assert_called_once_with(
"fake.pdf",
["field_a", "field_b"],
["*users.name.first", "users.name.last"],
)
self.assertEqual(result["fields_old"], ["field_a", "field_b"])
self.assertEqual(result["fields"], ["*users.name.first", "users.name.last"])

@patch("formfyxer.lit_explorer._get_named_parent")
@patch("formfyxer.lit_explorer._unnest_pdf_fields")
@patch("formfyxer.lit_explorer.pikepdf.Pdf.open")
def test_rewrite_pdf_fields_in_place_preserves_order_and_full_flat_names(
self,
mock_pdf_open,
mock_unnest_pdf_fields,
mock_get_named_parent,
):
named_targets = [
SimpleNamespace(T="repeat"),
SimpleNamespace(T="repeat"),
SimpleNamespace(T="old_leaf"),
SimpleNamespace(T="plain_old"),
]
flattened_fields = [
{"var_name": "repeat", "all": object()},
{"var_name": "repeat", "all": object()},
{"var_name": "group.old_leaf", "all": object()},
{"var_name": "plain_old", "all": object()},
]
fake_pdf = SimpleNamespace(
Root=SimpleNamespace(
AcroForm=SimpleNamespace(Fields=[object(), object(), object(), object()])
),
save=Mock(),
close=Mock(),
)
mock_pdf_open.return_value = fake_pdf
mock_unnest_pdf_fields.side_effect = [
[flattened_fields[0]],
[flattened_fields[1]],
[flattened_fields[2]],
[flattened_fields[3]],
]
mock_get_named_parent.side_effect = named_targets

_rewrite_pdf_fields_in_place(
"fake.pdf",
["repeat", "repeat", "group.old_leaf", "plain_old"],
[
"docket_number",
"docket_number__2",
"users.name.first",
"users.name.last",
],
)

self.assertEqual(named_targets[0].T, "docket_number")
self.assertEqual(named_targets[1].T, "docket_number__2")
self.assertEqual(named_targets[2].T, "first")
self.assertEqual(named_targets[3].T, "users.name.last")
fake_pdf.save.assert_called_once_with("fake.pdf")
fake_pdf.close.assert_called_once()

def test_get_original_text_with_fields_handles_pdfminer_state(self):
fixture = Path(__file__).parent / "affidavit_supplement.pdf"
with tempfile.NamedTemporaryFile(suffix=".txt", delete=False) as temp_file:
Expand Down
Loading