Skip to content

Commit 5b70d17

Browse files
committed
Fix PDF relabel rewrite for nested fields
1 parent 4780274 commit 5b70d17

2 files changed

Lines changed: 88 additions & 10 deletions

File tree

formfyxer/lit_explorer.py

Lines changed: 8 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
unlock_pdf_in_place,
2727
is_tagged,
2828
get_original_text_with_fields,
29+
rename_pdf_fields,
2930
)
3031

3132
import math
@@ -2081,16 +2082,13 @@ def _looks_reasonable(candidate: str) -> bool:
20812082
stats["debug fields"] = debug_fields
20822083
if rewrite:
20832084
try:
2084-
my_pdf = pikepdf.Pdf.open(in_file, allow_overwriting_input=True)
2085-
fields_too = (
2086-
my_pdf.Root.AcroForm.Fields
2087-
) # [0]["/Kids"][0]["/Kids"][0]["/Kids"][0]["/Kids"]
2088-
# print(repr(fields_too))
2089-
for k, field_name in enumerate(new_names):
2090-
# print(k,field)
2091-
fields_too[k].T = re.sub(r"^\*", "", field_name)
2092-
my_pdf.save(in_file)
2093-
my_pdf.close()
2085+
rename_mapping = {
2086+
old_name: re.sub(r"^\*", "", new_name)
2087+
for old_name, new_name in zip(field_names, new_names)
2088+
if old_name != new_name
2089+
}
2090+
if rename_mapping:
2091+
rename_pdf_fields(in_file, in_file, rename_mapping)
20942092
except Exception as ex:
20952093
stats["error"] = f"could not change form fields: {ex}"
20962094
return stats

formfyxer/tests/test_lit_explorer_pdf_labeling.py

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,86 @@ def test_parse_form_accepts_model_and_openai_base_url(self):
7373
self.assertIn("model", signature.parameters)
7474
self.assertIn("openai_base_url", signature.parameters)
7575

76+
@patch("formfyxer.lit_explorer.rename_pdf_fields")
77+
@patch("formfyxer.lit_explorer.needs_calculations", return_value=False)
78+
@patch("formfyxer.lit_explorer.is_tagged", return_value=False)
79+
@patch("formfyxer.lit_explorer.get_sensitive_data_types", return_value=[])
80+
@patch("formfyxer.lit_explorer.field_types_and_sizes", return_value=[])
81+
@patch("formfyxer.lit_explorer.all_caps_words", return_value=0)
82+
@patch("formfyxer.lit_explorer.transformed_sentences", return_value=[])
83+
@patch("formfyxer.lit_explorer.get_citations", return_value=[])
84+
@patch("formfyxer.lit_explorer.get_passive_sentences", return_value=[])
85+
@patch("formfyxer.lit_explorer.split_sentences", return_value=["Sentence one."])
86+
@patch("formfyxer.lit_explorer.spot", return_value=[])
87+
@patch("formfyxer.lit_explorer.guess_form_name", return_value="")
88+
@patch("formfyxer.lit_explorer.describe_form", return_value="")
89+
@patch("formfyxer.lit_explorer.cleanup_text", side_effect=lambda text: text)
90+
@patch("formfyxer.lit_explorer.extract_text", return_value="Sample form text")
91+
@patch(
92+
"formfyxer.lit_explorer.get_existing_pdf_fields",
93+
return_value=[
94+
[
95+
SimpleNamespace(name="field_a"),
96+
SimpleNamespace(name="field_b"),
97+
]
98+
],
99+
)
100+
@patch("formfyxer.lit_explorer.unlock_pdf_in_place")
101+
@patch("formfyxer.lit_explorer.pikepdf.open")
102+
@patch("formfyxer.lit_explorer.get_openai_api_key_from_sources", return_value="sk-test")
103+
@patch(
104+
"formfyxer.lit_explorer.rename_pdf_fields_with_context",
105+
return_value={
106+
"field_a": "*users.name.first",
107+
"field_b": "users.name.last",
108+
},
109+
)
110+
def test_parse_form_rewrite_uses_rename_helper_for_nested_fields(
111+
self,
112+
_mock_rename_with_context,
113+
_mock_get_api_key,
114+
mock_pikepdf_open,
115+
_mock_unlock,
116+
_mock_get_existing_fields,
117+
_mock_extract_text,
118+
_mock_cleanup_text,
119+
_mock_describe_form,
120+
_mock_guess_form_name,
121+
_mock_spot,
122+
_mock_split_sentences,
123+
_mock_get_passive_sentences,
124+
_mock_get_citations,
125+
_mock_transformed_sentences,
126+
_mock_all_caps_words,
127+
_mock_field_types_and_sizes,
128+
_mock_get_sensitive_data_types,
129+
_mock_is_tagged,
130+
_mock_needs_calculations,
131+
mock_rename_pdf_fields,
132+
):
133+
fake_pdf = SimpleNamespace(pages=[object()], docinfo=SimpleNamespace(Title="Form"))
134+
mock_pikepdf_open.return_value = fake_pdf
135+
136+
with patch(
137+
"formfyxer.lit_explorer.textstat.text_standard", return_value=6.0
138+
), patch(
139+
"formfyxer.lit_explorer.textstat.difficult_words_list", return_value=[]
140+
), patch(
141+
"formfyxer.lit_explorer.time_to_answer_form", return_value=[-1, -1]
142+
):
143+
result = parse_form("fake.pdf", rewrite=True)
144+
145+
mock_rename_pdf_fields.assert_called_once_with(
146+
"fake.pdf",
147+
"fake.pdf",
148+
{
149+
"field_a": "users.name.first",
150+
"field_b": "users.name.last",
151+
},
152+
)
153+
self.assertEqual(result["fields_old"], ["field_a", "field_b"])
154+
self.assertEqual(result["fields"], ["*users.name.first", "users.name.last"])
155+
76156
def test_get_original_text_with_fields_handles_pdfminer_state(self):
77157
fixture = Path(__file__).parent / "affidavit_supplement.pdf"
78158
with tempfile.NamedTemporaryFile(suffix=".txt", delete=False) as temp_file:

0 commit comments

Comments
 (0)