diff --git a/formfyxer/lit_explorer.py b/formfyxer/lit_explorer.py index 94a885e..ca14692 100644 --- a/formfyxer/lit_explorer.py +++ b/formfyxer/lit_explorer.py @@ -26,6 +26,8 @@ unlock_pdf_in_place, is_tagged, get_original_text_with_fields, + _get_named_parent, + _unnest_pdf_fields, ) import math @@ -118,6 +120,53 @@ def _truncate_to_token_limit( return encoding.decode(tokens[:max_tokens]) +def _rewrite_pdf_fields_in_place( + in_file: str, field_names: List[str], new_names: List[str] +) -> None: + """Rewrite PDF field names in traversal order, preserving repeated sources. + + For nested field trees we keep the existing parent hierarchy and rewrite only the + leaf segment. For flat fields we write the full target name, including dotted + names like ``users.name.first``. + """ + if len(field_names) != len(new_names): + raise ValueError("field_names and new_names must have the same length") + + my_pdf = pikepdf.Pdf.open(in_file, allow_overwriting_input=True) + try: + if not hasattr(my_pdf.Root, "AcroForm") or not hasattr( + my_pdf.Root.AcroForm, "Fields" + ): + return + + flattened_fields = [ + child_field + for parent_field in iter(my_pdf.Root.AcroForm.Fields) + for child_field in _unnest_pdf_fields(parent_field) + ] + if len(flattened_fields) != len(field_names): + raise ValueError( + "PDF field traversal count did not match parsed field-name count" + ) + + for field_data, old_name, new_name in zip(flattened_fields, field_names, new_names): + cleaned_name = re.sub(r"^\*", "", new_name) + if old_name == cleaned_name: + continue + + target = _get_named_parent(field_data["all"]) + if not target: + continue + + # Nested fields keep their parent hierarchy; flat fields accept full dotted names. + target_name = cleaned_name.split(".")[-1] if "." in old_name else cleaned_name + target.T = target_name + + my_pdf.save(in_file) + finally: + my_pdf.close() + + def _normalize_openai_base_url(openai_base_url: Optional[str]) -> Optional[str]: """Normalize OpenAI-compatible base URLs, including Azure resource URLs.""" base_url = str(openai_base_url or "").strip() @@ -2081,16 +2130,7 @@ def _looks_reasonable(candidate: str) -> bool: stats["debug fields"] = debug_fields if rewrite: try: - my_pdf = pikepdf.Pdf.open(in_file, allow_overwriting_input=True) - fields_too = ( - my_pdf.Root.AcroForm.Fields - ) # [0]["/Kids"][0]["/Kids"][0]["/Kids"][0]["/Kids"] - # print(repr(fields_too)) - for k, field_name in enumerate(new_names): - # print(k,field) - fields_too[k].T = re.sub(r"^\*", "", field_name) - my_pdf.save(in_file) - my_pdf.close() + _rewrite_pdf_fields_in_place(in_file, field_names, new_names) except Exception as ex: stats["error"] = f"could not change form fields: {ex}" return stats diff --git a/formfyxer/tests/test_lit_explorer_pdf_labeling.py b/formfyxer/tests/test_lit_explorer_pdf_labeling.py index 6b6238c..f9e0fb4 100644 --- a/formfyxer/tests/test_lit_explorer_pdf_labeling.py +++ b/formfyxer/tests/test_lit_explorer_pdf_labeling.py @@ -6,6 +6,7 @@ from unittest.mock import Mock, patch from formfyxer.lit_explorer import ( + _rewrite_pdf_fields_in_place, parse_form, rename_pdf_fields_with_context, text_complete, @@ -73,6 +74,138 @@ def test_parse_form_accepts_model_and_openai_base_url(self): self.assertIn("model", signature.parameters) self.assertIn("openai_base_url", signature.parameters) + @patch("formfyxer.lit_explorer.needs_calculations", return_value=False) + @patch("formfyxer.lit_explorer.is_tagged", return_value=False) + @patch("formfyxer.lit_explorer.get_sensitive_data_types", return_value=[]) + @patch("formfyxer.lit_explorer.field_types_and_sizes", return_value=[]) + @patch("formfyxer.lit_explorer.all_caps_words", return_value=0) + @patch("formfyxer.lit_explorer.transformed_sentences", return_value=[]) + @patch("formfyxer.lit_explorer.get_citations", return_value=[]) + @patch("formfyxer.lit_explorer.get_passive_sentences", return_value=[]) + @patch("formfyxer.lit_explorer.split_sentences", return_value=["Sentence one."]) + @patch("formfyxer.lit_explorer.spot", return_value=[]) + @patch("formfyxer.lit_explorer.guess_form_name", return_value="") + @patch("formfyxer.lit_explorer.describe_form", return_value="") + @patch("formfyxer.lit_explorer.cleanup_text", side_effect=lambda text: text) + @patch("formfyxer.lit_explorer.extract_text", return_value="Sample form text") + @patch( + "formfyxer.lit_explorer.get_existing_pdf_fields", + return_value=[ + [ + SimpleNamespace(name="field_a"), + SimpleNamespace(name="field_b"), + ] + ], + ) + @patch("formfyxer.lit_explorer.unlock_pdf_in_place") + @patch("formfyxer.lit_explorer.pikepdf.open") + @patch("formfyxer.lit_explorer.get_openai_api_key_from_sources", return_value="sk-test") + @patch( + "formfyxer.lit_explorer.rename_pdf_fields_with_context", + return_value={ + "field_a": "*users.name.first", + "field_b": "users.name.last", + }, + ) + @patch("formfyxer.lit_explorer._rewrite_pdf_fields_in_place") + def test_parse_form_rewrite_uses_rename_helper_for_nested_fields( + self, + mock_rewrite_pdf_fields, + _mock_rename_with_context, + _mock_get_api_key, + mock_pikepdf_open, + _mock_unlock, + _mock_get_existing_fields, + _mock_extract_text, + _mock_cleanup_text, + _mock_describe_form, + _mock_guess_form_name, + _mock_spot, + _mock_split_sentences, + _mock_get_passive_sentences, + _mock_get_citations, + _mock_transformed_sentences, + _mock_all_caps_words, + _mock_field_types_and_sizes, + _mock_get_sensitive_data_types, + _mock_is_tagged, + _mock_needs_calculations, + ): + fake_pdf = SimpleNamespace(pages=[object()], docinfo=SimpleNamespace(Title="Form")) + mock_pikepdf_open.return_value = fake_pdf + + with patch( + "formfyxer.lit_explorer.textstat.text_standard", return_value=6.0 + ), patch( + "formfyxer.lit_explorer.textstat.difficult_words_list", return_value=[] + ), patch( + "formfyxer.lit_explorer.time_to_answer_form", return_value=[-1, -1] + ): + result = parse_form("fake.pdf", rewrite=True) + + mock_rewrite_pdf_fields.assert_called_once_with( + "fake.pdf", + ["field_a", "field_b"], + ["*users.name.first", "users.name.last"], + ) + self.assertEqual(result["fields_old"], ["field_a", "field_b"]) + self.assertEqual(result["fields"], ["*users.name.first", "users.name.last"]) + + @patch("formfyxer.lit_explorer._get_named_parent") + @patch("formfyxer.lit_explorer._unnest_pdf_fields") + @patch("formfyxer.lit_explorer.pikepdf.Pdf.open") + def test_rewrite_pdf_fields_in_place_preserves_order_and_full_flat_names( + self, + mock_pdf_open, + mock_unnest_pdf_fields, + mock_get_named_parent, + ): + named_targets = [ + SimpleNamespace(T="repeat"), + SimpleNamespace(T="repeat"), + SimpleNamespace(T="old_leaf"), + SimpleNamespace(T="plain_old"), + ] + flattened_fields = [ + {"var_name": "repeat", "all": object()}, + {"var_name": "repeat", "all": object()}, + {"var_name": "group.old_leaf", "all": object()}, + {"var_name": "plain_old", "all": object()}, + ] + fake_pdf = SimpleNamespace( + Root=SimpleNamespace( + AcroForm=SimpleNamespace(Fields=[object(), object(), object(), object()]) + ), + save=Mock(), + close=Mock(), + ) + mock_pdf_open.return_value = fake_pdf + mock_unnest_pdf_fields.side_effect = [ + [flattened_fields[0]], + [flattened_fields[1]], + [flattened_fields[2]], + [flattened_fields[3]], + ] + mock_get_named_parent.side_effect = named_targets + + _rewrite_pdf_fields_in_place( + "fake.pdf", + ["repeat", "repeat", "group.old_leaf", "plain_old"], + [ + "docket_number", + "docket_number__2", + "users.name.first", + "users.name.last", + ], + ) + + self.assertEqual(named_targets[0].T, "docket_number") + self.assertEqual(named_targets[1].T, "docket_number__2") + self.assertEqual(named_targets[2].T, "first") + self.assertEqual(named_targets[3].T, "users.name.last") + fake_pdf.save.assert_called_once_with("fake.pdf") + fake_pdf.close.assert_called_once() + def test_get_original_text_with_fields_handles_pdfminer_state(self): fixture = Path(__file__).parent / "affidavit_supplement.pdf" with tempfile.NamedTemporaryFile(suffix=".txt", delete=False) as temp_file: