Skip to content

Commit 67755f9

Browse files
committed
Introduce header mode
1 parent afc66cc commit 67755f9

2 files changed

Lines changed: 121 additions & 9 deletions

File tree

src/paper2table/readers/pdfplumber.py

Lines changed: 19 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
import logging
2-
from typing import Optional
2+
from typing import Literal, Optional
33

44
import pandas as pd
55
import pdfplumber
@@ -25,6 +25,9 @@ class TableSchema(BaseModel):
2525
"""
2626

2727
title: str
28+
29+
header_mode: Literal["all_pages", "first_page_only", "none"]
30+
2831
first_page: int
2932
"""
3033
1-based first page number where table is allocated
@@ -106,7 +109,8 @@ def read_schema_tables(pdf_path: str, schema: TablesSchema, pdf: pdfplumber.PDF)
106109
try:
107110
dataframe = read_table(
108111
table_fragment if table_fragment else [],
109-
column_mappings=table_schema.column_mappings,
112+
table_schema=table_schema,
113+
page=page,
110114
)
111115
tables.append(
112116
DataFrameTableReader(
@@ -140,14 +144,22 @@ def to_dataframe(rows: TableFragment, column_names_hints: list[str]):
140144
def read_table(
141145
table_fragment: TableFragment,
142146
column_names_hints: list[str] = [],
143-
column_mappings: Optional[ColumnMappings] = None,
147+
table_schema: Optional[TableSchema] = None,
148+
page: Optional[int] = None,
144149
) -> pd.DataFrame:
145150
dataframe = to_dataframe(table_fragment, column_names_hints)
146151

147-
if column_mappings is not None:
148-
dataframe = dataframe[column_mappings.keys()].rename(column_mappings)
149-
150-
dataframe = dataframe.rename(columns=lambda column: normalize_name(str(column)))
152+
if table_schema is not None:
153+
selected_column_names = list(table_schema.column_mappings.keys())
154+
renamer = {(key): value for key, value in table_schema.column_mappings.items()}
155+
dataframe = dataframe[selected_column_names].rename(columns=renamer)
156+
if table_schema.header_mode == "all_pages" or (
157+
table_schema.header_mode == "first_page_only"
158+
and page == table_schema.first_page
159+
):
160+
dataframe.drop([0], inplace=True)
161+
162+
dataframe.rename(columns=lambda column: normalize_name(str(column)), inplace=True)
151163
dataframe = dataframe.apply(
152164
lambda row: list(
153165
map(lambda v: v.replace("\n", " ") if type(v) == str else v, row)

tests/test_pdfplumber.py

Lines changed: 102 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -155,11 +155,12 @@ def test_read_table_with_schema_that_matches_page():
155155
tables=[
156156
TableSchema(
157157
title="Plants",
158+
header_mode="all_pages",
158159
first_page=1,
159160
last_page=1,
160161
column_mappings={
161-
"0": "vernacular_name",
162-
"1": "scientific_name",
162+
0: "vernacular_name",
163+
1: "scientific_name",
163164
},
164165
)
165166
],
@@ -216,3 +217,102 @@ def test_read_table_with_schema_that_matches_page():
216217
result_dict = result.to_dict()
217218
assert result_dict["metadata"] == {"filename": "demo_table.pdf"}
218219
assert len(result_dict["tables"][0]["table_fragments"]) == 1
220+
221+
222+
def test_read_table_with_schema_without_headers():
223+
result = read_tables(
224+
"./tests/data/demo_table.pdf",
225+
schema=TablesSchema(
226+
tables=[
227+
TableSchema(
228+
title="Plants",
229+
header_mode="none",
230+
first_page=1,
231+
last_page=1,
232+
column_mappings={
233+
0: "vernacular_name",
234+
1: "scientific_name",
235+
},
236+
)
237+
],
238+
citation="A citation",
239+
),
240+
)
241+
242+
assert result.citation == "A citation"
243+
assert len(result.tables) == 1
244+
assert result.tables[0].title == "Plants"
245+
assert result.tables[0].page == 1
246+
assert result.tables[0].rows == [
247+
{
248+
"scientific_name": "scienti\x00c_name",
249+
"vernacular_name": "common_name",
250+
},
251+
{
252+
"vernacular_name": "Sun\x00ower",
253+
"scientific_name": "Helianthus annuus",
254+
},
255+
{
256+
"vernacular_name": "Rose",
257+
"scientific_name": "Rosa gallica",
258+
},
259+
{
260+
"vernacular_name": "Tulip",
261+
"scientific_name": "Tulipa gesneriana",
262+
},
263+
{
264+
"vernacular_name": "Lavender",
265+
"scientific_name": "Lavandula angustifolia",
266+
},
267+
{
268+
"vernacular_name": "Oak",
269+
"scientific_name": "Quercus robur",
270+
},
271+
{
272+
"vernacular_name": "Maple",
273+
"scientific_name": "Acer saccharum",
274+
},
275+
{
276+
"vernacular_name": "Dandelion",
277+
"scientific_name": "Taraxacum o\x00cinale",
278+
},
279+
{
280+
"vernacular_name": "Bamboo",
281+
"scientific_name": "Bambusa vulgaris",
282+
},
283+
{
284+
"vernacular_name": "Cactus (Prickly Pear)",
285+
"scientific_name": "Opuntia \x00cus-indica",
286+
},
287+
{
288+
"vernacular_name": "Coffee",
289+
"scientific_name": "Coffea arabica",
290+
},
291+
]
292+
result_dict = result.to_dict()
293+
assert result_dict["metadata"] == {"filename": "demo_table.pdf"}
294+
assert len(result_dict["tables"][0]["table_fragments"]) == 1
295+
296+
297+
def test_read_table_with_schema_that_doesnt_matches_page():
298+
result = read_tables(
299+
"./tests/data/demo_table.pdf",
300+
schema=TablesSchema(
301+
tables=[
302+
TableSchema(
303+
title="Plants",
304+
header_mode="all_pages",
305+
first_page=2,
306+
last_page=2,
307+
column_mappings={
308+
0: "vernacular_name",
309+
1: "scientific_name",
310+
},
311+
)
312+
],
313+
citation="A citation",
314+
),
315+
)
316+
317+
assert result.citation == "A citation"
318+
assert len(result.tables) == 0

0 commit comments

Comments
 (0)