Skip to content

Commit 6fdc6d0

Browse files
authored
Merge pull request #22 from SFBioinformaticsGroup/hybrid-mode
Hybrid mode
2 parents f4056d3 + ec5514e commit 6fdc6d0

19 files changed

Lines changed: 1021 additions & 222 deletions

setup.cfg

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,10 @@ install_requires =
5454
tqdm >= 4.67.1
5555
camelot-py[cv] == 1.0.9
5656
pdfplumber >= 0.11
57+
img2table >= 1.4.2
58+
opencv-contrib-python >= 4.12.0
59+
PyMuPDF >= 1.26
60+
pymupdf-layout >= 1.26
5761
pandas == 2.3.2
5862

5963
[options.packages.find]

src/paper2table/__main__.py

Lines changed: 71 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,13 @@
33
import sys
44
import time
55
from pathlib import Path
6+
from typing import Optional
67

78
from tqdm import tqdm
89

910
from paper2table import __version__
10-
from paper2table.readers import agent, camelot, pdfplumber
11+
from paper2table.mapping import TablesMapping
12+
from paper2table.readers import agent, camelot, pdfplumber, hybrid
1113
from paper2table.tables_reader import TablesReader
1214
from paper2table.writers import file, stdout, tablemerge
1315
from paper2table.writers.tablemerge import TablemergeMetadata
@@ -42,6 +44,14 @@ def parse_args():
4244
help="How tables are going to be extracted",
4345
default="pdfplumber",
4446
)
47+
parser.add_argument(
48+
"-H",
49+
"--hybrid",
50+
dest="hybrid",
51+
help="Enable hybrid mode",
52+
action="store_const",
53+
const=True,
54+
)
4555
parser.add_argument(
4656
"-m",
4757
"--model",
@@ -61,19 +71,25 @@ def parse_args():
6171
"-s",
6272
"--schema",
6373
type=str,
64-
help="set table schema in the form column:type. Only used by agent reader",
74+
help="set table schema in the form column:type. Only used by agent or hybrid reader",
6575
)
6676
parser.add_argument(
6777
"-p",
6878
"--schema-path",
6979
type=str,
7080
help="set table schema path. Only used by agent reader",
7181
)
82+
parser.add_argument(
83+
"-M",
84+
"--mappings-path",
85+
type=str,
86+
help="set tables mapping path. Only used by hybrid reader",
87+
)
7288
parser.add_argument(
7389
"-c",
7490
"--column-names-hints-path",
7591
type=str,
76-
help="set table schema path. Only used by agent reader",
92+
help="set table schema path. Only used by pdfplumber reader",
7793
)
7894
parser.add_argument(
7995
"-o",
@@ -112,44 +128,79 @@ def setup_logging(loglevel):
112128
"""
113129
logformat = "[%(asctime)s] %(levelname)s:%(name)s:%(message)s"
114130
logging.basicConfig(
115-
level=loglevel, stream=sys.stdout, format=logformat, datefmt="%Y-%m-%d %H:%M:%S"
131+
stream=sys.stdout, format=logformat, datefmt="%Y-%m-%d %H:%M:%S"
116132
)
133+
logging.getLogger().setLevel(logging.WARN)
134+
if loglevel:
135+
_logger.setLevel(loglevel)
117136

118137

119138
def get_tables_reader(args):
120139
if args.reader == "agent":
121140
schema = Path(args.schema_path).read_text() if args.schema_path else args.schema
122141
if not schema:
123-
print("Missing schema. Need to either pass --schema-path or --schema")
142+
print(
143+
"Missing schema. Need to either pass --schema-path or --schema when using agent reader"
144+
)
124145
sys.exit(1)
125146

126-
def read_tables(paper_path: str):
147+
def read_tables(paper_path: str, _mapping: Optional[TablesMapping] = None):
127148
time.sleep(args.model_sleep)
128-
_logger.debug(
129-
f"Processing paper {paper_path} with model {args.model} and {schema}..."
130-
)
149+
_logger.debug(f"Processing paper {paper_path} with model {args.model}")
131150
return agent.read_tables(paper_path, model=args.model, schema=schema)
132151

133152
elif args.reader == "pdfplumber":
153+
column_names_hints = (
154+
Path(args.column_names_hints_path).read_text()
155+
if args.column_names_hints_path
156+
else ""
157+
)
134158

135-
def read_tables(paper_path: str):
136-
column_names_hints = (
137-
Path(args.column_names_hints_path).read_text()
138-
if args.column_names_hints_path
139-
else ""
140-
)
159+
_logger.debug(
160+
f"Using pdfplumber reader with column names hints {column_names_hints}"
161+
)
162+
163+
def read_tables(paper_path: str, mapping: Optional[TablesMapping] = None):
141164

142-
_logger.debug(
143-
f"Processing paper {paper_path} with pdfplumber and {column_names_hints} as column names hints..."
165+
_logger.debug(f"Processing paper {paper_path}...")
166+
return pdfplumber.read_tables(
167+
paper_path, column_names_hints, mapping=mapping
144168
)
145-
return pdfplumber.read_tables(paper_path, column_names_hints)
146169

147170
else:
171+
_logger.debug(f"Using camelot reader {args.reader}-{args.model}")
148172

149-
def read_tables(paper_path: str):
150-
_logger.debug(f"Processing paper {paper_path} with camelot...")
173+
def read_tables(paper_path: str, _mapping: Optional[TablesMapping] = None):
174+
_logger.debug(f"Processing paper {paper_path}...")
151175
return camelot.read_tables(paper_path)
152176

177+
if args.hybrid:
178+
mappings_path = (
179+
Path(args.mappings_path) if args.schema_path else Path("./mappings")
180+
)
181+
schema = Path(args.schema_path).read_text() if args.schema_path else args.schema
182+
if not schema:
183+
print(
184+
"Missing schema. Need to either pass --schema-path or --schema when using hybrid mode"
185+
)
186+
sys.exit(1)
187+
188+
_logger.debug(f"Schema is {schema}")
189+
_logger.debug(f"Applying {args.reader}-{args.model} hybrid reader")
190+
191+
base_reader = read_tables
192+
193+
def read_tables(paper_path: str, _mapping: Optional[TablesMapping] = None):
194+
time.sleep(args.model_sleep)
195+
_logger.debug(f"Processing paper {paper_path}")
196+
return hybrid.read_tables(
197+
paper_path,
198+
model=args.model,
199+
mappings_path=mappings_path,
200+
schema=schema,
201+
reader=base_reader,
202+
)
203+
153204
return read_tables
154205

155206

src/paper2table/hints.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
2+
from utils.normalize_name import normalize_name
3+
from utils.columns_schema import tokenize_schema
4+
5+
def parse_column_names_hints(hints: str) -> list[str]:
6+
return [normalize_name(hint) for hint in tokenize_schema(hints)]

src/paper2table/mapping.py

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
2+
3+
from typing import Literal
4+
from pydantic import BaseModel
5+
6+
7+
class ColumnMapping(BaseModel):
8+
from_column_number: int
9+
"""
10+
The original column number
11+
"""
12+
13+
to_column_name: str
14+
"""
15+
The desired column name
16+
"""
17+
18+
19+
class TableMapping(BaseModel):
20+
"""
21+
Instructions for read_table
22+
about how to read a table.
23+
"""
24+
25+
title: str
26+
27+
header_mode: Literal["all_pages", "first_page_only", "none"]
28+
29+
first_page: int
30+
"""
31+
1-based first page number where table is allocated
32+
"""
33+
34+
last_page: int
35+
"""
36+
1-based last page number where table is allocated
37+
"""
38+
39+
column_mappings: list[ColumnMapping]
40+
"""
41+
Mappings that go from original column number
42+
to desired column name
43+
"""
44+
45+
46+
class TablesMapping(BaseModel):
47+
tables: list[TableMapping]
48+
citation: str

src/paper2table/readers/agent.py

Lines changed: 1 addition & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1,37 +1,13 @@
11
from pathlib import Path
2-
from typing import Any
32

43
from pydantic import create_model
54
from pydantic_ai import Agent, BinaryContent
65

7-
from utils.tokenize_schema import tokenize_schema
6+
from utils.columns_schema import parse_schema
87

98
from ..tables_reader import TablesReader
109
from ..tables_reader.pydantic import TablesModelWrapper
1110

12-
types_map: dict[str, Any] = {
13-
"str": str,
14-
"int": int,
15-
"float": float,
16-
"bool": bool,
17-
}
18-
19-
20-
def parse_schema(schema_str: str) -> dict[str, tuple[Any, ...]]:
21-
parts = tokenize_schema(schema_str)
22-
23-
fields: dict[str, tuple[Any, ...]] = {}
24-
for part in parts:
25-
if ":" not in part:
26-
raise ValueError(f"Invalid field specifier: {part}")
27-
name, type_str = part.split(":", 1)
28-
if type_str not in types_map:
29-
raise ValueError(f"Unsupported type: {type_str}")
30-
fields[name] = (types_map[type_str], ...)
31-
32-
return fields
33-
34-
3511
def build_table_model(schema: str):
3612
"""
3713
Build and return a TableModel from a schema string.

0 commit comments

Comments
 (0)