33import sys
44import time
55from pathlib import Path
6+ from typing import Optional
67
78from tqdm import tqdm
89
910from paper2table import __version__
10- from paper2table .readers import agent , camelot , pdfplumber
11+ from paper2table .mapping import TablesMapping
12+ from paper2table .readers import agent , camelot , pdfplumber , hybrid
1113from paper2table .tables_reader import TablesReader
1214from paper2table .writers import file , stdout , tablemerge
1315from paper2table .writers .tablemerge import TablemergeMetadata
@@ -42,6 +44,14 @@ def parse_args():
4244 help = "How tables are going to be extracted" ,
4345 default = "pdfplumber" ,
4446 )
47+ parser .add_argument (
48+ "-H" ,
49+ "--hybrid" ,
50+ dest = "hybrid" ,
51+ help = "Enable hybrid mode" ,
52+ action = "store_const" ,
53+ const = True ,
54+ )
4555 parser .add_argument (
4656 "-m" ,
4757 "--model" ,
@@ -61,19 +71,25 @@ def parse_args():
6171 "-s" ,
6272 "--schema" ,
6373 type = str ,
64- help = "set table schema in the form column:type. Only used by agent reader" ,
74+ help = "set table schema in the form column:type. Only used by agent or hybrid reader" ,
6575 )
6676 parser .add_argument (
6777 "-p" ,
6878 "--schema-path" ,
6979 type = str ,
7080 help = "set table schema path. Only used by agent reader" ,
7181 )
82+ parser .add_argument (
83+ "-M" ,
84+ "--mappings-path" ,
85+ type = str ,
86+ help = "set tables mapping path. Only used by hybrid reader" ,
87+ )
7288 parser .add_argument (
7389 "-c" ,
7490 "--column-names-hints-path" ,
7591 type = str ,
76- help = "set table schema path. Only used by agent reader" ,
92+ help = "set table schema path. Only used by pdfplumber reader" ,
7793 )
7894 parser .add_argument (
7995 "-o" ,
@@ -112,44 +128,79 @@ def setup_logging(loglevel):
112128 """
113129 logformat = "[%(asctime)s] %(levelname)s:%(name)s:%(message)s"
114130 logging .basicConfig (
115- level = loglevel , stream = sys .stdout , format = logformat , datefmt = "%Y-%m-%d %H:%M:%S"
131+ stream = sys .stdout , format = logformat , datefmt = "%Y-%m-%d %H:%M:%S"
116132 )
133+ logging .getLogger ().setLevel (logging .WARN )
134+ if loglevel :
135+ _logger .setLevel (loglevel )
117136
118137
119138def get_tables_reader (args ):
120139 if args .reader == "agent" :
121140 schema = Path (args .schema_path ).read_text () if args .schema_path else args .schema
122141 if not schema :
123- print ("Missing schema. Need to either pass --schema-path or --schema" )
142+ print (
143+ "Missing schema. Need to either pass --schema-path or --schema when using agent reader"
144+ )
124145 sys .exit (1 )
125146
126- def read_tables (paper_path : str ):
147+ def read_tables (paper_path : str , _mapping : Optional [ TablesMapping ] = None ):
127148 time .sleep (args .model_sleep )
128- _logger .debug (
129- f"Processing paper { paper_path } with model { args .model } and { schema } ..."
130- )
149+ _logger .debug (f"Processing paper { paper_path } with model { args .model } " )
131150 return agent .read_tables (paper_path , model = args .model , schema = schema )
132151
133152 elif args .reader == "pdfplumber" :
153+ column_names_hints = (
154+ Path (args .column_names_hints_path ).read_text ()
155+ if args .column_names_hints_path
156+ else ""
157+ )
134158
135- def read_tables (paper_path : str ):
136- column_names_hints = (
137- Path (args .column_names_hints_path ).read_text ()
138- if args .column_names_hints_path
139- else ""
140- )
159+ _logger .debug (
160+ f"Using pdfplumber reader with column names hints { column_names_hints } "
161+ )
162+
163+ def read_tables (paper_path : str , mapping : Optional [TablesMapping ] = None ):
141164
142- _logger .debug (
143- f"Processing paper { paper_path } with pdfplumber and { column_names_hints } as column names hints..."
165+ _logger .debug (f"Processing paper { paper_path } ..." )
166+ return pdfplumber .read_tables (
167+ paper_path , column_names_hints , mapping = mapping
144168 )
145- return pdfplumber .read_tables (paper_path , column_names_hints )
146169
147170 else :
171+ _logger .debug (f"Using camelot reader { args .reader } -{ args .model } " )
148172
149- def read_tables (paper_path : str ):
150- _logger .debug (f"Processing paper { paper_path } with camelot ..." )
173+ def read_tables (paper_path : str , _mapping : Optional [ TablesMapping ] = None ):
174+ _logger .debug (f"Processing paper { paper_path } ..." )
151175 return camelot .read_tables (paper_path )
152176
177+ if args .hybrid :
178+ mappings_path = (
179+ Path (args .mappings_path ) if args .schema_path else Path ("./mappings" )
180+ )
181+ schema = Path (args .schema_path ).read_text () if args .schema_path else args .schema
182+ if not schema :
183+ print (
184+ "Missing schema. Need to either pass --schema-path or --schema when using hybrid mode"
185+ )
186+ sys .exit (1 )
187+
188+ _logger .debug (f"Schema is { schema } " )
189+ _logger .debug (f"Applying { args .reader } -{ args .model } hybrid reader" )
190+
191+ base_reader = read_tables
192+
193+ def read_tables (paper_path : str , _mapping : Optional [TablesMapping ] = None ):
194+ time .sleep (args .model_sleep )
195+ _logger .debug (f"Processing paper { paper_path } " )
196+ return hybrid .read_tables (
197+ paper_path ,
198+ model = args .model ,
199+ mappings_path = mappings_path ,
200+ schema = schema ,
201+ reader = base_reader ,
202+ )
203+
153204 return read_tables
154205
155206
0 commit comments