diff --git a/bgp/modules/terms.py b/bgp/modules/terms.py index 1309d60..60dff33 100644 --- a/bgp/modules/terms.py +++ b/bgp/modules/terms.py @@ -353,6 +353,30 @@ def __init__(self): super().__init__(['copyright', '©'], extractor=self.extractor, match_limit=1) +class TocPageDetectorModule(KeywordPageDetectorModule): + + def __init__(self): + super().__init__(["table of contents"], match_limit=1) + + def detectTocHeading(self, page): + for i, line in enumerate(page.iter('LINE')): + if i < 5: # if we're in the first few lines + words = " ".join(line.iter('WORD')).lower().strip() + if any(kws == words for kws in self.keywords): + return True + return False + + + def run(self, page, node): + if not self.match_limit or len(self.matched_pages) < self.match_limit: + if self.detectTocHeading(page): + param = page[0].attrib['value'].split('.djvu')[0] + current_page = param[-4:] + match = { + 'page': current_page, + } + self.matched_pages.append(match) + class BackpageIsbnExtractorModule(): def __init__(self): diff --git a/mysequencer.py b/mysequencer.py new file mode 100644 index 0000000..ed0e3d0 --- /dev/null +++ b/mysequencer.py @@ -0,0 +1,17 @@ +from bgp import ia +from bgp import Sequencer +from bgp.modules.terms import TocPageDetectorModule, PageTypeProcessor, CopyrightPageDetectorModule + + +PageTypeDetectionSequencer = Sequencer({ + "pagetypes": PageTypeProcessor(modules={ + "toc_page": TocPageDetectorModule() + }) +}) + + +book = ia.get_item("9780262517638OpenAccess") + +results = PageTypeDetectionSequencer.sequence(book).results + +print(results) \ No newline at end of file