From 2b4aeca38312f735d781f31bff5a652402108dcc Mon Sep 17 00:00:00 2001 From: Hitansh Shah Date: Wed, 2 Feb 2022 13:44:47 +0530 Subject: [PATCH 1/5] TOC detection initial commit --- bgp/modules/terms.py | 6 ++++++ mysequencer.py | 17 +++++++++++++++++ 2 files changed, 23 insertions(+) create mode 100644 mysequencer.py diff --git a/bgp/modules/terms.py b/bgp/modules/terms.py index 1309d60..317cacd 100644 --- a/bgp/modules/terms.py +++ b/bgp/modules/terms.py @@ -353,6 +353,12 @@ def __init__(self): super().__init__(['copyright', '©'], extractor=self.extractor, match_limit=1) +class TocPageDetectorModule(KeywordPageDetectorModule): + + def __init__(self): + super().__init__(['contents', 'table of contents'], match_limit=1) + + class BackpageIsbnExtractorModule(): def __init__(self): diff --git a/mysequencer.py b/mysequencer.py new file mode 100644 index 0000000..ed0e3d0 --- /dev/null +++ b/mysequencer.py @@ -0,0 +1,17 @@ +from bgp import ia +from bgp import Sequencer +from bgp.modules.terms import TocPageDetectorModule, PageTypeProcessor, CopyrightPageDetectorModule + + +PageTypeDetectionSequencer = Sequencer({ + "pagetypes": PageTypeProcessor(modules={ + "toc_page": TocPageDetectorModule() + }) +}) + + +book = ia.get_item("9780262517638OpenAccess") + +results = PageTypeDetectionSequencer.sequence(book).results + +print(results) \ No newline at end of file From 0149045b3e626e8c7f4aab56e64a2d85a9391fe2 Mon Sep 17 00:00:00 2001 From: Hitansh Shah Date: Mon, 14 Feb 2022 16:53:04 +0530 Subject: [PATCH 2/5] configured TocPageDetectionModule to detect the keyword only as header --- bgp/modules/terms.py | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/bgp/modules/terms.py b/bgp/modules/terms.py index 317cacd..e346666 100644 --- a/bgp/modules/terms.py +++ b/bgp/modules/terms.py @@ -356,8 +356,28 @@ def __init__(self): class TocPageDetectorModule(KeywordPageDetectorModule): def __init__(self): - super().__init__(['contents', 'table of contents'], match_limit=1) + super().__init__(['contents', 'table of contents', "table of content"], match_limit=1) + def detectTocHeading(self, page): + for line in page.iter('LINE'): + heading = "" + for word in line.iter('WORD'): + heading = heading + " " + word.text + if heading.lower().strip() in self.keywords: + return True + + return False + + + def run(self, page, node): + if not self.match_limit or len(self.matched_pages) < self.match_limit: + if self.detectTocHeading(page): + param = page[0].attrib['value'].split('.djvu')[0] + current_page = param[-4:] + match = { + 'page': current_page, + } + self.matched_pages.append(match) class BackpageIsbnExtractorModule(): From 96e071d18ab61e935ed3130b3088ccd0197a75f7 Mon Sep 17 00:00:00 2001 From: Mek Date: Wed, 23 Feb 2022 01:11:26 -0500 Subject: [PATCH 3/5] Update bgp/modules/terms.py --- bgp/modules/terms.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bgp/modules/terms.py b/bgp/modules/terms.py index e346666..27c27a7 100644 --- a/bgp/modules/terms.py +++ b/bgp/modules/terms.py @@ -356,7 +356,7 @@ def __init__(self): class TocPageDetectorModule(KeywordPageDetectorModule): def __init__(self): - super().__init__(['contents', 'table of contents', "table of content"], match_limit=1) + super().__init__(["table of content"], match_limit=1) def detectTocHeading(self, page): for line in page.iter('LINE'): From 17bcb8c79370684867f57a059e36c8429c17a660 Mon Sep 17 00:00:00 2001 From: Mek Date: Wed, 23 Feb 2022 01:24:39 -0500 Subject: [PATCH 4/5] Update bgp/modules/terms.py --- bgp/modules/terms.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/bgp/modules/terms.py b/bgp/modules/terms.py index 27c27a7..9484417 100644 --- a/bgp/modules/terms.py +++ b/bgp/modules/terms.py @@ -359,13 +359,11 @@ def __init__(self): super().__init__(["table of content"], match_limit=1) def detectTocHeading(self, page): - for line in page.iter('LINE'): - heading = "" - for word in line.iter('WORD'): - heading = heading + " " + word.text - if heading.lower().strip() in self.keywords: + for i, line in enumerate(page.iter('LINE')): + if i < 5: # if we're in the first few lines + words = " ".join(line.iter('WORD')).lower().strip() + if any(kws == words for kws in self.keywords): return True - return False From efff1a2ac5fcd3af3912bbd991aade93e4a1d6fe Mon Sep 17 00:00:00 2001 From: Mek Date: Wed, 23 Feb 2022 01:25:32 -0500 Subject: [PATCH 5/5] Update bgp/modules/terms.py --- bgp/modules/terms.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bgp/modules/terms.py b/bgp/modules/terms.py index 9484417..60dff33 100644 --- a/bgp/modules/terms.py +++ b/bgp/modules/terms.py @@ -356,7 +356,7 @@ def __init__(self): class TocPageDetectorModule(KeywordPageDetectorModule): def __init__(self): - super().__init__(["table of content"], match_limit=1) + super().__init__(["table of contents"], match_limit=1) def detectTocHeading(self, page): for i, line in enumerate(page.iter('LINE')):