From 611fff18bf859ad079019f968d8ac57f0746f80f Mon Sep 17 00:00:00 2001 From: dt-yy Date: Fri, 13 Jun 2025 21:46:36 +0800 Subject: [PATCH 1/3] add html extract --- realcrawl/.realcrawl.jsonc | 30 +++++ realcrawl/cfg.py | 15 +++ .../config/extract_tpl/extractor_pipe.jsonc | 30 +++++ realcrawl/extract/html_extract.py | 43 ++++++ realcrawl/libs/path_lib.py | 17 +++ requirements/runtime.txt | 4 + tests/realcrawl/assets/1.html | 122 ++++++++++++++++++ tests/realcrawl/test_html_extract.py | 14 ++ 8 files changed, 275 insertions(+) create mode 100644 realcrawl/.realcrawl.jsonc create mode 100644 realcrawl/config/extract_tpl/extractor_pipe.jsonc create mode 100644 realcrawl/extract/html_extract.py create mode 100644 realcrawl/libs/path_lib.py create mode 100644 tests/realcrawl/assets/1.html create mode 100644 tests/realcrawl/test_html_extract.py diff --git a/realcrawl/.realcrawl.jsonc b/realcrawl/.realcrawl.jsonc new file mode 100644 index 0000000..1aee06d --- /dev/null +++ b/realcrawl/.realcrawl.jsonc @@ -0,0 +1,30 @@ +{ + "extractor_pipe": { + "enable": true, + "validate_input_format": false, + "pre_extractor": [ + { + "enable": true, + "python_class": "llm_web_kit.extractor.html.pre_extractor.HTMLFileFormatFilterTablePreExtractor" + }, + { + "enable": true, + "python_class": "llm_web_kit.extractor.html.pre_extractor.HTMLFileFormatCleanTagsPreExtractor", + "class_init_kwargs": {}, + } + ], + "extractor": [ + { + "enable": true, + "python_class": "llm_web_kit.extractor.html.extractor.HTMLFileFormatExtractor", + "class_init_kwargs": {} + } + ], + "post_extractor": [ + { + "enable": true, + "python_class": "llm_web_kit.extractor.html.post_extractor.HTMLStripSpacePostExtractor" + } + ] + } + } \ No newline at end of file diff --git a/realcrawl/cfg.py b/realcrawl/cfg.py index a15e8d8..fe66408 100644 --- a/realcrawl/cfg.py +++ b/realcrawl/cfg.py @@ -10,6 +10,7 @@ from loguru import logger from realcrawl.exception.base import ConfigFileNotFoundException +from realcrawl.libs.path_lib import get_py_pkg_root_dir def load_config(suppress_error: bool = False) -> dict: @@ -60,3 +61,17 @@ def load_config(suppress_error: bool = False) -> dict: config = json.load(f) return config + + +def load_pipe_tpl(pipe_name: str) -> dict: + """Load the pipe template for the web kit. + + Args: + pipe_name(str): The name of the pipe to load + + Returns: pipe_tpl(dict): The pipe template dictionary + """ + pipe_tpl_path = os.path.join(get_py_pkg_root_dir(), 'config', 'extract_tpl', f'{pipe_name}.jsonc') + with open(pipe_tpl_path, 'r', encoding='utf-8') as f: + pipe_tpl = json.load(f) + return pipe_tpl diff --git a/realcrawl/config/extract_tpl/extractor_pipe.jsonc b/realcrawl/config/extract_tpl/extractor_pipe.jsonc new file mode 100644 index 0000000..1aee06d --- /dev/null +++ b/realcrawl/config/extract_tpl/extractor_pipe.jsonc @@ -0,0 +1,30 @@ +{ + "extractor_pipe": { + "enable": true, + "validate_input_format": false, + "pre_extractor": [ + { + "enable": true, + "python_class": "llm_web_kit.extractor.html.pre_extractor.HTMLFileFormatFilterTablePreExtractor" + }, + { + "enable": true, + "python_class": "llm_web_kit.extractor.html.pre_extractor.HTMLFileFormatCleanTagsPreExtractor", + "class_init_kwargs": {}, + } + ], + "extractor": [ + { + "enable": true, + "python_class": "llm_web_kit.extractor.html.extractor.HTMLFileFormatExtractor", + "class_init_kwargs": {} + } + ], + "post_extractor": [ + { + "enable": true, + "python_class": "llm_web_kit.extractor.html.post_extractor.HTMLStripSpacePostExtractor" + } + ] + } + } \ No newline at end of file diff --git a/realcrawl/extract/html_extract.py b/realcrawl/extract/html_extract.py new file mode 100644 index 0000000..2143be0 --- /dev/null +++ b/realcrawl/extract/html_extract.py @@ -0,0 +1,43 @@ + +from llm_web_kit.input.datajson import DataJson +from llm_web_kit.extractor.extractor_chain import ExtractSimpleFactory +from func_timeout import FunctionTimedOut, func_timeout +from llm_web_kit.exception.exception import * +from realcrawl.cfg import load_pipe_tpl + + +class HtmlExtract: + def __init__(self, html_file_path: str, output_format: str = "md"): + self.config = load_pipe_tpl("extractor_pipe") + self.extractor_chain = ExtractSimpleFactory.create(self.config) + self.d = { + "track_id": "1", + "html": open(html_file_path, "r").read(), + "url": "https://www.google.com", + "domain": "google.com", + "dataset_name":"cc", + "data_source_category":"HTML", + "file_bytes": 4096, + "page_layout_type": "article", + "meta_info": {"input_datetime": "2020-01-01 00:00:00"} + } + self.output_format = output_format + + def get_html_content(self): + print("self.d: ", self.d) + input_data = DataJson(self.d) + data_e: DataJson = func_timeout(10, self.extractor_chain.extract, args=(input_data,)) + print("data_e: ", data_e.get_content_list().to_json()) + if self.output_format == "md": + md_content = data_e.get_content_list().to_mm_md() + elif self.output_format == "json": + md_content = data_e.get_content_list().to_mm_json() + else: + raise ValueError(f"Invalid output format: {self.output_format}") + return md_content + + def get_main_html(self): + input_data = DataJson(self.d) + data_e: DataJson = func_timeout(10, self.extractor_chain.extract, args=(input_data,)) + main_html = data_e.get_main_html() + return main_html \ No newline at end of file diff --git a/realcrawl/libs/path_lib.py b/realcrawl/libs/path_lib.py new file mode 100644 index 0000000..4033ce5 --- /dev/null +++ b/realcrawl/libs/path_lib.py @@ -0,0 +1,17 @@ +import os + + +def get_proj_root_dir(): + """获取项目的根目录.也就是含有.github, docs, llm_web_kit目录的那个目录.""" + return os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + + +def get_py_pkg_root_dir(): + """获取python包的根目录.也就是含有__init__.py的那个目录. + + Args: + None + Returns: + str: 项目的根目录 + """ + return os.path.dirname(os.path.dirname(os.path.abspath(__file__))) \ No newline at end of file diff --git a/requirements/runtime.txt b/requirements/runtime.txt index 8c197fd..18ac1f6 100644 --- a/requirements/runtime.txt +++ b/requirements/runtime.txt @@ -1 +1,5 @@ commentjson==0.9.0 +commentjson +func-timeout +git+https://github.com/ccprocessor/llm-webkit-mirror.git@dev +loguru diff --git a/tests/realcrawl/assets/1.html b/tests/realcrawl/assets/1.html new file mode 100644 index 0000000..55e83d5 --- /dev/null +++ b/tests/realcrawl/assets/1.html @@ -0,0 +1,122 @@ + + + + + Title + + + + + +

Heading 1

+

Paragraph 1

+
+ image-alt +

Paragraph 2

+
+ + + + + + + + + + + +
12
34
+ +
+ + + + + + + + + + + + + + + + +
123
4
567
+
+
+ + + + + + + + + + x + = + + + + + b + ± + + + b + 2 + + + 4 + a + c + + + + 2 + a + + + + . + + + +
const Prism = require('prismjs');
+
+    // The code snippet you want to highlight, as a string
+    const code = `var data = 1;`;
+
+    // Returns a highlighted HTML string
+    const html = Prism.highlight(code, Prism.languages.javascript, 'javascript');
+ + +
    +
  1. 100
  2. +
  3. 200
  4. +
+ + +

reference: #include<xxxx.hpp>

+ + + \ No newline at end of file diff --git a/tests/realcrawl/test_html_extract.py b/tests/realcrawl/test_html_extract.py new file mode 100644 index 0000000..6f643ed --- /dev/null +++ b/tests/realcrawl/test_html_extract.py @@ -0,0 +1,14 @@ +from realcrawl.extract.html_extract import HtmlExtract +import unittest +import os +class TestHtmlExtract(unittest.TestCase): + def setUp(self): + self.base_path = os.path.dirname(os.path.abspath(__file__)) + + def test_html_extract(self): + html_extract = HtmlExtract(os.path.join(self.base_path, "assets/1.html")) + html_content = html_extract.get_html_content() + assert len(html_content) > 0 + +if __name__ == "__main__": + unittest.main() \ No newline at end of file From fcce50e9953a47d1d1cc2e67d6062a03a233d83c Mon Sep 17 00:00:00 2001 From: dt-yy Date: Fri, 13 Jun 2025 21:47:32 +0800 Subject: [PATCH 2/3] add html extract --- realcrawl/.realcrawl.jsonc | 30 ------------------------------ 1 file changed, 30 deletions(-) delete mode 100644 realcrawl/.realcrawl.jsonc diff --git a/realcrawl/.realcrawl.jsonc b/realcrawl/.realcrawl.jsonc deleted file mode 100644 index 1aee06d..0000000 --- a/realcrawl/.realcrawl.jsonc +++ /dev/null @@ -1,30 +0,0 @@ -{ - "extractor_pipe": { - "enable": true, - "validate_input_format": false, - "pre_extractor": [ - { - "enable": true, - "python_class": "llm_web_kit.extractor.html.pre_extractor.HTMLFileFormatFilterTablePreExtractor" - }, - { - "enable": true, - "python_class": "llm_web_kit.extractor.html.pre_extractor.HTMLFileFormatCleanTagsPreExtractor", - "class_init_kwargs": {}, - } - ], - "extractor": [ - { - "enable": true, - "python_class": "llm_web_kit.extractor.html.extractor.HTMLFileFormatExtractor", - "class_init_kwargs": {} - } - ], - "post_extractor": [ - { - "enable": true, - "python_class": "llm_web_kit.extractor.html.post_extractor.HTMLStripSpacePostExtractor" - } - ] - } - } \ No newline at end of file From faa1b52ba8fbaa96e2a9b272e82728fa9a8e5994 Mon Sep 17 00:00:00 2001 From: dt-yy Date: Fri, 13 Jun 2025 21:51:49 +0800 Subject: [PATCH 3/3] fix pylint --- .../config/extract_tpl/extractor_pipe.jsonc | 6 +-- realcrawl/extract/html_extract.py | 44 +++++++++---------- realcrawl/libs/path_lib.py | 2 +- tests/realcrawl/assets/1.html | 2 +- tests/realcrawl/test_html_extract.py | 14 +++--- 5 files changed, 36 insertions(+), 32 deletions(-) diff --git a/realcrawl/config/extract_tpl/extractor_pipe.jsonc b/realcrawl/config/extract_tpl/extractor_pipe.jsonc index 1aee06d..9469fe8 100644 --- a/realcrawl/config/extract_tpl/extractor_pipe.jsonc +++ b/realcrawl/config/extract_tpl/extractor_pipe.jsonc @@ -1,7 +1,7 @@ { - "extractor_pipe": { + "extractor_pipe": { "enable": true, - "validate_input_format": false, + "validate_input_format": false, "pre_extractor": [ { "enable": true, @@ -27,4 +27,4 @@ } ] } - } \ No newline at end of file + } diff --git a/realcrawl/extract/html_extract.py b/realcrawl/extract/html_extract.py index 2143be0..5fd76b2 100644 --- a/realcrawl/extract/html_extract.py +++ b/realcrawl/extract/html_extract.py @@ -1,43 +1,43 @@ -from llm_web_kit.input.datajson import DataJson +from func_timeout import func_timeout from llm_web_kit.extractor.extractor_chain import ExtractSimpleFactory -from func_timeout import FunctionTimedOut, func_timeout -from llm_web_kit.exception.exception import * +from llm_web_kit.input.datajson import DataJson + from realcrawl.cfg import load_pipe_tpl class HtmlExtract: - def __init__(self, html_file_path: str, output_format: str = "md"): - self.config = load_pipe_tpl("extractor_pipe") - self.extractor_chain = ExtractSimpleFactory.create(self.config) + def __init__(self, html_file_path: str, output_format: str = 'md'): + self.config = load_pipe_tpl('extractor_pipe') + self.extractor_chain = ExtractSimpleFactory.create(self.config) self.d = { - "track_id": "1", - "html": open(html_file_path, "r").read(), - "url": "https://www.google.com", - "domain": "google.com", - "dataset_name":"cc", - "data_source_category":"HTML", - "file_bytes": 4096, - "page_layout_type": "article", - "meta_info": {"input_datetime": "2020-01-01 00:00:00"} - } + 'track_id': '1', + 'html': open(html_file_path, 'r').read(), + 'url': 'https://www.google.com', + 'domain': 'google.com', + 'dataset_name':'cc', + 'data_source_category':'HTML', + 'file_bytes': 4096, + 'page_layout_type': 'article', + 'meta_info': {'input_datetime': '2020-01-01 00:00:00'} + } self.output_format = output_format def get_html_content(self): - print("self.d: ", self.d) + print('self.d: ', self.d) input_data = DataJson(self.d) data_e: DataJson = func_timeout(10, self.extractor_chain.extract, args=(input_data,)) - print("data_e: ", data_e.get_content_list().to_json()) - if self.output_format == "md": + print('data_e: ', data_e.get_content_list().to_json()) + if self.output_format == 'md': md_content = data_e.get_content_list().to_mm_md() - elif self.output_format == "json": + elif self.output_format == 'json': md_content = data_e.get_content_list().to_mm_json() else: - raise ValueError(f"Invalid output format: {self.output_format}") + raise ValueError(f'Invalid output format: {self.output_format}') return md_content def get_main_html(self): input_data = DataJson(self.d) data_e: DataJson = func_timeout(10, self.extractor_chain.extract, args=(input_data,)) main_html = data_e.get_main_html() - return main_html \ No newline at end of file + return main_html diff --git a/realcrawl/libs/path_lib.py b/realcrawl/libs/path_lib.py index 4033ce5..4ef6abc 100644 --- a/realcrawl/libs/path_lib.py +++ b/realcrawl/libs/path_lib.py @@ -14,4 +14,4 @@ def get_py_pkg_root_dir(): Returns: str: 项目的根目录 """ - return os.path.dirname(os.path.dirname(os.path.abspath(__file__))) \ No newline at end of file + return os.path.dirname(os.path.dirname(os.path.abspath(__file__))) diff --git a/tests/realcrawl/assets/1.html b/tests/realcrawl/assets/1.html index 55e83d5..6317832 100644 --- a/tests/realcrawl/assets/1.html +++ b/tests/realcrawl/assets/1.html @@ -119,4 +119,4 @@

Heading 1

reference: #include<xxxx.hpp>

- \ No newline at end of file + diff --git a/tests/realcrawl/test_html_extract.py b/tests/realcrawl/test_html_extract.py index 6f643ed..2fe3aff 100644 --- a/tests/realcrawl/test_html_extract.py +++ b/tests/realcrawl/test_html_extract.py @@ -1,14 +1,18 @@ -from realcrawl.extract.html_extract import HtmlExtract -import unittest import os +import unittest + +from realcrawl.extract.html_extract import HtmlExtract + + class TestHtmlExtract(unittest.TestCase): def setUp(self): self.base_path = os.path.dirname(os.path.abspath(__file__)) def test_html_extract(self): - html_extract = HtmlExtract(os.path.join(self.base_path, "assets/1.html")) + html_extract = HtmlExtract(os.path.join(self.base_path, 'assets/1.html')) html_content = html_extract.get_html_content() assert len(html_content) > 0 -if __name__ == "__main__": - unittest.main() \ No newline at end of file + +if __name__ == '__main__': + unittest.main()