diff --git a/realcrawl/cfg.py b/realcrawl/cfg.py index a15e8d8..fe66408 100644 --- a/realcrawl/cfg.py +++ b/realcrawl/cfg.py @@ -10,6 +10,7 @@ from loguru import logger from realcrawl.exception.base import ConfigFileNotFoundException +from realcrawl.libs.path_lib import get_py_pkg_root_dir def load_config(suppress_error: bool = False) -> dict: @@ -60,3 +61,17 @@ def load_config(suppress_error: bool = False) -> dict: config = json.load(f) return config + + +def load_pipe_tpl(pipe_name: str) -> dict: + """Load the pipe template for the web kit. + + Args: + pipe_name(str): The name of the pipe to load + + Returns: pipe_tpl(dict): The pipe template dictionary + """ + pipe_tpl_path = os.path.join(get_py_pkg_root_dir(), 'config', 'extract_tpl', f'{pipe_name}.jsonc') + with open(pipe_tpl_path, 'r', encoding='utf-8') as f: + pipe_tpl = json.load(f) + return pipe_tpl diff --git a/realcrawl/config/extract_tpl/extractor_pipe.jsonc b/realcrawl/config/extract_tpl/extractor_pipe.jsonc new file mode 100644 index 0000000..9469fe8 --- /dev/null +++ b/realcrawl/config/extract_tpl/extractor_pipe.jsonc @@ -0,0 +1,30 @@ +{ + "extractor_pipe": { + "enable": true, + "validate_input_format": false, + "pre_extractor": [ + { + "enable": true, + "python_class": "llm_web_kit.extractor.html.pre_extractor.HTMLFileFormatFilterTablePreExtractor" + }, + { + "enable": true, + "python_class": "llm_web_kit.extractor.html.pre_extractor.HTMLFileFormatCleanTagsPreExtractor", + "class_init_kwargs": {}, + } + ], + "extractor": [ + { + "enable": true, + "python_class": "llm_web_kit.extractor.html.extractor.HTMLFileFormatExtractor", + "class_init_kwargs": {} + } + ], + "post_extractor": [ + { + "enable": true, + "python_class": "llm_web_kit.extractor.html.post_extractor.HTMLStripSpacePostExtractor" + } + ] + } + } diff --git a/realcrawl/extract/html_extract.py b/realcrawl/extract/html_extract.py new file mode 100644 index 0000000..5fd76b2 --- /dev/null +++ b/realcrawl/extract/html_extract.py @@ -0,0 +1,43 @@ + +from func_timeout import func_timeout +from llm_web_kit.extractor.extractor_chain import ExtractSimpleFactory +from llm_web_kit.input.datajson import DataJson + +from realcrawl.cfg import load_pipe_tpl + + +class HtmlExtract: + def __init__(self, html_file_path: str, output_format: str = 'md'): + self.config = load_pipe_tpl('extractor_pipe') + self.extractor_chain = ExtractSimpleFactory.create(self.config) + self.d = { + 'track_id': '1', + 'html': open(html_file_path, 'r').read(), + 'url': 'https://www.google.com', + 'domain': 'google.com', + 'dataset_name':'cc', + 'data_source_category':'HTML', + 'file_bytes': 4096, + 'page_layout_type': 'article', + 'meta_info': {'input_datetime': '2020-01-01 00:00:00'} + } + self.output_format = output_format + + def get_html_content(self): + print('self.d: ', self.d) + input_data = DataJson(self.d) + data_e: DataJson = func_timeout(10, self.extractor_chain.extract, args=(input_data,)) + print('data_e: ', data_e.get_content_list().to_json()) + if self.output_format == 'md': + md_content = data_e.get_content_list().to_mm_md() + elif self.output_format == 'json': + md_content = data_e.get_content_list().to_mm_json() + else: + raise ValueError(f'Invalid output format: {self.output_format}') + return md_content + + def get_main_html(self): + input_data = DataJson(self.d) + data_e: DataJson = func_timeout(10, self.extractor_chain.extract, args=(input_data,)) + main_html = data_e.get_main_html() + return main_html diff --git a/realcrawl/libs/path_lib.py b/realcrawl/libs/path_lib.py new file mode 100644 index 0000000..4ef6abc --- /dev/null +++ b/realcrawl/libs/path_lib.py @@ -0,0 +1,17 @@ +import os + + +def get_proj_root_dir(): + """获取项目的根目录.也就是含有.github, docs, llm_web_kit目录的那个目录.""" + return os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + + +def get_py_pkg_root_dir(): + """获取python包的根目录.也就是含有__init__.py的那个目录. + + Args: + None + Returns: + str: 项目的根目录 + """ + return os.path.dirname(os.path.dirname(os.path.abspath(__file__))) diff --git a/requirements/runtime.txt b/requirements/runtime.txt index 8c197fd..18ac1f6 100644 --- a/requirements/runtime.txt +++ b/requirements/runtime.txt @@ -1 +1,5 @@ commentjson==0.9.0 +commentjson +func-timeout +git+https://github.com/ccprocessor/llm-webkit-mirror.git@dev +loguru diff --git a/tests/realcrawl/assets/1.html b/tests/realcrawl/assets/1.html new file mode 100644 index 0000000..6317832 --- /dev/null +++ b/tests/realcrawl/assets/1.html @@ -0,0 +1,122 @@ + + +
+ +Paragraph 1
+
+ Paragraph 2
+| 1 | +2 | +
| 3 | +4 | +
| 1 | +2 | +3 | +
| 4 | +||
| 5 | +6 | +7 | +
const Prism = require('prismjs');
+
+ // The code snippet you want to highlight, as a string
+ const code = `var data = 1;`;
+
+ // Returns a highlighted HTML string
+ const html = Prism.highlight(code, Prism.languages.javascript, 'javascript');
+
+
+reference: #include<xxxx.hpp>