From 611fff18bf859ad079019f968d8ac57f0746f80f Mon Sep 17 00:00:00 2001
From: dt-yy <qywan918@163.com>
Date: Fri, 13 Jun 2025 21:46:36 +0800
Subject: [PATCH 1/3] add html extract

---
 realcrawl/.realcrawl.jsonc                    |  30 +++++
 realcrawl/cfg.py                              |  15 +++
 .../config/extract_tpl/extractor_pipe.jsonc   |  30 +++++
 realcrawl/extract/html_extract.py             |  43 ++++++
 realcrawl/libs/path_lib.py                    |  17 +++
 requirements/runtime.txt                      |   4 +
 tests/realcrawl/assets/1.html                 | 122 ++++++++++++++++++
 tests/realcrawl/test_html_extract.py          |  14 ++
 8 files changed, 275 insertions(+)
 create mode 100644 realcrawl/.realcrawl.jsonc
 create mode 100644 realcrawl/config/extract_tpl/extractor_pipe.jsonc
 create mode 100644 realcrawl/extract/html_extract.py
 create mode 100644 realcrawl/libs/path_lib.py
 create mode 100644 tests/realcrawl/assets/1.html
 create mode 100644 tests/realcrawl/test_html_extract.py

diff --git a/realcrawl/.realcrawl.jsonc b/realcrawl/.realcrawl.jsonc
new file mode 100644
index 0000000..1aee06d
--- /dev/null
+++ b/realcrawl/.realcrawl.jsonc
@@ -0,0 +1,30 @@
+{
+    "extractor_pipe": { 
+            "enable": true,
+            "validate_input_format": false, 
+            "pre_extractor": [
+                {
+                    "enable": true,
+                    "python_class": "llm_web_kit.extractor.html.pre_extractor.HTMLFileFormatFilterTablePreExtractor"
+                },
+                {
+                                    "enable": true,
+                                    "python_class": "llm_web_kit.extractor.html.pre_extractor.HTMLFileFormatCleanTagsPreExtractor",
+                                    "class_init_kwargs": {},
+                }
+            ],
+            "extractor": [
+                {
+                    "enable": true,
+                    "python_class": "llm_web_kit.extractor.html.extractor.HTMLFileFormatExtractor",
+                    "class_init_kwargs": {}
+                }
+            ],
+            "post_extractor": [
+                {
+                    "enable": true,
+                    "python_class": "llm_web_kit.extractor.html.post_extractor.HTMLStripSpacePostExtractor"
+                }
+            ]
+        }
+    }
\ No newline at end of file
diff --git a/realcrawl/cfg.py b/realcrawl/cfg.py
index a15e8d8..fe66408 100644
--- a/realcrawl/cfg.py
+++ b/realcrawl/cfg.py
@@ -10,6 +10,7 @@
 from loguru import logger
 
 from realcrawl.exception.base import ConfigFileNotFoundException
+from realcrawl.libs.path_lib import get_py_pkg_root_dir
 
 
 def load_config(suppress_error: bool = False) -> dict:
@@ -60,3 +61,17 @@ def load_config(suppress_error: bool = False) -> dict:
         config = json.load(f)
 
     return config
+
+
+def load_pipe_tpl(pipe_name: str) -> dict:
+    """Load the pipe template for the web kit.
+
+    Args:
+        pipe_name(str): The name of the pipe to load
+
+    Returns: pipe_tpl(dict): The pipe template dictionary
+    """
+    pipe_tpl_path = os.path.join(get_py_pkg_root_dir(), 'config', 'extract_tpl', f'{pipe_name}.jsonc')
+    with open(pipe_tpl_path, 'r', encoding='utf-8') as f:
+        pipe_tpl = json.load(f)
+    return pipe_tpl
diff --git a/realcrawl/config/extract_tpl/extractor_pipe.jsonc b/realcrawl/config/extract_tpl/extractor_pipe.jsonc
new file mode 100644
index 0000000..1aee06d
--- /dev/null
+++ b/realcrawl/config/extract_tpl/extractor_pipe.jsonc
@@ -0,0 +1,30 @@
+{
+    "extractor_pipe": { 
+            "enable": true,
+            "validate_input_format": false, 
+            "pre_extractor": [
+                {
+                    "enable": true,
+                    "python_class": "llm_web_kit.extractor.html.pre_extractor.HTMLFileFormatFilterTablePreExtractor"
+                },
+                {
+                                    "enable": true,
+                                    "python_class": "llm_web_kit.extractor.html.pre_extractor.HTMLFileFormatCleanTagsPreExtractor",
+                                    "class_init_kwargs": {},
+                }
+            ],
+            "extractor": [
+                {
+                    "enable": true,
+                    "python_class": "llm_web_kit.extractor.html.extractor.HTMLFileFormatExtractor",
+                    "class_init_kwargs": {}
+                }
+            ],
+            "post_extractor": [
+                {
+                    "enable": true,
+                    "python_class": "llm_web_kit.extractor.html.post_extractor.HTMLStripSpacePostExtractor"
+                }
+            ]
+        }
+    }
\ No newline at end of file
diff --git a/realcrawl/extract/html_extract.py b/realcrawl/extract/html_extract.py
new file mode 100644
index 0000000..2143be0
--- /dev/null
+++ b/realcrawl/extract/html_extract.py
@@ -0,0 +1,43 @@
+
+from llm_web_kit.input.datajson import DataJson
+from llm_web_kit.extractor.extractor_chain import ExtractSimpleFactory
+from func_timeout import FunctionTimedOut, func_timeout
+from llm_web_kit.exception.exception import *
+from realcrawl.cfg import load_pipe_tpl
+
+
+class HtmlExtract:
+    def __init__(self, html_file_path: str, output_format: str = "md"):
+        self.config = load_pipe_tpl("extractor_pipe")
+        self.extractor_chain =  ExtractSimpleFactory.create(self.config)
+        self.d = {
+            "track_id": "1",
+            "html": open(html_file_path, "r").read(),
+            "url": "https://www.google.com",
+            "domain": "google.com",
+            "dataset_name":"cc",
+            "data_source_category":"HTML",
+            "file_bytes": 4096,
+            "page_layout_type": "article",
+            "meta_info": {"input_datetime": "2020-01-01 00:00:00"}
+        }        
+        self.output_format = output_format
+
+    def get_html_content(self):
+        print("self.d: ", self.d)
+        input_data = DataJson(self.d)
+        data_e: DataJson = func_timeout(10, self.extractor_chain.extract, args=(input_data,))
+        print("data_e: ", data_e.get_content_list().to_json())
+        if self.output_format == "md":
+            md_content = data_e.get_content_list().to_mm_md()
+        elif self.output_format == "json":
+            md_content = data_e.get_content_list().to_mm_json()
+        else:
+            raise ValueError(f"Invalid output format: {self.output_format}")
+        return md_content
+
+    def get_main_html(self):
+        input_data = DataJson(self.d)
+        data_e: DataJson = func_timeout(10, self.extractor_chain.extract, args=(input_data,))
+        main_html = data_e.get_main_html()
+        return main_html
\ No newline at end of file
diff --git a/realcrawl/libs/path_lib.py b/realcrawl/libs/path_lib.py
new file mode 100644
index 0000000..4033ce5
--- /dev/null
+++ b/realcrawl/libs/path_lib.py
@@ -0,0 +1,17 @@
+import os
+
+
+def get_proj_root_dir():
+    """获取项目的根目录.也就是含有.github, docs, llm_web_kit目录的那个目录."""
+    return os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+
+def get_py_pkg_root_dir():
+    """获取python包的根目录.也就是含有__init__.py的那个目录.
+
+    Args:
+        None
+    Returns:
+        str: 项目的根目录
+    """
+    return os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
\ No newline at end of file
diff --git a/requirements/runtime.txt b/requirements/runtime.txt
index 8c197fd..18ac1f6 100644
--- a/requirements/runtime.txt
+++ b/requirements/runtime.txt
@@ -1 +1,5 @@
 commentjson==0.9.0
+commentjson
+func-timeout
+git+https://github.com/ccprocessor/llm-webkit-mirror.git@dev
+loguru
diff --git a/tests/realcrawl/assets/1.html b/tests/realcrawl/assets/1.html
new file mode 100644
index 0000000..55e83d5
--- /dev/null
+++ b/tests/realcrawl/assets/1.html
@@ -0,0 +1,122 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <title>Title</title>
+</head>
+<body>
+
+<!-- Path: 2.html -->
+
+<h1>Heading 1</h1>
+<p>Paragraph 1</p>
+<div>
+    <img alt="image-alt" title="image-title" src="test.png" />
+    <p>Paragraph 2</p>
+</div>
+
+<!-- 简单table -->
+<table>
+    <tr>
+        <td>1</td>
+        <td>2</td>
+    </tr>
+    <tr>
+        <td>3</td>
+        <td>4</td>
+    </tr>
+</table>
+
+<div>
+    <span>
+        <!-- 复杂table -->
+    <table>
+        <tr>
+            <td rowspan="2">1</td>
+            <td>2</td>
+            <td>3</td>
+        </tr>
+        <tr>
+            <td colspan="2">4</td>
+        </tr>
+        <tr>
+            <td>5</td>
+            <td>6</td>
+            <td>7</td>
+        </tr>
+    </table>
+    </span>
+</div>
+
+<!-- 简单list -->
+<ul>
+    <li>1</li>
+    <li>2</li>
+</ul>
+
+<!-- 列表项里有子列表 -->
+<ul>
+    <li>1
+        <ul>
+            <li>1.1</li>
+            <li>1.2</li>
+        </ul>
+    </li>
+    <li>2
+        <ul>
+            <li>2.1</li>
+            <li>2.2</li>
+        </ul>
+    </li>
+</ul>
+
+<!-- 数学公式 -->
+<math xmlns="http://www.w3.org/1998/Math/MathML" display="block">
+    <mi>x</mi>
+    <mo>=</mo>
+    <mrow>
+      <mfrac>
+        <mrow>
+          <mo>&#x2212;</mo>
+          <mi>b</mi>
+          <mo>&#x00B1;</mo>
+          <msqrt>
+            <msup>
+              <mi>b</mi>
+              <mn>2</mn>
+            </msup>
+            <mo>&#x2212;</mo>
+            <mn>4</mn>
+            <mi>a</mi>
+            <mi>c</mi>
+          </msqrt>
+        </mrow>
+        <mrow>
+          <mn>2</mn>
+          <mi>a</mi>
+        </mrow>
+      </mfrac>
+    </mrow>
+    <mtext>.</mtext>
+  </math>
+
+<!-- 代码 -->
+<pre><code class="language-js">const Prism = require('prismjs');
+
+    // The code snippet you want to highlight, as a string
+    const code = `var data = 1;`;
+
+    // Returns a highlighted HTML string
+    const html = Prism.highlight(code, Prism.languages.javascript, 'javascript');</code></pre>
+
+<!-- 有序列表 -->
+<ol>
+    <li>100</li>
+    <li>200</li>
+</ol>
+
+<!-- 带链接的 inline code -->
+<p>reference: <code>#include&lt;<a href="xxxx.xxxx.com">xxxx.hpp</a>&gt;</code></p>
+
+</body>
+</html>
\ No newline at end of file
diff --git a/tests/realcrawl/test_html_extract.py b/tests/realcrawl/test_html_extract.py
new file mode 100644
index 0000000..6f643ed
--- /dev/null
+++ b/tests/realcrawl/test_html_extract.py
@@ -0,0 +1,14 @@
+from realcrawl.extract.html_extract import HtmlExtract
+import unittest
+import os
+class TestHtmlExtract(unittest.TestCase):
+    def setUp(self):
+        self.base_path = os.path.dirname(os.path.abspath(__file__))
+
+    def test_html_extract(self):
+        html_extract = HtmlExtract(os.path.join(self.base_path, "assets/1.html"))
+        html_content = html_extract.get_html_content()
+        assert len(html_content) > 0
+
+if __name__ == "__main__":
+    unittest.main()
\ No newline at end of file

From fcce50e9953a47d1d1cc2e67d6062a03a233d83c Mon Sep 17 00:00:00 2001
From: dt-yy <qywan918@163.com>
Date: Fri, 13 Jun 2025 21:47:32 +0800
Subject: [PATCH 2/3] add html extract

---
 realcrawl/.realcrawl.jsonc | 30 ------------------------------
 1 file changed, 30 deletions(-)
 delete mode 100644 realcrawl/.realcrawl.jsonc

diff --git a/realcrawl/.realcrawl.jsonc b/realcrawl/.realcrawl.jsonc
deleted file mode 100644
index 1aee06d..0000000
--- a/realcrawl/.realcrawl.jsonc
+++ /dev/null
@@ -1,30 +0,0 @@
-{
-    "extractor_pipe": { 
-            "enable": true,
-            "validate_input_format": false, 
-            "pre_extractor": [
-                {
-                    "enable": true,
-                    "python_class": "llm_web_kit.extractor.html.pre_extractor.HTMLFileFormatFilterTablePreExtractor"
-                },
-                {
-                                    "enable": true,
-                                    "python_class": "llm_web_kit.extractor.html.pre_extractor.HTMLFileFormatCleanTagsPreExtractor",
-                                    "class_init_kwargs": {},
-                }
-            ],
-            "extractor": [
-                {
-                    "enable": true,
-                    "python_class": "llm_web_kit.extractor.html.extractor.HTMLFileFormatExtractor",
-                    "class_init_kwargs": {}
-                }
-            ],
-            "post_extractor": [
-                {
-                    "enable": true,
-                    "python_class": "llm_web_kit.extractor.html.post_extractor.HTMLStripSpacePostExtractor"
-                }
-            ]
-        }
-    }
\ No newline at end of file

From faa1b52ba8fbaa96e2a9b272e82728fa9a8e5994 Mon Sep 17 00:00:00 2001
From: dt-yy <qywan918@163.com>
Date: Fri, 13 Jun 2025 21:51:49 +0800
Subject: [PATCH 3/3] fix pylint

---
 .../config/extract_tpl/extractor_pipe.jsonc   |  6 +--
 realcrawl/extract/html_extract.py             | 44 +++++++++----------
 realcrawl/libs/path_lib.py                    |  2 +-
 tests/realcrawl/assets/1.html                 |  2 +-
 tests/realcrawl/test_html_extract.py          | 14 +++---
 5 files changed, 36 insertions(+), 32 deletions(-)

diff --git a/realcrawl/config/extract_tpl/extractor_pipe.jsonc b/realcrawl/config/extract_tpl/extractor_pipe.jsonc
index 1aee06d..9469fe8 100644
--- a/realcrawl/config/extract_tpl/extractor_pipe.jsonc
+++ b/realcrawl/config/extract_tpl/extractor_pipe.jsonc
@@ -1,7 +1,7 @@
 {
-    "extractor_pipe": { 
+    "extractor_pipe": {
             "enable": true,
-            "validate_input_format": false, 
+            "validate_input_format": false,
             "pre_extractor": [
                 {
                     "enable": true,
@@ -27,4 +27,4 @@
                 }
             ]
         }
-    }
\ No newline at end of file
+    }
diff --git a/realcrawl/extract/html_extract.py b/realcrawl/extract/html_extract.py
index 2143be0..5fd76b2 100644
--- a/realcrawl/extract/html_extract.py
+++ b/realcrawl/extract/html_extract.py
@@ -1,43 +1,43 @@
 
-from llm_web_kit.input.datajson import DataJson
+from func_timeout import func_timeout
 from llm_web_kit.extractor.extractor_chain import ExtractSimpleFactory
-from func_timeout import FunctionTimedOut, func_timeout
-from llm_web_kit.exception.exception import *
+from llm_web_kit.input.datajson import DataJson
+
 from realcrawl.cfg import load_pipe_tpl
 
 
 class HtmlExtract:
-    def __init__(self, html_file_path: str, output_format: str = "md"):
-        self.config = load_pipe_tpl("extractor_pipe")
-        self.extractor_chain =  ExtractSimpleFactory.create(self.config)
+    def __init__(self, html_file_path: str, output_format: str = 'md'):
+        self.config = load_pipe_tpl('extractor_pipe')
+        self.extractor_chain = ExtractSimpleFactory.create(self.config)
         self.d = {
-            "track_id": "1",
-            "html": open(html_file_path, "r").read(),
-            "url": "https://www.google.com",
-            "domain": "google.com",
-            "dataset_name":"cc",
-            "data_source_category":"HTML",
-            "file_bytes": 4096,
-            "page_layout_type": "article",
-            "meta_info": {"input_datetime": "2020-01-01 00:00:00"}
-        }        
+            'track_id': '1',
+            'html': open(html_file_path, 'r').read(),
+            'url': 'https://www.google.com',
+            'domain': 'google.com',
+            'dataset_name':'cc',
+            'data_source_category':'HTML',
+            'file_bytes': 4096,
+            'page_layout_type': 'article',
+            'meta_info': {'input_datetime': '2020-01-01 00:00:00'}
+        }
         self.output_format = output_format
 
     def get_html_content(self):
-        print("self.d: ", self.d)
+        print('self.d: ', self.d)
         input_data = DataJson(self.d)
         data_e: DataJson = func_timeout(10, self.extractor_chain.extract, args=(input_data,))
-        print("data_e: ", data_e.get_content_list().to_json())
-        if self.output_format == "md":
+        print('data_e: ', data_e.get_content_list().to_json())
+        if self.output_format == 'md':
             md_content = data_e.get_content_list().to_mm_md()
-        elif self.output_format == "json":
+        elif self.output_format == 'json':
             md_content = data_e.get_content_list().to_mm_json()
         else:
-            raise ValueError(f"Invalid output format: {self.output_format}")
+            raise ValueError(f'Invalid output format: {self.output_format}')
         return md_content
 
     def get_main_html(self):
         input_data = DataJson(self.d)
         data_e: DataJson = func_timeout(10, self.extractor_chain.extract, args=(input_data,))
         main_html = data_e.get_main_html()
-        return main_html
\ No newline at end of file
+        return main_html
diff --git a/realcrawl/libs/path_lib.py b/realcrawl/libs/path_lib.py
index 4033ce5..4ef6abc 100644
--- a/realcrawl/libs/path_lib.py
+++ b/realcrawl/libs/path_lib.py
@@ -14,4 +14,4 @@ def get_py_pkg_root_dir():
     Returns:
         str: 项目的根目录
     """
-    return os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
\ No newline at end of file
+    return os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
diff --git a/tests/realcrawl/assets/1.html b/tests/realcrawl/assets/1.html
index 55e83d5..6317832 100644
--- a/tests/realcrawl/assets/1.html
+++ b/tests/realcrawl/assets/1.html
@@ -119,4 +119,4 @@ <h1>Heading 1</h1>
 <p>reference: <code>#include&lt;<a href="xxxx.xxxx.com">xxxx.hpp</a>&gt;</code></p>
 
 </body>
-</html>
\ No newline at end of file
+</html>
diff --git a/tests/realcrawl/test_html_extract.py b/tests/realcrawl/test_html_extract.py
index 6f643ed..2fe3aff 100644
--- a/tests/realcrawl/test_html_extract.py
+++ b/tests/realcrawl/test_html_extract.py
@@ -1,14 +1,18 @@
-from realcrawl.extract.html_extract import HtmlExtract
-import unittest
 import os
+import unittest
+
+from realcrawl.extract.html_extract import HtmlExtract
+
+
 class TestHtmlExtract(unittest.TestCase):
     def setUp(self):
         self.base_path = os.path.dirname(os.path.abspath(__file__))
 
     def test_html_extract(self):
-        html_extract = HtmlExtract(os.path.join(self.base_path, "assets/1.html"))
+        html_extract = HtmlExtract(os.path.join(self.base_path, 'assets/1.html'))
         html_content = html_extract.get_html_content()
         assert len(html_content) > 0
 
-if __name__ == "__main__":
-    unittest.main()
\ No newline at end of file
+
+if __name__ == '__main__':
+    unittest.main()