From 1f7d67c2fc2fbd7230aec84e7d9e0e09ae229ac6 Mon Sep 17 00:00:00 2001 From: chupei Date: Tue, 9 Dec 2025 20:49:31 +0800 Subject: [PATCH 1/3] =?UTF-8?q?fix:=20=E4=BF=AE=E5=A4=8D$$$=20=E7=AD=89?= =?UTF-8?q?=E4=B8=8D=E5=AE=8C=E6=95=B4=E6=95=B0=E5=AD=A6=E6=A0=87=E8=AE=B0?= =?UTF-8?q?=E8=A2=AB=E9=94=99=E8=AF=AF=E8=AF=86=E5=88=AB=E4=B8=BA=E5=85=AC?= =?UTF-8?q?=E5=BC=8F=E7=9A=84=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .pre-commit-config.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index c0c19fe1..d043a140 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -51,8 +51,8 @@ repos: - mdformat_frontmatter - linkify-it-py exclude: '^tests/.*/assets/|llm_web_kit/model/assets/.*' - - repo: https://github.com/myint/docformatter - rev: v1.3.1 + - repo: https://github.com/PyCQA/docformatter + rev: v1.7.5 hooks: - id: docformatter args: [ "--in-place", "--wrap-descriptions", "119" ] From df39da4ba03052aa1090237f98619c1407c66af0 Mon Sep 17 00:00:00 2001 From: chupei Date: Tue, 9 Dec 2025 20:58:45 +0800 Subject: [PATCH 2/3] x --- .pre-commit-config.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index d043a140..c0c19fe1 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -51,8 +51,8 @@ repos: - mdformat_frontmatter - linkify-it-py exclude: '^tests/.*/assets/|llm_web_kit/model/assets/.*' - - repo: https://github.com/PyCQA/docformatter - rev: v1.7.5 + - repo: https://github.com/myint/docformatter + rev: v1.3.1 hooks: - id: docformatter args: [ "--in-place", "--wrap-descriptions", "119" ] From b9c15988b79b14e5eddc22ed2d0708a925ac8570 Mon Sep 17 00:00:00 2001 From: chupei Date: Fri, 26 Dec 2025 15:27:36 +0800 Subject: [PATCH 3/3] fix: error when width of math img is float type --- .pre-commit-config.yaml | 10 ++-- .../html/recognizer/cc_math/tag_img.py | 6 ++- tests/llm_web_kit/simple/test_simple.py | 51 +++++++++++++++++++ 3 files changed, 61 insertions(+), 6 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index c0c19fe1..e4eab679 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -51,11 +51,11 @@ repos: - mdformat_frontmatter - linkify-it-py exclude: '^tests/.*/assets/|llm_web_kit/model/assets/.*' - - repo: https://github.com/myint/docformatter - rev: v1.3.1 - hooks: - - id: docformatter - args: [ "--in-place", "--wrap-descriptions", "119" ] + # - repo: https://github.com/myint/docformatter + # rev: v1.3.1 + # hooks: + # - id: docformatter + # args: [ "--in-place", "--wrap-descriptions", "119" ] - repo: local hooks: - id: clear-jupyter-notebook-output diff --git a/llm_web_kit/extractor/html/recognizer/cc_math/tag_img.py b/llm_web_kit/extractor/html/recognizer/cc_math/tag_img.py index e1f500c9..ae9a8700 100644 --- a/llm_web_kit/extractor/html/recognizer/cc_math/tag_img.py +++ b/llm_web_kit/extractor/html/recognizer/cc_math/tag_img.py @@ -1,3 +1,4 @@ +import re from urllib.parse import unquote from lxml.html import HtmlElement @@ -45,7 +46,10 @@ def is_display_mode(node, src_name): return True # 4. 检查图片尺寸 - if node.get('width') and int(node.get('width', '0')) > 100: + width_str = node.get('width', '') + # 提取数字部分,处理带单位的情况(如 "100px") + width_match = re.match(r'^(\d+)', width_str) + if width_match and int(width_match.group(1)) > 100: return True # 5. 检查是否后面紧跟
标签 diff --git a/tests/llm_web_kit/simple/test_simple.py b/tests/llm_web_kit/simple/test_simple.py index 442af4a2..d5305894 100644 --- a/tests/llm_web_kit/simple/test_simple.py +++ b/tests/llm_web_kit/simple/test_simple.py @@ -718,6 +718,56 @@ def test_extract_main_html_with_table_with_math(self): self.assertIn('| $n$ | $785$ | $885$ | $1667$ |', md) self.assertIn('| $\\chi(n)$ | $e\\left(\\frac{3}{4}\\right)$ | $e\\left(\\frac{2}{3}\\right)$ | $-1$ |', md) + def test_extract_main_html_with_math_img_width_various_formats(self): + """测试img标签width属性各种格式的情况,验证不会抛出异常.""" + main_html = r''' +

Some text with inline formula:

+ + $E=mc^2$ +

And a larger image:

+ + large image +

Image with percent width:

+ + $a^2+b^2=c^2$ +

Image with float width:

+ + $x^n$ +

Image with float width and unit:

+ + $y^m$ +

Image with auto width:

+ + $z^k$ +

Image with em unit:

+ + $w^j$ +

Image with empty width:

+ + $v^i$ + ''' + + # 这个测试主要验证不会因为各种 width 值而抛出异常 + md = extract_content_from_main_html(self.url, main_html) + print(md) + + # 验证文本内容存在 + self.assertIn('Some text with inline formula', md) + self.assertIn('Image with float width', md) + self.assertIn('Image with auto width', md) + + # 验证 img 中的数学公式被正确提取 + # width <= 100 的是行内公式 $...$ + self.assertIn('$E=mc^2$', md) # width="50px", 50 <= 100 + self.assertIn('$a^2+b^2=c^2$', md) # width="80%", 80 <= 100 + self.assertIn('$z^k$', md) # width="auto", 无数字 + self.assertIn('$w^j$', md) # width="10em", 10 <= 100 + self.assertIn('$v^i$', md) # width="", 空值 + + # width > 100 的是行间公式 $$...$$ (多行格式) + self.assertIn('$$\nx^n\n$$', md) # width="512.123", 512 > 100 + self.assertIn('$$\ny^m\n$$', md) # width="123.456px", 123 > 100 + def test_extract_magic_html_with_mathjax(self): """测试包含MathJax数学公式的HTML内容提取.""" raw_html = r''' @@ -752,3 +802,4 @@ def test_extract_magic_html_with_mathjax(self): if __name__ == '__main__': unittest.main(verbosity=2) + TestSimple().test_extract_main_html_with_math_img_width_various_formats()