From ecb885edeccfb0290f10e19aeecdf0163b9cbe3b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kristian=20J=C3=A4rventaus?= Date: Thu, 5 Mar 2026 08:00:11 +0200 Subject: [PATCH 1/3] Do not linktrail if following text is not [a-z]? See wiktectract issue #1604 https://github.com/tatuylonen/wiktextract/pull/1604 https://en.wikipedia.org/wiki/Help:Wikitext#Blend_link This should not be merged as is, because it will create problems in other extractors that might rely on different behavior. In the best-case scenario, there might be two different camps: 1) Languages that use spaces that want to do linktrailing 2) Languages without spaces that can't do linktrailing If this is the case, we might be able to get away with a kludge that checks whether the script of the last character in the link matches the script of the first character after the link. --- src/wikitextprocessor/parser.py | 2 +- src/wikitextprocessor/parserfns.py | 12 ++++++------ tests/test_parser.py | 11 ++++++++++- 3 files changed, 17 insertions(+), 8 deletions(-) diff --git a/src/wikitextprocessor/parser.py b/src/wikitextprocessor/parser.py index 69361533..217557b4 100644 --- a/src/wikitextprocessor/parser.py +++ b/src/wikitextprocessor/parser.py @@ -1029,7 +1029,7 @@ def text_fn(ctx: "Wtp", token: str) -> None: and not node.children[-1].children and not ctx.suppress_special ): - m = re.match(r"(?s)(\w+)(.*)", token) + m = re.match(r"(?s)([a-z]+)(.*)", token) if m: node.children[-1].children.append(m.group(1)) token = m.group(2) diff --git a/src/wikitextprocessor/parserfns.py b/src/wikitextprocessor/parserfns.py index eb759c38..114f1b2a 100644 --- a/src/wikitextprocessor/parserfns.py +++ b/src/wikitextprocessor/parserfns.py @@ -9,7 +9,7 @@ from collections.abc import Callable, Sequence from datetime import datetime, timezone from pathlib import Path -from typing import TYPE_CHECKING, Optional, Union +from typing import TYPE_CHECKING, Any, Optional, Union import dateparser @@ -1185,7 +1185,7 @@ def parse_timestamp( if not dt: dt = "now" - settings: dateparser._Settings = {"RETURN_AS_TIMEZONE_AWARE": True} + settings: dict[str, Any] = {"RETURN_AS_TIMEZONE_AWARE": True} if loc in ("", "0"): dt += " UTC" @@ -1206,15 +1206,15 @@ def parse_timestamp( # php's strtotime() (which is the original function used) # but we can handle special cases here and hope # people on wiktionary don't go crazy with weird formatting - t = dateparser.parse(dt, settings=settings) + t = dateparser.parse(dt, settings=settings) # type: ignore if t is None: m = re.match( r"([^+]*)\s*(\+\s*\d+\s*(day|year|month)s?)\s*$", orig_dt ) if m: - main_date = dateparser.parse(m.group(1), settings=settings) - add_time = dateparser.parse(m.group(2), settings=settings) - now = dateparser.parse("now", settings=settings) + main_date = dateparser.parse(m.group(1), settings=settings) # type: ignore + add_time = dateparser.parse(m.group(2), settings=settings) # type: ignore + now = dateparser.parse("now", settings=settings) # type: ignore if main_date and add_time is not None and now is not None: # this is just a kludge: dateparser parses "+2 days" as # "2 days AGO". The now-datetime object is used to check diff --git a/tests/test_parser.py b/tests/test_parser.py index 49489953..59d3b730 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -1101,7 +1101,7 @@ def test_link12(self): self.assertEqual(link.kind, NodeKind.LINK) self.assertEqual(link.largs, [["foo"], ["\n[bar"]]) - def test_link_trailing(self): + def test_link_trailing_1(self): tree = self.parse("test", "[[Help]]ing heal") self.assertEqual(len(tree.children), 2) a, b = tree.children @@ -1110,6 +1110,15 @@ def test_link_trailing(self): self.assertEqual(a.children, ["ing"]) self.assertEqual(b, " heal") + def test_link_trailing_not_latin(self): + tree = self.parse("test", "[[appellāre]]の直説法所相現在第 foo") + self.assertEqual(len(tree.children), 2) + a, b = tree.children + self.assertEqual(a.kind, NodeKind.LINK) + self.assertEqual(a.largs, [["appellāre"]]) + self.assertEqual(a.children, []) + self.assertEqual(b, "の直説法所相現在第 foo") + def test_url1(self): tree = self.parse("test", "this https://wikipedia.com link") self.assertEqual(len(tree.children), 3) From db2859ecdca5d334c3d366d754e0133fba75c351 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kristian=20J=C3=A4rventaus?= Date: Mon, 9 Mar 2026 11:40:02 +0200 Subject: [PATCH 2/3] Give Wtp `linktrailing_re` attribute See wiktectract issue #1604 tatuylonen/wiktextract#1604 https://en.wikipedia.org/wiki/Help:Wikitext#Blend_link This adds a new attribute to Wtp that contains a `re.Pattern` object used for pattern-matching these kinds of suffixed links. Modify `Wtp.linktrailing_re` to change the behavior based on how the parsed Wikimedia project handles linktrailing. English uses `[a-z]+`. Our default implementation uses `\w+`, which should be fine most of the time. Languages without spaces seem to use the English `[a-z]+`, which seems to make sense. `[[englishword]]KANJI` wouldn't have the kanji characters be consumed, but `\w+` breaks this. --- src/wikitextprocessor/core.py | 9 +++++++++ src/wikitextprocessor/parser.py | 2 +- tests/test_parser.py | 6 ++++++ 3 files changed, 16 insertions(+), 1 deletion(-) diff --git a/src/wikitextprocessor/core.py b/src/wikitextprocessor/core.py index 9a31ce10..413dcb36 100644 --- a/src/wikitextprocessor/core.py +++ b/src/wikitextprocessor/core.py @@ -282,6 +282,7 @@ class Wtp: "notes", # NOTE error messages "wiki_notices", # WIKI error messages "wikidata_session", + "linktrailing_re", ) def __init__( @@ -355,6 +356,14 @@ def __init__( if not quiet: logger.setLevel(logging.DEBUG) self.wikidata_session: Session | None = None + # Default regex pattern, will sometimes cause trouble. + # Linktrailing is when you have [[a li]]nk that consumes the + # trailing suffix so that the whole word is blue. Languages + # without spaces, like Japanese, should use the English + # [a-z] pattern, other languages their own if `w+` actually + # causes problems in them. + # Will be modified later in wiktextract wxr through WiktionaryConfig. + self.linktrailing_re = re.compile(r"(?s)(\w+)(.*)") def create_db(self) -> None: from .wikidata import init_wikidata_cache diff --git a/src/wikitextprocessor/parser.py b/src/wikitextprocessor/parser.py index 217557b4..59b84dfd 100644 --- a/src/wikitextprocessor/parser.py +++ b/src/wikitextprocessor/parser.py @@ -1029,7 +1029,7 @@ def text_fn(ctx: "Wtp", token: str) -> None: and not node.children[-1].children and not ctx.suppress_special ): - m = re.match(r"(?s)([a-z]+)(.*)", token) + m = ctx.linktrailing_re.match(token) if m: node.children[-1].children.append(m.group(1)) token = m.group(2) diff --git a/tests/test_parser.py b/tests/test_parser.py index 59d3b730..bf9d71dd 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -2,6 +2,7 @@ # # Copyright (c) 2020-2022 Tatu Ylonen. See file LICENSE and https://ylonen.org +import re import unittest from wikitextprocessor import Wtp @@ -1111,7 +1112,12 @@ def test_link_trailing_1(self): self.assertEqual(b, " heal") def test_link_trailing_not_latin(self): + _linktrailing_re = self.ctx.linktrailing_re + # Normally this alternative pattern would be provided by Wiktextract's + # WiktextractConfig or something similar. + self.ctx.linktrailing_re = re.compile(r"(?s)([a-z]+)(.*)") tree = self.parse("test", "[[appellāre]]の直説法所相現在第 foo") + self.ctx.linktrailing_re = _linktrailing_re self.assertEqual(len(tree.children), 2) a, b = tree.children self.assertEqual(a.kind, NodeKind.LINK) From 980bb47499d1df6c3c3726e4f2a40a0aaca1b05d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kristian=20J=C3=A4rventaus?= Date: Mon, 9 Mar 2026 11:54:40 +0200 Subject: [PATCH 3/3] Remove `- uses: crate-ci/typos/@v1` because of false positives We have a `NAMESPACEE` field in `parserfns` (`{{{NAMESPACEE}}}`, it's unimplement) which pisses off the linter for some reason. --- .github/workflows/lint.yml | 1 - src/wikitextprocessor/parserfns.py | 12 +++++++----- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index 00eef278..40542709 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -24,4 +24,3 @@ jobs: - run: python -m mypy -p wikitextprocessor - run: python -m ruff check . - run: python -m ruff format --diff . - - uses: crate-ci/typos@v1 diff --git a/src/wikitextprocessor/parserfns.py b/src/wikitextprocessor/parserfns.py index 114f1b2a..8fadf541 100644 --- a/src/wikitextprocessor/parserfns.py +++ b/src/wikitextprocessor/parserfns.py @@ -1108,9 +1108,11 @@ def month_num_days(ctx: "Wtp", t: datetime) -> int: ] = { "Y": "%Y", "y": "%y", - "L": lambda ctx, t: 1 - if (t.year % 4 == 0 and (t.year % 100 != 0 or t.year % 400 == 0)) - else 0, + "L": lambda ctx, t: ( + 1 + if (t.year % 4 == 0 and (t.year % 100 != 0 or t.year % 400 == 0)) + else 0 + ), "o": "%G", "n": lambda ctx, t: t.month, "m": "%m", @@ -1120,8 +1122,8 @@ def month_num_days(ctx: "Wtp", t: datetime) -> int: "j": lambda ctx, t: t.day, "d": "%d", "z": lambda ctx, t: ( - t - datetime(year=t.year, month=1, day=1, tzinfo=t.tzinfo) - ).days, + (t - datetime(year=t.year, month=1, day=1, tzinfo=t.tzinfo)).days + ), "W": "%V", "N": "%u", "w": "%w",