From ecb885edeccfb0290f10e19aeecdf0163b9cbe3b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kristian=20J=C3=A4rventaus?= <kristian@clausal.com>
Date: Thu, 5 Mar 2026 08:00:11 +0200
Subject: [PATCH 1/3] Do not linktrail if following text is not [a-z]?

See wiktectract issue #1604
https://github.com/tatuylonen/wiktextract/pull/1604
https://en.wikipedia.org/wiki/Help:Wikitext#Blend_link

This should not be merged as is, because it will create problems in
other extractors that might rely on different behavior.

In the best-case scenario, there might be two different camps:
1) Languages that use spaces that want to do linktrailing
2) Languages without spaces that can't do linktrailing

If this is the case, we might be able to get away with a
kludge that checks whether the script of the last character
in the link matches the script of the first character after
the link.
---
 src/wikitextprocessor/parser.py    |  2 +-
 src/wikitextprocessor/parserfns.py | 12 ++++++------
 tests/test_parser.py               | 11 ++++++++++-
 3 files changed, 17 insertions(+), 8 deletions(-)

diff --git a/src/wikitextprocessor/parser.py b/src/wikitextprocessor/parser.py
index 69361533..217557b4 100644
--- a/src/wikitextprocessor/parser.py
+++ b/src/wikitextprocessor/parser.py
@@ -1029,7 +1029,7 @@ def text_fn(ctx: "Wtp", token: str) -> None:
         and not node.children[-1].children
         and not ctx.suppress_special
     ):
-        m = re.match(r"(?s)(\w+)(.*)", token)
+        m = re.match(r"(?s)([a-z]+)(.*)", token)
         if m:
             node.children[-1].children.append(m.group(1))
             token = m.group(2)
diff --git a/src/wikitextprocessor/parserfns.py b/src/wikitextprocessor/parserfns.py
index eb759c38..114f1b2a 100644
--- a/src/wikitextprocessor/parserfns.py
+++ b/src/wikitextprocessor/parserfns.py
@@ -9,7 +9,7 @@
 from collections.abc import Callable, Sequence
 from datetime import datetime, timezone
 from pathlib import Path
-from typing import TYPE_CHECKING, Optional, Union
+from typing import TYPE_CHECKING, Any, Optional, Union
 
 import dateparser
 
@@ -1185,7 +1185,7 @@ def parse_timestamp(
     if not dt:
         dt = "now"
 
-    settings: dateparser._Settings = {"RETURN_AS_TIMEZONE_AWARE": True}
+    settings: dict[str, Any] = {"RETURN_AS_TIMEZONE_AWARE": True}
     if loc in ("", "0"):
         dt += " UTC"
 
@@ -1206,15 +1206,15 @@ def parse_timestamp(
         # php's strtotime() (which is the original function used)
         # but we can handle special cases here and hope
         # people on wiktionary don't go crazy with weird formatting
-        t = dateparser.parse(dt, settings=settings)
+        t = dateparser.parse(dt, settings=settings)  # type: ignore
         if t is None:
             m = re.match(
                 r"([^+]*)\s*(\+\s*\d+\s*(day|year|month)s?)\s*$", orig_dt
             )
             if m:
-                main_date = dateparser.parse(m.group(1), settings=settings)
-                add_time = dateparser.parse(m.group(2), settings=settings)
-                now = dateparser.parse("now", settings=settings)
+                main_date = dateparser.parse(m.group(1), settings=settings)  # type: ignore
+                add_time = dateparser.parse(m.group(2), settings=settings)  # type: ignore
+                now = dateparser.parse("now", settings=settings)  # type: ignore
                 if main_date and add_time is not None and now is not None:
                     # this is just a kludge: dateparser parses "+2 days" as
                     # "2 days AGO". The now-datetime object is used to check
diff --git a/tests/test_parser.py b/tests/test_parser.py
index 49489953..59d3b730 100644
--- a/tests/test_parser.py
+++ b/tests/test_parser.py
@@ -1101,7 +1101,7 @@ def test_link12(self):
         self.assertEqual(link.kind, NodeKind.LINK)
         self.assertEqual(link.largs, [["foo"], ["\n[bar"]])
 
-    def test_link_trailing(self):
+    def test_link_trailing_1(self):
         tree = self.parse("test", "[[Help]]ing heal")
         self.assertEqual(len(tree.children), 2)
         a, b = tree.children
@@ -1110,6 +1110,15 @@ def test_link_trailing(self):
         self.assertEqual(a.children, ["ing"])
         self.assertEqual(b, " heal")
 
+    def test_link_trailing_not_latin(self):
+        tree = self.parse("test", "[[appellāre]]の直説法所相現在第 foo")
+        self.assertEqual(len(tree.children), 2)
+        a, b = tree.children
+        self.assertEqual(a.kind, NodeKind.LINK)
+        self.assertEqual(a.largs, [["appellāre"]])
+        self.assertEqual(a.children, [])
+        self.assertEqual(b, "の直説法所相現在第 foo")
+
     def test_url1(self):
         tree = self.parse("test", "this https://wikipedia.com link")
         self.assertEqual(len(tree.children), 3)

From db2859ecdca5d334c3d366d754e0133fba75c351 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kristian=20J=C3=A4rventaus?= <kristian@clausal.com>
Date: Mon, 9 Mar 2026 11:40:02 +0200
Subject: [PATCH 2/3] Give Wtp `linktrailing_re` attribute

See wiktectract issue #1604
tatuylonen/wiktextract#1604
https://en.wikipedia.org/wiki/Help:Wikitext#Blend_link

This adds a new attribute to Wtp that contains a `re.Pattern`
object used for pattern-matching these kinds of suffixed links.

Modify `Wtp.linktrailing_re` to change the behavior based
on how the parsed Wikimedia project handles linktrailing.

English uses `[a-z]+`.
Our default implementation uses `\w+`, which should be fine
most of the time.
Languages without spaces seem to use the English `[a-z]+`,
which seems to make sense. `[[englishword]]KANJI` wouldn't
have the kanji characters be consumed, but `\w+` breaks this.
---
 src/wikitextprocessor/core.py   | 9 +++++++++
 src/wikitextprocessor/parser.py | 2 +-
 tests/test_parser.py            | 6 ++++++
 3 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/src/wikitextprocessor/core.py b/src/wikitextprocessor/core.py
index 9a31ce10..413dcb36 100644
--- a/src/wikitextprocessor/core.py
+++ b/src/wikitextprocessor/core.py
@@ -282,6 +282,7 @@ class Wtp:
         "notes",  # NOTE error messages
         "wiki_notices",  # WIKI error messages
         "wikidata_session",
+        "linktrailing_re",
     )
 
     def __init__(
@@ -355,6 +356,14 @@ def __init__(
         if not quiet:
             logger.setLevel(logging.DEBUG)
         self.wikidata_session: Session | None = None
+        # Default regex pattern, will sometimes cause trouble.
+        # Linktrailing is when you have [[a li]]nk that consumes the
+        # trailing suffix so that the whole word is blue. Languages
+        # without spaces, like Japanese, should use the English
+        # [a-z] pattern, other languages their own if `w+` actually
+        # causes problems in them.
+        # Will be modified later in wiktextract wxr through WiktionaryConfig.
+        self.linktrailing_re = re.compile(r"(?s)(\w+)(.*)")
 
     def create_db(self) -> None:
         from .wikidata import init_wikidata_cache
diff --git a/src/wikitextprocessor/parser.py b/src/wikitextprocessor/parser.py
index 217557b4..59b84dfd 100644
--- a/src/wikitextprocessor/parser.py
+++ b/src/wikitextprocessor/parser.py
@@ -1029,7 +1029,7 @@ def text_fn(ctx: "Wtp", token: str) -> None:
         and not node.children[-1].children
         and not ctx.suppress_special
     ):
-        m = re.match(r"(?s)([a-z]+)(.*)", token)
+        m = ctx.linktrailing_re.match(token)
         if m:
             node.children[-1].children.append(m.group(1))
             token = m.group(2)
diff --git a/tests/test_parser.py b/tests/test_parser.py
index 59d3b730..bf9d71dd 100644
--- a/tests/test_parser.py
+++ b/tests/test_parser.py
@@ -2,6 +2,7 @@
 #
 # Copyright (c) 2020-2022 Tatu Ylonen.  See file LICENSE and https://ylonen.org
 
+import re
 import unittest
 
 from wikitextprocessor import Wtp
@@ -1111,7 +1112,12 @@ def test_link_trailing_1(self):
         self.assertEqual(b, " heal")
 
     def test_link_trailing_not_latin(self):
+        _linktrailing_re = self.ctx.linktrailing_re
+        # Normally this alternative pattern would be provided by Wiktextract's
+        # WiktextractConfig or something similar.
+        self.ctx.linktrailing_re = re.compile(r"(?s)([a-z]+)(.*)")
         tree = self.parse("test", "[[appellāre]]の直説法所相現在第 foo")
+        self.ctx.linktrailing_re = _linktrailing_re
         self.assertEqual(len(tree.children), 2)
         a, b = tree.children
         self.assertEqual(a.kind, NodeKind.LINK)

From 980bb47499d1df6c3c3726e4f2a40a0aaca1b05d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kristian=20J=C3=A4rventaus?= <kristian@clausal.com>
Date: Mon, 9 Mar 2026 11:54:40 +0200
Subject: [PATCH 3/3] Remove `- uses: crate-ci/typos/@v1` because of false
 positives

We have a `NAMESPACEE` field in `parserfns` (`{{{NAMESPACEE}}}`,
it's unimplement) which pisses off the linter for some
reason.
---
 .github/workflows/lint.yml         |  1 -
 src/wikitextprocessor/parserfns.py | 12 +++++++-----
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
index 00eef278..40542709 100644
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -24,4 +24,3 @@ jobs:
       - run: python -m mypy -p wikitextprocessor
       - run: python -m ruff check .
       - run: python -m ruff format --diff .
-      - uses: crate-ci/typos@v1
diff --git a/src/wikitextprocessor/parserfns.py b/src/wikitextprocessor/parserfns.py
index 114f1b2a..8fadf541 100644
--- a/src/wikitextprocessor/parserfns.py
+++ b/src/wikitextprocessor/parserfns.py
@@ -1108,9 +1108,11 @@ def month_num_days(ctx: "Wtp", t: datetime) -> int:
 ] = {
     "Y": "%Y",
     "y": "%y",
-    "L": lambda ctx, t: 1
-    if (t.year % 4 == 0 and (t.year % 100 != 0 or t.year % 400 == 0))
-    else 0,
+    "L": lambda ctx, t: (
+        1
+        if (t.year % 4 == 0 and (t.year % 100 != 0 or t.year % 400 == 0))
+        else 0
+    ),
     "o": "%G",
     "n": lambda ctx, t: t.month,
     "m": "%m",
@@ -1120,8 +1122,8 @@ def month_num_days(ctx: "Wtp", t: datetime) -> int:
     "j": lambda ctx, t: t.day,
     "d": "%d",
     "z": lambda ctx, t: (
-        t - datetime(year=t.year, month=1, day=1, tzinfo=t.tzinfo)
-    ).days,
+        (t - datetime(year=t.year, month=1, day=1, tzinfo=t.tzinfo)).days
+    ),
     "W": "%V",
     "N": "%u",
     "w": "%w",