Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion .github/workflows/lint.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,4 +24,3 @@ jobs:
- run: python -m mypy -p wikitextprocessor
- run: python -m ruff check .
- run: python -m ruff format --diff .
- uses: crate-ci/typos@v1
9 changes: 9 additions & 0 deletions src/wikitextprocessor/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -282,6 +282,7 @@ class Wtp:
"notes", # NOTE error messages
"wiki_notices", # WIKI error messages
"wikidata_session",
"linktrailing_re",
)

def __init__(
Expand Down Expand Up @@ -355,6 +356,14 @@ def __init__(
if not quiet:
logger.setLevel(logging.DEBUG)
self.wikidata_session: Session | None = None
# Default regex pattern, will sometimes cause trouble.
# Linktrailing is when you have [[a li]]nk that consumes the
# trailing suffix so that the whole word is blue. Languages
# without spaces, like Japanese, should use the English
# [a-z] pattern, other languages their own if `w+` actually
# causes problems in them.
# Will be modified later in wiktextract wxr through WiktionaryConfig.
self.linktrailing_re = re.compile(r"(?s)(\w+)(.*)")

def create_db(self) -> None:
from .wikidata import init_wikidata_cache
Expand Down
2 changes: 1 addition & 1 deletion src/wikitextprocessor/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -1029,7 +1029,7 @@ def text_fn(ctx: "Wtp", token: str) -> None:
and not node.children[-1].children
and not ctx.suppress_special
):
m = re.match(r"(?s)(\w+)(.*)", token)
m = ctx.linktrailing_re.match(token)
if m:
node.children[-1].children.append(m.group(1))
token = m.group(2)
Expand Down
24 changes: 13 additions & 11 deletions src/wikitextprocessor/parserfns.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from collections.abc import Callable, Sequence
from datetime import datetime, timezone
from pathlib import Path
from typing import TYPE_CHECKING, Optional, Union
from typing import TYPE_CHECKING, Any, Optional, Union

import dateparser

Expand Down Expand Up @@ -1108,9 +1108,11 @@ def month_num_days(ctx: "Wtp", t: datetime) -> int:
] = {
"Y": "%Y",
"y": "%y",
"L": lambda ctx, t: 1
if (t.year % 4 == 0 and (t.year % 100 != 0 or t.year % 400 == 0))
else 0,
"L": lambda ctx, t: (
1
if (t.year % 4 == 0 and (t.year % 100 != 0 or t.year % 400 == 0))
else 0
),
"o": "%G",
"n": lambda ctx, t: t.month,
"m": "%m",
Expand All @@ -1120,8 +1122,8 @@ def month_num_days(ctx: "Wtp", t: datetime) -> int:
"j": lambda ctx, t: t.day,
"d": "%d",
"z": lambda ctx, t: (
t - datetime(year=t.year, month=1, day=1, tzinfo=t.tzinfo)
).days,
(t - datetime(year=t.year, month=1, day=1, tzinfo=t.tzinfo)).days
),
"W": "%V",
"N": "%u",
"w": "%w",
Expand Down Expand Up @@ -1185,7 +1187,7 @@ def parse_timestamp(
if not dt:
dt = "now"

settings: dateparser._Settings = {"RETURN_AS_TIMEZONE_AWARE": True}
settings: dict[str, Any] = {"RETURN_AS_TIMEZONE_AWARE": True}
if loc in ("", "0"):
dt += " UTC"

Expand All @@ -1206,15 +1208,15 @@ def parse_timestamp(
# php's strtotime() (which is the original function used)
# but we can handle special cases here and hope
# people on wiktionary don't go crazy with weird formatting
t = dateparser.parse(dt, settings=settings)
t = dateparser.parse(dt, settings=settings) # type: ignore
if t is None:
m = re.match(
r"([^+]*)\s*(\+\s*\d+\s*(day|year|month)s?)\s*$", orig_dt
)
if m:
main_date = dateparser.parse(m.group(1), settings=settings)
add_time = dateparser.parse(m.group(2), settings=settings)
now = dateparser.parse("now", settings=settings)
main_date = dateparser.parse(m.group(1), settings=settings) # type: ignore
add_time = dateparser.parse(m.group(2), settings=settings) # type: ignore
now = dateparser.parse("now", settings=settings) # type: ignore
if main_date and add_time is not None and now is not None:
# this is just a kludge: dateparser parses "+2 days" as
# "2 days AGO". The now-datetime object is used to check
Expand Down
17 changes: 16 additions & 1 deletion tests/test_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
#
# Copyright (c) 2020-2022 Tatu Ylonen. See file LICENSE and https://ylonen.org

import re
import unittest

from wikitextprocessor import Wtp
Expand Down Expand Up @@ -1101,7 +1102,7 @@ def test_link12(self):
self.assertEqual(link.kind, NodeKind.LINK)
self.assertEqual(link.largs, [["foo"], ["\n[bar"]])

def test_link_trailing(self):
def test_link_trailing_1(self):
tree = self.parse("test", "[[Help]]ing heal")
self.assertEqual(len(tree.children), 2)
a, b = tree.children
Expand All @@ -1110,6 +1111,20 @@ def test_link_trailing(self):
self.assertEqual(a.children, ["ing"])
self.assertEqual(b, " heal")

def test_link_trailing_not_latin(self):
_linktrailing_re = self.ctx.linktrailing_re
# Normally this alternative pattern would be provided by Wiktextract's
# WiktextractConfig or something similar.
self.ctx.linktrailing_re = re.compile(r"(?s)([a-z]+)(.*)")
tree = self.parse("test", "[[appellāre]]の直説法所相現在第 foo")
self.ctx.linktrailing_re = _linktrailing_re
self.assertEqual(len(tree.children), 2)
a, b = tree.children
self.assertEqual(a.kind, NodeKind.LINK)
self.assertEqual(a.largs, [["appellāre"]])
self.assertEqual(a.children, [])
self.assertEqual(b, "の直説法所相現在第 foo")

def test_url1(self):
tree = self.parse("test", "this https://wikipedia.com link")
self.assertEqual(len(tree.children), 3)
Expand Down