swerik-project · BobBorges · Dec 19, 2025
diff --git a/src/cur-mot/check-nrs.py b/src/cur-mot/check-nrs.py
@@ -0,0 +1,112 @@
+#!/usr/bin/env python3
+"""
+Check that the number (filename) is actually in the document.
+"""
+from common.args import (
+    alto_args,
+    list_years,
+    verify_alto_args,
+)
+from common.xml_utils import (
+    parse_xml,
+    write_xml,
+)
+from glob import glob
+from lxml import etree
+from tqdm import tqdm
+import argparse, os
+import pandas as pd
+import regex as re
+
+
+def main(args):
+    years = list_years(args)
+    for year in years:
+        print(year)
+        if args.list:
+            debug = False
+            p_count = 0
+            matches = 0
+            mismatch = 0
+            mismatch_2 = 0
+            no_match = 0
+            motions = sorted(glob(f"{args.motionspath}/{year}/*.xml"))
+            rows = []
+            cols = ["mot", "elem_id", "nr", "text"]
+
+            for mot in tqdm(motions):
+                if mot == "riksdagen-motions/data/1962/mot-1962--ak--00262.xml":
+                    debug = True
+                else:
+                    debug = False
+                root, ns = parse_xml(mot, get_ns=True)
+                body = root.find(f".//{ns['tei_ns']}body")
+
+                nr = int(mot.split('-')[-1].replace(".xml", ""))
+                pat = re.compile(fr'((N|n)r[\.\,]?\s{nr}[\.\,]?){{i<=1,d<=1,s<=1,e<=1}}')
+                pat_2 = re.compile(r'((N|n)r[\.\,]?\s\S+){i<=1,d<=1,s<=1,e<=1}')
+                pat_3 = re.compile(r'(N|n)r[\.\,]?\s\S+')
+                ps = root.findall(f".//{ns['tei_ns']}p")
+                M = False
+                for p in ps:
+                    p_count += 1
+                    _text = " ".join([_.strip() for _ in p.text.splitlines() if _.strip() != ''])
+                    if debug: print("0", _text)
+                    m = None
+                    m = pat.match(_text)
+                    if m is not None:
+                        matches += 1
+                        M = True
+                    else:
+                        if debug: print("1", _text)
+                if not M:
+                    for p in ps:
+                        _text = " ".join([_.strip() for _ in p.text.splitlines() if _.strip() != ''])
+                        m = None
+                        m = pat_2.match(_text)
+                        if m is not None:
+                            mismatch += 1
+                            #M = True
+                            rows.append([mot, p.attrib[f"{ns['xml_ns']}id"], nr, m.group(0)])
+                        else:
+                            if debug: print("2", _text)
+                if not M:
+                    for p in ps:
+                        _text = " ".join([_.strip() for _ in p.text.splitlines() if _.strip() != ''])
+                        m = None
+                        m = pat_3.search(_text)
+                        if mot == "riksdagen-motions/data/1962/mot-1962--ak--00262.xml":
+                            print(m)
+                        if m is not None:
+                            mismatch_2 += 1
+                            M = True
+                            rows.append([mot, p.attrib[f"{ns['xml_ns']}id"], nr, _text])
+                        else:
+                            if debug: print("3", _text)
+                    if not M:
+                        no_match += 1
+                        rows.append([mot, None, nr, None])
+            df = pd.DataFrame(rows, columns=cols)
+            df.drop_duplicates(inplace=True)
+            df.to_csv(f"{args.io_path}/_{year}-nomatch-nr.tsv", sep="\t", index=False)
+
+
+            print("  p", p_count)
+            print("  ma", matches)
+            print("  mi", mismatch)
+            print("  mi2", mismatch_2)
+            print("  0", no_match)
+
+        if args.fix_listed:
+            pass
+
+
+
+
+if __name__ == '__main__':
+    parser = alto_args(__file__)
+    parser.add_argument("-o", "--io-path", default="input/mot-unmatched-nr")
+    parser.add_argument("--list", action='store_true')
+    parser.add_argument("--fix-listed", action='store_true')
+    args = parser.parse_args()
+    main(verify_alto_args(args))
diff --git a/src/cur-mot/committee-discrepancy.py b/src/cur-mot/committee-discrepancy.py
@@ -0,0 +1,36 @@
+#!/usr/bin/env python3
+
+from pyriksdagen.args import (
+    fetch_parser,
+    impute_args
+)
+
+def abb(filename, list_):
+    print(filename)
+    abbr = filename.split('/')[-1].split('-')[2]
+    if abbr is not None and abbr != '':
+        if abbr not in list_:
+            list_.append(abbr)
+    return list_
+
+def main(args):
+    abb_a = []
+    abb_b = []
+    for m in args.motions:
+        abbr_a = abb(m, abb_a)
+    with open(args.tmp, 'r') as inf:
+        files = inf.readlines()
+        files = [_.strip() for _ in files if _.strip() != '']
+        for f in files:
+            abb_b = abb(f, abb_b)
+
+    print(sorted(abb_a))
+    print(sorted(abb_b))
+
+
+
+if __name__ == '__main__':
+    parser = fetch_parser("motions")
+    parser.add_argument("--tmp", default="riksdagen-motions/_tmp.txt")
+    args = parser.parse_args()
+    main(impute_args(args))
diff --git a/src/cur-mot/denest-pages-pdf.py b/src/cur-mot/denest-pages-pdf.py
@@ -0,0 +1,37 @@
+#!/usr/bin/env python3
+"""
+Denest pages/ directory in the pdf repository.
+"""
+import argparse
+import os
+import shutil
+from glob import glob
+
+def main(args):
+    pdf_packages = sorted(glob(f"riksdagen-motions-pdf/data/{args.parliament_year}/*/"))
+    for p in pdf_packages:
+        p_content = glob(f"{p}pages/*")
+        for file_ in p_content:
+            print(" --", file_)
+            if file_.endswith(".png"):
+                if args.remove_png == True:
+                    os.remove(file_)
+                    print("rm:", file_)
+                else:
+                    shutil.move(file_, p)
+                    print("mv:", file_, p)
+            elif file_.endswith(".pdf"):
+                shutil.move(file_, p)
+                print("mv:", file_, p)
+            else:
+                os.remove(file_)
+                print("rm:", file_)
+        os.rmdir(f"{p}pages/")
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("-y", "--parliament-year", type = str, required=True)
+    parser.add_argument("-P", "--remove-png", type=bool, default=True)
+    args = parser.parse_args()
+    main(args)
diff --git a/src/cur-mot/detect-title.py b/src/cur-mot/detect-title.py
@@ -0,0 +1,121 @@
+#!/usr/bin/env python3
+"""
+annotate titles
+"""
+from pyriksdagen.args import (
+    fetch_parser,
+    impute_args,
+)
+from pyriksdagen.io import (
+    parse_tei,
+    write_tei,
+)
+from tqdm import tqdm
+import regex as re
+
+
+#                                (?P<fullname>\b[A-ZÅÄÖ][a-zåäö]+(?:\s+[A-ZÅÄÖ][a-zåäö]+)+\b)
+
+def main(args):
+    salutation = r"""
+                        \b(((g|G)re(f)?ve){e<=1}|
+                        (((F|f)ri)?(H|h)err(e|ar)?){e<=1}|
+                        (H|h)r|
+                        (?i:fru){e<=1}|
+                        (?i:fröken){e<=1})\b
+                """
+    subjkw = r"""
+                        \b((O|o)m|
+                        (A|a)ngående|
+                        i(:)?\sanledning)\b
+            """
+
+    pat = re.compile(rf"""
+                    ^(\S\s)?
+                    (?P<number>
+                        (
+                            (N(:)?(r|o)(\.)?){{e<=1}}\s
+                            .{{1,3}}(\.)?\s
+                        )?
+                    )
+                    (?:(?P<av>A(v|f)){{e<=1}}\s)?
+                    (?(av)
+                        (?:
+                            (?P<salutation>{salutation}).*
+                            |
+                            (
+                                (?P<fullname>
+                                    \b(?:
+                                           [A-ZÅÄÖ]{{1,2}}\.?|
+                                           [A-ZÅÄÖ][a-zåäö]+
+                                    )
+                                    (?:\s+
+                                        (?:
+                                             [A-ZÅÄÖ]{{1,2}}\.?|
+                                             [A-ZÅÄÖ][a-zåäö]+
+                                        )
+                                    )+\b
+                                )
+                                .*
+                                (?P<subjkw>{subjkw})
+                                .*
+                            )
+                        )
+                        |
+                        (?P<salutation>{salutation}).*(?P<subjkw>{subjkw}).*
+                    )
+                    """, re.VERBOSE)
+
+
+    if args.test:
+        test_cases = [
+                # Should match
+                "4f Ola Lasson: Om fattigunderstöd af allmänna medel för den som,",
+                "Af RK. P. Arnoldson: Om ändring af 28 $ 1 mom. i Regerings- formen.",
+                "N:o 27. Af herr J. Anderson i Tenhult, om höjd tull å sulläder m. m.",
+                "Av Fru Mróz Om saken",
+                "Af Fru Markowska Angående saken",
+                "Av Hr Kozławski komer en bóbr, krwa.",
+                "Fru Om saken",
+                "Af Margaret Thatcher om",
+                # Shouldn't match
+                "-----------------",
+                "Om saken direkt",
+                "av Idi Amin",
+                "af de tafel valde het boek om",
+                "herr Zdzisław Kosłąwski såg en bóbr",
+            ]
+        print(pat.pattern)
+        for tc in test_cases:
+            m = None
+            m = pat.match(tc)
+            if m:
+                print("matched: ", tc)
+            else:
+                print("didn't match:", tc)
+
+    else:
+        args.motions = [_ for _ in args.motions if _.split('/')[2] not in ["fört", "reg"]]
+        for motion in tqdm(args.motions):
+            write = False
+            root, ns = parse_tei(motion)
+            Ps = root.findall(f".//{ns['tei_ns']}p")
+            for p in Ps:
+                if p.text is not None:
+                    t = ' '.join([_.strip() for _ in p.text.splitlines() if _.strip() != ''])
+                    m = pat.search(t)
+                    if m:
+                        p.attrib["type"] = "titleString"
+                        write = True
+
+            if write:
+                write_tei(root, motion)
+
+
+
+
+if __name__ == '__main__':
+    parser = fetch_parser("motions", docstring=__doc__)
+    parser.add_argument("--test", action='store_true')
+    args = impute_args(parser.parse_args())
+    main(args)
diff --git a/src/cur-mot/doc-formatting-ck.py b/src/cur-mot/doc-formatting-ck.py
@@ -0,0 +1,28 @@
+#!/usr/bin/env python3
+from glob import glob
+from tqdm import tqdm
+import argparse
+import pandas as pd
+
+
+def main(args):
+    D = {}
+    tsvs = sorted(glob(f"riksdagen-motions-pdf/data/{args.parliament_year}/*/*.tsv"))
+    for tsv in tqdm(tsvs):
+        df = pd.read_csv(tsv, sep='\t')
+        print(df["conf"].unique())
+
+        for i, r in df.iterrows():
+            if r["conf"] == 100:
+                if r["height"] not in D:
+                    D[r["height"]] = 0
+                D[r["height"]] += 1
+
+
+    {print(k, ":", v) for k,v in dict(sorted(D.items(), key=lambda kv: (kv[1], kv[0]), reverse=True)).items()}
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("-y", "--parliament-year", type = str, required=True)
+    args = parser.parse_args()
+    main(args)