diff --git a/src/cur-mot/check-nrs.py b/src/cur-mot/check-nrs.py new file mode 100644 index 0000000..4b66086 --- /dev/null +++ b/src/cur-mot/check-nrs.py @@ -0,0 +1,112 @@ +#!/usr/bin/env python3 +""" +Check that the number (filename) is actually in the document. +""" +from common.args import ( + alto_args, + list_years, + verify_alto_args, +) +from common.xml_utils import ( + parse_xml, + write_xml, +) +from glob import glob +from lxml import etree +from tqdm import tqdm +import argparse, os +import pandas as pd +import regex as re + + +def main(args): + years = list_years(args) + for year in years: + print(year) + if args.list: + debug = False + p_count = 0 + matches = 0 + mismatch = 0 + mismatch_2 = 0 + no_match = 0 + motions = sorted(glob(f"{args.motionspath}/{year}/*.xml")) + rows = [] + cols = ["mot", "elem_id", "nr", "text"] + + for mot in tqdm(motions): + if mot == "riksdagen-motions/data/1962/mot-1962--ak--00262.xml": + debug = True + else: + debug = False + root, ns = parse_xml(mot, get_ns=True) + body = root.find(f".//{ns['tei_ns']}body") + + nr = int(mot.split('-')[-1].replace(".xml", "")) + pat = re.compile(fr'((N|n)r[\.\,]?\s{nr}[\.\,]?){{i<=1,d<=1,s<=1,e<=1}}') + pat_2 = re.compile(r'((N|n)r[\.\,]?\s\S+){i<=1,d<=1,s<=1,e<=1}') + pat_3 = re.compile(r'(N|n)r[\.\,]?\s\S+') + ps = root.findall(f".//{ns['tei_ns']}p") + M = False + for p in ps: + p_count += 1 + _text = " ".join([_.strip() for _ in p.text.splitlines() if _.strip() != '']) + if debug: print("0", _text) + m = None + m = pat.match(_text) + if m is not None: + matches += 1 + M = True + else: + if debug: print("1", _text) + if not M: + for p in ps: + _text = " ".join([_.strip() for _ in p.text.splitlines() if _.strip() != '']) + m = None + m = pat_2.match(_text) + if m is not None: + mismatch += 1 + #M = True + rows.append([mot, p.attrib[f"{ns['xml_ns']}id"], nr, m.group(0)]) + else: + if debug: print("2", _text) + if not M: + for p in ps: + _text = " ".join([_.strip() for _ in p.text.splitlines() if _.strip() != '']) + m = None + m = pat_3.search(_text) + if mot == "riksdagen-motions/data/1962/mot-1962--ak--00262.xml": + print(m) + if m is not None: + mismatch_2 += 1 + M = True + rows.append([mot, p.attrib[f"{ns['xml_ns']}id"], nr, _text]) + else: + if debug: print("3", _text) + if not M: + no_match += 1 + rows.append([mot, None, nr, None]) + df = pd.DataFrame(rows, columns=cols) + df.drop_duplicates(inplace=True) + df.to_csv(f"{args.io_path}/_{year}-nomatch-nr.tsv", sep="\t", index=False) + + + print(" p", p_count) + print(" ma", matches) + print(" mi", mismatch) + print(" mi2", mismatch_2) + print(" 0", no_match) + + if args.fix_listed: + pass + + + + +if __name__ == '__main__': + parser = alto_args(__file__) + parser.add_argument("-o", "--io-path", default="input/mot-unmatched-nr") + parser.add_argument("--list", action='store_true') + parser.add_argument("--fix-listed", action='store_true') + args = parser.parse_args() + main(verify_alto_args(args)) diff --git a/src/cur-mot/committee-discrepancy.py b/src/cur-mot/committee-discrepancy.py new file mode 100644 index 0000000..417f2c7 --- /dev/null +++ b/src/cur-mot/committee-discrepancy.py @@ -0,0 +1,36 @@ +#!/usr/bin/env python3 + +from pyriksdagen.args import ( + fetch_parser, + impute_args +) + +def abb(filename, list_): + print(filename) + abbr = filename.split('/')[-1].split('-')[2] + if abbr is not None and abbr != '': + if abbr not in list_: + list_.append(abbr) + return list_ + +def main(args): + abb_a = [] + abb_b = [] + for m in args.motions: + abbr_a = abb(m, abb_a) + with open(args.tmp, 'r') as inf: + files = inf.readlines() + files = [_.strip() for _ in files if _.strip() != ''] + for f in files: + abb_b = abb(f, abb_b) + + print(sorted(abb_a)) + print(sorted(abb_b)) + + + +if __name__ == '__main__': + parser = fetch_parser("motions") + parser.add_argument("--tmp", default="riksdagen-motions/_tmp.txt") + args = parser.parse_args() + main(impute_args(args)) diff --git a/src/cur-mot/denest-pages-pdf.py b/src/cur-mot/denest-pages-pdf.py new file mode 100644 index 0000000..8092b02 --- /dev/null +++ b/src/cur-mot/denest-pages-pdf.py @@ -0,0 +1,37 @@ +#!/usr/bin/env python3 +""" +Denest pages/ directory in the pdf repository. +""" +import argparse +import os +import shutil +from glob import glob + +def main(args): + pdf_packages = sorted(glob(f"riksdagen-motions-pdf/data/{args.parliament_year}/*/")) + for p in pdf_packages: + p_content = glob(f"{p}pages/*") + for file_ in p_content: + print(" --", file_) + if file_.endswith(".png"): + if args.remove_png == True: + os.remove(file_) + print("rm:", file_) + else: + shutil.move(file_, p) + print("mv:", file_, p) + elif file_.endswith(".pdf"): + shutil.move(file_, p) + print("mv:", file_, p) + else: + os.remove(file_) + print("rm:", file_) + os.rmdir(f"{p}pages/") + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("-y", "--parliament-year", type = str, required=True) + parser.add_argument("-P", "--remove-png", type=bool, default=True) + args = parser.parse_args() + main(args) diff --git a/src/cur-mot/detect-title.py b/src/cur-mot/detect-title.py new file mode 100644 index 0000000..d2e451c --- /dev/null +++ b/src/cur-mot/detect-title.py @@ -0,0 +1,121 @@ +#!/usr/bin/env python3 +""" +annotate titles +""" +from pyriksdagen.args import ( + fetch_parser, + impute_args, +) +from pyriksdagen.io import ( + parse_tei, + write_tei, +) +from tqdm import tqdm +import regex as re + + +# (?P\b[A-ZÅÄÖ][a-zåäö]+(?:\s+[A-ZÅÄÖ][a-zåäö]+)+\b) + +def main(args): + salutation = r""" + \b(((g|G)re(f)?ve){e<=1}| + (((F|f)ri)?(H|h)err(e|ar)?){e<=1}| + (H|h)r| + (?i:fru){e<=1}| + (?i:fröken){e<=1})\b + """ + subjkw = r""" + \b((O|o)m| + (A|a)ngående| + i(:)?\sanledning)\b + """ + + pat = re.compile(rf""" + ^(\S\s)? + (?P + ( + (N(:)?(r|o)(\.)?){{e<=1}}\s + .{{1,3}}(\.)?\s + )? + ) + (?:(?PA(v|f)){{e<=1}}\s)? + (?(av) + (?: + (?P{salutation}).* + | + ( + (?P + \b(?: + [A-ZÅÄÖ]{{1,2}}\.?| + [A-ZÅÄÖ][a-zåäö]+ + ) + (?:\s+ + (?: + [A-ZÅÄÖ]{{1,2}}\.?| + [A-ZÅÄÖ][a-zåäö]+ + ) + )+\b + ) + .* + (?P{subjkw}) + .* + ) + ) + | + (?P{salutation}).*(?P{subjkw}).* + ) + """, re.VERBOSE) + + + if args.test: + test_cases = [ + # Should match + "4f Ola Lasson: Om fattigunderstöd af allmänna medel för den som,", + "Af RK. P. Arnoldson: Om ändring af 28 $ 1 mom. i Regerings- formen.", + "N:o 27. Af herr J. Anderson i Tenhult, om höjd tull å sulläder m. m.", + "Av Fru Mróz Om saken", + "Af Fru Markowska Angående saken", + "Av Hr Kozławski komer en bóbr, krwa.", + "Fru Om saken", + "Af Margaret Thatcher om", + # Shouldn't match + "-----------------", + "Om saken direkt", + "av Idi Amin", + "af de tafel valde het boek om", + "herr Zdzisław Kosłąwski såg en bóbr", + ] + print(pat.pattern) + for tc in test_cases: + m = None + m = pat.match(tc) + if m: + print("matched: ", tc) + else: + print("didn't match:", tc) + + else: + args.motions = [_ for _ in args.motions if _.split('/')[2] not in ["fört", "reg"]] + for motion in tqdm(args.motions): + write = False + root, ns = parse_tei(motion) + Ps = root.findall(f".//{ns['tei_ns']}p") + for p in Ps: + if p.text is not None: + t = ' '.join([_.strip() for _ in p.text.splitlines() if _.strip() != '']) + m = pat.search(t) + if m: + p.attrib["type"] = "titleString" + write = True + + if write: + write_tei(root, motion) + + + + +if __name__ == '__main__': + parser = fetch_parser("motions", docstring=__doc__) + parser.add_argument("--test", action='store_true') + args = impute_args(parser.parse_args()) + main(args) diff --git a/src/cur-mot/doc-formatting-ck.py b/src/cur-mot/doc-formatting-ck.py new file mode 100644 index 0000000..666cea0 --- /dev/null +++ b/src/cur-mot/doc-formatting-ck.py @@ -0,0 +1,28 @@ +#!/usr/bin/env python3 +from glob import glob +from tqdm import tqdm +import argparse +import pandas as pd + + +def main(args): + D = {} + tsvs = sorted(glob(f"riksdagen-motions-pdf/data/{args.parliament_year}/*/*.tsv")) + for tsv in tqdm(tsvs): + df = pd.read_csv(tsv, sep='\t') + print(df["conf"].unique()) + + for i, r in df.iterrows(): + if r["conf"] == 100: + if r["height"] not in D: + D[r["height"]] = 0 + D[r["height"]] += 1 + + + {print(k, ":", v) for k,v in dict(sorted(D.items(), key=lambda kv: (kv[1], kv[0]), reverse=True)).items()} + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("-y", "--parliament-year", type = str, required=True) + args = parser.parse_args() + main(args) diff --git a/src/cur-mot/doc-pipeline.py b/src/cur-mot/doc-pipeline.py new file mode 100644 index 0000000..dd66eda --- /dev/null +++ b/src/cur-mot/doc-pipeline.py @@ -0,0 +1,100 @@ +#!/usr/bin/env python3 +from common.html_common import ( + add_to_docD, + doc_D, +) +from glob import glob +from tqdm import tqdm +import argparse +import os +import pandas as pd + + +formatters = { + "200506": { + 10.52: "body", # 264863 + 17.71: "h1", # 3392 + 8.86 : "fw-fn", # 3128 + 14.94: "h2", # 2910 + 14.39: "fw", # 1233 + 22.14: "header_title", # 916 + 13.28: "body", # 786 + 11.76: "emoji", # 317 + 6.09: "footnote_ref", # 264 + 11.62: "h3", # 194 + 9.41: "table_cell", # 129 + 6.42: "footnote_nr", # 33 + 19.93: "header_title", # 28 + 9.4: "skip", # 15 + 8.81: "footnote_nr", # 15 + 5.09: "footnote_nr", # 9 + 11.07: "body", # 2 + 11.05: "emoji", # 2 + 9.9: "skip", # 2 + 7.75: "footnote_nr-ref", # 2 + 7.08: "footnote_ref", # 2 + 18.23: "body", # 1 + 12.45: "emoji", # 1 + 12.18: "body", # 1 + "parts": { + "header_block": ["h1", "header_author", "header_title"], + "signature_block": ["body"], + } + } +} + + + +def main(args): + + pdf_dumps = sorted(glob(f"riksdagen-motions-pdf/data/{args.parliament_year}/*/*.tsv")) + tmp_counter = 0 + for d in tqdm(pdf_dumps): + header_found = False + current_div = None + current_fragment = None + last_formatter = None + + d_base = os.path.basename(d) + print(d_base) + + df = pd.read_csv(d, sep='\t') + doc_d = doc_D() + for i, r in df.iterrows(): + + if str(r["conf"]) == "100": + formatter = formatters[args.parliament_year][r["height"]] + #print("-", formatter==last_formatter, last_formatter, formatter) + if formatter == "header_title": + header_found = True + #print(current_fragment) + if last_formatter == formatter: + + current_fragment["text"].append(r["text"]) + else: + if current_fragment is not None: + doc_d = add_to_docD(doc_d, current_fragment, current_div) + current_fragment = {} + current_div = "header_title" + if "text" not in current_fragment: + current_fragment["text"] = [] + current_fragment["text"].append(r["text"]) + else: + if current_fragment is not None: + doc_d = add_to_docD(doc_d, current_fragment, current_div) + current_fragment = {} + current_div = "unknown" + last_formatter = formatter + + + if header_found: + tmp_counter += 1 + print(" --", " ".join(doc_d["header_title"]["text"])) + + + print(len(pdf_dumps), tmp_counter) +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument("-y", "--parliament-year", required=True) + args = parser.parse_args() + main(args) diff --git a/src/cur-mot/dump-pdf-text.py b/src/cur-mot/dump-pdf-text.py new file mode 100644 index 0000000..28f791f --- /dev/null +++ b/src/cur-mot/dump-pdf-text.py @@ -0,0 +1,26 @@ +#!/usr/bin/env python3 +""" +pdftotext on +""" +import argparse +import os +import subprocess +from glob import glob +from tqdm import tqdm + + +def main(args): + pdfs = sorted(glob(f"riksdagen-motions-pdf/data/{args.parliament_year}/*.pdf")) + for pdf in tqdm(pdfs): + d = os.path.dirname(pdf) + b = os.path.basename(pdf)[:-4] + subprocess.run([ + "pdftotext", "-tsv", pdf, + f"{d}/{b}/{b}.tsv" + ]) + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("-y", "--parliament-year", type = str, required=True) + args = parser.parse_args() + main(args) diff --git a/src/cur-mot/find-duplicates.py b/src/cur-mot/find-duplicates.py new file mode 100644 index 0000000..6b5eecb --- /dev/null +++ b/src/cur-mot/find-duplicates.py @@ -0,0 +1,72 @@ +#!/usr/bin/env python3 +from pyriksdagen.args import ( + fetch_parser, + impute_args +) +from pyriksdagen.utils import parse_tei +from tqdm import tqdm +import shutil + + +def elements_equal(e1, e2): + if e1.tag != e2.tag: return False + if e1.text != e2.text: return False + if e1.tail != e2.tail: return False + if not e1.tag.endswith('pb'): + if e1.attrib != e2.attrib: return False + if len(e1) != len(e2): return False + return all(elements_equal(c1, c2) for c1, c2 in zip(e1, e2)) + + + +def test_duplicates(new_file, list_, dry_run): + #print(new_file, list_) + root, ns = parse_tei(f"riksdagen-motions/{new_file}") + new_file_body = root.find(f".//{ns['tei_ns']}div[@type='motBody']") + #print(new_file_body) + for file_ in list_: + if file_ != new_file: + root, ns = parse_tei(f"riksdagen-motions/{file_}") + old_file_body = root.find(f".//{ns['tei_ns']}div[@type='motBody']") + # print("~~~", old_file_body) + if elements_equal(new_file_body, old_file_body): + print(f"Deleting {new_file} -- same as {file_}") + if dry_run == False: + shutil.move(f"riksdagen-motions/{new_file}", "riksdagen-motions/_duplicates") + break + #else: + # print("OK") + + +def main(args): + D = {} + for motion in tqdm(args.motions): + #print(motion) + m, _, py, file_ = motion.split('/') + if py not in D: + D[py] = {} + nr = file_[:-4].split('-')[-1] + if nr not in D[py]: + D[py][nr] = [] + D[py][nr].append(motion) + + with open("riksdagen-motions/_tmp.txt", 'r') as inf: + new_files = inf.readlines() + new_files = [_.strip() for _ in new_files if _.strip() != ''] + #print(new_files) + + for year, year_d in D.items(): + for nr, mots in year_d.items(): + mots = [_.replace("riksdagen-motions/", "") for _ in mots] + for mot in mots: + #print(mot) + if mot in new_files: + test_duplicates(mot, mots, args.dry_run) + + + + +if __name__ == '__main__': + parser = fetch_parser("motions") + parser.add_argument("-n", "--dry-run", action='store_true') + main(impute_args(parser.parse_args())) diff --git a/src/cur-mot/fix-filenames.py b/src/cur-mot/fix-filenames.py new file mode 100644 index 0000000..9e9be04 --- /dev/null +++ b/src/cur-mot/fix-filenames.py @@ -0,0 +1,308 @@ +#!/usr/bin/env python3 +""" +Format filenames according to the pattern established. +""" +from glob import glob +from tqdm import tqdm +import argparse +import shutil +import os + + + +odnu = { + +"riksdagen-motions-pdf/data/200304/A1.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A_1.pdf", +"riksdagen-motions-pdf/data/200304/A2.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A_2.pdf", +"riksdagen-motions-pdf/data/200304/A205.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A20_5.pdf", +"riksdagen-motions-pdf/data/200304/A206.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A20_6.pdf", +"riksdagen-motions-pdf/data/200304/A207.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A20_7.pdf", +"riksdagen-motions-pdf/data/200304/A208.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A20_8.pdf", +"riksdagen-motions-pdf/data/200304/A209.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A20_9.pdf", +"riksdagen-motions-pdf/data/200304/A210.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A21_0.pdf", +"riksdagen-motions-pdf/data/200304/A211.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A21_1.pdf", +"riksdagen-motions-pdf/data/200304/A212.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A21_2.pdf", +"riksdagen-motions-pdf/data/200304/A213.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A21_3.pdf", +"riksdagen-motions-pdf/data/200304/A214.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A21_4.pdf", +"riksdagen-motions-pdf/data/200304/A215.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A21_5.pdf", +"riksdagen-motions-pdf/data/200304/A216.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A21_6.pdf", +"riksdagen-motions-pdf/data/200304/A217.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A21_7.pdf", +"riksdagen-motions-pdf/data/200304/A218.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A21_8.pdf", +"riksdagen-motions-pdf/data/200304/A219.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A21_9.pdf", +"riksdagen-motions-pdf/data/200304/A220.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A22_0.pdf", +"riksdagen-motions-pdf/data/200304/A221.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A22_1.pdf", +"riksdagen-motions-pdf/data/200304/A222.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A22_2.pdf", +"riksdagen-motions-pdf/data/200304/A223.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A22_3.pdf", +"riksdagen-motions-pdf/data/200304/A224.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A22_4.pdf", +"riksdagen-motions-pdf/data/200304/A225.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A22_5.pdf", +"riksdagen-motions-pdf/data/200304/A226.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A22_6.pdf", +"riksdagen-motions-pdf/data/200304/A227.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A22_7.pdf", +"riksdagen-motions-pdf/data/200304/A228.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A22_8.pdf", +"riksdagen-motions-pdf/data/200304/A229.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A22_9.pdf", +"riksdagen-motions-pdf/data/200304/A230.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A23_0.pdf", +"riksdagen-motions-pdf/data/200304/A231.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A23_1.pdf", +"riksdagen-motions-pdf/data/200304/A232.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A23_2.pdf", +"riksdagen-motions-pdf/data/200304/A233.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A23_3.pdf", +"riksdagen-motions-pdf/data/200304/A234.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A23_4.pdf", +"riksdagen-motions-pdf/data/200304/A235.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A23_5.pdf", +"riksdagen-motions-pdf/data/200304/A236.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A23_6.pdf", +"riksdagen-motions-pdf/data/200304/A237.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A23_7.pdf", +"riksdagen-motions-pdf/data/200304/A238.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A23_8.pdf", +"riksdagen-motions-pdf/data/200304/A239.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A23_9.pdf", +"riksdagen-motions-pdf/data/200304/A240.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A24_0.pdf", +"riksdagen-motions-pdf/data/200304/A241.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A24_1.pdf", +"riksdagen-motions-pdf/data/200304/A242.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A24_2.pdf", +"riksdagen-motions-pdf/data/200304/A243.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A24_3.pdf", +"riksdagen-motions-pdf/data/200304/A244.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A24_4.pdf", +"riksdagen-motions-pdf/data/200304/A245.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A24_5.pdf", +"riksdagen-motions-pdf/data/200304/A246.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A24_6.pdf", +"riksdagen-motions-pdf/data/200304/A248.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A24_8.pdf", +"riksdagen-motions-pdf/data/200304/A249.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A24_9.pdf", +"riksdagen-motions-pdf/data/200304/A250.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A25_0.pdf", +"riksdagen-motions-pdf/data/200304/A251.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A25_1.pdf", +"riksdagen-motions-pdf/data/200304/A252.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A25_2.pdf", +"riksdagen-motions-pdf/data/200304/A253.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A25_3.pdf", +"riksdagen-motions-pdf/data/200304/A254.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A25_4.pdf", +"riksdagen-motions-pdf/data/200304/A255.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A25_5.pdf", +"riksdagen-motions-pdf/data/200304/A256.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A25_6.pdf", +"riksdagen-motions-pdf/data/200304/A257.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A25_7.pdf", +"riksdagen-motions-pdf/data/200304/A258.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A25_8.pdf", +"riksdagen-motions-pdf/data/200304/A259.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A25_9.pdf", +"riksdagen-motions-pdf/data/200304/A260.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A26_0.pdf", +"riksdagen-motions-pdf/data/200304/A261.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A26_1.pdf", +"riksdagen-motions-pdf/data/200304/A262.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A26_2.pdf", +"riksdagen-motions-pdf/data/200304/A263.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A26_3.pdf", +"riksdagen-motions-pdf/data/200304/A264.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A26_4.pdf", +"riksdagen-motions-pdf/data/200304/A265.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A26_5.pdf", +"riksdagen-motions-pdf/data/200304/A266.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A26_6.pdf", +"riksdagen-motions-pdf/data/200304/A267.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A26_7.pdf", +"riksdagen-motions-pdf/data/200304/A268.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A26_8.pdf", +"riksdagen-motions-pdf/data/200304/A269.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A26_9.pdf", +"riksdagen-motions-pdf/data/200304/A270.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A27_0.pdf", +"riksdagen-motions-pdf/data/200304/A271.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A27_1.pdf", +"riksdagen-motions-pdf/data/200304/A272.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A27_2.pdf", +"riksdagen-motions-pdf/data/200304/A273.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A27_3.pdf", +"riksdagen-motions-pdf/data/200304/A274.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A27_4.pdf", +"riksdagen-motions-pdf/data/200304/A275.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A27_5.pdf", +"riksdagen-motions-pdf/data/200304/A276.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A27_6.pdf", +"riksdagen-motions-pdf/data/200304/A277.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A27_7.pdf", +"riksdagen-motions-pdf/data/200304/A278.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A27_8.pdf", +"riksdagen-motions-pdf/data/200304/A279.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A27_9.pdf", +"riksdagen-motions-pdf/data/200304/A280.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A28_0.pdf", +"riksdagen-motions-pdf/data/200304/A281.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A28_1.pdf", +"riksdagen-motions-pdf/data/200304/A282.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A28_2.pdf", +"riksdagen-motions-pdf/data/200304/A283.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A28_3.pdf", +"riksdagen-motions-pdf/data/200304/A284.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A28_4.pdf", +"riksdagen-motions-pdf/data/200304/A285.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A28_5.pdf", +"riksdagen-motions-pdf/data/200304/A286.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A28_6.pdf", +"riksdagen-motions-pdf/data/200304/A287.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A28_7.pdf", +"riksdagen-motions-pdf/data/200304/A288.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A28_8.pdf", +"riksdagen-motions-pdf/data/200304/A289.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A28_9.pdf", +"riksdagen-motions-pdf/data/200304/A290.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A29_0.pdf", +"riksdagen-motions-pdf/data/200304/A291.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A29_1.pdf", +"riksdagen-motions-pdf/data/200304/A292.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A29_2.pdf", +"riksdagen-motions-pdf/data/200304/A293.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A29_3.pdf", +"riksdagen-motions-pdf/data/200304/A294.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A29_4.pdf", +"riksdagen-motions-pdf/data/200304/A295.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A29_5.pdf", +"riksdagen-motions-pdf/data/200304/A296.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A29_6.pdf", +"riksdagen-motions-pdf/data/200304/A297.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A29_7.pdf", +"riksdagen-motions-pdf/data/200304/A298.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A29_8.pdf", +"riksdagen-motions-pdf/data/200304/A299.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A29_9.pdf", +"riksdagen-motions-pdf/data/200304/A3.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A_3.pdf", +"riksdagen-motions-pdf/data/200304/A300.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A30_0.pdf", +"riksdagen-motions-pdf/data/200304/A301.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A30_1.pdf", +"riksdagen-motions-pdf/data/200304/A302.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A30_2.pdf", +"riksdagen-motions-pdf/data/200304/A303.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A30_3.pdf", +"riksdagen-motions-pdf/data/200304/A304.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A30_4.pdf", +"riksdagen-motions-pdf/data/200304/A305.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A30_5.pdf", +"riksdagen-motions-pdf/data/200304/A306.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A30_6.pdf", +"riksdagen-motions-pdf/data/200304/A307.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A30_7.pdf", +"riksdagen-motions-pdf/data/200304/A308.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A30_8.pdf", +"riksdagen-motions-pdf/data/200304/A309.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A30_9.pdf", +"riksdagen-motions-pdf/data/200304/A310.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A31_0.pdf", +"riksdagen-motions-pdf/data/200304/A311.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A31_1.pdf", +"riksdagen-motions-pdf/data/200304/A312.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A31_2.pdf", +"riksdagen-motions-pdf/data/200304/A313.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A31_3.pdf", +"riksdagen-motions-pdf/data/200304/A314.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A31_4.pdf", +"riksdagen-motions-pdf/data/200304/A315.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A31_5.pdf", +"riksdagen-motions-pdf/data/200304/A316.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A31_6.pdf", +"riksdagen-motions-pdf/data/200304/A317.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A31_7.pdf", +"riksdagen-motions-pdf/data/200304/A318.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A31_8.pdf", +"riksdagen-motions-pdf/data/200304/A319.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A31_9.pdf", +"riksdagen-motions-pdf/data/200304/A320.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A32_0.pdf", +"riksdagen-motions-pdf/data/200304/A321.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A32_1.pdf", +"riksdagen-motions-pdf/data/200304/A322.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A32_2.pdf", +"riksdagen-motions-pdf/data/200304/A323.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A32_3.pdf", +"riksdagen-motions-pdf/data/200304/A324.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A32_4.pdf", +"riksdagen-motions-pdf/data/200304/A325.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A32_5.pdf", +"riksdagen-motions-pdf/data/200304/A326.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A32_6.pdf", +"riksdagen-motions-pdf/data/200304/A327.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A32_7.pdf", +"riksdagen-motions-pdf/data/200304/A328.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A32_8.pdf", +"riksdagen-motions-pdf/data/200304/A329.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A32_9.pdf", +"riksdagen-motions-pdf/data/200304/A330.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A33_0.pdf", +"riksdagen-motions-pdf/data/200304/A331.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A33_1.pdf", +"riksdagen-motions-pdf/data/200304/A332.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A33_2.pdf", +"riksdagen-motions-pdf/data/200304/A333.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A33_3.pdf", +"riksdagen-motions-pdf/data/200304/A334.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A33_4.pdf", +"riksdagen-motions-pdf/data/200304/A335.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A33_5.pdf", +"riksdagen-motions-pdf/data/200304/A336.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A33_6.pdf", +"riksdagen-motions-pdf/data/200304/A337.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A33_7.pdf", +"riksdagen-motions-pdf/data/200304/A338.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A33_8.pdf", +"riksdagen-motions-pdf/data/200304/A339.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A33_9.pdf", +"riksdagen-motions-pdf/data/200304/A340.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A34_0.pdf", +"riksdagen-motions-pdf/data/200304/A341.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A34_1.pdf", +"riksdagen-motions-pdf/data/200304/A342.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A34_2.pdf", +"riksdagen-motions-pdf/data/200304/A343.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A34_3.pdf", +"riksdagen-motions-pdf/data/200304/A344.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A34_4.pdf", +"riksdagen-motions-pdf/data/200304/A345.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A34_5.pdf", +"riksdagen-motions-pdf/data/200304/A346.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A34_6.pdf", +"riksdagen-motions-pdf/data/200304/A347.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A34_7.pdf", +"riksdagen-motions-pdf/data/200304/A348.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A34_8.pdf", +"riksdagen-motions-pdf/data/200304/A349.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A34_9.pdf", +"riksdagen-motions-pdf/data/200304/A350.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A35_0.pdf", +"riksdagen-motions-pdf/data/200304/A351.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A35_1.pdf", +"riksdagen-motions-pdf/data/200304/A352.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A35_2.pdf", +"riksdagen-motions-pdf/data/200304/A353.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A35_3.pdf", +"riksdagen-motions-pdf/data/200304/A354.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A35_4.pdf", +"riksdagen-motions-pdf/data/200304/A355.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A35_5.pdf", +"riksdagen-motions-pdf/data/200304/A356.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A35_6.pdf", +"riksdagen-motions-pdf/data/200304/A357.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A35_7.pdf", +"riksdagen-motions-pdf/data/200304/A358.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A35_8.pdf", +"riksdagen-motions-pdf/data/200304/A359.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A35_9.pdf", +"riksdagen-motions-pdf/data/200304/A360.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A36_0.pdf", +"riksdagen-motions-pdf/data/200304/A361.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A36_1.pdf", +"riksdagen-motions-pdf/data/200304/A362.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A36_2.pdf", +"riksdagen-motions-pdf/data/200304/A363.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A36_3.pdf", +"riksdagen-motions-pdf/data/200304/A364.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A36_4.pdf", +"riksdagen-motions-pdf/data/200304/A365.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A36_5.pdf", +"riksdagen-motions-pdf/data/200304/A366.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A36_6.pdf", +"riksdagen-motions-pdf/data/200304/A367.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A36_7.pdf", +"riksdagen-motions-pdf/data/200304/A368.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A36_8.pdf", +"riksdagen-motions-pdf/data/200304/A369.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A36_9.pdf", +"riksdagen-motions-pdf/data/200304/A370.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A37_0.pdf", +"riksdagen-motions-pdf/data/200304/A371.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A37_1.pdf", +"riksdagen-motions-pdf/data/200304/A4.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A_4.pdf", +"riksdagen-motions-pdf/data/200304/A5.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A_5.pdf", +"riksdagen-motions-pdf/data/200304/A6.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A_6.pdf", +"riksdagen-motions-pdf/data/200304/A7.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_A_7.pdf", +"riksdagen-motions-pdf/data/200304/BO1.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_BO_1.pdf", +"riksdagen-motions-pdf/data/200304/Bo10.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_Bo1_0.pdf", +"riksdagen-motions-pdf/data/200304/Bo201.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_Bo20_1.pdf", +"riksdagen-motions-pdf/data/200304/Bo202.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_Bo20_2.pdf", +"riksdagen-motions-pdf/data/200304/Bo203.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_Bo20_3.pdf", +"riksdagen-motions-pdf/data/200304/Bo204.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_Bo20_4.pdf", +"riksdagen-motions-pdf/data/200304/Bo205.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_Bo20_5.pdf", +"riksdagen-motions-pdf/data/200304/Bo206.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_Bo20_6.pdf", +"riksdagen-motions-pdf/data/200304/Bo207.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_Bo20_7.pdf", +"riksdagen-motions-pdf/data/200304/Bo208.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_Bo20_8.pdf", +"riksdagen-motions-pdf/data/200304/Bo209.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_Bo20_9.pdf", +"riksdagen-motions-pdf/data/200304/Bo210.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_Bo21_0.pdf", +"riksdagen-motions-pdf/data/200304/Bo211.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_Bo21_1.pdf", +"riksdagen-motions-pdf/data/200304/Bo212.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_Bo21_2.pdf", +"riksdagen-motions-pdf/data/200304/Bo213.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_Bo21_3.pdf", +"riksdagen-motions-pdf/data/200304/Bo214.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_Bo21_4.pdf", +"riksdagen-motions-pdf/data/200304/Bo215.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_Bo21_5.pdf", +"riksdagen-motions-pdf/data/200304/Bo216.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_Bo21_6.pdf", +"riksdagen-motions-pdf/data/200304/Bo217.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_Bo21_7.pdf", +"riksdagen-motions-pdf/data/200304/Bo218.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_Bo21_8.pdf", +"riksdagen-motions-pdf/data/200304/Bo219.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_Bo21_9.pdf", +"riksdagen-motions-pdf/data/200304/Bo220.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_Bo22_0.pdf", +"riksdagen-motions-pdf/data/200304/Bo221.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_Bo22_1.pdf", +"riksdagen-motions-pdf/data/200304/Bo222.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_Bo22_2.pdf", +"riksdagen-motions-pdf/data/200304/Bo223.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_Bo22_3.pdf", +"riksdagen-motions-pdf/data/200304/Bo224.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_Bo22_4.pdf", +"riksdagen-motions-pdf/data/200304/Bo225.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_Bo22_5.pdf", +"riksdagen-motions-pdf/data/200304/Bo226.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_Bo22_6.pdf", +"riksdagen-motions-pdf/data/200304/Bo227.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_Bo22_7.pdf", +"riksdagen-motions-pdf/data/200304/Bo228.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_Bo22_8.pdf", +"riksdagen-motions-pdf/data/200304/Bo229.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_Bo22_9.pdf", +"riksdagen-motions-pdf/data/200304/Bo230.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_Bo23_0.pdf", +"riksdagen-motions-pdf/data/200304/Bo231.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_Bo23_1.pdf", +"riksdagen-motions-pdf/data/200304/Bo232.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_Bo23_2.pdf", +"riksdagen-motions-pdf/data/200304/Bo233.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_Bo23_3.pdf", +"riksdagen-motions-pdf/data/200304/Bo234.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_Bo23_4.pdf", +"riksdagen-motions-pdf/data/200304/Bo235.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_Bo23_5.pdf", +"riksdagen-motions-pdf/data/200304/Bo236.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_Bo23_6.pdf", +"riksdagen-motions-pdf/data/200304/Bo237.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_Bo23_7.pdf", +"riksdagen-motions-pdf/data/200304/Bo238.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_Bo23_8.pdf", +"riksdagen-motions-pdf/data/200304/Bo239.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_Bo23_9.pdf", +"riksdagen-motions-pdf/data/200304/Bo240.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_Bo24_0.pdf", +"riksdagen-motions-pdf/data/200304/Bo241.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_Bo24_1.pdf", +"riksdagen-motions-pdf/data/200304/Bo242.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_Bo24_2.pdf", +"riksdagen-motions-pdf/data/200304/Bo243.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_Bo24_3.pdf", +"riksdagen-motions-pdf/data/200304/Bo244.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_Bo24_4.pdf", +"riksdagen-motions-pdf/data/200304/Bo245.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_Bo24_5.pdf", +"riksdagen-motions-pdf/data/200304/Bo246.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_Bo24_6.pdf", +"riksdagen-motions-pdf/data/200304/Bo247.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_Bo24_7.pdf", +"riksdagen-motions-pdf/data/200304/Bo248.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_Bo24_8.pdf", +"riksdagen-motions-pdf/data/200304/Bo249.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_Bo24_9.pdf", +"riksdagen-motions-pdf/data/200304/Bo250.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_Bo25_0.pdf", +"riksdagen-motions-pdf/data/200304/Bo251.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_Bo25_1.pdf", +"riksdagen-motions-pdf/data/200304/Bo252.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_Bo25_2.pdf", +"riksdagen-motions-pdf/data/200304/Bo253.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_Bo25_3.pdf", +"riksdagen-motions-pdf/data/200304/Bo254.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_Bo25_4.pdf", +"riksdagen-motions-pdf/data/200304/Bo255.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_Bo25_5.pdf", +"riksdagen-motions-pdf/data/200304/Bo256.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_Bo25_6.pdf", +"riksdagen-motions-pdf/data/200304/Bo257.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_Bo25_7.pdf", +"riksdagen-motions-pdf/data/200304/Bo258.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_Bo25_8.pdf", +"riksdagen-motions-pdf/data/200304/Bo259.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_Bo25_9.pdf", +"riksdagen-motions-pdf/data/200304/Bo260.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_Bo26_0.pdf", +"riksdagen-motions-pdf/data/200304/Bo261.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_Bo26_1.pdf", +"riksdagen-motions-pdf/data/200304/Bo262.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_Bo26_2.pdf", +"riksdagen-motions-pdf/data/200304/Bo263.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_Bo26_3.pdf", +"riksdagen-motions-pdf/data/200304/Bo264.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_Bo26_4.pdf", +"riksdagen-motions-pdf/data/200304/Bo265.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_Bo26_5.pdf", +"riksdagen-motions-pdf/data/200304/Bo266.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_Bo26_6.pdf", +"riksdagen-motions-pdf/data/200304/Bo267.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_Bo26_7.pdf", +"riksdagen-motions-pdf/data/200304/a201.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_a20_1.pdf", +"riksdagen-motions-pdf/data/200304/a202.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_a20_2.pdf", +"riksdagen-motions-pdf/data/200304/a203.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_a20_3.pdf", +"riksdagen-motions-pdf/data/200304/a204.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_a20_4.pdf", +"riksdagen-motions-pdf/data/200304/a247.pdf": "riksdagen-motions-pdf/data/200304/mot_200304_a24_7.pdf" + +} + +undo = {} + +for k, v in odnu.items(): + undo[v] = k + +def main(args): + pdf_path = f"{args.pdf_path}/{args.parliament_year}" + pdf_files = sorted(glob(f"{pdf_path}/*.pdf")) + for f in tqdm(pdf_files): + if f in undo: + #shutil.move(f, undo[f]) + pass + else: + base_f = os.path.basename(f) + n, e = base_f.split('.') + if base_f.startswith("MOT_"): + mot, py, committee, index = n.split("_") + new_f = base_f.lower() + print(f, f"{pdf_path}/mot_{args.parliament_year}_{committee}_{index:0>4}.{e}") + shutil.move(f, f"{pdf_path}/mot_{args.parliament_year}_{committee}_{index:0>4}.{e}") + elif not base_f.startswith("mot"): + split_point = None + + for i, _ in enumerate(n): + try: + assert split_point is None + _ = int(_) + split_point = i + except: + pass + committee = n[:split_point] + index = n[split_point:] + print(f, f"{pdf_path}/mot_{args.parliament_year}_{committee}_{index:0>4}.{e}") + shutil.move(f, f"{pdf_path}/mot_{args.parliament_year}_{committee}_{index:0>4}.{e}") + + + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description = __doc__) + parser.add_argument("-p", "--parliament-year", required=True) + parser.add_argument("--pdf-path", default="riksdagen-motions-pdf/data") + args = parser.parse_args() + main(args) diff --git a/src/cur-mot/link-head-toc.py b/src/cur-mot/link-head-toc.py new file mode 100644 index 0000000..7e1c917 --- /dev/null +++ b/src/cur-mot/link-head-toc.py @@ -0,0 +1,54 @@ +#!/usr/bin/env python3 +""" +Link TOC entries with head elements +""" + +from common.xml_utils import write_xml +from glob import glob +from lxml import etree +from pyriksdagen.utils import ( + parse_protocol +) +from tqdm import tqdm +import argparse, os + + +def main(main): + data_location = os.environ.get("MOTIONS_PATH", "data") + motions = glob(f"{data_location}/*/*.xml") + motions = [m for m in motions if \ + m.split("/")[-2][:4] >= args.start and\ + m.split("/")[-2][:2]+m.split("/")[-2][-2:] < args.end] + matched = 0 + total = 0 + for mot in tqdm(motions): + root, ns = parse_protocol(mot, get_ns=True) + toc = root.find(f".//{ns['tei_ns']}div[@type='TOC']/{ns['tei_ns']}list") + if toc is not None: + print(toc) + heads = root.findall(f".//{ns['tei_ns']}head") + for head in heads: + total += 1 + match = False + if head.text is not None and head.attrib is not None and f"{ns['xml_ns']}id" in head.attrib: + #print(" ", head.attrib) + for elem in toc: + if elem.text is not None: + if ' '.join([_.strip() for _ in head.text.split("\n")]) in ' '.join([_.strip() for _ in elem.text.split("\n")]): + elem.attrib["corresp"] = head.attrib[f"{ns['xml_ns']}id"] + match = True + matched += 1 + + + print(" ", match, head.text.strip()) + write_xml(root, mot) + print(matched, total, matched/total) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("-s", "--start", required=True) + parser.add_argument("-e", "--end", required=True) + args = parser.parse_args() + main(args) + diff --git a/src/cur-mot/list-ocr-corrections.py b/src/cur-mot/list-ocr-corrections.py new file mode 100644 index 0000000..c1b144d --- /dev/null +++ b/src/cur-mot/list-ocr-corrections.py @@ -0,0 +1,61 @@ +#!/usr/bin/env python3 +""" +generate a lits of OCR corrections for test suite +""" +from common.args import ( + alto_args, + list_years, + verify_alto_args, +) +from common.xml_utils import ( + parse_xml, +) +from glob import glob +from lxml import etree +from tqdm import tqdm +import argparse +import pandas as pd + + + + +def main(args): + years = list_years(args) + rows = [] + cols = ["mot", "elem_id", "who", "when", "elem_tag", "elem_text"] + for year in years: + print(year) + motions = sorted(glob(f"{args.motionspath}/{year}/*.xml")) + #print(len(motions)) + for mot in tqdm(motions): + #print(mot) + root, ns = parse_xml(mot, get_ns=True) + corrections = root.findall(f".//{ns['tei_ns']}correction") + #if len(corrections) > 0: + # print(" ", len(corrections)) + for c in corrections: + if c.text == "OCR correction": + elem_id = c.attrib['corresp'] + who = c.attrib['who'] + when = c.attrib['when'] + corresp = root.findall(f".//*[@{ns['xml_ns']}id=\"{elem_id}\"]") + if len(corresp) > 0: + # print(corresp) + elem_tag = corresp[0].tag + elem_text = ' '.join([_.strip() for _ in corresp[0].text.splitlines() if _.strip != '']) + else: + print("Elem not found!") + + rows.append([mot, elem_id, who, when, elem_tag, elem_text]) + + df = pd.DataFrame(rows, columns=cols) + df.to_csv(f"{args.test_path}/ocr-corrections.tsv", sep='\t', index=False) + + + + +if __name__ == '__main__': + parser = alto_args(__doc__) + parser.add_argument("-o", "--test-path", default="riksdagen-motions/test/data") + args = parser.parse_args() + main(verify_alto_args(args)) diff --git a/src/cur-mot/list-seg-class-corrections.py b/src/cur-mot/list-seg-class-corrections.py new file mode 100644 index 0000000..54f6f13 --- /dev/null +++ b/src/cur-mot/list-seg-class-corrections.py @@ -0,0 +1,66 @@ +#!/usr/bin/env python3 +""" +generate a lits of OCR corrections for test suite +""" +from common.args import ( + alto_args, + list_years, + verify_alto_args, +) +from common.xml_utils import ( + parse_xml, +) +from glob import glob +from lxml import etree +from tqdm import tqdm +import argparse, json +import pandas as pd + + + + +def main(args): + years = list_years(args) + D = {} + cols = ["mot", "elem_id", "who", "when", "elem_tag", "elem_text"] + for year in years: + print(year) + motions = sorted(glob(f"{args.motionspath}/{year}/*.xml")) + #print(len(motions)) + for mot in tqdm(motions): + #print(mot) + root, ns = parse_xml(mot, get_ns=True) + corrections = root.findall(f".//{ns['tei_ns']}correction") + #if len(corrections) > 0: + # print(" ", len(corrections)) + for c in corrections: + if c.text == "segment classification": + elem_id = c.attrib['corresp'] + who = c.attrib['who'] + when = c.attrib['when'] + corresp = root.findall(f".//*[@{ns['xml_ns']}id=\"{elem_id}\"]") + if len(corresp) > 0: + # print(corresp) + elem_tag = corresp[0].tag + elem_attrib = dict(corresp[0].attrib) + else: + print("Elem not found!") + if mot not in D: + D[mot] = {} + if elem_id not in D[mot]: + D[mot][elem_id] = {} + D[mot][elem_id][when] = {"by":who, "elem_tag": elem_tag, "elem_attrib": elem_attrib} + + + + with open(f"{args.test_path}/segment_classification-corrections.json", 'w+') as out: + json.dump(D, out, ensure_ascii=False, indent=4) + + + + +if __name__ == '__main__': + parser = alto_args(__doc__) + parser.add_argument("-o", "--test-path", default="riksdagen-motions/test/data") + args = parser.parse_args() + main(verify_alto_args(args)) diff --git a/src/cur-mot/refine-signature-parsing.py b/src/cur-mot/refine-signature-parsing.py new file mode 100644 index 0000000..b4ed3dc --- /dev/null +++ b/src/cur-mot/refine-signature-parsing.py @@ -0,0 +1,813 @@ +#!/usr/bin/env python3 +## nb not run in the normal swerik env +## Spacy numpy dependency not compatible with our tensorflow/numpy version +""" +Use heuristics and NER to parse and classify signature elements +""" +import os +os.environ["CUDA_VISIBLE_DEVICES"] = "-1" # hides all GPUs +os.environ["SPACY_FORCE_CPU"] = "true" # newer spaCy obeys this +os.environ["TOKENIZERS_PARALLELISM"] = "false" +os.environ["OMP_NUM_THREADS"] = "1" +os.environ["OPENBLAS_NUM_THREADS"] = "1" +os.environ["MKL_NUM_THREADS"] = "1" +os.environ["NUMEXPR_NUM_THREADS"] = "1" +os.environ["THINC_NO_OPTIMIZE"] = "1" +os.environ["VECLIB_MAXIMUM_THREADS"] = "1" + +import multiprocessing as mp + +# Must come before any Pool, Executor, or spaCy import +if mp.get_start_method(allow_none=True) != "spawn": + mp.set_start_method("spawn", force=True) +from multiprocessing import ( + get_context, + Process, +) +from pyriksdagen.args import ( + fetch_parser, + impute_args +) +from pyriksdagen.io import ( + parse_tei, + write_tei, +) +from tqdm import tqdm +from typing import ( + List, + Optional, + Tuple, +) +import sys, time +#import cupy as cp +import gc +import lxml.etree as etree +import re +import spacy +import thinc +from spacy.matcher import PhraseMatcher +from spacy.util import filter_spans +from spacy.tokens import Span +from spacy.language import Language +import pandas as pd +from concurrent.futures import ProcessPoolExecutor, as_completed +from contextlib import closing +import traceback +UP = "A-ZÅÄÖÉÈÀÂÊÎÔÛÜÖÄÅ" +LO = "a-zåäöéèàâêîôûüöäåç" + +NAME_TOKEN = re.compile(rf"^[{UP}][{LO}{UP}\-'\.:]+$") +SHORT_PAREN = re.compile(r"^\([A-Za-zÅÄÖåäö]{1,3}\)$") +RE_NR_HEAD = re.compile(r"^(?:N:o|Nr)$", re.IGNORECASE) +RE_ALNUM = re.compile(r"^(?:[A-Za-zÅÄÖåäö]+|\d+)$") + +# Allow longer multi-word places after 'i' / 'från' +MAX_PLACE_TOKENS = 6 +LOWER_PLACE_WORDS = { + "län","härad","socken","församling","kommun","stad","köping", + "tingslag","domsaga","landskap","kapellag","bygden" +} +# Optional surname hints (we still allow 2-token names even if 2nd isn’t in this list) +SURNAME_SUFFIXES = ( + "son","sson","dotter","berg","borg","gren","kvist","quist", + "ström","man","mark","lund","holm","beck","blad","feldt","felt", + "fors","vall","hage","dahl" +) +NOBILIARY_PARTICLES = {"von", "de", "af", "van", "di", "du", "v"} +PLACE_HINT = re.compile(r"^[A-ZÅÄÖ][a-zåäöéèàâêîôûüöäåç\-]+$") +PLACE_ENDINGS = {"län", "stad", "kommun", "församling", "härad", "socken", "kapellag"} + +def log(msg): + sys.stdout.write(f"[PID {os.getpid()}] {msg}\n") + sys.stdout.flush() + + +def clean_text(t: str) -> str: + # Fix broken hyphenation (Carls- son → Carlsson); normalize whitespace/dashes. + t = re.sub(rf"([{UP}{LO}])-\s+([{UP}{LO}])", r"\1\2", t) + t = t.replace("’","'").replace("–","-").replace("—","-") + t = re.sub(r"\b0\.", "O.", t) + return re.sub(r"\s+"," ", t.strip()) + + +def is_initial(tok: str) -> bool: + t = tok.strip() + return bool(re.fullmatch(rf"[{UP}]\.?", t)) + + +def is_hy_initial(tok: str) -> bool: + t = tok.strip() + return bool(re.fullmatch(rf"-[{UP}]\.?", t)) + + +def is_name_token(tok: str) -> bool: + return bool(NAME_TOKEN.match(tok)) + + +def is_surname_like(tok: str) -> bool: + t = tok.lower().rstrip(".") + return any(t.endswith(s) for s in SURNAME_SUFFIXES) + + +# --- spaCy helper (optional) ------------------------------------------------- +def _spacy_doc(text: str): + try: + import spacy + for m in ("sv_core_news_md",): + try: + return spacy.load(m)(text) + except Exception: + pass + return None + except Exception: + return None + + +def _ent_covering(doc, start_char: int) -> Optional[Tuple[str,int,int]]: + """Return (label, ent_start_char, ent_end_char) for entity starting at or covering start_char.""" + if not doc: return None + for e in doc.ents: + if e.start_char <= start_char < e.end_char and e.label_ in ("PER","LOC","GPE"): + return (e.label_, e.start_char, e.end_char) + return None + + +# --- tokenization with spans ------------------------------------------------- +def _tokenize_with_spans(text: str): + toks = text.split() + spans = [] + pos = 0 + for tok in toks: + start = pos + end = start + len(tok) + spans.append((tok, start, end)) + pos = end + 1 # +1 for the single space between tokens + return toks, spans + + +def _consume_record_id(toks, i): + # Expect head: "Nr" or "N:o" + if i >= len(toks) or not RE_NR_HEAD.match(toks[i]): + return None + parts = [toks[i]]; j = i + 1 + # up to two following parts that are letters or digits (to catch "B 236") + take = 0 + while j < len(toks) and take < 2 and RE_ALNUM.match(toks[j]): + parts.append(toks[j]) + j += 1; take += 1 + if take == 0: + return None + return j, " ".join(parts) + + +# --- consume place phrase after 'i' / 'från' -------------------------------- +def _consume_place(toks, i_start,use_spacy_ent, spans) -> int: + """ + Consume tokens after 'i' or 'från' that look like a place. + Stops before what appears to be the next person's name. + """ + i = i_start + 1 + if i >= len(toks): + return i + + # If spaCy already detected a location entity covering the next token + if use_spacy_ent: + label, ent_s, ent_e = use_spacy_ent + if label in ("LOC", "GPE"): + j = i + while j < len(toks) and spans[j][2] <= ent_e: + j += 1 + return max(j, i) + + taken = 0 + j = i + while j < len(toks): + w = toks[j] + + # Allow lowercase place-type words (län, härad, socken, etc.) + if w.lower() in LOWER_PLACE_WORDS: + j += 1 + taken += 1 + continue + + # Stop if we hit "Nr" or record patterns + if RE_NR_HEAD.match(w) or w.lower().startswith("nr"): + break + + # Allow capitalized place components (Stockholms, Västra, Sundbyberg) + if re.match(r"^[A-ZÅÄÖ][a-zåäöéèàâêîôûüöäåç\-]*$", w): + # stop early if this looks like a person surname (e.g. ends in -son, -berg) + if is_surname_like(w): + break + j += 1 + taken += 1 + continue + + break + + # Must have taken at least one token to count as a place + if taken == 0: + return i_start + 1 + + return j + + +@Language.factory("known_name_ruler") +def create_known_name_ruler(nlp, name, known_names=None, known_places=None): + """Factory for a component that adds known person/place entities.""" + person_matcher = PhraseMatcher(nlp.vocab, attr="LOWER") + if known_names: + person_matcher.add("PER", [nlp.make_doc(n) for n in known_names]) + + place_matcher = None + if known_places: + place_matcher = PhraseMatcher(nlp.vocab, attr="LOWER") + place_matcher.add("LOC", [nlp.make_doc(p) for p in known_places]) + + # The actual callable component + def known_name_component(doc): + new_ents = [] + + for _, start, end in person_matcher(doc): + new_ents.append(Span(doc, start, end, label="PER")) + + if place_matcher: + for _, start, end in place_matcher(doc): + new_ents.append(Span(doc, start, end, label="LOC")) + + doc.ents = filter_spans(list(doc.ents) + new_ents) + return doc + + return known_name_component + + +def add_known_names_ruler(nlp, known_names, known_places=None): + """Attach known-name/place ruler safely.""" + if "known_name_ruler" not in nlp.pipe_names: + nlp.add_pipe( + "known_name_ruler", + before="ner", + config={"known_names": known_names, "known_places": known_places}, + ) + return nlp + + +def is_name_boundary(tok: str) -> bool: + """ + Decide if token likely begins a new signature block or record section. + """ + # new signature usually starts after a name (capitalized) *and* previous token ended cleanly + return bool( + tok.lower() in {"i", "nr", "från"} or + tok in {",", ";"} or + re.match(r"^[-–]$", tok) or + is_initial(tok) + ) + + +def _merge_adjacent(items): + merged = [] + for it in items: + if merged and merged[-1]["type"] == it["type"]: + # avoid duplicate words + prev = merged[-1]["text"].split() + new = it["text"].split() + if not new or new[0] in prev: + continue + merged[-1]["text"] = " ".join(prev + new) + else: + merged.append(it) + return merged + + +def merge_overlapping_spans(spans): + """Merge overlapping or nested spaCy spans.""" + if not spans: + return [] + spans = sorted(spans, key=lambda x: (x.start_char, x.end_char)) + merged = [spans[0]] + for cur in spans[1:]: + prev = merged[-1] + if cur.start_char <= prev.end_char: + merged[-1] = doc.char_span(prev.start_char, max(prev.end_char, cur.end_char)) + else: + merged.append(cur) + return merged + + +# --- main ordered parser ----------------------------------------------------- +def parse_name_string(text: str, nlp=None, doc=None, known_names=None) -> List[dict]: + text = clean_text(text) + if not text: + return [] + + doc = nlp(text) + toks, spans = _tokenize_with_spans(text) + # Ensure every span is a 3-tuple (tok, start, end) + fixed_spans = [] + for s in spans: + if isinstance(s, (list, tuple)) and len(s) == 3: + fixed_spans.append(tuple(s)) + else: + # fall back to dummy start/end positions + if isinstance(s, str): + fixed_spans.append((s, 0, len(s))) + else: + fixed_spans.append(("", 0, 0)) + spans = fixed_spans + spans = [t if len(t) == 3 else (t[0], 0, 0) for t in spans] + out = [] + i = 0 + + # # build matcher if we have a list of names + # name_spans = [] + # if known_names: + # matcher = PhraseMatcher(nlp.vocab, attr="LOWER") + # patterns = [nlp.make_doc(name.lower()) for name in known_names if isinstance(name, str)] + # matcher.add("KNOWN_NAME", patterns) + # for _, start, end in matcher(doc): + # name_spans.append(doc[start:end]) + + # merge spaCy PER entities with known-name matches + name_spans = [ent for ent in doc.ents if ent.label_ == "PER"] + name_spans = sorted(name_spans, key=lambda e: e.start_char) + loc_spans = [ent for ent in doc.ents if ent.label_ in ("LOC", "GPE")] + # index entities by start offset for quick lookup + ent_starts = {e.start_char: e for e in name_spans} + loc_starts = {e.start_char: e for e in loc_spans} + + while i < len(toks): + try: + tok, s, e = spans[i] + if not isinstance(tok, str): + tok = str(tok) + if not isinstance(s, int) or not isinstance(e, int): + s, e = 0, len(tok) + except Exception: + # fallback if the tuple isn't valid + val = spans[i] + if isinstance(val, str): + tok, s, e = val, 0, len(val) + elif isinstance(val, (list, tuple)): + tok = str(val[0]) if len(val) > 0 else "" + s = int(val[1]) if len(val) > 1 and isinstance(val[1], int) else 0 + e = int(val[2]) if len(val) > 2 and isinstance(val[2], int) else s + len(tok) + else: + tok, s, e = "", 0, 0 + + # ---- record id ------------------------------------------------------- + two = " ".join(toks[i:i+2]) + if re.match(r"(?i)\bN[:r]\s*\d+\b", two): + out.append({"type": "record-id", "text": two}) + i += 2 + continue + + # ---- location specifier --------------------------------------------- + if tok.lower() in {"i", "från"}: + use_ent = None + for ent in loc_spans: + if ent.start_char <= spans[i][1] < ent.end_char: + use_ent = (ent.label_, ent.start_char, ent.end_char) + break + + j = _consume_place(toks, i, use_ent, spans) + frag = " ".join(toks[i:j]) + out.append({"type": "location-specifier", "text": frag}) + i = j + continue + + # ---- matched entity ------------------------------------------------- + if s in ent_starts: + ent = ent_starts[s] + out.append({"type": "person", "text": ent.text}) + # advance i safely past the entity span + j = i + while j < len(spans) and spans[j][2] <= ent.end_char: + j += 1 + i = j + continue + + # ---- initials-aware name grouping with strong sequential rules ------- + if re.match(r"^[A-ZÅÄÖ]", tok): + j = i + name_tokens = [] + + def is_initial(t): + """A single capital, with or without dot, e.g. 'W' or 'W.'""" + return bool(re.fullmatch(r"[A-ZÅÄÖ]\.?", t)) + + def is_word(t): + """A proper name-like token.""" + return bool(re.match(r"^[A-ZÅÄÖ][a-zåäö\-]+$", t)) + + while j < len(toks): + nxt = toks[j] + + # Hard stop markers + if nxt.lower() in {"i", "från"} or nxt in {"Nr", "nr"}: + break + if re.fullmatch(r"[,;]", nxt): + j += 1 + continue + + # Accept initials or capitalized words + if is_initial(nxt) or is_word(nxt) or re.fullmatch(r"[A-ZÅÄÖ][a-zåäö]{1,3}\.", nxt): + name_tokens.append(nxt) + j += 1 + + # --- Rule 1: "initials + word" followed by another initial → new name + if ( + len(name_tokens) >= 2 + and is_initial(name_tokens[-2]) + and is_word(name_tokens[-1]) + and j < len(toks) + and is_initial(toks[j]) + ): + break + + # --- Rule 2: prevent ending on an initial (with or without dot) + # keep going until we get a surname-like word + if j < len(toks) and is_initial(name_tokens[-1]) and not is_word(toks[j]): + continue + + # --- Rule 3: detect next full-name start ("Firstname" pattern) + if ( + j < len(toks) + and is_word(name_tokens[-1]) + and is_word(toks[j]) + and j + 1 < len(toks) + and (is_initial(toks[j + 1]) or is_word(toks[j + 1])) + ): + break + + continue + + break # anything else stops + + frag = " ".join(name_tokens).strip() + if frag: + out.append({"type": "person", "text": frag}) + i = j + continue + + i += 1 + + del doc + return out + + + +def looks_like_signature_block(sig_block, nlp=None): + texts = " ".join([itm.strip() for itm in sig_block.itertext() if itm is not None and itm.strip()!='']) + #print(texts) + if not texts: + #print("no texts") + return False + + if not any([c.isupper() for c in texts]): + #print("no uppercase") + return False + + words = texts.split() + #print(words) + if len(words) < 4: + # trivial, short, might be single name + return True + + # reject only if this looks like pure prose + avg_len = sum(len(w) for w in words) / len(words) + caps_ratio = sum(w and w[0].isupper() for w in words) / len(words) + + # allow if many short capitalized tokens + if caps_ratio > 0.5:# and avg_len < 10: + return True + + if len(words) > 200: + #print("too many words") + #print(texts) + return False # extreme prose + + # Always accept anything that has even 1 typical surname or capitalized run + if any(w.endswith(("sson","berg","man","gren","lund","ström")) for w in words): + return True + + if nlp: + doc = nlp(texts) + if any(ent.label_ == "PER" for ent in doc.ents): + return True + + return True # << default to True to test the rest of the pipeline + + +def expand_signatures(sig_block, ns, nlp, parser_fn=parse_name_string, known_names=None): + changed = False + try: + if not looks_like_signature_block(sig_block, nlp): + return changed + + lists = sig_block.findall(f".//{ns['tei_ns']}list") + for lst in lists: + old_items = list(lst.findall(f"{ns['tei_ns']}item")) + if not old_items: + continue + + full_text = " ".join( + (itm.text or "").strip() + for itm in old_items + if (itm.text or "").strip() + ).strip() + if not full_text: + continue + + parsed = parser_fn(full_text, nlp) + + new_items = [] + for entry in parsed: + el = etree.Element("item") + el.text = entry["text"] + t = entry["type"] + if t == "person": + el.set("type", "signature") + elif t in {"location", "location-specifier"}: + el.set("type", "location-specifier") + elif t == "record_id": + el.set("type", "record-id") + new_items.append(el) + + lst[:] = new_items + changed = True + + except Exception as e: + tb = traceback.format_exc() + print(f"[expand_signatures] ⚠ {e}\n{tb}") + raise # propagate so process_one can report it + + return changed + +def _load_spacy(): + try: + nlp = spacy.load("sv_core_news_md", disable=["parser","lemmatizer"]) + except OSError: + raise Error("you want to run with spacy...install it :D") + else: + df0 = pd.read_csv("riksdagen-persons/data/name.csv") + known_names = df0["name"].unique().tolist() + df1 = pd.read_csv("riksdagen-persons/data/location_specifier.csv") + known_places = df1["location"].unique().tolist() + nlp = add_known_names_ruler(nlp, known_names, known_places=known_places) + return nlp, known_names + + +def get_nlp(): + # Each process gets its own nlp instance, loaded once + if not hasattr(get_nlp, "_nlp"): + get_nlp._nlp, get_nlp._known_names = _load_spacy() # returns (nlp, known_names) + return get_nlp._nlp, get_nlp._known_names + + +def process_one(args): + path, cfg = args + try: + nlp, known_names = _load_spacy() + except Exception as e: + return (path, False, f"nlp init failed: {e}") + + root = None + try: + root, ns = parse_tei(path) + changed = expand_signatures(root, ns, nlp, known_names=known_names) + if changed: + write_tei(root, path) + return (path, bool(changed), None) + except Exception as e: + tb = traceback.format_exc() + return (path, False, f"{e}\n{tb}") + finally: + try: + if root is not None: + root.clear() + except Exception: + pass + gc.collect() + +_worker_nlp = None +_worker_known_names = None + +def worker_init(known_names_csv, known_places_csv): + """Runs once in each worker process.""" + # make workers single-threaded (prevents BLAS/OMP deadlocks) + import os, sys + msg = f"[PID {os.getpid()}] worker init starting\n" + sys.stdout.write(msg) + sys.stdout.flush() + os.environ["OMP_NUM_THREADS"] = "1" + os.environ["OPENBLAS_NUM_THREADS"] = "1" + os.environ["MKL_NUM_THREADS"] = "1" + os.environ["VECLIB_MAXIMUM_THREADS"] = "1" + os.environ["NUMEXPR_NUM_THREADS"] = "1" + os.environ["TOKENIZERS_PARALLELISM"] = "false" + os.environ["CUDA_VISIBLE_DEVICES"] = "-1" + os.environ["SPACY_FORCE_CPU"] = "true" + + + global _worker_nlp, _worker_known_names + # load spaCy and rulers *inside* the worker + import spacy, pandas as pd + _worker_nlp = spacy.load("sv_core_news_md", disable=["parser","lemmatizer"]) + + df0 = pd.read_csv(known_names_csv) + known_names = df0["name"].dropna().astype(str).unique().tolist() + df1 = pd.read_csv(known_places_csv) + known_places = df1["location"].dropna().astype(str).unique().tolist() + + add_known_names_ruler(_worker_nlp, known_names, known_places=known_places) + _worker_known_names = known_names + + + + print(f"[PID {os.getpid()}] loading spacy...") + _worker_nlp = spacy.load("sv_core_news_md", disable=["parser","lemmatizer"]) + print(f"[PID {os.getpid()}] reading CSVs...") + df0 = pd.read_csv(known_names_csv) + df1 = pd.read_csv(known_places_csv) + print(f"[PID {os.getpid()}] init done") + + +def _get_worker_nlp(): + if _worker_nlp is None: + raise RuntimeError("worker nlp not initialized") + return _worker_nlp, _worker_known_names + + + +def teardown_nlp(): + if hasattr(get_nlp, "_nlp"): + del get_nlp._nlp + del get_nlp._known_names + gc.collect() +def process_one_star(arg): + return process_one(arg) + +def run_batch(paths, args, max_workers=10, task_timeout=60, batch_timeout=600): + """ + Run one batch of XML files in parallel, returning (path, changed, err) for each. + Never hangs: forcibly cleans up all workers on exit. + """ + import multiprocessing as mp + from concurrent.futures import ProcessPoolExecutor, as_completed, TimeoutError + import psutil, traceback, gc, time, os, signal + + known_names_csv = "riksdagen-persons/data/name.csv" + known_places_csv = "riksdagen-persons/data/location_specifier.csv" + + results = [] + stuck = [] + tasks = [(p, args) for p in paths] + + print(f"[PID {os.getpid()}] ⚙️ Starting run_batch for {len(tasks)} files") + sys.stdout.flush() + + # -------- Global watchdog (batch-level) -------- + def timeout_handler(signum, frame): + raise TimeoutError("⏰ batch timeout reached") + + signal.signal(signal.SIGALRM, timeout_handler) + signal.alarm(batch_timeout) + + start_time = time.monotonic() + + # -------- Main execution -------- + ex = None + try: + ctx = mp.get_context("spawn") # ✅ safer with spaCy + ex = ProcessPoolExecutor( + max_workers=max_workers, + initializer=worker_init, + initargs=(known_names_csv, known_places_csv), + mp_context=ctx, + ) + + future_to_path = {ex.submit(process_one, t): t[0] for t in tasks} + + for idx, fut in enumerate(tqdm(as_completed(future_to_path, timeout=batch_timeout), total=len(future_to_path)), start=1): + path = future_to_path[fut] + print(path) + try: + + res = fut.result(timeout=task_timeout) + print(" --> OK") + except TimeoutError: + print(f"⚠ Timeout in worker: {path}") + stuck.append(path) + continue + except Exception as e: + tb = traceback.format_exc(limit=1) + print(f"⚠ {path}: {e}") + res = (path, False, f"processing failed: {e}\n{tb}") + results.append(res) + print(f"finished {idx} of {len(future_to_path)}") + print(len(results)) + except TimeoutError as e: + print(f"⏰ Batch timed out after {batch_timeout}s — aborting...") + for path, _ in tasks: + if path not in [r[0] for r in results]: + results.append((path, False, "batch timeout")) + finally: + signal.alarm(0) + print("🧹 Forcing executor shutdown...") + if ex: + try: + ex.shutdown(wait=False, cancel_futures=True) + except Exception: + print("shutdown failed") + # -------- Kill any stray python worker processes -------- + time.sleep(0.5) + for proc in psutil.process_iter(): + try: + if proc.pid != os.getpid() and "python" in proc.name().lower(): + if any(x in " ".join(proc.cmdline()) for x in ["refine-signature", "spacy", "ProcessPoolExecutor"]): + proc.terminate() + except Exception: + pass + + gc.collect() + elapsed = time.monotonic() - start_time + print(f"✅ Batch finished in {elapsed:.1f}s — {len(results)} results ({len(stuck)} stuck)\n") + + return results + + + + + +def main(args): + + motions = list(args.motions) + batch_size = 50 + batches = [motions[i:i+batch_size] for i in range(0, len(motions), batch_size)] + + for bi, chunk in enumerate(batches, 1): + print(f"\n⚙️ Processing batch {bi}/{len(batches)} ({len(chunk)} files)") + results = run_batch(chunk, args, max_workers=10, task_timeout=60) + done, changed, failed = 0, 0, 0 + for path, ch, err in results: + done += 1 + if err: + failed += 1 + elif ch: + changed += 1 + print(f"✅ Batch {bi} done: {done} files, {changed} changed, {failed} failed") + + + """ + for bi, chunk in enumerate(batches, 1): + print(f"⚙️ Batch {bi}/{len(batches)} ({len(chunk)} files)") + with ctx.Pool(processes=procs, maxtasksperchild=tasks_per_child) as pool: + for path, changed, err in pool.imap_unordered(process_one, [(m, args) for m in chunk], chunksize=1): + if err: + print(f"⚠ {path}: {err}") + elif changed: + print(f"✅ {path}") + else: + print(f"⏩ {path}") + print(f"✅ Batch {bi} finished") + + for bi, chunk in enumerate(batches, 1): + print(f"⚙️ Processing batch {bi}/{len(batches)} — {len(chunk)} files") + + with closing(ctx.Pool(processes=procs, maxtasksperchild=tasks_per_child)) as pool: + try: + iterator = pool.imap_unordered(process_one, [(m, args) for m in chunk], chunksize=1) + + for path, changed, err in iterator:# tqdm(iterator, total=len(chunk), dynamic_ncols=True): + if err: + print(f"⚠ {path}: {err}") + elif changed: + print(f"✅ {path}") + else: + print(f"⏩ {path}") + + # Make sure all results are fully consumed *before* closing the pool + pool.close() + pool.join() + + except KeyboardInterrupt: + print("⚠ Interrupted, terminating workers…") + pool.terminate() + pool.join() + except Exception as e: + print(f"⚠ Batch {bi} crashed: {e}") + pool.terminate() + pool.join() + + teardown_nlp() + gc.collect() + gc.collect() + + """ + + +if __name__ == '__main__': + parser = fetch_parser("motions", docstring=__doc__) + parser.add_argument("--spacy", action='store_true', help="Run with spacy NER") + main(impute_args(parser.parse_args())) diff --git a/src/cur-mot/refine-signature-parsing_new.py b/src/cur-mot/refine-signature-parsing_new.py new file mode 100644 index 0000000..3e09428 --- /dev/null +++ b/src/cur-mot/refine-signature-parsing_new.py @@ -0,0 +1,712 @@ +#!/usr/bin/env python3 +import multiprocessing as mp + +# Must come before any Pool, Executor, or spaCy import +if mp.get_start_method(allow_none=True) != "spawn": + mp.set_start_method("spawn", force=True) +from multiprocessing import ( + get_context, + Process, +) +from concurrent.futures import ProcessPoolExecutor +import gc +import lxml.etree as etree +import pandas as pd +from pyriksdagen.args import ( + fetch_parser, + impute_args, +) +from pyriksdagen.io import ( + parse_tei, + write_tei +) +import re +import spacy +from spacy.language import Language +from spacy.matcher import PhraseMatcher +from spacy.tokens import Span +import time +import tqdm as tqdm + + + + + +UP = "A-ZÅÄÖÉÈÀÂÊÎÔÛÜÖÄÅ" +LO = "a-zåäöéèàâêîôûüöäåç" + +NAME_TOKEN = re.compile(rf"^[{UP}][{LO}{UP}\-'\.:]+$") +SHORT_PAREN = re.compile(r"^\([A-Za-zÅÄÖåäö]{1,3}\)$") +RE_NR_HEAD = re.compile(r"^(?:N:o|Nr)$", re.IGNORECASE) +RE_ALNUM = re.compile(r"^(?:[A-Za-zÅÄÖåäö]+|\d+)$") + +# Allow longer multi-word places after 'i' / 'från' +MAX_PLACE_TOKENS = 6 +LOWER_PLACE_WORDS = { + "län","härad","socken","församling","kommun","stad","köping", + "tingslag","domsaga","landskap","kapellag","bygden" +} +# Optional surname hints (we still allow 2-token names even if 2nd isn’t in this list) +SURNAME_SUFFIXES = ( + "son","sson","dotter","berg","borg","gren","kvist","quist", + "ström","man","mark","lund","holm","beck","blad","feldt","felt", + "fors","vall","hage","dahl" +) + +def is_initial_token(t: str) -> bool: + return bool(re.fullmatch(r"[A-ZÅÄÖ]\.?", t)) + +def is_name_word(t: str) -> bool: + return bool(re.match(r"^[A-ZÅÄÖ][a-zåäö\-]+$", t)) + +NAME_WORD_RE = re.compile(rf"^[{UP}][{LO}\-]+$") + +def is_word(tok: str) -> bool: + return bool(NAME_WORD_RE.match(tok)) + +def looks_like_initials_plus_surname(toks, j) -> bool: + """Return True if toks[j:] starts with 1+ initials followed by a name word.""" + i = j + saw_initial = False + while i < len(toks) and is_initial_token(toks[i]): + saw_initial = True + i += 1 + return saw_initial and i < len(toks) and is_name_word(toks[i]) + +NOBILIARY_PARTICLES = {"von", "de", "af", "van", "di", "du", "v"} +PLACE_HINT = re.compile(r"^[A-ZÅÄÖ][a-zåäöéèàâêîôûüöäåç\-]+$") +PLACE_ENDINGS = {"län", "stad", "kommun", "församling", "härad", "socken", "kapellag"} + +def log(msg): + sys.stdout.write(f"[PID {os.getpid()}] {msg}\n") + sys.stdout.flush() + + +def clean_text(t: str) -> str: + # Fix broken hyphenation (Carls- son → Carlsson); normalize whitespace/dashes. + t = re.sub(rf"([{UP}{LO}])-\s+([{UP}{LO}])", r"\1\2", t) + t = t.replace("’","'").replace("–","-").replace("—","-") + t = re.sub(r"\b0\.", "O.", t) + return re.sub(r"\s+"," ", t.strip()) + + +def is_initial(tok: str) -> bool: + t = tok.strip().rstrip(":;,.!?") + return bool(re.fullmatch(rf"[{UP}]\.?", t)) + + +def is_initial_block(tok: str) -> bool: + t = tok.strip().rstrip(":;,.!?") + return bool(re.fullmatch(rf"(?:[{UP}]\.\s*){{1,3}}", t)) + + +def is_hy_initial(tok: str) -> bool: + t = tok.strip() + return bool(re.fullmatch(rf"-[{UP}]\.?", t)) + + +def is_name_token(tok: str) -> bool: + return bool(NAME_TOKEN.match(tok)) + + +def is_surname_like(tok: str) -> bool: + t = tok.lower().rstrip(".") + return any(t.endswith(s) for s in SURNAME_SUFFIXES) + + +if not Span.has_extension("is_known_name"): + Span.set_extension("is_known_name", default=False) +if not Span.has_extension("is_known_place"): + Span.set_extension("is_known_place", default=False) + + +class KnownNameRuler: + def __init__(self, nlp, name, known_names, known_places): + self.known_places = set(known_places) + + # Build a PhraseMatcher once (much faster than text.find in a loop) + self.matcher = PhraseMatcher(nlp.vocab, attr="ORTH") + # Only keep non-empty names + patterns = [nlp.make_doc(n) for n in known_names if n and n.strip()] + # You can shard patterns if extremely large; for now one label + self.matcher.add("KNOWN_PERSON", patterns) + + def __call__(self, doc): + new_ents = list(doc.ents) + + # Run the phrase matcher over the doc + matches = self.matcher(doc) # list of (match_id, start, end) + for _, start, end in matches: + span = doc[start:end] + # Create a PER span (known person) + span = doc.char_span(span.start_char, span.end_char, label="PER", alignment_mode="expand") + if span is not None: + span._.is_known_name = True + new_ents.append(span) + + # Deduplicate & remove overlaps, preferring earlier & longer + new_ents = sorted(new_ents, key=lambda s: (s.start_char, -s.end_char)) + filtered = [] + last_end = -1 + for ent in new_ents: + if ent.start_char >= last_end: + filtered.append(ent) + last_end = ent.end_char + # else skip overlaps + + doc.ents = tuple(filtered) + return doc + + +@Language.factory("known_name_ruler") +def create_known_name_ruler(nlp, name, known_names, known_places): + return KnownNameRuler(nlp, name, known_names, known_places) + + +def _load_spacy(): + try: + nlp = spacy.load("sv_core_news_md", disable=["parser","lemmatizer"]) + except OSError: + raise Error("you want to run with spacy...install it :D") + else: + df0 = pd.read_csv("riksdagen-persons/data/name.csv") + known_names = df0["name"].unique().tolist() + df1 = pd.read_csv("riksdagen-persons/data/location_specifier.csv") + known_places = df1["location"].unique().tolist() + if "known_name_ruler" not in nlp.pipe_names: + nlp.add_pipe( + "known_name_ruler", + before="ner", + config={"known_names": known_names, "known_places": known_places}, + ) + return nlp, known_names, known_places + + +def _tokenize_with_spans(text: str): + toks = text.split() + spans = [] + pos = 0 + for tok in toks: + start = pos + end = start + len(tok) + spans.append((tok, start, end)) + pos = end + 1 # +1 for the single space between tokens + return toks, spans + + +def _consume_place(toks, i_start,use_spacy_ent, spans) -> int: + """ + Consume tokens after 'i' or 'från' that look like a place. + Stops before what appears to be the next person's name. + """ + i = i_start + 1 + if i >= len(toks): + return i + + # If spaCy already detected a location entity covering the next token + if use_spacy_ent: + label, ent_s, ent_e = use_spacy_ent + if label in ("LOC", "GPE"): + j = i + while j < len(toks) and spans[j][2] <= ent_e: + j += 1 + return max(j, i) + + taken = 0 + j = i + while j < len(toks): + if taken >= MAX_PLACE_TOKENS: + break + w = toks[j] + + # Allow lowercase place-type words (län, härad, socken, etc.) + if w.lower() in LOWER_PLACE_WORDS: + j += 1 + taken += 1 + continue + + # Stop if we hit "Nr" or record patterns + if RE_NR_HEAD.match(w) or w.lower().startswith("nr"): + break + + # Allow capitalized place components (Stockholms, Västra, Sundbyberg) + if re.match(r"^[A-ZÅÄÖ][a-zåäöéèàâêîôûüöäåç\-]*$", w): + # stop early if this looks like a person surname (e.g. ends in -son, -berg) + if is_surname_like(w): + break + j += 1 + taken += 1 + continue + + break + + # Must have taken at least one token to count as a place + if taken == 0: + return i_start + 1 + + return j + +def _clean_for_compare(s: str) -> str: + return re.sub(r"\s+", " ", s.strip(" \t\n.;:,!?")).strip() + + +def parse_signature_text(text, nlp, known_names, known_places): + #print("parsing signature text") + text = clean_text(text) + #print(text) + if not text: + return [] + + doc = nlp(text) + toks, spans = _tokenize_with_spans(text) + norm_toks = [] + for t in toks: + core = re.sub(r"[:;,.!?]+$", "", t.strip()) # strip trailing punctuation for classification + norm_toks.append(core if core else t) + # Ensure every span is a 3-tuple (tok, start, end) + fixed_spans = [] + for s in spans: + if isinstance(s, (list, tuple)) and len(s) == 3: + fixed_spans.append(tuple(s)) + else: + # fall back to dummy start/end positions + if isinstance(s, str): + fixed_spans.append((s, 0, len(s))) + else: + fixed_spans.append(("", 0, 0)) + spans = fixed_spans + spans = [t if len(t) == 3 else (t[0], 0, 0) for t in spans] + out = [] + i = 0 + + name_spans = [ent for ent in doc.ents if ent.label_ == "PER"] + name_spans = sorted(name_spans, key=lambda e: e.start_char) + loc_spans = [ent for ent in doc.ents if ent.label_ in ("LOC", "GPE")] + # index entities by start offset for quick lookup + ent_starts = {e.start_char: e for e in name_spans} + loc_starts = {e.start_char: e for e in loc_spans} + + print(f"TOKENS ({len(toks)}): {toks}") + + MAX_GLOBAL_STEPS = 5000 + global_steps = 0 + while i < len(toks): + global_steps += 1 + if global_steps > MAX_GLOBAL_STEPS: + print(f"🚨 Safety break: exceeded {MAX_GLOBAL_STEPS} steps") + break + if global_steps % 1000 == 0: + print(f"[{global_steps}] i={i}/{len(toks)} tok={toks[i]!r}") + #print("--", i, len(toks)) + i_prev=i + try: + tok, s, e = spans[i] + if not isinstance(tok, str): + tok = str(tok) + if not isinstance(s, int) or not isinstance(e, int): + s, e = 0, len(tok) + except Exception: + # fallback if the tuple isn't valid + val = spans[i] + if isinstance(val, str): + tok, s, e = val, 0, len(val) + elif isinstance(val, (list, tuple)): + tok = str(val[0]) if len(val) > 0 else "" + s = int(val[1]) if len(val) > 1 and isinstance(val[1], int) else 0 + e = int(val[2]) if len(val) > 2 and isinstance(val[2], int) else s + len(tok) + else: + tok, s, e = "", 0, 0 + norm_tok = norm_toks[i] + + # if re.match(r".*[:;,.!?]$", tok): + # core = tok.rstrip(":;,.!?") + # toks[i] = core + # if core and re.match(r"^[A-ZÅÄÖ]", core): + # tok = core + + + #print("t", toks) + #print("o", out) + # ---- record id ------------------------------------------------------- + two = " ".join(norm_toks[i:i+2]) + if re.match(r"(?i)\bN[:r]\s*\d+\b", two): + #print(" -> D-bug: match record ID") + out.append({"type": "record-id", "text": two}) + i += 2 + continue + + # ---- location specifier --------------------------------------------- + elif norm_tok.lower() in {"i", "från"}: + #print(" -> D-bug: match i-ort") + j = i + 1 + + # Try to consume a following proper noun or entity + if j < len(toks): + nxt = norm_toks[j] + if re.match(r"^[A-ZÅÄÖ][a-zåäö\-]+$", nxt): + # e.g. "i Dahl", "från Stockholm" + j += 1 + elif j + 1 < len(toks): + # handle two-part place names like "i Nya Kopparberg" + nxt2 = norm_toks[j + 1] + if re.match(r"^[A-ZÅÄÖ][a-zåäö\-]+$", nxt) and re.match(r"^[A-ZÅÄÖ][a-zåäö\-]+$", nxt2): + j += 2 + + frag = " ".join(toks[i:j]).strip() + out.append({"type": "location-specifier", "text": frag}) + i = j + continue + + # ---- matched entity ------------------------------------------------- + elif s in ent_starts: + #print(" -> D-bug: match ent") + ent = ent_starts[s] + out.append({"type": "person", "text": ent.text}) + # advance i safely past the entity span + j = i + while j < len(spans) and spans[j][2] <= ent.end_char: + j += 1 + i = j + continue + + # Skip tokens that end with punctuation unless clearly part of a name + elif re.match(r".*[.:;!?]$", tok) and not is_initial(norm_tok) and not is_name_word(norm_tok.rstrip(":;.!?")): + out.append({"type": "other", "text": tok}) + i += 1 + continue + + elif re.fullmatch(r"[,;]", norm_tok): + i += 1 + continue + + elif re.match(r"^[A-ZÅÄÖ]", norm_tok) and norm_tok.lower() not in {"på", "av", "och", "från"}: + #print(" -> D-bug: match initials/name group") + j = i + name_tokens = [] + inner_guard = 0 + MAX_INNER_GUARD = 100 + while j < len(toks): + inner_guard += 1 + j_prev = j + nxt = norm_toks[j] + + # Hard stop markers + if nxt.lower() in {"i", "från", "på", "av"} or nxt in {"Nr", "nr"}: + break + + # --- Rule 0e: standalone colon or dash → consume and end current name + if nxt in {":", "-", "–", "—"}: + j += 1 + continue + + # --- Rule 0a: break on obviously lowercase or non-name words + if re.fullmatch(r"[a-zåäö\-]+", nxt): + break + + # --- Rule 0b: nobiliary particles continue the current name + if nxt.lower() in NOBILIARY_PARTICLES: + name_tokens.append(toks[j]) + j += 1 + continue + + # --- Rule 0c: handle multi-letter pseudo-initials like 'AA.' or 'ÅA.' + if re.fullmatch(r"([A-ZÅÄÖ]{2,3}\.?)", nxt): + name_tokens.append(toks[j]) + j += 1 + continue + + # --- Rule 0d: skip trivial punctuation + if re.fullmatch(r"[,;]", nxt): + j += 1 + while j < len(toks) and toks[j].islower(): + j += 1 + break + + # Accept initials or capitalized words + if is_initial_block(nxt) or is_initial(nxt) or is_word(nxt) or re.fullmatch(r"[A-ZÅÄÖ][a-zåäö]{1,3}\.", nxt): + name_tokens.append(toks[j]) + j += 1 + + # --- Rule 1: initials + word followed by another initial → new name + if ( + is_initial(name_tokens[-1]) + and j < len(toks) + and (is_initial(norm_toks[j]) or is_word(norm_toks[j])) + ): + continue + + # --- Rule 2: don’t end on an initial; continue until surname-like word + if j < len(toks) and is_initial(name_tokens[-1]) and not is_word(norm_toks[j]): + continue + + # --- Rule 2b: no discontiguous initials + if ( + len(name_tokens) >= 2 + and is_initial(name_tokens[-1]) + and not is_initial(name_tokens[-2]) + and j < len(toks) + and is_initial(norm_toks[j]) + ): + break + + # --- Rule 2c: don't end on bare initials unless followed by surname-like word or nobiliary particle + if ( + all(is_initial(t) for t in name_tokens) + and j < len(toks) + and norm_toks[j].lower() not in NOBILIARY_PARTICLES + and not is_word(norm_toks[j]) + ): + break + + # --- Rule 3: detect next full-name start (Firstname + NextWord) + if ( + len(name_tokens) >= 2 + and j < len(toks) + and is_word(norm_toks[j]) + and (j + 1 < len(toks) and (is_word(norm_toks[j + 1]) or is_initial(norm_toks[j + 1]))) + ): + break + + # --- Rule 4b: stop if we've already got a surname-like word + # and the next token is just an initial (to prevent 'Ahlberga P.') + if ( + len(name_tokens) >= 1 + and is_word(name_tokens[-1]) + and j < len(toks) + and is_initial(norm_toks[j]) + and norm_toks[j].lower() not in NOBILIARY_PARTICLES + ): + break + + # --- Rule 5: if next tokens look like a *new* initials+surname combo, split here + if ( + len(name_tokens) >= 2 + and looks_like_initials_plus_surname(norm_toks, j) + ): + break + + continue + + # --- Rule 6: if next token(s) form a known name, break here + lookahead = " ".join(norm_toks[j:j+3]) + found_known = False + for kname in known_names: + if lookahead.startswith(kname) or norm_toks[j] in kname.split(): + found_known = True + break + if found_known: + break # break the inner name loop so outer loop can start next name at j + + # --- safety: ensure progress even on punctuation or noise + # --- safety: ensure progress even on punctuation or noise + if j == j_prev: + print(f"⚠️ Inner name-loop stuck at {j}, token={toks[j]!r}") + i = j + 1 + break + + if inner_guard > MAX_INNER_GUARD: + print(f"🚨 Inner loop emergency break at token {j}, {toks[j]!r}") + i = j + 1 # advance outer index so we don’t re-enter + break + + # --- finalize name tokens ----------------------------------------- + frag = " ".join(name_tokens).strip() + if frag: + cur = _clean_for_compare(frag) + last = _clean_for_compare(out[-1]["text"]) if out else None + if not out or cur != last: + out.append({"type": "person", "text": frag}) + + # --- always advance safely ---------------------------------------- + # 1. normally, go to j + # 2. if punctuation-ended, also skip that token + if j >= len(toks): + break + if toks[j - 1].endswith((':', ';', '.')): + i = j # skip the punctuation token + else: + i = max(i + 1, j) + + continue + + # ---------------------------------------------------------------------- + # catch-all for tokens that weren't matched by any rule + out.append({"type": "other", "text": tok}) + i += 1 + continue + + + # --- Strong guard to prevent Wallin-type repetition --- + # If we hit the end of text or the token was punctuation-terminated, stop processing this item + if j >= len(toks): + break # end of tokens reached safely + + # Defensive: check the current token only if still within range + if i < len(toks) and toks[i].endswith((':', ';', '.')): + i = j + 1 + continue + + # If nothing consumed, force skip + if j == i: + if i < len(toks): + print(f"⚠️ No progress / stuck on token {i}, tok={toks[i]!r} — skipping") + i += 1 + continue + + # Normal advance + i = max(i, j) + continue + + else: + # Catch-all: preserve unclassified token(s), avoid duplicates (ignore punctuation) + cur = _clean_for_compare(tok) + last = _clean_for_compare(out[-1]["text"]) if out else None + if not out or cur != last: + out.append({"type": "other", "text": tok}) + i += 1 + continue + # ---- end of main while i < len(toks) iteration ---- + if i_prev == i: + print(f"⚠️ No progress at token index {i}, tok={toks[i]!r} — forcing advance") + cur = _clean_for_compare(toks[i]) + last = _clean_for_compare(out[-1]["text"]) if out else None + if not out or cur != last: + out.append({"type": "other", "text": toks[i]}) + i += 1 + continue + + del doc + #print("out --", out) + return out + + +def looks_like_signature_block(sig_block, nlp): + #print("is it a signature block?") + texts = " ".join([l.strip() for itm in sig_block.itertext() for l in itm.splitlines() if l is not None and l.strip()!='']) + #print(texts) + if not texts: + #print("no texts") + return False + + if not any([c.isupper() for c in texts]): + #print("no uppercase") + return False + + words = texts.split() + #print(words) + if len(words) < 4: + # trivial, short, might be single name + return True + + # reject only if this looks like pure prose + avg_len = sum(len(w) for w in words) / len(words) + caps_ratio = sum(w and w[0].isupper() for w in words) / len(words) + + # allow if many short capitalized tokens + if caps_ratio > 0.5:# and avg_len < 10: + return True + + if len(words) > 100: + #print("too many words") + #print(texts) + return False # extreme prose + + # Always accept anything that has even 1 typical surname or capitalized run + if any(w.endswith(("sson","berg","man","gren","lund","ström")) for w in words): + return True + + if nlp: + doc = nlp(texts) + if any(ent.label_ == "PER" for ent in doc.ents): + return True + return True # << default to True to test the rest of the pipeline + + +def expand_signatures(root, ns, nlp, known_names, known_places): + #print("expanding signatures") + changed = False + signature_blocks = root.findall(f".//{ns['tei_ns']}div[@type=\"signatureBlock\"]") + for sb in signature_blocks: + if not looks_like_signature_block(sb, nlp): + continue + lists = sb.findall(f".//{ns['tei_ns']}list") + for list_ in lists: + old_items = list(list_.findall(f"{ns['tei_ns']}item")) + if not old_items: + continue + + full_text = ' '.join([l.strip() for t in list_.itertext() for l in t.splitlines() if l.strip() != '']) + + if not full_text or full_text=='': + continue + + parsed = parse_signature_text(full_text, nlp, known_names, known_places) + + new_items = [] + for entry in parsed: + el = etree.Element("item") + el.text = entry["text"] + t = entry["type"] + if t == "person": + el.set("type", "signature") + elif t in {"location", "location-specifier"}: + el.set("type", "location-specifier") + elif t == "record_id": + el.set("type", "record-id") + new_items.append(el) + + list_[:] = new_items + changed = True + return changed + + +def parse_signature_block(mots, start=1): + #print(f" parsing signature block") + nlp, known_names, known_places = _load_spacy() + for i, mot in enumerate(mots, start): + print(" ", i, mot) + root,ns = parse_tei(mot) + changed = expand_signatures(root, ns, nlp, known_names, known_places) + if changed: + write_tei(root, mot) + del root + del ns + + +def run_batch(batch, start_i, max_workers=10): + print(f"... run batch with {max_workers} processes") + ctx = mp.get_context("spawn") # ✅ safer with spaCy + ex = ProcessPoolExecutor( + max_workers=max_workers, + #initializer=worker_init, + #initargs=(known_names_csv, known_places_csv), + mp_context=ctx, + ) + ex.submit(parse_signature_block, batch, start_i) + + + + +def main(args): + args.motions = [m for m in args.motions if not m.endswith("-fört.xml") and not m.endswith("-reg.xml")] + if args.use_multithreading: + print("Using multithread") + batches = [args.motions[i:i+args.batch_size] for i in range(0, len(args.motions), args.batch_size)] + + for bi, batch in enumerate(batches, 1): + print(f"\n\n starting batch {bi} of {len(batches)}\n\n") + result = run_batch(batch, bi*args.batch_size, max_workers=args.n_workers) + gc.collect() + else: + parse_signature_block(args.motions) + + + + +if __name__ == '__main__': + parser = fetch_parser("motions", docstring=__doc__) + parser.add_argument("--use-multithreading", action='store_true') + parser.add_argument("--batch-size", type=int, default=10) + parser.add_argument("--n-workers", type=int, default=10) + main(impute_args(parser.parse_args())) + diff --git a/src/cur-mot/rename-doc-pdfs.py b/src/cur-mot/rename-doc-pdfs.py new file mode 100644 index 0000000..49067bc --- /dev/null +++ b/src/cur-mot/rename-doc-pdfs.py @@ -0,0 +1,35 @@ +#!/usr/bin/env python3 +from tqdm import tqdm +import argparse, os, re + + + + +def main(args): + pattern = re.compile(r'(?=\d)(?<=\D)(?=\D*?\d)') + pattern2 = re.compile(r'(?<=\d)(?=\D)') + yearpath=f"{args.doc_pdf_path}/{args.year}" + files = os.listdir(f"{yearpath}") + for f in tqdm(files): + if "_" in f: + continue + print(f) + spl = pattern.split(f) + committee = spl[0] + try: + Next = f"{int(spl[1][:-4]):0>4}.pdf" + except: + spl2 = pattern2.split(spl[1]) + Next = f"{int(spl2[0]):0>4}-{spl2[1]}" + #print(f, "--> " f"mot_{args.year}_{committee}_{Next}") + nn = f"mot_{args.year}_{committee}_{Next}" + os.rename(f"{yearpath}/{f}", f"{yearpath}/{nn}") + + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument("--doc-pdf-path", default="riksdagen-motions-doc-pdf/data") + parser.add_argument("-y", "--year", type=str, required=True) + args = parser.parse_args() + main(args) diff --git a/src/cur-mot/unify-signature-block-structure.py b/src/cur-mot/unify-signature-block-structure.py new file mode 100644 index 0000000..314068e --- /dev/null +++ b/src/cur-mot/unify-signature-block-structure.py @@ -0,0 +1,52 @@ +#!/usr/bin/env python3 +""" +Curation of motions from different sources led to inconsistencies in the signature block structure and attribute +""" +from lxml import etree +from pyriksdagen.args import ( + fetch_parser, + impute_args, +) +from pyriksdagen.io import ( + parse_tei, + write_tei, +) +from tqdm import tqdm + + + + +def main(args): + + for motion in tqdm(args.motions): + write = False + root, ns = parse_tei(motion) + blocks = root.findall(f".//{ns['tei_ns']}div[@type=\"signatureBlock\"]") + for block in blocks: + if any(_.tag.endswith("list") for _ in block): + raise ValueError("This shouldn't happen") + list_ = etree.Element("list") + for p in block: + list_.append(p) + for p in list(block): + block.remove(p) + for item in list_: + item.tag = "item" + block.append(list_) + write = True + signs = root.findall(f".//{ns['tei_ns']}div[@type=\"motSignatures\"]") + for sign in signs: + sign.attrib["type"] = "signatureBlock" + for list_ in sign: + for item in list_: + item.attrib["type"] = "signature" + write = True + if write: + write_tei(root, motion) + + + + +if __name__ == '__main__': + parser = fetch_parser("motions", docstring=__doc__) + main(impute_args(parser.parse_args())) diff --git a/src/cur-mot/untag-iort-insignature-block.py b/src/cur-mot/untag-iort-insignature-block.py new file mode 100644 index 0000000..5aa91a0 --- /dev/null +++ b/src/cur-mot/untag-iort-insignature-block.py @@ -0,0 +1,42 @@ +#!/usr/bin/env python3 +""" +Untag stray i-ort as type=signature in signature block +""" +from pyriksdagen.args import ( + fetch_parser, + impute_args, +) +from pyriksdagen.io import ( + parse_tei, + write_tei, +) +from tqdm import tqdm +import re + + + + +def main(args): + stray_iort = re.compile(r'^(i|från)\s\S+(\s\S)?$') + for motion in tqdm(args.motions): + write = False + root, ns = parse_tei(motion) + signatures = root.findall(f".//{ns['tei_ns']}p[@type=\"signature\"]") + signatures.extend(root.findall(f".//{ns['tei_ns']}item[@type=\"signature\"]")) + for signature in signatures: + txt = ' '.join([_.strip() for _ in signature.text.splitlines() if _.strip() != ""]) + m = stray_iort.match(txt) + if m and "type" in signature.attrib and signature.attrib["type"] == "signature": + del signature.attrib["type"] + if "who" in signature.attrib and signature.attrib["who"] == "unknown": + del signature.attrib["who"] + write = True + if write: + write_tei(root, motion) + + + + +if __name__ == '__main__': + parser = fetch_parser("motions", docstring=__doc__) + main(impute_args(parser.parse_args())) diff --git a/src/cur-mot/write-ocr-correction-to-revDesc.py b/src/cur-mot/write-ocr-correction-to-revDesc.py new file mode 100644 index 0000000..0c67a8d --- /dev/null +++ b/src/cur-mot/write-ocr-correction-to-revDesc.py @@ -0,0 +1,41 @@ +#!/usr/bin/env python3 +from common.xml_utils import ( + parse_xml, + write_xml, +) +from datetime import datetime +from lxml import etree +from tqdm import tqdm +import argparse, os +import pandas as pd + + +def main(args): + orcid = os.environ.get("ORCID") + today = datetime.now().strftime('%Y-%m-%d') + df = pd.read_csv(f"{args.infile}", sep=';') + mots = df["file"].unique() + for mot in tqdm(mots): + root, ns = parse_xml(f"riksdagen-motions/{mot}") + print(root, ns) + try: + revdesc = root.find(f".//{ns['tei_ns']}revisionDesc") + assert revdesc is not None + except: + revdesc = etree.SubElement(root.find(f"{ns['tei_ns']}teiHeader"), "revisionDesc") + + mot_df = df.loc[df["file"] == mot] + for i, r in mot_df.iterrows(): + c = etree.SubElement(revdesc, "correction") + c.text = "OCR correction" + c.attrib["who"] = orcid + c.attrib["when"] = today + c.attrib["corresp"] = r['elem'] + write_xml(root, f"riksdagen-motions/{mot}") + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument("--infile", default="riksdagen-motions/_tmp.txt") + args = parser.parse_args() + main(args)