Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
112 changes: 112 additions & 0 deletions src/cur-mot/check-nrs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
#!/usr/bin/env python3
"""
Check that the number (filename) is actually in the document.
"""
from common.args import (
alto_args,
list_years,
verify_alto_args,
)
from common.xml_utils import (
parse_xml,
write_xml,
)
from glob import glob
from lxml import etree
from tqdm import tqdm
import argparse, os
import pandas as pd
import regex as re


def main(args):
years = list_years(args)
for year in years:
print(year)
if args.list:
debug = False
p_count = 0
matches = 0
mismatch = 0
mismatch_2 = 0
no_match = 0
motions = sorted(glob(f"{args.motionspath}/{year}/*.xml"))
rows = []
cols = ["mot", "elem_id", "nr", "text"]

for mot in tqdm(motions):
if mot == "riksdagen-motions/data/1962/mot-1962--ak--00262.xml":
debug = True
else:
debug = False
root, ns = parse_xml(mot, get_ns=True)
body = root.find(f".//{ns['tei_ns']}body")

nr = int(mot.split('-')[-1].replace(".xml", ""))
pat = re.compile(fr'((N|n)r[\.\,]?\s{nr}[\.\,]?){{i<=1,d<=1,s<=1,e<=1}}')
pat_2 = re.compile(r'((N|n)r[\.\,]?\s\S+){i<=1,d<=1,s<=1,e<=1}')
pat_3 = re.compile(r'(N|n)r[\.\,]?\s\S+')
ps = root.findall(f".//{ns['tei_ns']}p")
M = False
for p in ps:
p_count += 1
_text = " ".join([_.strip() for _ in p.text.splitlines() if _.strip() != ''])
if debug: print("0", _text)
m = None
m = pat.match(_text)
if m is not None:
matches += 1
M = True
else:
if debug: print("1", _text)
if not M:
for p in ps:
_text = " ".join([_.strip() for _ in p.text.splitlines() if _.strip() != ''])
m = None
m = pat_2.match(_text)
if m is not None:
mismatch += 1
#M = True
rows.append([mot, p.attrib[f"{ns['xml_ns']}id"], nr, m.group(0)])
else:
if debug: print("2", _text)
if not M:
for p in ps:
_text = " ".join([_.strip() for _ in p.text.splitlines() if _.strip() != ''])
m = None
m = pat_3.search(_text)
if mot == "riksdagen-motions/data/1962/mot-1962--ak--00262.xml":
print(m)
if m is not None:
mismatch_2 += 1
M = True
rows.append([mot, p.attrib[f"{ns['xml_ns']}id"], nr, _text])
else:
if debug: print("3", _text)
if not M:
no_match += 1
rows.append([mot, None, nr, None])
df = pd.DataFrame(rows, columns=cols)
df.drop_duplicates(inplace=True)
df.to_csv(f"{args.io_path}/_{year}-nomatch-nr.tsv", sep="\t", index=False)


print(" p", p_count)
print(" ma", matches)
print(" mi", mismatch)
print(" mi2", mismatch_2)
print(" 0", no_match)

if args.fix_listed:
pass




if __name__ == '__main__':
parser = alto_args(__file__)
parser.add_argument("-o", "--io-path", default="input/mot-unmatched-nr")
parser.add_argument("--list", action='store_true')
parser.add_argument("--fix-listed", action='store_true')
args = parser.parse_args()
main(verify_alto_args(args))
36 changes: 36 additions & 0 deletions src/cur-mot/committee-discrepancy.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
#!/usr/bin/env python3

from pyriksdagen.args import (
fetch_parser,
impute_args
)

def abb(filename, list_):
print(filename)
abbr = filename.split('/')[-1].split('-')[2]
if abbr is not None and abbr != '':
if abbr not in list_:
list_.append(abbr)
return list_

def main(args):
abb_a = []
abb_b = []
for m in args.motions:
abbr_a = abb(m, abb_a)
with open(args.tmp, 'r') as inf:
files = inf.readlines()
files = [_.strip() for _ in files if _.strip() != '']
for f in files:
abb_b = abb(f, abb_b)

print(sorted(abb_a))
print(sorted(abb_b))



if __name__ == '__main__':
parser = fetch_parser("motions")
parser.add_argument("--tmp", default="riksdagen-motions/_tmp.txt")
args = parser.parse_args()
main(impute_args(args))
37 changes: 37 additions & 0 deletions src/cur-mot/denest-pages-pdf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
#!/usr/bin/env python3
"""
Denest pages/ directory in the pdf repository.
"""
import argparse
import os
import shutil
from glob import glob

def main(args):
pdf_packages = sorted(glob(f"riksdagen-motions-pdf/data/{args.parliament_year}/*/"))
for p in pdf_packages:
p_content = glob(f"{p}pages/*")
for file_ in p_content:
print(" --", file_)
if file_.endswith(".png"):
if args.remove_png == True:
os.remove(file_)
print("rm:", file_)
else:
shutil.move(file_, p)
print("mv:", file_, p)
elif file_.endswith(".pdf"):
shutil.move(file_, p)
print("mv:", file_, p)
else:
os.remove(file_)
print("rm:", file_)
os.rmdir(f"{p}pages/")


if __name__ == '__main__':
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("-y", "--parliament-year", type = str, required=True)
parser.add_argument("-P", "--remove-png", type=bool, default=True)
args = parser.parse_args()
main(args)
121 changes: 121 additions & 0 deletions src/cur-mot/detect-title.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
#!/usr/bin/env python3
"""
annotate titles
"""
from pyriksdagen.args import (
fetch_parser,
impute_args,
)
from pyriksdagen.io import (
parse_tei,
write_tei,
)
from tqdm import tqdm
import regex as re


# (?P<fullname>\b[A-ZÅÄÖ][a-zåäö]+(?:\s+[A-ZÅÄÖ][a-zåäö]+)+\b)

def main(args):
salutation = r"""
\b(((g|G)re(f)?ve){e<=1}|
(((F|f)ri)?(H|h)err(e|ar)?){e<=1}|
(H|h)r|
(?i:fru){e<=1}|
(?i:fröken){e<=1})\b
"""
subjkw = r"""
\b((O|o)m|
(A|a)ngående|
i(:)?\sanledning)\b
"""

pat = re.compile(rf"""
^(\S\s)?
(?P<number>
(
(N(:)?(r|o)(\.)?){{e<=1}}\s
.{{1,3}}(\.)?\s
)?
)
(?:(?P<av>A(v|f)){{e<=1}}\s)?
(?(av)
(?:
(?P<salutation>{salutation}).*
|
(
(?P<fullname>
\b(?:
[A-ZÅÄÖ]{{1,2}}\.?|
[A-ZÅÄÖ][a-zåäö]+
)
(?:\s+
(?:
[A-ZÅÄÖ]{{1,2}}\.?|
[A-ZÅÄÖ][a-zåäö]+
)
)+\b
)
.*
(?P<subjkw>{subjkw})
.*
)
)
|
(?P<salutation>{salutation}).*(?P<subjkw>{subjkw}).*
)
""", re.VERBOSE)


if args.test:
test_cases = [
# Should match
"4f Ola Lasson: Om fattigunderstöd af allmänna medel för den som,",
"Af RK. P. Arnoldson: Om ändring af 28 $ 1 mom. i Regerings- formen.",
"N:o 27. Af herr J. Anderson i Tenhult, om höjd tull å sulläder m. m.",
"Av Fru Mróz Om saken",
"Af Fru Markowska Angående saken",
"Av Hr Kozławski komer en bóbr, krwa.",
"Fru Om saken",
"Af Margaret Thatcher om",
# Shouldn't match
"-----------------",
"Om saken direkt",
"av Idi Amin",
"af de tafel valde het boek om",
"herr Zdzisław Kosłąwski såg en bóbr",
]
print(pat.pattern)
for tc in test_cases:
m = None
m = pat.match(tc)
if m:
print("matched: ", tc)
else:
print("didn't match:", tc)

else:
args.motions = [_ for _ in args.motions if _.split('/')[2] not in ["fört", "reg"]]
for motion in tqdm(args.motions):
write = False
root, ns = parse_tei(motion)
Ps = root.findall(f".//{ns['tei_ns']}p")
for p in Ps:
if p.text is not None:
t = ' '.join([_.strip() for _ in p.text.splitlines() if _.strip() != ''])
m = pat.search(t)
if m:
p.attrib["type"] = "titleString"
write = True

if write:
write_tei(root, motion)




if __name__ == '__main__':
parser = fetch_parser("motions", docstring=__doc__)
parser.add_argument("--test", action='store_true')
args = impute_args(parser.parse_args())
main(args)
28 changes: 28 additions & 0 deletions src/cur-mot/doc-formatting-ck.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
#!/usr/bin/env python3
from glob import glob
from tqdm import tqdm
import argparse
import pandas as pd


def main(args):
D = {}
tsvs = sorted(glob(f"riksdagen-motions-pdf/data/{args.parliament_year}/*/*.tsv"))
for tsv in tqdm(tsvs):
df = pd.read_csv(tsv, sep='\t')
print(df["conf"].unique())

for i, r in df.iterrows():
if r["conf"] == 100:
if r["height"] not in D:
D[r["height"]] = 0
D[r["height"]] += 1


{print(k, ":", v) for k,v in dict(sorted(D.items(), key=lambda kv: (kv[1], kv[0]), reverse=True)).items()}

if __name__ == '__main__':
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("-y", "--parliament-year", type = str, required=True)
args = parser.parse_args()
main(args)
Loading