-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathner2tei_index.py
More file actions
83 lines (71 loc) · 3.6 KB
/
ner2tei_index.py
File metadata and controls
83 lines (71 loc) · 3.6 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import csv
from collections import defaultdict
from lxml import etree
placeNames = [] # Liste avec les noms normalisés sans doublon
placeNamesCoords = [] # Liste avec les noms normalisés + coordonnées géo + id. Wikidata
placesNamesOriginal = [] # Liste avec les noms normalisés + originaux avec doublons
placesNamesDeduplicated = [] # Liste avec les noms normalisés + originaux sans doublon
with open('../pliegos-ner/moreno-ner/nerList_Moreno_Wikidata.csv', encoding='utf-8') as f:
csv_file = csv.reader(f) # On parcourt le fichier CSV
next(csv_file) # On passe la première ligne
for line in csv_file:
# On élimine les doublons et les lieux inclassables
if line[6] not in placeNames and line[6] != '':
placeNames.append(line[6])
placeNames.sort()
placeNamesCoords.append([line[6], [line[5], line[3]]])
#
for name in placeNames:
if line[6] == name:
placesNamesOriginal.append([line[6], line[2]])
for p in placesNamesOriginal:
if p not in placesNamesDeduplicated:
placesNamesDeduplicated.append(p)
placesNamesDeduplicated.sort()
# print(placesNamesDeduplicated)
data = defaultdict(list)
for key, value in placesNamesDeduplicated:
data[key].append(value)
# print(data.items())
placesNamesPairs = [list(t) for t in data.items()]
# print(placesNamesPairs)
root = etree.Element("TEI", xmlns="http://www.tei-c.org/ns/1.0")
teiHeader = etree.SubElement(root, "teiHeader")
fileDesc = etree.SubElement(teiHeader, "fileDesc")
titleStmt = etree.SubElement(fileDesc, "titleStmt")
title = etree.SubElement(titleStmt, "title")
title.text = "Desenrollando el cordel: Índice de lugares"
pubStmt = etree.SubElement(fileDesc, "publicationStmt")
pubStmt_child1 = etree.SubElement(pubStmt, "authority")
pubStmt_child1.text = "Bibliothèque Universitaire de Genève (BUNIGE)"
pubStmt_child2 = etree.SubElement(pubStmt, "availability", status="restricted")
licence = etree.SubElement(pubStmt_child2, "licence",
n="cc by nc sa", target="https://creativecommons.org/licenses/by-nc-sa/4.0/")
sourceDesc = etree.SubElement(fileDesc, "sourceDesc")
sourceDescP = etree.SubElement(sourceDesc, "p")
standoff = etree.SubElement(root, "standOff")
listPlace = etree.SubElement(standoff, "listPlace")
for i in placeNames:
place = etree.SubElement(listPlace, "place")
place.set("{http://www.w3.org/XML/1998/namespace}id", i.replace(" ", "_"))
place.set("n", i)
placeName = etree.SubElement(place, "placeName")
placeName.set("type", "main")
placeName.text = i
for n in range(len(placesNamesPairs)):
if i == placesNamesPairs[n][0] and i not in placesNamesPairs[n][1]:
altName = etree.SubElement(place, "placeName")
altName.set("type", "alt")
altName.text = str(placesNamesPairs[n][1]).replace('[', "").replace(']', "").replace("'", "")
location = etree.SubElement(place, "location")
for g in range(len(placeNamesCoords)):
if i == placeNamesCoords[g][0]:
geo = etree.SubElement(location, "geo")
geo.text = placeNamesCoords[g][1][0]
desc = etree.SubElement(place, "desc")
ref = etree.SubElement(desc, "ref")
ref.set("target", "https://www.wikidata.org/wiki/" + placeNamesCoords[g][1][1])
ref.text = placeNamesCoords[g][1][1]
# tree = etree.ElementTree(root)
# print(etree.tostring(root, xml_declaration=True, encoding='UTF-8', pretty_print=True))
# tree.write("places.xml", xml_declaration=True, encoding='UTF-8', pretty_print=True)