-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathTEI_To_Excel.py
More file actions
60 lines (47 loc) · 2.53 KB
/
TEI_To_Excel.py
File metadata and controls
60 lines (47 loc) · 2.53 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
import argparse
from openpyxl import load_workbook
import xml.etree.ElementTree as eT
import os
import re
# Creation of the argument
parser = argparse.ArgumentParser()
parser.add_argument('--tei', type=str, help="Path to the XML-TEI folder.")
args = parser.parse_args()
# Loading of the Excel file
wb = load_workbook(filename='../varios_woodcuts.xlsx')
# Selection of the worksheet
ws1 = wb["metadata"]
# Addition of data to XLSX worksheet
for f in os.listdir(args.tei):
if f.endswith('.xml'):
xml_path = os.path.join(args.tei, f) # Path of XML files
label, ext = os.path.splitext(f) # Splitting of the filename
# Namespace declarations
ns = {'tei': 'http://www.tei-c.org/ns/1.0'}
eT.register_namespace('', 'http://www.tei-c.org/ns/1.0')
xml_tree = eT.parse(xml_path) # Parsing of the XML file
root = xml_tree.getroot() # We get the root
printer = root.find(".//tei:publisher", ns).text # Name of the printer
title = root.find(".//tei:fileDesc/tei:titleStmt/tei:title", ns).text.capitalize() # Title
date = root.find(".//tei:sourceDesc//tei:publicationStmt/tei:date", ns).text # Date
place = root.find(".//tei:sourceDesc//tei:pubPlace", ns).text # Place of publication
# Iteration through the rows and the second columns (with document name)
for row in ws1.iter_rows(min_row=2, max_row=954, min_col=2, max_col=2):
for cell in row:
number_row = re.split("B", str(cell)) # We split the name of the cell to only get its number
c = cell.value # We get the value of each cell
# If the value of the cell is the same as the name of the XML file,
# we add information about the title, the date and the printer of the document
# in the corresponding column and row.
if c == label:
name_cell_columnE = "E" + number_row[1][:-1] # Concatenation of the name of the column and the number of the row
ws1[name_cell_columnE] = title # We put in the cell the title of the document
name_cell_columnG = "G" + number_row[1][:-1]
ws1[name_cell_columnG] = date
name_cell_columnH = "H" + number_row[1][:-1]
ws1[name_cell_columnH] = printer
name_cell_columnI = "I" + number_row[1][:-1]
ws1[name_cell_columnI] = place
else:
pass
wb.save(filename='../varios_woodcuts_update.xlsx')