-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathdateparse.py
More file actions
346 lines (330 loc) · 14.2 KB
/
dateparse.py
File metadata and controls
346 lines (330 loc) · 14.2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
import pbw
import config
import re
import convertdate
from datetime import datetime
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
def julian_day(year, month, day):
"""Return the Julian day for the given system"""
return convertdate.julian.to_jd(year, month, day)
def julian_eom(year, month):
"""Return the Julian day for the end of the given month"""
if month in convertdate.julian.HAVE_31_DAYS:
return convertdate.julian.to_jd(year, month, 31)
elif month in convertdate.julian.HAVE_30_DAYS:
return convertdate.julian.to_jd(year, month, 30)
elif convertdate.julian.leap(year):
return convertdate.julian.to_jd(year, month, 29)
else:
return convertdate.julian.to_jd(year, month, 28)
def produce_range(datestr):
"""Produce a pair of strings where a dash has been decomposed"""
# Regex all the ones that don't register algorithmically
splitstr = re.match(r'(\d+)\s+(\w+)-(\d+)', datestr)
if splitstr is not None: # 1083 Winter-1085
return "%s %s" % (splitstr.group(1), splitstr.group(2)), splitstr.group(3)
splitstr = re.match(r'(\d+)-(\d+)\s+(\w+)-(\w+)', datestr)
if splitstr is not None: # 1046-1047 December-January
return "%s %s" % (splitstr.group(1), splitstr.group(3)), "%s %s" % (splitstr.group(2), splitstr.group(4))
splitstr = re.match(r'(\d+)\s+(\w+\s+\d+)-(\w+\s+\d+)', datestr)
if splitstr is not None: # 1033 February 20-March 15
return "%s %s" % (splitstr.group(1), splitstr.group(2)), "%s %s" % (splitstr.group(1), splitstr.group(3))
splitstr = re.match(r'(\d+)\s+(\w+(\s+\d+)?) to (\w+(\s+\d+)?)', datestr)
if splitstr is not None: # 1168 October to November 3
return "%s %s" % (splitstr.group(1), splitstr.group(2)), "%s %s" % (splitstr.group(1), splitstr.group(4))
# Now the algorithm.
# First split the thing into space-separated words
words = datestr.split()
# Now find the index of the word with a dash
rng = []
for ridx in range(len(words)):
if '-' in words[ridx]:
# Split it
parts = words[ridx].split('-')
for p in parts:
lst = words[0:ridx]
lst.append(p)
rng.append(' '.join(lst))
return tuple(rng)
def day_string(jd):
# Get the Julian calendar date
moy = ["", "Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
j = convertdate.julian.from_jd(jd)
return "%d %s %d" % (j[2], moy[j[1]], j[0])
def parse_date(datestr):
# Make a dating node. First try to parse the date
dt = None
dmin = None
dmax = None
try: # 1085
dt = datetime.strptime(datestr, "%Y")
dmin = julian_day(dt.year, dt.month, dt.day)
dmax = julian_day(dt.year, 12, 31)
except ValueError:
pass
if dt is None:
try: # 1085 January
dt = datetime.strptime(datestr, "%Y %B")
dmin = julian_day(dt.year, dt.month, dt.day)
dmax = julian_eom(dt.year, dt.month)
except ValueError:
pass
if dt is None:
try: # 1085 Jan
dt = datetime.strptime(datestr, "%Y %b")
dmin = julian_day(dt.year, dt.month, dt.day)
dmax = julian_eom(dt.year, dt.month)
except ValueError:
pass
if dt is None:
try: # 1085 mid-January
dt = datetime.strptime(datestr, "%Y mid-%B")
dmin = julian_day(dt.year, dt.month, 9)
dmax = julian_day(dt.year, dt.month, 21)
except ValueError:
pass
if dt is None:
try: # 1085 middle of January
dt = datetime.strptime(datestr, "%Y middle of %B")
dmin = julian_day(dt.year, dt.month, 9)
dmax = julian_day(dt.year, dt.month, 21)
except ValueError:
pass
if dt is None:
try: # 1085 January 23
dt = datetime.strptime(datestr, "%Y %B %d")
dmin = julian_day(dt.year, dt.month, dt.day)
dmax = julian_day(dt.year, dt.month, dt.day)
except ValueError:
pass
if dt is None:
try: # 1085 23 January
dt = datetime.strptime(datestr, "%Y %d %B")
dmin = julian_day(dt.year, dt.month, dt.day)
dmax = julian_day(dt.year, dt.month, dt.day)
except ValueError:
pass
if dt is None:
try: # 1085 January23
dt = datetime.strptime(datestr, "%Y %B%d")
dmin = julian_day(dt.year, dt.month, dt.day)
dmax = julian_day(dt.year, dt.month, dt.day)
except ValueError:
pass
if dt is None:
try: # 1085 Jan 23
dt = datetime.strptime(datestr, "%Y %b %d")
dmin = julian_day(dt.year, dt.month, dt.day)
dmax = julian_day(dt.year, dt.month, dt.day)
except ValueError:
pass
if dt is None:
try: # 1085 after January
dt = datetime.strptime(datestr, "%Y after %B")
dmin = julian_day(dt.year, dt.month + 1, 1)
dmax = julian_day(dt.year, 12, 31)
except ValueError:
pass
if dt is None:
try: # c. 1085
dt = datetime.strptime(datestr, "c. %Y")
dmin = julian_day(dt.year - 3, 1, 1)
dmax = julian_day(dt.year + 3, 12, 31)
except ValueError:
pass
if dt is None:
timeofday = re.match(r'(.*?),?\s+(morning|evening)$', datestr)
if timeofday is not None:
return parse_date(timeofday.group(1))
if dt is None:
easterdate = re.match(r'(\d+)\s+easter', datestr, flags=re.I)
if easterdate is not None:
# Get the date for Orthodox Easter that year
dt = convertdate.holidays.easter(int(easterdate.group(1)), church="orthodox")
dmin = julian_day(*dt)
dmax = dmin
print("Easter for year %d falls on %d/%d" % dt)
if dt is None:
lentdate = re.match(r'(\d+)\s+lent', datestr, flags=re.I)
if lentdate is not None:
# Get the date for Orthodox Easter that year
dt = convertdate.holidays.easter(int(lentdate.group(1)), church="orthodox")
dmax = julian_day(*dt) - 1
dmin = dmax - 46
if dt is None:
pentdate = re.match(r'(\d+)\s+pentecost', datestr, flags=re.I)
if pentdate is not None:
# Get the date for Orthodox Easter that year
dt = convertdate.holidays.easter(int(pentdate.group(1)), church="orthodox")
dmin = julian_day(*dt) + 49
dmax = dmin
if dt is None and '-' in datestr or ' to ' in datestr:
# See if we can parse two ends of a range.
daterange = produce_range(datestr)
if len(daterange) == 2:
firstrange = parse_date(daterange[0])
secondrange = parse_date(daterange[1])
dt = firstrange[0]
dmin = dt
dmax = secondrange[1]
# if dmin and dmax:
# print("Parsed string %s as %s - %s" % (datestr, day_string(dmin), day_string(dmax)))
# Handle interstitial qualifiers
if dt is None:
beforeafter = re.match(r'(\d+)\s+(before|after|early|late|around|mid(dle)?|end)( of)?\s+(.*)$',
datestr, flags=re.I)
if beforeafter is not None:
year = int(beforeafter.group(1))
qualifier = beforeafter.group(2)
rest = beforeafter.group(5)
dt = parse_date("%d %s" % (year, rest))
if dt[0] is not None:
if qualifier.lower() == "before":
dmin = julian_day(year, 1, 1)
dmax = dt[0] - 1
elif qualifier.lower() == "after":
dmin = dt[1] + 1
dmax = julian_day(year, 12, 31)
elif qualifier.lower() == "early":
dmin = dt[0]
dmax = dt[0] + (dt[1] - dt[0]) / 2
elif qualifier.lower() == "late":
dmin = dt[0] + (dt[1] - dt[0]) / 2
dmax = dt[1]
elif qualifier.lower() == "around":
magnitude = dt[1] - dt[0]
if magnitude < 10:
# Add 3 days each side if it's a matter of days
dmin = dt[0] - 3
dmax = dt[1] + 3
else: # Add half the timespan
dmin = dt[0] - magnitude / 2
dmax = dt[1] + magnitude / 2
elif "mid" in qualifier.lower():
# Take the middle half of the timespan
magnitude = dt[1] - dt[0]
dmin = dt[0] + magnitude / 4
dmax = dt[1] - magnitude / 4
elif qualifier.lower() == "end":
# Take the last 25% of the timespan
magnitude = dt[1] - dt[0]
dmin = dt[1] - magnitude / 4
dmax = dt[1]
# print("Parsed interstitial %s as %s - %s" % (datestr, day_string(dmin), day_string(dmax)))
if dt is None:
inverted = False
seasonstring = r'(winter|spring|summer|autumn|beginning|early|mid(dle)?|late|end|(first|second) half)'
seasonal = re.match(r'(\d+)\s+%s$' % seasonstring, datestr.lower())
if seasonal is None:
inverted = True
seasonal = re.match(r'%s\s+(\d+)$' % seasonstring, datestr.lower())
if seasonal is not None:
year = int(seasonal.group(4 if inverted else 1))
season = seasonal.group(1 if inverted else 2)
if season == "winter":
dmin = julian_day(year-1, 12, 1)
dmax = julian_day(year, 3, 20)
elif season == "spring":
dmin = julian_day(year, 3, 1)
dmax = julian_day(year, 6, 20)
elif season == "summer":
dmin = julian_day(year, 6, 1)
dmax = julian_day(year, 9, 20)
elif season == "autumn":
dmin = julian_day(year, 9, 1)
dmax = julian_day(year, 12, 20)
elif season == "beginning":
dmin = julian_day(year, 1, 1)
dmax = julian_eom(year, 2)
elif season == "early":
dmin = julian_day(year, 1, 1)
dmax = julian_day(year, 4, 15)
elif "mid" in season:
dmin = julian_day(year, 4, 15)
dmax = julian_day(year, 9, 15)
elif season == "late":
dmin = julian_day(year, 9, 15)
dmax = julian_day(year, 12, 31)
elif season == "end":
dmin = julian_day(year, 11, 1)
dmax = julian_day(year, 12, 31)
elif season == "first half":
dmin = julian_day(year, 1, 1)
dmax = julian_day(year, 6, 30)
elif season == "second half":
dmin = julian_day(year, 7, 1)
dmax = julian_day(year, 12, 31)
return dmin, dmax
def clean_datestring(dstr):
# Get rid of leading space and leading asterisk
datestr = dstr.lstrip().lstrip('*')
datestr = datestr.rstrip('*')
# Get rid of em- or en-dashes
datestr = datestr.replace('–', '-').replace('—', '-')
# Get rid of trailing colons or spaces, condense all spaces to a single space
datestr = re.sub(r'\s+', ' ', re.sub(r':?\s*$', '', datestr)).replace(' - ', '-')
# Get rid of commas and colons after the year
datestr = re.sub(r'^(\d+)[,:;]', r'\1', datestr)
return datestr
def parse_date_info(nunit):
# Get the date string and try to rationalise its format
datestr = clean_datestring(nunit.dates)
if not datestr or datestr == "0":
return
# Get the date type
datetype = nunit.dateTypeKey # 0/1 undef, 2 internal approx., 3 inferred, 4 uncertain, 5 median, 6 wrong
# See if the date has some sort of qualifier
qualified = re.match(r'(.*)\s+\((.*)\)\s*$', datestr)
if qualified is None:
qualified = re.match(r'(.*?)\s*\(?(\?)\)?(.*)$', datestr)
if qualified is not None:
datestr = qualified.group(1)
qualifier = qualified.group(2).lower()
if len(qualified.groups()) == 3:
datestr += qualified.group(3)
# Send it through the cleaner again just in case there is a leftover comma
datestr = clean_datestring(datestr)
if 'uncertain' in qualifier or 'uncetain' in qualifier or 'uncertan' in qualifier or qualifier == '?':
datetype = 4
elif 'mistaken' in qualifier or 'guess' in qualifier:
datetype = 6
elif 'median' in qualifier:
datetype = 5
elif 'august' in qualifier:
# Singleton: after Michael VI accession in 1056
datetype = 2
datestr = "1056 autumn"
elif qualifier == 'confused':
# Singleton: event seems to have happened in 1052
datestr = "1052"
else:
print("Ignoring date qualifier %s" % qualifier)
# Make a dating node. First try to parse the date
dmin, dmax = parse_date(datestr)
if dmin is None or dmax is None:
# Print the date in some kind of standardised format
# print("Parsed date %s of type %d in range %s - %s"
# % (nunit.dates, datetype, day_string(dmin), day_string(dmax)))
print("Unparsed date %s (%s)" % (nunit.dates, datestr))
return dmin, dmax, datetype
if __name__ == '__main__':
# Connect to the SQL DB
engine = create_engine('mysql+pymysql://' + config.dbstring)
smaker = sessionmaker(bind=engine)
mysqlsession = smaker()
# Get all the narrative dates
unparsed = 0
print("Trying narrative factoids")
for nu in mysqlsession.query(pbw.NarrativeUnit).all():
result = parse_date_info(nu)
if result is not None and result[0] is None:
unparsed += 1
print("Trying death factoids")
for df in mysqlsession.query(pbw.DeathFactoid).all():
if df.sourceDate is not None and df.sourceDate != '':
result = parse_date(df.sourceDate)
if result[0] is None:
print("Unparsed date %s" % df.sourceDate)
unparsed += 1
print("Total unparsed: %d" % unparsed)