forked from ZoeLeBlanc/visualizing_humanities_data_workshop
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathidentify_ner.py
More file actions
64 lines (54 loc) · 2.69 KB
/
Copy pathidentify_ner.py
File metadata and controls
64 lines (54 loc) · 2.69 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
# Importing libraries
import pandas as pd
import nltk
from nltk.corpus import stopwords
import string
import spacy
from progress.bar import IncrementalBar
# Load in spacy model
# python3 -m spacy download en_core_web_lg
nlp = spacy.load('en_core_web_lg')
# Read in full dataset scraped from dh humanist websited, includes data from 1987-2019
humanist_vols = pd.read_csv('web_scraped_humanist_listserv.csv')
# Subset data to only includes rows with dates
humanist_vols_dates = humanist_vols[humanist_vols.dates.str.contains('-')]
# Create new column containing origintal text length
humanist_vols_dates.loc[:,'original_text_length'] = humanist_vols_dates.text.apply(len)
humanist_vols_dates.loc[:,'text_lowercase'] = humanist_vols_dates.text.str.lower()
def tokenize_text_split_monthly(rows):
tokens = nltk.word_tokenize(rows.text_lowercase)
val = round(len(tokens) / 12)
split_tokens = [tokens[i:i + val] for i in range(0, len(tokens), val)]
if len(split_tokens) > 12:
combine_rows = split_tokens[11] + split_tokens[12]
final_tokens = split_tokens[0:11]
final_tokens.insert(len(final_tokens), combine_rows)
split_tokens = final_tokens
rows['split_tokens'] = split_tokens
return rows
humanist_vols_split = humanist_vols_dates.apply(tokenize_text_split_monthly, axis=1)
humanist_vols_split = humanist_vols_split.explode('split_tokens')
humanist_vols_split = humanist_vols_split[['dates', 'split_tokens']]
humanist_vols_split.loc[:,'month'] = humanist_vols_split.groupby('dates').cumcount() + 1
humanist_vols_split.loc[:,'text_lowercase'] = humanist_vols_split.split_tokens.agg(' '.join)
processing = IncrementalBar('identifying named entities', max=len(humanist_vols_split.index))
def get_ner(row, types):
processing.next()
ner_terms= ''
cleaned_terms = ''
cleaned_text = ' '.join([token for token in row.split_tokens if ( token not in string.punctuation) and (token not in stopwords.words('english'))])
spacy_text = nlp(cleaned_text)
for ent in spacy_text:
if len(ent.ent_type_) > 0 or ent.is_alpha:
if( ent.is_punct == False) and (any(i.isdigit() for i in ent.text) == False) and (ent.is_stop ==False):
text = ('').join(ent.text.split('.')) if '.' in ent.text else ent.text
cleaned_terms += text + ' '
if ent.ent_type_ in types:
ner_terms += text + ' '
row['ner_terms'] = ner_terms
row['cleaned_terms'] = cleaned_terms
return row
ner_types = ['LOC', 'GPE']
humanist_vols_split = humanist_vols_split.apply(get_ner, types=ner_types, axis=1, result_type='expand')
processing.finish()
humanist_vols_split.to_csv('humanist_vols_ner_tokens.csv', index=False)