-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathdata_cleaning.py
More file actions
66 lines (62 loc) · 2.29 KB
/
data_cleaning.py
File metadata and controls
66 lines (62 loc) · 2.29 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
# -*- coding: utf-8 -*-
import matplotlib.image as img
from matplotlib import pyplot as plt
from pydpc import Cluster
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import random
import cv2
import numpy as np
import scipy
from scipy import spatial
import sklearn
dat = pd.read_csv("D4H19.csv",low_memory=False)
print(dat.columns)
for col in dat.columns:
if 'Unnamed:' in col:
del dat[col]
dat['noncitizen']=dat['citizenship']==dat['CountryOfExploitation']
dat.to_csv("D4H19_cleaned.csv")
cases = pd.read_csv("unodc_export.csv",low_memory=False)
import json
new_cols={}
new_df_len={}
print('\n\n\n')
def txt_add(s):
return ' '.join(s)+' '
for col in 'Keywords,Procedural_Fields,Victims,Defendants,Charges'.split(','):
cases[col]=cases[col].apply(lambda s: str(s).replace(" "," ").replace(" "," ").replace(r'\\n'," "))
#cases[col]=cases[col].apply(lambda s: json.loads(str(s).replace("'", "\'")).keys())
if col in 'Victims,Defendants,Charges'.split(','):
cases[col]=cases[col].apply(lambda s: "[{'Error': 'Empty'}]" if s.lower()=='nan' else s)
cases[col]=cases[col].apply(lambda s: eval(s))
#print(cases[col])
#cases[col+'_keys']=cases[col].apply(lambda s: [ ' '.join(list(eval(ls).keys())) for ls in s])
#for ele in cases[col+'_keys'].values.tolist():
cases[col+'_keys']=cases[col].apply(lambda s: [item for ls in s for item in ls.keys()])
cases[col+'_cnt']=cases[col].apply(lambda s: len(s))
new_df_len[col]=sum(cases[col+'_cnt'].values.tolist())
else:
cases[col]=cases[col].apply(lambda s: "{'Error': 'Empty'}" if s.lower()=='nan' else s)
cases[col+'_keys']=cases[col].apply(lambda s: list(eval(s).keys()))
cases[col+'_cnt']=1
#cases[col+'_keys']=cases[col].apply(lambda s: str(s.keys()))
new_cols[col]=[]
print(col)
for ele in cases[col+'_keys'].values.tolist():
new_cols[col].extend(ele)
new_cols[col]=list(set(new_cols[col]))
print(new_cols[col])
print('\n')
'Victims,Defendants,Charges'
newDF={}
for col in 'Victims,Defendants,Charges'.split(','):
newDF[col]=[]
for row in cases.iterrows():
#print(row)
for i,rec in enumerate(row[1][col]):
newDF[col].append(rec)
newDF[col][i]['UNODC_NO']=str(row[1]['UNODC_NO'])
pd.DataFrame(newDF[col]).to_csv("unodc_"+col+".csv")
del cases[col]
cases.to_csv("unodc_export_cleaned.csv")