-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathget_relationship_data.py
More file actions
88 lines (72 loc) · 2.85 KB
/
get_relationship_data.py
File metadata and controls
88 lines (72 loc) · 2.85 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import json
import pandas as pd
import numpy as np
from typing import List
def main():
df = pd.read_csv('cleaned.csv')
agg_list_str = lambda x: json.loads(list(x)[0])
group_by_id = df.groupby('id')
id2following = group_by_id['following_list'].apply(agg_list_str).to_dict()
# id2follower = group_by_id['followed_list'].apply(agg_list_str).to_dict()
id2n_commits = pd.cut(
group_by_id['commit_n'].apply(int), bins=[100, 200, 500, 700, 1000, np.inf],
labels=['100-200', '200-500', '500-700', '700-1000', '1000+'],
).to_dict()
id2public_repos = pd.cut(
group_by_id['public_repos'].apply(int), bins=[-np.inf, 20, 50, 100, np.inf],
labels=['0-20', '20-50', '50-100', '100+'],
).to_dict()
id2public_gists = pd.cut(
group_by_id['public_gists'].apply(int), bins=[-np.inf, 3, 50, np.inf],
labels=['0-3', '3-50', '50+'],
).to_dict()
id2n_followers = pd.cut(
group_by_id['followed_n'].apply(int), bins=[-np.inf, 100, 500, 1000, np.inf],
labels=['0-100', '100-500', '500-1000', '1000+'],
).to_dict()
data = {
'n_commits': id2n_commits,
'public_repos': id2public_repos,
'public_gists': id2public_gists,
'n_followers': id2n_followers,
}
nodes = []
group_id = 0
for feat_name, v in data.items():
# nodes
column_vals = list(v.values())
unique_counts = np.asarray(
np.unique(np.asarray(column_vals), return_counts=True)
)
for uniq_idx, val in enumerate(unique_counts[0]):
col_id = f'{feat_name} {val}'
nodes.append(
dict(
name=col_id, id=col_id,
# adjust n to make circle size look normal
n=np.clip(unique_counts[1, uniq_idx].astype(int) / 5, 5, np.inf),
grp=group_id,
)
)
group_id += 1
# links
link_counts = {}
for feat_name1, v1 in data.items():
for dev_id, followings in id2following.items():
src_col_id = f'{feat_name1} {v1[dev_id]}'
for feat_name2, v2 in data.items():
for f_id in followings:
if f_id not in v2: # the person being followed might not be in data
continue
dest_col_id = f'{feat_name2} {v2[f_id]}'
link_counts.setdefault((src_col_id, dest_col_id), 0)
link_counts[(src_col_id, dest_col_id)] += 1
# print(link_counts)
links = []
for pair, count in link_counts.items():
if count >= 10:
links.append(dict(source=pair[0], target=pair[1], value=count))
ret = dict(nodes=nodes, links=links)
json.dump(ret, open('characteristic_relationship.json', 'w', encoding='utf-8'), ensure_ascii=False)
if __name__ == '__main__':
main()