-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathwrite_data.py
More file actions
183 lines (143 loc) · 6 KB
/
write_data.py
File metadata and controls
183 lines (143 loc) · 6 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
from persistance.binary import Binary
from nltk.tokenize import RegexpTokenizer
import os
tokenizer = RegexpTokenizer(r'\w+')
def make_capt_vocab(lines):
vocab = set()
for capt in lines:
for w in tokenizer.tokenize(capt.lower()):
vocab.add(w)
return vocab
def make_attr_vocab(lines):
vocab = set()
for line in lines:
if type(line) == list:
for w in line:
for w_ in w.split():
vocab.add(w_)
elif type(line) == str:
for w_ in line.split():
vocab.add(w_)
return vocab
def write_coco(data_path,out_dir):
"""
:param data_path: str, path to prepared bin data files
:param out_dir: str, path to save the data set
"""
vocab_attr = set()
vocab_capt = set()
capt_train = open(out_dir + 'train.capt', 'w')
attr_train = open(out_dir + 'train.attr', 'w')
attr_dev = open(out_dir + 'dev.attr', 'w')
capt_dev = open(out_dir + 'dev.capt', 'w')
attr_test = open(out_dir + 'test.attr', 'w')
capt_test = open(out_dir + 'test.capt', 'w')
image_id = open(out_dir + 'image_id', 'w')
data_dict = Binary.load(data_path)
end_train = int(len(data_dict) * 0.7)
end_dev = int(len(data_dict) * 0.9)
for image in data_dict['coco'][0:end_train]:
for capt in image['captions']:
if capt != '--':
image_id.write('train' + ' ' + str(image['image_id']) + '\n')
attr = [cat for cat in image['categories'] if cat != '--']
attr_train.write(' '.join(attr) + '\n')
capt_train.write(' '.join(tokenizer.tokenize(capt.replace('\n', ' ').lower())) + '\n')
for w in tokenizer.tokenize(capt.lower()):
vocab_capt.add(w)
for w in image['categories']:
for w_ in w.split():
vocab_attr.add(w_)
for image in data_dict['coco'][end_train:end_dev]:
for capt in image['captions']:
if capt != '--':
image_id.write('dev' + ' ' + str(image['image_id']) + '\n')
attr = [cat for cat in image['categories'] if cat != '--']
attr_dev.write(' '.join(attr) + '\n')
capt_dev.write(' '.join(tokenizer.tokenize(capt.replace('\n', ' ').lower())) + '\n')
for w in tokenizer.tokenize(capt.lower()):
vocab_capt.add(w)
for w in image['categories']:
for w_ in w.split():
vocab_attr.add(w_)
for image in data_dict['coco'][end_dev:len(data_dict)]:
for capt in image['captions']:
if capt != '--':
image_id.write('test' + ' ' + str(image['image_id']) + '\n')
attr = [cat for cat in image['categories'] if cat != '--']
attr_test.write(' '.join(attr) + '\n')
capt_test.write(' '.join(tokenizer.tokenize(capt.replace('\n', ' ').lower())) + '\n')
for w in tokenizer.tokenize(capt.lower()):
vocab_capt.add(w)
for w in image['categories']:
for w_ in w.split():
vocab_attr.add(w_)
attr_train.close()
capt_train.close()
attr_dev.close()
capt_dev.close()
attr_test.close()
capt_test.close()
image_id.close()
with open(out_dir + 'vocab.attr', 'w') as f:
for w in vocab_attr:
f.write(w + '\n')
with open(out_dir + 'vocab.capt', 'w') as f:
for w in vocab_capt:
f.write(w + '\n')
def write_coco_a(data_path,out_dir,image_id_file, num_interaction=None):
"""
:param data_path: str, path to prepared bin data files
:param out_dir: str, path to save the data set
:param image_id_file: str, path to the ms coco dir where the image id is saved
:param num_interaction: int, limit interactions
"""
if not os.path.exists(out_dir):
os.mkdir(out_dir)
os.mkdir(out_dir+"coco_a_input/")
attr_train = open(out_dir + 'coco_a_input/train.attr', 'w')
attr_dev = open(out_dir+ 'coco_a_input/dev.attr', 'w')
attr_test = open(out_dir + 'coco_a_input/test.attr', 'w')
coco_a_dict = {} # without '--'
vocab_attr = set()
data_dict = Binary.load(data_path)
for image in data_dict['coco-a']:
if image['image_id'] not in coco_a_dict:
clear = list()
for inter in image['interactions']:
clear.append([attr for attr in inter if attr != '--'])
for inter in clear:
for a in inter:
for w in a.split():
vocab_attr.add(w)
coco_a_dict[image['image_id']] = clear
if num_interaction:
assert type(num_interaction) == int
coco_a_dict[image['image_id']] = clear[:num_interaction]
with open(image_id_file) as f:
for line in f.readlines():
new_l = line.split()
pre = new_l[0]
image = int(new_l[1])
if pre == 'train':
attr = ' </s> '.join([' '.join(one) for one in coco_a_dict[image] if one != []])
attr_train.write(attr + '\n')
elif pre == 'dev':
attr = ' </s> '.join([' '.join(one) for one in coco_a_dict[image] if one != []])
attr_dev.write(attr + '\n')
elif pre == 'test':
attr = ' </s> '.join([' '.join(one) for one in coco_a_dict[image] if one != []])
attr_test.write(attr + '\n')
attr_train.close()
attr_dev.close()
attr_test.close()
with open(out_dir + 'coco_a_input/vocab.attr', 'w') as f:
for a in vocab_attr:
f.write(a + '\n')
if __name__ == "__main__":
path_coco_a = "assets/prepared_data/1.4_5_6.bin"
out_dir = 'train_data/3.1/'
image_id_file = 'train_data/1.1_2_3/ms_coco_input/image_id.txt'
write_coco_a(path_coco_a, out_dir, image_id_file, num_interaction=2)
out_dir_3_3 = 'train_data/3.3/'
write_coco_a(path_coco_a, out_dir_3_3, image_id_file, num_interaction=4)