-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathrun_file2db.py
More file actions
150 lines (118 loc) · 4.92 KB
/
run_file2db.py
File metadata and controls
150 lines (118 loc) · 4.92 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
#!/usr/bin/env python
# -*- coding: utf-8 -*-
""" run_file2db is a tool to migrate a labeled dataset in a pickle file to a
mongo db.
It must be invoked using
python run_file2db.py <project_folder>
Created on Dec, 2016
@autor: Jesus Cid.
"""
import ast
import time
import sys
import os
import ipdb
# Local imports
from labelfactory.ConfigCfg import ConfigCfg as Cfg
from labelfactory.Log import Log
from labelfactory.labeling.datamanager import DataManager
CF_FNAME = "config.cf"
CF_DEFAULT_PATH = "./config.cf.default"
def main():
# To complete the migration to python 3, I should replace all "raw_input"
# by "input". Transitorily, to preserve compatibility with python 2, I
# simply rename inut to raw_input
if sys.version_info.major == 3:
raw_input2 = input
else:
raw_input2 = raw_input
#######
# Start
# Check if project folder exists. Otherwise exit.
if len(sys.argv) > 1:
project_path = sys.argv[1]
else:
project_path = raw_input2("Select the (absolute or relative) path to" +
" the labeling project folder: ")
if not project_path.endswith('/'):
project_path = project_path + '/'
# Check if project folder exists. This is necessary to follow
if not os.path.isdir(project_path):
sys.exit("Project folder does not exist")
#########################
# Read configuration data
# Check if configuration file existe
config_path = project_path + CF_FNAME
if not os.path.isfile(config_path):
sys.exit("Configuration file does not exist")
# Read data from the configuation file
cf = Cfg(config_path)
# Data source and destination (options: file, mongodb)
source_type = 'file'
dest_type = 'mongodb'
# Mongo DB settings
db_info = {'name': cf.get('DataPaths', 'db_name'),
'hostname': cf.get('DataPaths', 'db_hostname'),
'user': cf.get('DataPaths', 'db_user'),
'pwd': cf.get('DataPaths', 'db_pwd'),
'label_coll_name': cf.get('DataPaths', 'db_label_coll_name'),
'history_coll_name': cf.get('DataPaths',
'db_history_coll_name'),
'port': cf.get('DataPaths', 'db_port'),
'mode': cf.get('DataPaths', 'db_mode'),
'file2db_mode': cf.get('DataPaths', 'db_file2db_mode'),
'db2file_mode': cf.get('DataPaths', 'db_db2file_mode'),
}
# Folder containing the urls to label
file_info = {'project_path': project_path,
'input_folder': cf.get('DataPaths', 'input_folder'),
'output_folder': cf.get('DataPaths', 'output_folder'),
'used_folder': cf.get('DataPaths', 'used_folder'),
'dataset_fname': cf.get('DataPaths', 'dataset_fname'),
'labelhistory_fname': cf.get(
'DataPaths', 'labelhistory_fname'),
'labels_endname': cf.get('DataPaths', 'labels_endname'),
'preds_endname': cf.get('DataPaths', 'preds_endname'),
'urls_fname': cf.get('DataPaths', 'urls_fname')}
# Type of wid: if 'yes', the wid is computed as a transformed url.
# if 'no', the wid is taken equal to the url.
compute_wid = cf.get('Labeler', 'compute_wid')
# List of categories to label.
categories = ast.literal_eval(cf.get('Labeler', 'categories'))
parentcat = ast.literal_eval(cf.get('Labeler', 'parentcat'))
# Possible labels for each category
yes_label = cf.get('Labeler', 'yes_label')
no_label = cf.get('Labeler', 'no_label')
unknown_label = cf.get('Labeler', 'unknown_label')
error_label = cf.get('Labeler', 'error_label')
alphabet = {'yes': yes_label, 'no': no_label, 'unknown': unknown_label,
'error': error_label}
# In multiclass cases, the reference class is the class used by the active
# learning algorithm to compute the sample scores.
ref_class = cf.get('ActiveLearning', 'ref_class')
##########
# Log file
# Create the log object
log = Log(project_path + 'log')
log.info('*****************************')
log.info('****** WEB LABELER: *********')
#####################
# Create main objects
# Data manager object
data_mgr = DataManager(source_type, dest_type, file_info, db_info,
categories, parentcat, ref_class, alphabet,
compute_wid)
##############
# Read dataset
# Load data from the standard dataset.
log.info('Carga de datos')
df_labels, df_preds, labelhistory = data_mgr.loadData(source_type)
###############
# Migrate to DB
# Save data and label history into db
log.info("-- Saving data in mongodb")
start = time.clock()
data_mgr.migrate2DB(df_labels)
log.info(str(time.clock() - start) + ' seconds')
if __name__ == "__main__":
main()