labelFactory/run_file2db.py at master · Orieus/labelFactory · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
#!/usr/bin/env python
# -*- coding: utf-8 -*-

""" run_file2db is a tool to migrate a labeled dataset in a pickle file to a
    mongo db.

    It must be invoked using

        python run_file2db.py <project_folder>

    Created on Dec, 2016
    @autor: Jesus Cid.
"""

import ast
import time
import sys
import os
import ipdb

# Local imports
from labelfactory.ConfigCfg import ConfigCfg as Cfg
from labelfactory.Log import Log
from labelfactory.labeling.datamanager import DataManager

CF_FNAME = "config.cf"
CF_DEFAULT_PATH = "./config.cf.default"


def main():

    # To complete the migration to python 3, I should replace all "raw_input"
    # by "input". Transitorily, to preserve compatibility with python 2, I
    # simply rename inut to raw_input
    if sys.version_info.major == 3:
        raw_input2 = input
    else:
        raw_input2 = raw_input

    #######
    # Start

    # Check if project folder exists. Otherwise exit.
    if len(sys.argv) > 1:
        project_path = sys.argv[1]
    else:
        project_path = raw_input2("Select the (absolute or relative) path to" +
                                  " the labeling project folder: ")
    if not project_path.endswith('/'):
        project_path = project_path + '/'

    # Check if project folder exists. This is necessary to follow
    if not os.path.isdir(project_path):
        sys.exit("Project folder does not exist")

    #########################
    # Read configuration data

    # Check if configuration file existe
    config_path = project_path + CF_FNAME
    if not os.path.isfile(config_path):
        sys.exit("Configuration file does not exist")

    # Read data from the configuation file
    cf = Cfg(config_path)

    # Data source and destination (options: file, mongodb)
    source_type = 'file'
    dest_type = 'mongodb'

    # Mongo DB settings
    db_info = {'name': cf.get('DataPaths', 'db_name'),
               'hostname': cf.get('DataPaths', 'db_hostname'),
               'user': cf.get('DataPaths', 'db_user'),
               'pwd': cf.get('DataPaths', 'db_pwd'),
               'label_coll_name': cf.get('DataPaths', 'db_label_coll_name'),
               'history_coll_name': cf.get('DataPaths',
                                           'db_history_coll_name'),
               'port': cf.get('DataPaths', 'db_port'),
               'mode': cf.get('DataPaths', 'db_mode'),
               'file2db_mode': cf.get('DataPaths', 'db_file2db_mode'),
               'db2file_mode': cf.get('DataPaths', 'db_db2file_mode'),
               }

    # Folder containing the urls to label
    file_info = {'project_path': project_path,
                 'input_folder': cf.get('DataPaths', 'input_folder'),
                 'output_folder': cf.get('DataPaths', 'output_folder'),
                 'used_folder': cf.get('DataPaths', 'used_folder'),
                 'dataset_fname': cf.get('DataPaths', 'dataset_fname'),
                 'labelhistory_fname': cf.get(
                    'DataPaths', 'labelhistory_fname'),
                 'labels_endname': cf.get('DataPaths', 'labels_endname'),
                 'preds_endname': cf.get('DataPaths', 'preds_endname'),
                 'urls_fname': cf.get('DataPaths', 'urls_fname')}

    # Type of wid: if 'yes', the wid is computed as a transformed url.
    #              if 'no', the wid is taken equal to the url.
    compute_wid = cf.get('Labeler', 'compute_wid')

    # List of categories to label.
    categories = ast.literal_eval(cf.get('Labeler', 'categories'))
    parentcat = ast.literal_eval(cf.get('Labeler', 'parentcat'))

    # Possible labels for each category
    yes_label = cf.get('Labeler', 'yes_label')
    no_label = cf.get('Labeler', 'no_label')
    unknown_label = cf.get('Labeler', 'unknown_label')
    error_label = cf.get('Labeler', 'error_label')
    alphabet = {'yes': yes_label, 'no': no_label, 'unknown': unknown_label,
                'error': error_label}

    # In multiclass cases, the reference class is the class used by the active
    # learning algorithm to compute the sample scores.
    ref_class = cf.get('ActiveLearning', 'ref_class')

    ##########
    # Log file

    # Create the log object
    log = Log(project_path + 'log')
    log.info('*****************************')
    log.info('****** WEB LABELER: *********')

    #####################
    # Create main objects

    # Data manager object
    data_mgr = DataManager(source_type, dest_type, file_info, db_info,
                           categories, parentcat, ref_class, alphabet,
                           compute_wid)

    ##############
    # Read dataset

    # Load data from the standard dataset.
    log.info('Carga de datos')
    df_labels, df_preds, labelhistory = data_mgr.loadData(source_type)

    ###############
    # Migrate to DB

    # Save data and label history into db
    log.info("-- Saving data in mongodb")
    start = time.clock()
    data_mgr.migrate2DB(df_labels)
    log.info(str(time.clock() - start) + ' seconds')

if __name__ == "__main__":
    main()