general-specific/parse_turk_data.py at master · wpchop/general-specific · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
#-------------------------------------------------------------------------------
# Name:        parse_data.py
# Purpose: Takes a .input file and a .results file from mTurk, extracts the data,
# and outputs it nicely.
# Authors:      Bridget O'Daniel, Wenli Zhao, Lily Wu
# Created:     21/05/2015
#-------------------------------------------------------------------------------
import os, sys
sys.path.insert( 0, sys.path[0]+'/TurkClasses' )                                #Allows script to reach TurkClasses folder
from ast import (literal_eval)
from Task import Task
from Question import Question
from Sentence import Sentence
from Worker import Worker

def parseMeta(string):	#separate into sentence number, (question number), field
    """Takes an entry of meta tags of the form "Answer.xxx" and extracts relevant
    information, storing it in a tuple.
    pre: string is a String of the form "Answer.sentxxx"
    post: Returns a tuple of one of the following forms:
        1) ()
        2) (sentence_num, 'field')
        3) (sentence_num, question_num, 'field')"""
    new_string = string[12::]   #Removes Answer.sent
    lst = new_string.split("_")
    sentnum = lst[0]                                  #Gets the sentence number
    if len(lst) == 3:                                 #If field is part of a quesetion...
      q = lst[1][1::]                                   #Get the question number (w/o the q)
      field = lst[2][0:len(lst[2])-1]                   #Get the field without the final quote
      return (sentnum, q, field)
    elif len(lst) == 2:                               #If field is part of a scale or qstat
      field = lst[1][0:len(lst[1])-1]                   #Get the field without the final quote
      return (sentnum, field)
    else:                                             #If field is submitbutton, empty tuple
      return ()

def deleteQuotes(string):
    """Takes an input string and removes the first and last characters.
    Presumably quotation marks, but it'll happily cut off other things too."""
    return string[1:len(string)-1]

def get_input_file_lines(fname):
    """Returns a list of the lines from the given input file name, leaving off
    the first line of the file."""
    inputFile = open("data/"+fname, "r")
    inputFile.readline()                            #Ignore first line of input file
    inputlines = inputFile.readlines()              #Read the rest of the lines!
    inputFile.close()
    return inputlines

def make_task_dict(input_lines):
    """Takes the lines from the input file and creates Tasks containing
    the proper Sentences from that information. Returns the Tasks as a dictionary,
    where the keys are the taskID's (strings) and the values are the Task objects."""
    tasks = {}                                      #Dictionary to store the {taskID:Task Object}
    for task_line in input_lines:                   #For each task in the input file...
        task_line = task_line.split("\t")               #Split string into parts
        taskID = task_line[0]                           #First part is the taskID!
        task = Task(taskID)                             #Make a Task object with the ID
        s_list = literal_eval(task_line[3])             #Get the list of sentences as a list!!
        s_list = [i.strip() for i in s_list]            #Clean up the sentences
        task = add_sentences_to_task(s_list, task)      #Add sentences to the task
        tasks[taskID]= task                             #Put the task in the task dictionary
    return tasks

def add_sentences_to_task(s_list, task):
    """Adds each sentence (string) in s_list to the given task (Task object),
    and returns the task."""
    for i, sent in enumerate(s_list):             #For each sentence...
        sentence = Sentence(i, sent)                    #Make a Sentence object for it
        task.add_sentence(sentence)                     #Add it to the Task's sentence list
    return task

def read_results_file(fname):
    """Opens the given file name and extracts the metadata and the lines that
    follow, containing each user's results.
    pre: fname--string that is a valid file name.
    post: Returns meta (a list of metadata strings) and result_lines (a list of
    lines in the file)"""
    inFile = open("data/"+fname, "r")
    meta = inFile.readline().strip().split()
    #taskID = meta[29]
    result_lines=inFile.readlines()
    inFile.close()
    return meta, result_lines

def make_category_dict(meta):
    """Takes in the list of metadata strings and parses them. Returns a dictionary
    wherein the keys are the order of the tags and the values are tuples that
    represent the metadata usefully. See parseMeta for details."""
    categories = {}
    for i in range(30, len(meta)):              #Skips irrelevant data at the start of the meta
        categories[i-30] = parseMeta(meta[i])   #Creates the tuples and puts 'em in there
    return categories

def __add_questions(l, currenttask, categories, worker):
    """Function for actually categorizing questions and adding them. Returns the
    changed Task."""
    low, high, body, context = 0,0,"",""              #Initialize variables for the loop
    workerID = worker.get_ID()
    for i in range(30, len(l)):                       #For all data by the worker....
        entry = categories[i-30]                            #Get relevant category information (what sentence/question/field this info is)
        if l[i] == "":                                      #If the info is empty, skip it.
          pass
        elif len(entry) == 2:                               #If the worker did NOT ask a question...
          if entry[1] == "scale":                               #If the info is a scale
            currenttask.sentences[int(entry[0])].add_scale((l[i], workerID))	#On the current task, add scale to corresponding sentence
            worker.add_scale((l[i],entry[0]))
          else:                                                 #If the info is a qstat
            currenttask.sentences[int(entry[0])].add_qstat(l[i])    #Add it to the proper question
        elif len(entry)== 3:                                #If the worker DID ask a question...
          if entry[2] == "low":                                 #Add the field into the proper variable for later Question making
            low = int(l[i])
          elif entry[2] == "high":
            high = int(l[i])
          elif entry[2] == "body":
            body = l[i]
          else:                                                                     #If we're on context, that means it's the end of a question
            context = l[i]
            question = Question(int(entry[1]), workerID, low, high, body, context)  #Put all current info from variables into a Question object
            currenttask.get_sentence(int(entry[0])).add_question(question)          #Now add that question to the proper sentence in the task (phew)
    return currenttask

def add_questions_to_sentences(resultlines, categories, tasks):
    """Adds the Questions to the Sentences in the Tasks (phew), reading the worker's
    data from resultlines (a list of lines from the result file) and using the
    dictionary categories to put the Questions in the correct places.
    pre: resultlines is a list of lines from the result file, categories is a
    hash table based on metadata strings.
    post: Returns the task dictionary."""
    workers = []
    for rline in resultlines:                                                   #For each worker's data...
      l = rline.split("\t")                                                         #Make it a list of data
      workerID = l[19]                                                              #Retrieve workerID
      l = map(deleteQuotes,l)                                                       #Delete all the unnecessary quotes
      worker = Worker(workerID, l[29])
      workers.append(worker)
      currentTask = tasks[ l[29] ]                                                  #Find the Task that the worker is working on (taskID @l[29])
      #print currentTask.get_ID()
      currentTask = __add_questions(l, currentTask, categories, worker)           #Add the questions where they should go
      #currentTask.add_worker(worker)
    #print len(currentTask.get_workers())
    for worker in workers:
      taskID = worker.get_taskID()
      task = tasks[taskID]
      if task.get_ID()==taskID:
      	task.add_worker(worker)
    #print len(
    return tasks

def write_output(input_fname, tasks):
    """Creates an output file by the name of "<file>_output.txt" using the hash of
    Task objects.
    WARNING: Will overwrite another file of the same name."""
    outFile = open(input_fname[0:-6]+"_output.txt","w")
    context_map = {0: "No", 1: "Vague", 2: "Some", 3: "Immediate"}
    for key in tasks:
      outFile.write("\n==================TASK==================\n"+key+"\n\n")
      task = tasks[key]
      for sentence in task.get_sentences():
        outFile.write("\n---SENTENCE "+str(sentence.get_num()) + "---\n"+sentence.get_sent()+"\n")

        for question in sentence.get_questions():
          outFile.write("\t---Question #"+str(question.get_num())+" "+question.get_ID()+"\n")
          outFile.write("\tTXT\t"+question.get_text(sentence.get_sent().split())+"\n")
          outFile.write("\tBODY\t"+question.get_body()+"\n")
          outFile.write("\tIN CONTEXT\t"+context_map[int(question.get_context()[0:1])]+"\n")
        outFile.write("--Specificity scale: ")
        for scale in sentence.get_scales():
          outFile.write(str(scale))
        outFile.write("\n\n")
    outFile.close()

def read_input_file():
    '''Uses a text file "input.txt" that lists all desired input files. The mTurk
    files should be listed on one line in the following format:
    mturk:\t<file>\t<file>
    where <file> is the name of BOTH the .input and .results file, listed here
    without the extension, ex: week3
    Returns the list of tuples where each tuple is of the form
    ("file.input", "file.results")'''
    fname_list = []
    with open('data/input.txt', 'r') as inFile:
      for line in inFile:
        if line[:5] == 'mturk':
          items = line.strip().split('\t')[1:]
          for item in items:
            fname_list.append( (item+'.input', item+'.results') )
    return fname_list


def main(input_fname, result_fname):
    '''Takes in two strings representing the .input file and the .results file,
    respectively. Returns a dictionary of tasks.'''
    input_lines = get_input_file_lines(input_fname)                             #Get lines from .input file
    tasks = make_task_dict(input_lines)                                         #Make a dictionary containing the Tasks with Sentences in them
    meta, resultlines = read_results_file(result_fname)                         #Get the list of metadata and the lines from the .result file
    categories = make_category_dict(meta)                                       #A category dictionary parsed from the metadata
    tasks = add_questions_to_sentences(resultlines, categories, tasks)          #Add Questions to the proper Sentences
    #write_output('output/'+input_fname,tasks)                                                         #Output to an output file "output.txt"
    return tasks

#for tup in read_input_file():
#  main( tup[0], tup[1] )