-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy pathdata.py
More file actions
90 lines (62 loc) · 2.44 KB
/
data.py
File metadata and controls
90 lines (62 loc) · 2.44 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
#
# data.py
#
# This file contains code to read
# in sentiment data (for sentiment analysis)
# from the data source specified by the
# DATA_SOURCE constant
#
# imports #
import os
import re
# constants #
DATA_SOURCE = 'subjectivity_clues/subjclueslen1-HLTEMNLP05.tff'
# functions #
def __get_match(pat, str, n=0):
'''
Given a regex pattern and an input string
returns the nth match found
By default, n is 0
'''
matches = re.findall(pat, str)
if 0 <= n < len(matches):
return matches[n]
else:
return None
def read_dataset():
'''
Reads in sentiment data from the file specified
by the DATA_SOURCE constant, returning it as a
list of dictionaries
Raises RuntimeError if sentiment file does not exist
Each entry in the list is a dictionary with the following fields:
- word : the English word associated with this entry
- type : the type of subject this word represents {weaksubj, strongsubj}
- len : UNUSED
- pos : Part-of-Speech for this particular word {adj, noun, verb, anypos, adverb}
- stemmed : is this the most basic (stemmed) form of the word?
- polarity : sentiment associated with this word {positive, neutral, negaative}
'''
if os.path.exists(DATA_SOURCE):
entries = []
with open(DATA_SOURCE) as f:
for line in f:
# define patterns for each field in source file
type_pat = r'type=(\w+)'
len_pat = r'len=(\d+)'
word_pat = r'word\d+=(\w+)'
pos_pat = r'pos\d+=(\w+)'
stemmed_pat = r'stemmed\d+=(\w)'
polarity_pat = r'priorpolarity=(\w+)'
# extract (key, value) pairs for each field in current line
type = ('type', __get_match(type_pat, line))
len = ('len', __get_match(len_pat, line))
word = ('word', __get_match(word_pat, line))
pos = ('pos', __get_match(pos_pat, line))
stemmed = ('stemmed', __get_match(stemmed_pat, line))
polarity = ('polarity', __get_match(polarity_pat, line))
# add current row to list of data entries
entries.append(dict([type, len, word, pos, stemmed, polarity]))
return entries
else:
raise RuntimeError('Cannot find data source at "{}"'.format(DATA_SOURCE))