text-analyzer/data.py at master · GotoCode/text-analyzer · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
#
# data.py
#
# This file contains code to read
# in sentiment data (for sentiment analysis)
# from the data source specified by the
# DATA_SOURCE constant
#

# imports #

import os
import re


# constants #

DATA_SOURCE = 'subjectivity_clues/subjclueslen1-HLTEMNLP05.tff'


# functions #

def __get_match(pat, str, n=0):
    '''
    Given a regex pattern and an input string
    returns the nth match found

    By default, n is 0
    '''

    matches = re.findall(pat, str)

    if 0 <= n < len(matches):

        return matches[n]

    else:

        return None

def read_dataset():
    '''
    Reads in sentiment data from the file specified
    by the DATA_SOURCE constant, returning it as a
    list of dictionaries

    Raises RuntimeError if sentiment file does not exist

    Each entry in the list is a dictionary with the following fields:

    - word : the English word associated with this entry
    - type : the type of subject this word represents {weaksubj, strongsubj}
    - len  : UNUSED
    - pos  : Part-of-Speech for this particular word {adj, noun, verb, anypos, adverb}
    - stemmed  : is this the most basic (stemmed) form of the word?
    - polarity : sentiment associated with this word {positive, neutral, negaative}
    '''

    if os.path.exists(DATA_SOURCE):

        entries = []

        with open(DATA_SOURCE) as f:

            for line in f:

                # define patterns for each field in source file
                type_pat = r'type=(\w+)'
                len_pat  = r'len=(\d+)'
                word_pat = r'word\d+=(\w+)'
                pos_pat  = r'pos\d+=(\w+)'
                stemmed_pat = r'stemmed\d+=(\w)'
                polarity_pat = r'priorpolarity=(\w+)'

                # extract (key, value) pairs for each field in current line
                type = ('type', __get_match(type_pat, line))
                len  = ('len', __get_match(len_pat, line))
                word = ('word', __get_match(word_pat, line))
                pos  = ('pos', __get_match(pos_pat, line))
                stemmed  = ('stemmed', __get_match(stemmed_pat, line))
                polarity = ('polarity', __get_match(polarity_pat, line))

                # add current row to list of data entries
                entries.append(dict([type, len, word, pos, stemmed, polarity]))

        return entries

    else:

        raise RuntimeError('Cannot find data source at "{}"'.format(DATA_SOURCE))