-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathSpellCorrector.py
More file actions
161 lines (116 loc) · 5.79 KB
/
SpellCorrector.py
File metadata and controls
161 lines (116 loc) · 5.79 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
import nltk
from nltk.corpus import words
from nltk.tokenize import word_tokenize
from nltk.stem import *
from NGramModel import NGramModel
from collections import Counter
import string
class SpellCorrector:
def __init__(self):
self.allWords = set(words.words()).union(set(string.punctuation))
self.countWords = Counter(self.allWords)
self.numWords = len(self.allWords)
self.snowball = SnowballStemmer("english")
self.porter = PorterStemmer()
self.wnl = WordNetLemmatizer()
self.ngram = NGramModel(2)
self.punctuations = '''’!()-[]{};:'"\,<>./?@#$%^&*_~'''
self.misspelled_dict = {}
def check(self, words_str):
if not isinstance(words_str, str):
raise Exception("check() takes a string")
misspelled = []
count = 0
# Remove punctuation from the string
filtered_str = ""
for char in words_str:
if char not in self.punctuations:
filtered_str = filtered_str + char
# Tokenize the sentence passed
tokens = word_tokenize(filtered_str)
for word in tokens:
# Valid token so continue searching
if self.is_valid_word(word):
count += 1
continue
# If we haven't already seen the word
if word not in self.misspelled_dict:
# Retrieve all the edit distance candidates
edit_candidates = self.all_edits_candidates(word, 2)
edit_candidates = set(edit_candidates)
edit_candidates = list(edit_candidates)
gram_candidates = []
if count >= 1:
gram = [tokens[count - 1].lower(), word.lower()]
gram_candidates = self.ngram.prob(gram)
gram_candidates = set(gram_candidates)
gram_candidates = list(gram_candidates)
sorted(gram_candidates, key=lambda e1: e1[1])
# These are candidates in both so we really want them
absolute_candidates = [e1[0] for e1 in gram_candidates if e1[0] in edit_candidates]
absolute_candidates = set(absolute_candidates)
absolute_candidates = list(absolute_candidates)
if len(absolute_candidates) > 0:
absolute_candidates.sort(key=lambda e1: self.ngram.freq(e1))
absolute_candidates.sort(key=lambda e1: nltk.edit_distance(e1, word))
if len(absolute_candidates) > 5:
absolute_candidates = absolute_candidates[:5]
self.misspelled_dict[word] = absolute_candidates
else:
gram_candidates = [e1 for e1 in gram_candidates if nltk.edit_distance(e1, word) < 3]
gram_candidates = set(gram_candidates)
gram_candidates = list(gram_candidates)
gram_candidates.sort(key=lambda e1: e1[1])
if len(gram_candidates) > 5:
gram_candidates = gram_candidates[:5]
edit_candidates.sort(key=lambda e1: nltk.edit_distance(e1, word))
edit_candidates.sort(key=lambda e1: self.ngram.freq(e1), reverse=True)
if len(edit_candidates) > 5:
edit_candidates = edit_candidates[:5]
# The best candidates from each group
all_candidates = edit_candidates + gram_candidates
all_candidates.sort(key=lambda e1: self.ngram.freq(e1), reverse=True)
all_candidates.sort(key=lambda e1: nltk.edit_distance(e1, word))
candidates = all_candidates[:5]
self.misspelled_dict[word] = candidates
misspelled.append((count, word, self.misspelled_dict[word]))
count += 1
return misspelled
def is_valid_word(self, word):
if word.lower() in self.allWords:
return True
if self.snowball.stem(word.lower()) in self.allWords:
return True
if self.porter.stem(word.lower()) in self.allWords:
return True
if self.wnl.lemmatize(word.lower()) in self.allWords:
return True
return False
def all_edits_candidates(self, word, level):
candidates = []
edit_words = [word.lower()]
# Keep looping, moving more distance away from original word until we have candidates
while level > 0:
raw_candidates = []
# Check all permutations of all words
for e1 in edit_words:
raw_candidates += list(self.edit_distance_candidates(e1))
edit_words = raw_candidates
# Filter possible candidates to valid words only
candidates = [e1 for e1 in raw_candidates if self.is_valid_word(e1)]
level -= 1
return candidates
def edit_distance_candidates(self, word):
letters = 'abcdefghijklmnopqrstuvwxyz'
split_word = [(word[:i], word[i:]) for i in range(len(word) + 1)]
delete_chars = [left + right[1:] for left, right in split_word if right]
switch_chars = [left + right[1] + right[0] + right[2:] for left, right in split_word if len(right)>1]
replace_chars = [left + let + right[1:] for left, right in split_word if right for let in letters]
insert_chars = [left + let + right for left, right in split_word for let in letters]
return set(delete_chars + switch_chars + replace_chars + insert_chars)
def pos_match(self, word, pos):
pos_key = pos[:1].lower()
if pos_key != "n" and pos_key != "v" and pos_key != "a":
return False
synsets = nltk.corpus.wordnet.synsets(word, pos=pos_key)
return len(synsets) > 0