Skip to content

Commit bc6b702

Browse files
committed
added common_words
1 parent 4df2760 commit bc6b702

10 files changed

Lines changed: 123 additions & 33 deletions

File tree

.gitignore

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,4 +3,5 @@
33
build
44
dist
55
*.egg-info
6-
wordinfo.db
6+
wordinfo.db
7+
venv/*

example.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
from wordstats import LanguageInfo
2+
from wordstats.common_words import common_words
3+
4+
French = LanguageInfo.load("fr")
5+
6+
7+
print("Top 10 most used words in French")
8+
print(French.all_words()[:10])
9+
print(" ")
10+
11+
12+
print("Info about the word 'jamais'")
13+
jamais_info = French.get("jamais")
14+
print(jamais_info)
15+
print(" ")
16+
17+
18+
print("First 10 common words of more than 10 letters common between French and English")
19+
20+
English = LanguageInfo.load("en")
21+
count = 0
22+
for each in French.all_words():
23+
if len(each) > 10 and each in English.all_words():
24+
print(each)
25+
count += 1
26+
if count == 10:
27+
break
28+
print("")
29+
30+
31+
print("Words (>8chars) common in all the languages*")
32+
for each in common_words():
33+
if len(each) > 8:
34+
print(each)

wordstats/common_words.py

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
from wordstats.config import DATA_COMMON_FOLDER
2+
3+
COMMON_WORDS = []
4+
5+
common_words_file_name = DATA_COMMON_FOLDER + "/common.txt"
6+
7+
8+
def common_words():
9+
global COMMON_WORDS
10+
11+
if COMMON_WORDS:
12+
return COMMON_WORDS
13+
14+
with open(common_words_file_name) as common_words_file:
15+
words_list = common_words_file.read().splitlines()
16+
COMMON_WORDS = words_list
17+
return COMMON_WORDS
18+
19+
20+
def write_common_words_file():
21+
"""
22+
Compute common words between all_languages_with_latin_characters (i.e.
23+
"da","de","en","es","fr","it","nl","no","pl","pt","ro")
24+
Write them to file
25+
:return:
26+
"""
27+
from wordstats import LanguageInfo
28+
from wordstats.language_codes import all_languages_with_latin_characters
29+
30+
common_words = set()
31+
32+
reference_language = LanguageInfo.load(all_languages_with_latin_characters[0])
33+
all_other_languages = [LanguageInfo.load(each) for each in all_languages_with_latin_characters[1:]]
34+
35+
with open(common_words_file_name, 'w') as common_words_file:
36+
37+
for each in reference_language.all_words():
38+
at_least_one_exception = False
39+
for other_language in all_other_languages:
40+
if each not in other_language.all_words():
41+
at_least_one_exception = True
42+
continue
43+
44+
if not at_least_one_exception:
45+
if each not in common_words:
46+
common_words.add(each)
47+
common_words_file.write(each + "\n")
48+
print(each)

wordstats/config.py

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,23 @@
1-
db_uri = 'sqlite:///wordinfo.db'
1+
import os
2+
3+
import platform
4+
import tempfile
5+
import logging as log
6+
7+
tempdir = "/tmp" if platform.system() == "Darwin" else tempfile.gettempdir()
8+
9+
db_uri = 'sqlite:///' + tempdir + '/wordinfo.db'
10+
11+
log.debug("running with DB URI: " + db_uri )
212

313
# for mysql, we want to declare the default character encoding
414
# for comm. with the db
515
if db_uri.startswith("mysql"):
616
db_uri += '?charset=utf8'
717

8-
DATA_FOLDER= 'language_data/hermitdave/2016'
18+
DATA_HERMIT_FOLDER = 'language_data/hermitdave/2016'
19+
20+
package_directory = os.path.dirname(os.path.abspath(__file__))
21+
DATA_COMMON_FOLDER = package_directory + os.sep +'language_data/common'
922

1023
MAX_WORDS = 10000

wordstats/language_codes.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
all_languages = ["da", "de", "el", "en", "es", "fr", "it", "nl", "no", "pl", "pt", "ro", "zh-CN"]
2+
3+
all_languages_with_latin_characters = ["da", "de", "en", "es", "fr", "it", "nl", "no", "pl", "pt", "ro"]

wordstats/language_data/common/Readme.md

Lines changed: 0 additions & 4 deletions
This file was deleted.

wordstats/language_data/common/find_common.py

Lines changed: 0 additions & 20 deletions
This file was deleted.

wordstats/language_info.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,13 +9,29 @@
99
from .base_service import BaseService, Base
1010
from .config import MAX_WORDS
1111
from .metrics_computers import *
12+
import logging as log
1213

1314

1415
class LanguageInfo(object):
1516
def __init__(self, language_id):
1617
self.language_id = language_id
1718
self.word_info_dict = dict()
1819

20+
@classmethod
21+
def load(cls, language_code):
22+
from wordstats.loading_from_hermit import load_language_from_hermit
23+
24+
log.info(f"loading {language_code} from DB")
25+
lang = LanguageInfo.load_from_db(language_code)
26+
27+
if len(lang.all_words()) == 0:
28+
log.info(f"loading {language_code} from file")
29+
lang = load_language_from_hermit(language_code)
30+
log.info(f"caching {language_code} to DB")
31+
lang.cache_to_db()
32+
33+
return lang
34+
1935
def get(self, word):
2036
"""
2137

wordstats/loading_from_hermit.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,19 @@
11
# "hermit dave" has a nice repository of word frequencies
22
# computed for many languages based on movie subtitles
33
from .language_info import LanguageInfo
4-
from .config import DATA_FOLDER
4+
from .config import DATA_HERMIT_FOLDER
55

66

77
def path_of_hermit_language_file(language):
8-
file_name = "{0}/{1}/{1}_50k.txt".format(DATA_FOLDER, language)
8+
file_name = "{0}/{1}/{1}_50k.txt".format(DATA_HERMIT_FOLDER, language)
99
return file_name
1010

1111

1212
def load_language_from_hermit(language, hermit_root_folder=None):
1313

1414
# by default use the hermit folder in the config file
1515
if not hermit_root_folder:
16-
hermit_root_folder = DATA_FOLDER
16+
hermit_root_folder = DATA_HERMIT_FOLDER
1717

1818
file_name = "{0}/{1}/{1}_50k.txt".format(hermit_root_folder, language)
1919
d = LanguageInfo.load_from_file(file_name, language)
@@ -24,7 +24,7 @@ def load_multiple_languages_from_hermit(languages, hermit_root_folder=None):
2424

2525
# by default use the hermit folder in the config file
2626
if not hermit_root_folder:
27-
hermit_root_folder = DATA_FOLDER
27+
hermit_root_folder = DATA_HERMIT_FOLDER
2828

2929
result = dict()
3030
for language in languages:

wordstats/word_info.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ def __init__(self, word, language_id, frequency, difficulty, importance, rank, k
4646
self.rank = rank
4747

4848
def __str__(self):
49-
result = "info: {0} ({1}, freq: {2}, imp: {3}, diff: {4}, rank: {5}, klevel: {6})".format(
49+
result = "{0}: (lang: {1}, rank: {5}, freq: {2}, imp: {3}, diff: {4}, klevel: {6})".format(
5050
self.word,
5151
self.language_id,
5252
self.frequency,
@@ -55,7 +55,6 @@ def __str__(self):
5555
self.rank,
5656
self.klevel)
5757

58-
result = result.encode(stdout.encoding)
5958
return result
6059

6160
@classmethod

0 commit comments

Comments
 (0)