forked from abadojack/whatlanggo
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdetect.go
More file actions
135 lines (121 loc) · 3.17 KB
/
detect.go
File metadata and controls
135 lines (121 loc) · 3.17 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
package whatlanggo
import (
"sort"
"unicode"
)
const maxDist = 300
//Detect language and script of the given text.
func Detect(text string) Info {
return DetectWithOptions(text, Options{})
}
//DetectLang detects only the language by a given text.
func DetectLang(text string) Lang {
return Detect(text).Lang
}
//DetectLangWithOptions detects only the language of the given text with the provided options.
func DetectLangWithOptions(text string, options Options) Lang {
return DetectWithOptions(text, options).Lang
}
//DetectWithOptions detects the language and script of the given text with the provided options.
func DetectWithOptions(text string, options Options) Info {
script := DetectScript(text)
if script != nil {
lang := detectLangBaseOnScript(text, options, script)
return Info{
Lang: lang,
Script: script,
}
}
return Info{}
}
func detectLangBaseOnScript(text string, options Options, script *unicode.RangeTable) Lang {
switch script {
case unicode.Latin:
return detectLangInProfiles(text, options, latinLangs)
case unicode.Cyrillic:
return detectLangInProfiles(text, options, cyrillicLangs)
case unicode.Devanagari:
return detectLangInProfiles(text, options, devanagariLangs)
case unicode.Hebrew:
return detectLangInProfiles(text, options, hebrewLangs)
case unicode.Ethiopic:
return detectLangInProfiles(text, options, ethiopicLangs)
case unicode.Arabic:
return detectLangInProfiles(text, options, arabicLangs)
case unicode.Han:
return Cmn
case unicode.Bengali:
return Ben
case unicode.Hangul:
return Kor
case unicode.Georgian:
return Kat
case unicode.Greek:
return Ell
case unicode.Kannada:
return Kan
case unicode.Tamil:
return Tam
case unicode.Thai:
return Tha
case unicode.Gujarati:
return Guj
case unicode.Gurmukhi:
return Pan
case unicode.Telugu:
return Tel
case unicode.Malayalam:
return Mal
case unicode.Oriya:
return Ori
case unicode.Myanmar:
return Mya
case unicode.Sinhala:
return Sin
case unicode.Khmer:
return Khm
case _HiraganaKatakana:
return Jpn
}
return -1
}
func detectLangInProfiles(text string, options Options, langProfileList langProfileList) Lang {
trigrams := getTrigramsWithPositions(text)
type langDistance struct {
lang Lang
dist int
}
langDistances := []langDistance{}
for lang, langTrigrams := range langProfileList {
if len(options.Whitelist) != 0 {
//Skip non-whitelisted languages.
if _, ok := options.Whitelist[lang]; !ok {
continue
}
} else if len(options.Blacklist) != 0 {
//skip blacklisted languages.
if _, ok := options.Blacklist[lang]; ok {
continue
}
}
dist := calculateDistance(langTrigrams, trigrams)
langDistances = append(langDistances, langDistance{lang, dist})
}
if len(langDistances) == 0 {
return -1
}
sort.SliceStable(langDistances, func(i, j int) bool { return langDistances[i].dist < langDistances[j].dist })
return langDistances[0].lang
}
func calculateDistance(langTrigrams []string, textTrigrams map[string]int) int {
var dist, totalDist int
for i, trigram := range langTrigrams {
if n, ok := textTrigrams[trigram]; ok {
dist = abs(n - i)
} else {
dist = maxDist
}
totalDist += dist
}
return totalDist
}