forked from chuanconggao/TopSim
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtopsim-cli
More file actions
executable file
·85 lines (59 loc) · 1.9 KB
/
topsim-cli
File metadata and controls
executable file
·85 lines (59 loc) · 1.9 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
#! /usr/bin/env python3
"""
Usage:
topsim-cli <query> [options] [<file>]
topsim-cli --help
Options:
-I Case-sensitive matching.
-k <k> Maximum number of search results. [default: 1]
--tie Include all the results with the same similarity of the "k"-th result. May return more than "k" results.
-s <simfunc> Use "jaccard", "overlap", or "tversky" as similarity function. [default: jaccard]
-e <e> Parameter for "tversky" similarity. [default: 0.001]
--mapping=<mapping> Map each string to a set of either "gram"s or "word"s. [default: gram]
--numgrams=<numgrams> Number of characters for each gram when mapping by "gram". [default: 2]
--quiet Do not print additional information to standard error.
"""
import sys
from functools import partial
import os
import time
import resource
import gc
from docopt import docopt
from topsim import TopSim
argv = docopt(__doc__)
print2 = partial(print, file=(open(os.devnull, 'w') if argv["--quiet"] else sys.stderr))
startTime = time.clock()
def printResourceUsage():
global startTime
gc.collect()
print2("{:.2} sec | {:.2} MB".format(
time.clock() - startTime,
resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024 / 1024
))
startTime = time.clock()
sRawStrs = [
line.rstrip('\r\n')
for line in (open(argv["<file>"]) if argv["<file>"] else sys.stdin)
]
print2("Indexing...", end=" ")
ts = TopSim(
sRawStrs,
argv["-I"],
mapping=argv["--mapping"],
numGrams=int(argv["--numgrams"])
)
printResourceUsage()
print2("Searching...", end=" ")
rBest = ts.search(
argv["<query>"],
int(argv["-k"]),
argv["--tie"],
argv["-s"],
float(argv["-e"])
)
printResourceUsage()
print2()
for sim, lns in rBest:
for ln in lns:
print("{}\t{:.4}".format(sRawStrs[ln], sim))