-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathsherlok.py
More file actions
120 lines (100 loc) · 4.5 KB
/
sherlok.py
File metadata and controls
120 lines (100 loc) · 4.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import requests # pip install requests
# For Python3 compatibility
import sys
from six import iteritems
if sys.version_info > (3,):
long = int
class SherlokError(Exception):
def __init__(self, message, errorLog, response=None):
# Call the base class constructor with the parameters it needs
super(SherlokError, self).__init__(message)
self.errorLog = errorLog
self.response = response
class SherlokResult(object):
def __init__(self, text, annotations, refs = {}):
self.text = text
self.annotations = annotations
self.refs = refs
def __iter__(self):
return self.annotations
class Sherlok(object):
'''
@param pipeline: the Sherlok pipeline to invoke
'''
def __init__(self, pipeline, host='localhost', port=9600, view='_InitialView'):
self.pipeline = pipeline
self.host = host
self.port = port
self.view = view
'''
@param text: the text to analyse
@return: a generator of tuples (begin, end, text, annotation_type, attributes{})
'''
def annotate(self, text, filter = False, timeout=1.0, nbRetries=5):
# On occasions, the post request hangs for a long time.
# From run to run, it does not seem to hang on the same publications, so
# it appears to be independant of the request but more to be a bug related
# to communication. A default 1s timeout has been added to avoid hanging
# for too long. Since this issue appears to be mainly independant of the
# request, we can just catch the timeout exception and retry. We retry up
# to 5 times by default.
for retry in range(nbRetries):
try:
resp = requests.post(
'http://{}:{}/annotate/{}'.format(self.host, self.port, self.pipeline),
data={'text': text}, timeout=timeout)
break
except requests.Timeout:
if retry == nbRetries-1:
raise
else:
continue
if resp.status_code != 200:
log = ("##################### ERROR LOG #########################\n" +
str(text) + "\n" +
"##########################################################\n" +
str(resp) + "\n" +
str(type(resp)) + "\n" +
str(dir(resp)) + "\n" +
str(resp.content) + "\n" +
str(resp.headers) + "\n" +
str(resp.ok) + "\n" +
str(dir(resp.raw)) + "\n" +
str(resp.reason) + "\n" +
str(resp.request) + "\n" +
str(resp.status_code) + "\n" +
str(resp.text) + "\n" +
str(resp.url) + "\n" +
str(resp.raise_for_status) + "\n")
raise SherlokError('Sherlok error: {} {}'.format(resp.status_code, resp.text), log, resp)
json = resp.json()
refs = json['_referenced_fss']
return_annots = []
for annot_type, annotations in iteritems(json['_views'][self.view]):
# filter?
if filter is False or annot_type == filter:
# ignore DocumentAnnotation (contains the request text) and Sofa
if annot_type not in [u'DocumentAnnotation', u'Sofa', u'FSArray']:
for a in annotations:
if isinstance(a, (int, long)): # a ref?
a = refs[str(a)]
begin, end = a['begin'], a['end']
txt = text[begin:end]
# additional attributes
attributes = { k:v for (k,v) in a.items() \
if k not in ['sofa', 'begin', 'end'] }
return_annots.append( (begin, end, txt, annot_type, attributes) )
return SherlokResult(text, return_annots, refs)
# def keep_largest(self, annotations):
# largests = []
# for a in annotations:
# print 'largest in obs=', a
# is_larger = False
# for i, largest in enumerate(largests):
# # is a larger than largest?
# if (a[0] >= largest[0]) and (a[1] <= largest[1]):
# del largests[i]
# is_larger = True
# if is_larger:
# largests.append(a)
# return largests