-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathrewritingparser.py
More file actions
157 lines (137 loc) · 4.9 KB
/
rewritingparser.py
File metadata and controls
157 lines (137 loc) · 4.9 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
import HTMLParser
import re
import cgi
from htmlentitydefs import name2codepoint
def html_quote(v):
if v is None:
return ''
elif hasattr(v, '__html__'):
return v.__html__()
elif isinstance(v, basestring):
return cgi.escape(v, 1)
else:
if hasattr(v, '__unicode__'):
v = unicode(v)
else:
v = str(v)
return cgi.escape(v, 1)
class RewritingParser(HTMLParser.HTMLParser):
listener = None
skip_next = False
def __init__(self):
self._content = []
HTMLParser.HTMLParser.__init__(self)
def feed(self, data):
self.data_is_str = isinstance(data, str)
self.source = data
self.lines = data.split('\n')
self.source_pos = 1, 0
if self.listener:
self.listener.reset()
HTMLParser.HTMLParser.feed(self, data)
_entityref_re = re.compile('&([a-zA-Z][-.a-zA-Z\d]*);')
_charref_re = re.compile('&#(\d+|[xX][a-fA-F\d]+);')
def unescape(self, s):
s = self._entityref_re.sub(self._sub_entityref, s)
s = self._charref_re.sub(self._sub_charref, s)
return s
def _sub_entityref(self, match):
name = match.group(1)
if name not in name2codepoint:
# If we don't recognize it, pass it through as though it
# wasn't an entity ref at all
return match.group(0)
return unichr(name2codepoint[name])
def _sub_charref(self, match):
num = match.group(1)
if num.lower().startswith('x'):
num = int(num[1:], 16)
else:
num = int(num)
return unichr(num)
def handle_misc(self, whatever):
self.write_pos()
handle_charref = handle_misc
handle_entityref = handle_misc
handle_data = handle_misc
handle_comment = handle_misc
handle_decl = handle_misc
handle_pi = handle_misc
unknown_decl = handle_misc
handle_endtag = handle_misc
def write_tag(self, tag, attrs, startend=False):
attr_text = ''.join([' %s="%s"' % (n, html_quote(v))
for (n, v) in attrs
if not n.startswith('form:')])
if startend:
attr_text += " /"
self.write_text('<%s%s>' % (tag, attr_text))
def skip_output(self):
return False
def write_pos(self):
cur_line, cur_offset = self.getpos()
if self.skip_output():
self.source_pos = self.getpos()
return
if self.skip_next:
self.skip_next = False
self.source_pos = self.getpos()
return
if cur_line == self.source_pos[0]:
self.write_text(
self.lines[cur_line-1][self.source_pos[1]:cur_offset])
else:
self.write_text(
self.lines[self.source_pos[0]-1][self.source_pos[1]:])
self.write_text('\n')
for i in range(self.source_pos[0]+1, cur_line):
self.write_text(self.lines[i-1])
self.write_text('\n')
self.write_text(self.lines[cur_line-1][:cur_offset])
self.source_pos = self.getpos()
def write_text(self, text):
self._content.append(text)
def get_attr(self, attr, name, default=None):
for n, value in attr:
if n.lower() == name:
return value
return default
def set_attr(self, attr, name, value):
for i in range(len(attr)):
if attr[i][0].lower() == name:
attr[i] = (name, value)
return
attr.append((name, value))
def del_attr(self, attr, name):
for i in range(len(attr)):
if attr[i][0].lower() == name:
del attr[i]
break
def add_class(self, attr, class_name):
current = self.get_attr(attr, 'class', '')
new = current + ' ' + class_name
self.set_attr(attr, 'class', new.strip())
def text(self):
try:
return self._text
except AttributeError:
raise Exception(
"You must .close() a parser instance before getting "
"the text from it")
def _get_text(self):
try:
return ''.join([
t for t in self._content if not isinstance(t, tuple)])
except UnicodeDecodeError, e:
if self.data_is_str:
e.reason += (
" the form was passed in as an encoded string, but "
"some data or error messages were unicode strings; "
"the form should be passed in as a unicode string")
else:
e.reason += (
" the form was passed in as an unicode string, but "
"some data or error message was an encoded string; "
"the data and error messages should be passed in as "
"unicode strings")
raise