1 |
arfrever 09/09/08 19:57:03 |
2 |
|
3 |
Added: beautifulsoup-3.1.0.1-python-3.patch |
4 |
Log: |
5 |
Add patch for compatibility with Python 3 (upstream patches don't apply cleanly). Set SUPPORT_PYTHON_ABIS. |
6 |
(Portage version: 14218-svn/cvs/Linux x86_64) |
7 |
|
8 |
Revision Changes Path |
9 |
1.1 dev-python/beautifulsoup/files/beautifulsoup-3.1.0.1-python-3.patch |
10 |
|
11 |
file : http://sources.gentoo.org/viewcvs.py/gentoo-x86/dev-python/beautifulsoup/files/beautifulsoup-3.1.0.1-python-3.patch?rev=1.1&view=markup |
12 |
plain: http://sources.gentoo.org/viewcvs.py/gentoo-x86/dev-python/beautifulsoup/files/beautifulsoup-3.1.0.1-python-3.patch?rev=1.1&content-type=text/plain |
13 |
|
14 |
Index: beautifulsoup-3.1.0.1-python-3.patch |
15 |
=================================================================== |
16 |
--- BeautifulSoup.py |
17 |
+++ BeautifulSoup.py |
18 |
@@ -76,7 +76,7 @@ |
19 |
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE, DAMMIT. |
20 |
|
21 |
""" |
22 |
-from __future__ import generators |
23 |
+ |
24 |
|
25 |
__author__ = "Leonard Richardson (leonardr@××××××××.org)" |
26 |
__version__ = "3.1.0.1" |
27 |
@@ -84,12 +84,12 @@ |
28 |
__license__ = "New-style BSD" |
29 |
|
30 |
import codecs |
31 |
-import markupbase |
32 |
+import _markupbase |
33 |
import types |
34 |
import re |
35 |
-from HTMLParser import HTMLParser, HTMLParseError |
36 |
+from html.parser import HTMLParser, HTMLParseError |
37 |
try: |
38 |
- from htmlentitydefs import name2codepoint |
39 |
+ from html.entities import name2codepoint |
40 |
except ImportError: |
41 |
name2codepoint = {} |
42 |
try: |
43 |
@@ -98,18 +98,18 @@ |
44 |
from sets import Set as set |
45 |
|
46 |
#These hacks make Beautiful Soup able to parse XML with namespaces |
47 |
-markupbase._declname_match = re.compile(r'[a-zA-Z][-_.:a-zA-Z0-9]*\s*').match |
48 |
+_markupbase._declname_match = re.compile(r'[a-zA-Z][-_.:a-zA-Z0-9]*\s*').match |
49 |
|
50 |
DEFAULT_OUTPUT_ENCODING = "utf-8" |
51 |
|
52 |
# First, the classes that represent markup elements. |
53 |
|
54 |
-def sob(unicode, encoding): |
55 |
+def sob(str, encoding): |
56 |
"""Returns either the given Unicode string or its encoding.""" |
57 |
if encoding is None: |
58 |
- return unicode |
59 |
+ return str |
60 |
else: |
61 |
- return unicode.encode(encoding) |
62 |
+ return str.encode(encoding) |
63 |
|
64 |
class PageElement: |
65 |
"""Contains the navigational information for some part of the page |
66 |
@@ -178,8 +178,8 @@ |
67 |
return lastChild |
68 |
|
69 |
def insert(self, position, newChild): |
70 |
- if (isinstance(newChild, basestring) |
71 |
- or isinstance(newChild, unicode)) \ |
72 |
+ if (isinstance(newChild, str) |
73 |
+ or isinstance(newChild, str)) \ |
74 |
and not isinstance(newChild, NavigableString): |
75 |
newChild = NavigableString(newChild) |
76 |
|
77 |
@@ -334,7 +334,7 @@ |
78 |
g = generator() |
79 |
while True: |
80 |
try: |
81 |
- i = g.next() |
82 |
+ i = g.__next__() |
83 |
except StopIteration: |
84 |
break |
85 |
if i: |
86 |
@@ -385,22 +385,22 @@ |
87 |
def toEncoding(self, s, encoding=None): |
88 |
"""Encodes an object to a string in some encoding, or to Unicode. |
89 |
.""" |
90 |
- if isinstance(s, unicode): |
91 |
+ if isinstance(s, str): |
92 |
if encoding: |
93 |
s = s.encode(encoding) |
94 |
elif isinstance(s, str): |
95 |
if encoding: |
96 |
s = s.encode(encoding) |
97 |
else: |
98 |
- s = unicode(s) |
99 |
+ s = str(s) |
100 |
else: |
101 |
if encoding: |
102 |
s = self.toEncoding(str(s), encoding) |
103 |
else: |
104 |
- s = unicode(s) |
105 |
+ s = str(s) |
106 |
return s |
107 |
|
108 |
-class NavigableString(unicode, PageElement): |
109 |
+class NavigableString(str, PageElement): |
110 |
|
111 |
def __new__(cls, value): |
112 |
"""Create a new NavigableString. |
113 |
@@ -410,12 +410,12 @@ |
114 |
passed in to the superclass's __new__ or the superclass won't know |
115 |
how to handle non-ASCII characters. |
116 |
""" |
117 |
- if isinstance(value, unicode): |
118 |
- return unicode.__new__(cls, value) |
119 |
- return unicode.__new__(cls, value, DEFAULT_OUTPUT_ENCODING) |
120 |
+ if isinstance(value, str): |
121 |
+ return str.__new__(cls, value) |
122 |
+ return str.__new__(cls, value, DEFAULT_OUTPUT_ENCODING) |
123 |
|
124 |
def __getnewargs__(self): |
125 |
- return (unicode(self),) |
126 |
+ return (str(self),) |
127 |
|
128 |
def __getattr__(self, attr): |
129 |
"""text.string gives you text. This is for backwards |
130 |
@@ -424,7 +424,7 @@ |
131 |
if attr == 'string': |
132 |
return self |
133 |
else: |
134 |
- raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr) |
135 |
+ raise AttributeError("'%s' object has no attribute '%s'" % (self.__class__.__name__, attr)) |
136 |
|
137 |
def encode(self, encoding=DEFAULT_OUTPUT_ENCODING): |
138 |
return self.decode().encode(encoding) |
139 |
@@ -435,23 +435,23 @@ |
140 |
class CData(NavigableString): |
141 |
|
142 |
def decodeGivenEventualEncoding(self, eventualEncoding): |
143 |
- return u'<![CDATA[' + self + u']]>' |
144 |
+ return '<![CDATA[' + self + ']]>' |
145 |
|
146 |
class ProcessingInstruction(NavigableString): |
147 |
|
148 |
def decodeGivenEventualEncoding(self, eventualEncoding): |
149 |
output = self |
150 |
- if u'%SOUP-ENCODING%' in output: |
151 |
+ if '%SOUP-ENCODING%' in output: |
152 |
output = self.substituteEncoding(output, eventualEncoding) |
153 |
- return u'<?' + output + u'?>' |
154 |
+ return '<?' + output + '?>' |
155 |
|
156 |
class Comment(NavigableString): |
157 |
def decodeGivenEventualEncoding(self, eventualEncoding): |
158 |
- return u'<!--' + self + u'-->' |
159 |
+ return '<!--' + self + '-->' |
160 |
|
161 |
class Declaration(NavigableString): |
162 |
def decodeGivenEventualEncoding(self, eventualEncoding): |
163 |
- return u'<!' + self + u'>' |
164 |
+ return '<!' + self + '>' |
165 |
|
166 |
class Tag(PageElement): |
167 |
|
168 |
@@ -460,7 +460,7 @@ |
169 |
def _invert(h): |
170 |
"Cheap function to invert a hash." |
171 |
i = {} |
172 |
- for k,v in h.items(): |
173 |
+ for k,v in list(h.items()): |
174 |
i[v] = k |
175 |
return i |
176 |
|
177 |
@@ -479,23 +479,23 @@ |
178 |
escaped.""" |
179 |
x = match.group(1) |
180 |
if self.convertHTMLEntities and x in name2codepoint: |
181 |
- return unichr(name2codepoint[x]) |
182 |
+ return chr(name2codepoint[x]) |
183 |
elif x in self.XML_ENTITIES_TO_SPECIAL_CHARS: |
184 |
if self.convertXMLEntities: |
185 |
return self.XML_ENTITIES_TO_SPECIAL_CHARS[x] |
186 |
else: |
187 |
- return u'&%s;' % x |
188 |
+ return '&%s;' % x |
189 |
elif len(x) > 0 and x[0] == '#': |
190 |
# Handle numeric entities |
191 |
if len(x) > 1 and x[1] == 'x': |
192 |
- return unichr(int(x[2:], 16)) |
193 |
+ return chr(int(x[2:], 16)) |
194 |
else: |
195 |
- return unichr(int(x[1:])) |
196 |
+ return chr(int(x[1:])) |
197 |
|
198 |
elif self.escapeUnrecognizedEntities: |
199 |
- return u'&%s;' % x |
200 |
+ return '&%s;' % x |
201 |
else: |
202 |
- return u'&%s;' % x |
203 |
+ return '&%s;' % x |
204 |
|
205 |
def __init__(self, parser, name, attrs=None, parent=None, |
206 |
previous=None): |
207 |
@@ -524,7 +524,7 @@ |
208 |
return kval |
209 |
return (k, re.sub("&(#\d+|#x[0-9a-fA-F]+|\w+);", |
210 |
self._convertEntities, val)) |
211 |
- self.attrs = map(convert, self.attrs) |
212 |
+ self.attrs = list(map(convert, self.attrs)) |
213 |
|
214 |
def get(self, key, default=None): |
215 |
"""Returns the value of the 'key' attribute for the tag, or |
216 |
@@ -533,7 +533,7 @@ |
217 |
return self._getAttrMap().get(key, default) |
218 |
|
219 |
def has_key(self, key): |
220 |
- return self._getAttrMap().has_key(key) |
221 |
+ return key in self._getAttrMap() |
222 |
|
223 |
def __getitem__(self, key): |
224 |
"""tag[key] returns the value of the 'key' attribute for the tag, |
225 |
@@ -551,7 +551,7 @@ |
226 |
def __contains__(self, x): |
227 |
return x in self.contents |
228 |
|
229 |
- def __nonzero__(self): |
230 |
+ def __bool__(self): |
231 |
"A tag is non-None even if it has no contents." |
232 |
return True |
233 |
|
234 |
@@ -577,14 +577,14 @@ |
235 |
#We don't break because bad HTML can define the same |
236 |
#attribute multiple times. |
237 |
self._getAttrMap() |
238 |
- if self.attrMap.has_key(key): |
239 |
+ if key in self.attrMap: |
240 |
del self.attrMap[key] |
241 |
|
242 |
def __call__(self, *args, **kwargs): |
243 |
"""Calling a tag like a function is the same as calling its |
244 |
findAll() method. Eg. tag('a') returns a list of all the A tags |
245 |
found within this tag.""" |
246 |
- return apply(self.findAll, args, kwargs) |
247 |
+ return self.findAll(*args, **kwargs) |
248 |
|
249 |
def __getattr__(self, tag): |
250 |
#print "Getattr %s.%s" % (self.__class__, tag) |
251 |
@@ -592,7 +592,7 @@ |
252 |
return self.find(tag[:-3]) |
253 |
elif tag.find('__') != 0: |
254 |
return self.find(tag) |
255 |
- raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__, tag) |
256 |
+ raise AttributeError("'%s' object has no attribute '%s'" % (self.__class__, tag)) |
257 |
|
258 |
def __eq__(self, other): |
259 |
"""Returns true iff this tag has the same name, the same attributes, |
260 |
@@ -868,7 +868,7 @@ |
261 |
if isinstance(markupName, Tag): |
262 |
markup = markupName |
263 |
markupAttrs = markup |
264 |
- callFunctionWithTagData = callable(self.name) \ |
265 |
+ callFunctionWithTagData = hasattr(self.name, '__call__') \ |
266 |
and not isinstance(markupName, Tag) |
267 |
|
268 |
if (not self.name) \ |
269 |
@@ -880,7 +880,7 @@ |
270 |
else: |
271 |
match = True |
272 |
markupAttrMap = None |
273 |
- for attr, matchAgainst in self.attrs.items(): |
274 |
+ for attr, matchAgainst in list(self.attrs.items()): |
275 |
if not markupAttrMap: |
276 |
if hasattr(markupAttrs, 'get'): |
277 |
markupAttrMap = markupAttrs |
278 |
@@ -921,16 +921,16 @@ |
279 |
if self._matches(markup, self.text): |
280 |
found = markup |
281 |
else: |
282 |
- raise Exception, "I don't know how to match against a %s" \ |
283 |
- % markup.__class__ |
284 |
+ raise Exception("I don't know how to match against a %s" \ |
285 |
+ % markup.__class__) |
286 |
return found |
287 |
|
288 |
def _matches(self, markup, matchAgainst): |
289 |
#print "Matching %s against %s" % (markup, matchAgainst) |
290 |
result = False |
291 |
- if matchAgainst == True and type(matchAgainst) == types.BooleanType: |
292 |
+ if matchAgainst == True and type(matchAgainst) == bool: |
293 |
result = markup != None |
294 |
- elif callable(matchAgainst): |
295 |
+ elif hasattr(matchAgainst, '__call__'): |
296 |
result = matchAgainst(markup) |
297 |
else: |
298 |
#Custom match methods take the tag as an argument, but all |
299 |
@@ -938,7 +938,7 @@ |
300 |
if isinstance(markup, Tag): |
301 |
markup = markup.name |
302 |
if markup is not None and not isString(markup): |
303 |
- markup = unicode(markup) |
304 |
+ markup = str(markup) |
305 |
#Now we know that chunk is either a string, or None. |
306 |
if hasattr(matchAgainst, 'match'): |
307 |
# It's a regexp object. |
308 |
@@ -947,10 +947,10 @@ |
309 |
and (markup is not None or not isString(matchAgainst))): |
310 |
result = markup in matchAgainst |
311 |
elif hasattr(matchAgainst, 'items'): |
312 |
- result = markup.has_key(matchAgainst) |
313 |
+ result = matchAgainst in markup |
314 |
elif matchAgainst and isString(markup): |
315 |
- if isinstance(markup, unicode): |
316 |
- matchAgainst = unicode(matchAgainst) |
317 |
+ if isinstance(markup, str): |
318 |
+ matchAgainst = str(matchAgainst) |
319 |
else: |
320 |
matchAgainst = str(matchAgainst) |
321 |
|
322 |
@@ -971,13 +971,13 @@ |
323 |
"""Convenience method that works with all 2.x versions of Python |
324 |
to determine whether or not something is listlike.""" |
325 |
return ((hasattr(l, '__iter__') and not isString(l)) |
326 |
- or (type(l) in (types.ListType, types.TupleType))) |
327 |
+ or (type(l) in (list, tuple))) |
328 |
|
329 |
def isString(s): |
330 |
"""Convenience method that works with all 2.x versions of Python |
331 |
to determine whether or not something is stringlike.""" |
332 |
try: |
333 |
- return isinstance(s, unicode) or isinstance(s, basestring) |
334 |
+ return isinstance(s, str) or isinstance(s, str) |
335 |
except NameError: |
336 |
return isinstance(s, str) |
337 |
|
338 |
@@ -989,7 +989,7 @@ |
339 |
for portion in args: |
340 |
if hasattr(portion, 'items'): |
341 |
#It's a map. Merge it. |
342 |
- for k,v in portion.items(): |
343 |
+ for k,v in list(portion.items()): |
344 |
built[k] = v |
345 |
elif isList(portion) and not isString(portion): |
346 |
#It's a list. Map each item to the default. |
347 |
@@ -1034,7 +1034,7 @@ |
348 |
object, possibly one with a %SOUP-ENCODING% slot into which an |
349 |
encoding will be plugged later.""" |
350 |
if text[:3] == "xml": |
351 |
- text = u"xml version='1.0' encoding='%SOUP-ENCODING%'" |
352 |
+ text = "xml version='1.0' encoding='%SOUP-ENCODING%'" |
353 |
self._toStringSubclass(text, ProcessingInstruction) |
354 |
|
355 |
def handle_comment(self, text): |
356 |
@@ -1044,7 +1044,7 @@ |
357 |
def handle_charref(self, ref): |
358 |
"Handle character references as data." |
359 |
if self.soup.convertEntities: |
360 |
- data = unichr(int(ref)) |
361 |
+ data = chr(int(ref)) |
362 |
else: |
363 |
data = '&#%s;' % ref |
364 |
self.handle_data(data) |
365 |
@@ -1056,7 +1056,7 @@ |
366 |
data = None |
367 |
if self.soup.convertHTMLEntities: |
368 |
try: |
369 |
- data = unichr(name2codepoint[ref]) |
370 |
+ data = chr(name2codepoint[ref]) |
371 |
except KeyError: |
372 |
pass |
373 |
|
374 |
@@ -1147,7 +1147,7 @@ |
375 |
lambda x: '<!' + x.group(1) + '>') |
376 |
] |
377 |
|
378 |
- ROOT_TAG_NAME = u'[document]' |
379 |
+ ROOT_TAG_NAME = '[document]' |
380 |
|
381 |
HTML_ENTITIES = "html" |
382 |
XML_ENTITIES = "xml" |
383 |
@@ -1236,14 +1236,14 @@ |
384 |
def _feed(self, inDocumentEncoding=None, isHTML=False): |
385 |
# Convert the document to Unicode. |
386 |
markup = self.markup |
387 |
- if isinstance(markup, unicode): |
388 |
+ if isinstance(markup, str): |
389 |
if not hasattr(self, 'originalEncoding'): |
390 |
self.originalEncoding = None |
391 |
else: |
392 |
dammit = UnicodeDammit\ |
393 |
(markup, [self.fromEncoding, inDocumentEncoding], |
394 |
smartQuotesTo=self.smartQuotesTo, isHTML=isHTML) |
395 |
- markup = dammit.unicode |
396 |
+ markup = dammit.str |
397 |
self.originalEncoding = dammit.originalEncoding |
398 |
self.declaredHTMLEncoding = dammit.declaredHTMLEncoding |
399 |
if markup: |
400 |
@@ -1269,8 +1269,8 @@ |
401 |
def isSelfClosingTag(self, name): |
402 |
"""Returns true iff the given string is the name of a |
403 |
self-closing tag according to this parser.""" |
404 |
- return self.SELF_CLOSING_TAGS.has_key(name) \ |
405 |
- or self.instanceSelfClosingTags.has_key(name) |
406 |
+ return name in self.SELF_CLOSING_TAGS \ |
407 |
+ or name in self.instanceSelfClosingTags |
408 |
|
409 |
def reset(self): |
410 |
Tag.__init__(self, self, self.ROOT_TAG_NAME) |
411 |
@@ -1305,7 +1305,7 @@ |
412 |
|
413 |
def endData(self, containerClass=NavigableString): |
414 |
if self.currentData: |
415 |
- currentData = u''.join(self.currentData) |
416 |
+ currentData = ''.join(self.currentData) |
417 |
if (currentData.translate(self.STRIP_ASCII_SPACES) == '' and |
418 |
not set([tag.name for tag in self.tagStack]).intersection( |
419 |
self.PRESERVE_WHITESPACE_TAGS)): |
420 |
@@ -1368,7 +1368,7 @@ |
421 |
|
422 |
nestingResetTriggers = self.NESTABLE_TAGS.get(name) |
423 |
isNestable = nestingResetTriggers != None |
424 |
- isResetNesting = self.RESET_NESTING_TAGS.has_key(name) |
425 |
+ isResetNesting = name in self.RESET_NESTING_TAGS |
426 |
popTo = None |
427 |
inclusive = True |
428 |
for i in range(len(self.tagStack)-1, 0, -1): |
429 |
@@ -1381,7 +1381,7 @@ |
430 |
if (nestingResetTriggers != None |
431 |
and p.name in nestingResetTriggers) \ |
432 |
or (nestingResetTriggers == None and isResetNesting |
433 |
- and self.RESET_NESTING_TAGS.has_key(p.name)): |
434 |
+ and p.name in self.RESET_NESTING_TAGS): |
435 |
|
436 |
#If we encounter one of the nesting reset triggers |
437 |
#peculiar to this tag, or we encounter another tag |
438 |
@@ -1399,7 +1399,7 @@ |
439 |
if self.quoteStack: |
440 |
#This is not a real tag. |
441 |
#print "<%s> is not real!" % name |
442 |
- attrs = ''.join(map(lambda(x, y): ' %s="%s"' % (x, y), attrs)) |
443 |
+ attrs = ''.join([' %s="%s"' % (x_y[0], x_y[1]) for x_y in attrs]) |
444 |
self.handle_data('<%s%s>' % (name, attrs)) |
445 |
return |
446 |
self.endData() |
447 |
@@ -1493,7 +1493,7 @@ |
448 |
BeautifulStoneSoup before writing your own subclass.""" |
449 |
|
450 |
def __init__(self, *args, **kwargs): |
451 |
- if not kwargs.has_key('smartQuotesTo'): |
452 |
+ if 'smartQuotesTo' not in kwargs: |
453 |
kwargs['smartQuotesTo'] = self.HTML_ENTITIES |
454 |
kwargs['isHTML'] = True |
455 |
BeautifulStoneSoup.__init__(self, *args, **kwargs) |
456 |
@@ -1677,7 +1677,7 @@ |
457 |
parent._getAttrMap() |
458 |
if (isinstance(tag, Tag) and len(tag.contents) == 1 and |
459 |
isinstance(tag.contents[0], NavigableString) and |
460 |
- not parent.attrMap.has_key(tag.name)): |
461 |
+ tag.name not in parent.attrMap): |
462 |
parent[tag.name] = tag.contents[0] |
463 |
BeautifulStoneSoup.popTag(self) |
464 |
|
465 |
@@ -1751,9 +1751,9 @@ |
466 |
self._detectEncoding(markup, isHTML) |
467 |
self.smartQuotesTo = smartQuotesTo |
468 |
self.triedEncodings = [] |
469 |
- if markup == '' or isinstance(markup, unicode): |
470 |
+ if markup == '' or isinstance(markup, str): |
471 |
self.originalEncoding = None |
472 |
- self.unicode = unicode(markup) |
473 |
+ self.str = str(markup) |
474 |
return |
475 |
|
476 |
u = None |
477 |
@@ -1766,7 +1766,7 @@ |
478 |
if u: break |
479 |
|
480 |
# If no luck and we have auto-detection library, try that: |
481 |
- if not u and chardet and not isinstance(self.markup, unicode): |
482 |
+ if not u and chardet and not isinstance(self.markup, str): |
483 |
u = self._convertFrom(chardet.detect(self.markup)['encoding']) |
484 |
|
485 |
# As a last resort, try utf-8 and windows-1252: |
486 |
@@ -1775,7 +1775,7 @@ |
487 |
u = self._convertFrom(proposed_encoding) |
488 |
if u: break |
489 |
|
490 |
- self.unicode = u |
491 |
+ self.str = u |
492 |
if not u: self.originalEncoding = None |
493 |
|
494 |
def _subMSChar(self, match): |
495 |
@@ -1783,7 +1783,7 @@ |
496 |
entity.""" |
497 |
orig = match.group(1) |
498 |
sub = self.MS_CHARS.get(orig) |
499 |
- if type(sub) == types.TupleType: |
500 |
+ if type(sub) == tuple: |
501 |
if self.smartQuotesTo == 'xml': |
502 |
sub = '&#x'.encode() + sub[1].encode() + ';'.encode() |
503 |
else: |
504 |
@@ -1804,7 +1804,7 @@ |
505 |
if self.smartQuotesTo and proposed.lower() in("windows-1252", |
506 |
"iso-8859-1", |
507 |
"iso-8859-2"): |
508 |
- smart_quotes_re = "([\x80-\x9f])" |
509 |
+ smart_quotes_re = b"([\x80-\x9f])" |
510 |
smart_quotes_compiled = re.compile(smart_quotes_re) |
511 |
markup = smart_quotes_compiled.sub(self._subMSChar, markup) |
512 |
|
513 |
@@ -1813,7 +1813,7 @@ |
514 |
u = self._toUnicode(markup, proposed) |
515 |
self.markup = u |
516 |
self.originalEncoding = proposed |
517 |
- except Exception, e: |
518 |
+ except Exception as e: |
519 |
# print "That didn't work!" |
520 |
# print e |
521 |
return None |
522 |
@@ -1842,7 +1842,7 @@ |
523 |
elif data[:4] == '\xff\xfe\x00\x00': |
524 |
encoding = 'utf-32le' |
525 |
data = data[4:] |
526 |
- newdata = unicode(data, encoding) |
527 |
+ newdata = str(data, encoding) |
528 |
return newdata |
529 |
|
530 |
def _detectEncoding(self, xml_data, isHTML=False): |
531 |
@@ -1855,41 +1855,41 @@ |
532 |
elif xml_data[:4] == '\x00\x3c\x00\x3f': |
533 |
# UTF-16BE |
534 |
sniffed_xml_encoding = 'utf-16be' |
535 |
- xml_data = unicode(xml_data, 'utf-16be').encode('utf-8') |
536 |
+ xml_data = str(xml_data, 'utf-16be').encode('utf-8') |
537 |
elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') \ |
538 |
and (xml_data[2:4] != '\x00\x00'): |
539 |
# UTF-16BE with BOM |
540 |
sniffed_xml_encoding = 'utf-16be' |
541 |
- xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8') |
542 |
+ xml_data = str(xml_data[2:], 'utf-16be').encode('utf-8') |
543 |
elif xml_data[:4] == '\x3c\x00\x3f\x00': |
544 |
# UTF-16LE |
545 |
sniffed_xml_encoding = 'utf-16le' |
546 |
- xml_data = unicode(xml_data, 'utf-16le').encode('utf-8') |
547 |
+ xml_data = str(xml_data, 'utf-16le').encode('utf-8') |
548 |
elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and \ |
549 |
(xml_data[2:4] != '\x00\x00'): |
550 |
# UTF-16LE with BOM |
551 |
sniffed_xml_encoding = 'utf-16le' |
552 |
- xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8') |
553 |
+ xml_data = str(xml_data[2:], 'utf-16le').encode('utf-8') |
554 |
elif xml_data[:4] == '\x00\x00\x00\x3c': |
555 |
# UTF-32BE |
556 |
sniffed_xml_encoding = 'utf-32be' |
557 |
- xml_data = unicode(xml_data, 'utf-32be').encode('utf-8') |
558 |
+ xml_data = str(xml_data, 'utf-32be').encode('utf-8') |
559 |
elif xml_data[:4] == '\x3c\x00\x00\x00': |
560 |
# UTF-32LE |
561 |
sniffed_xml_encoding = 'utf-32le' |
562 |
- xml_data = unicode(xml_data, 'utf-32le').encode('utf-8') |
563 |
+ xml_data = str(xml_data, 'utf-32le').encode('utf-8') |
564 |
elif xml_data[:4] == '\x00\x00\xfe\xff': |
565 |
# UTF-32BE with BOM |
566 |
sniffed_xml_encoding = 'utf-32be' |
567 |
- xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8') |
568 |
+ xml_data = str(xml_data[4:], 'utf-32be').encode('utf-8') |
569 |
elif xml_data[:4] == '\xff\xfe\x00\x00': |
570 |
# UTF-32LE with BOM |
571 |
sniffed_xml_encoding = 'utf-32le' |
572 |
- xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8') |
573 |
+ xml_data = str(xml_data[4:], 'utf-32le').encode('utf-8') |
574 |
elif xml_data[:3] == '\xef\xbb\xbf': |
575 |
# UTF-8 with BOM |
576 |
sniffed_xml_encoding = 'utf-8' |
577 |
- xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8') |
578 |
+ xml_data = str(xml_data[3:], 'utf-8').encode('utf-8') |
579 |
else: |
580 |
sniffed_xml_encoding = 'ascii' |
581 |
pass |
582 |
@@ -1954,41 +1954,41 @@ |
583 |
250,251,252,253,254,255) |
584 |
import string |
585 |
c.EBCDIC_TO_ASCII_MAP = string.maketrans( \ |
586 |
- ''.join(map(chr, range(256))), ''.join(map(chr, emap))) |
587 |
+ ''.join(map(chr, list(range(256)))), ''.join(map(chr, emap))) |
588 |
return s.translate(c.EBCDIC_TO_ASCII_MAP) |
589 |
|
590 |
- MS_CHARS = { '\x80' : ('euro', '20AC'), |
591 |
- '\x81' : ' ', |
592 |
- '\x82' : ('sbquo', '201A'), |
593 |
- '\x83' : ('fnof', '192'), |
594 |
- '\x84' : ('bdquo', '201E'), |
595 |
- '\x85' : ('hellip', '2026'), |
596 |
- '\x86' : ('dagger', '2020'), |
597 |
- '\x87' : ('Dagger', '2021'), |
598 |
- '\x88' : ('circ', '2C6'), |
599 |
- '\x89' : ('permil', '2030'), |
600 |
- '\x8A' : ('Scaron', '160'), |
601 |
- '\x8B' : ('lsaquo', '2039'), |
602 |
- '\x8C' : ('OElig', '152'), |
603 |
- '\x8D' : '?', |
604 |
- '\x8E' : ('#x17D', '17D'), |
605 |
- '\x8F' : '?', |
606 |
- '\x90' : '?', |
607 |
- '\x91' : ('lsquo', '2018'), |
608 |
- '\x92' : ('rsquo', '2019'), |
609 |
- '\x93' : ('ldquo', '201C'), |
610 |
- '\x94' : ('rdquo', '201D'), |
611 |
- '\x95' : ('bull', '2022'), |
612 |
- '\x96' : ('ndash', '2013'), |
613 |
- '\x97' : ('mdash', '2014'), |
614 |
- '\x98' : ('tilde', '2DC'), |
615 |
- '\x99' : ('trade', '2122'), |
616 |
- '\x9a' : ('scaron', '161'), |
617 |
- '\x9b' : ('rsaquo', '203A'), |
618 |
- '\x9c' : ('oelig', '153'), |
619 |
- '\x9d' : '?', |
620 |
- '\x9e' : ('#x17E', '17E'), |
621 |
- '\x9f' : ('Yuml', ''),} |
622 |
+ MS_CHARS = { b'\x80' : ('euro', '20AC'), |
623 |
+ b'\x81' : ' ', |
624 |
+ b'\x82' : ('sbquo', '201A'), |
625 |
+ b'\x83' : ('fnof', '192'), |
626 |
+ b'\x84' : ('bdquo', '201E'), |
627 |
+ b'\x85' : ('hellip', '2026'), |
628 |
+ b'\x86' : ('dagger', '2020'), |
629 |
+ b'\x87' : ('Dagger', '2021'), |
630 |
+ b'\x88' : ('circ', '2C6'), |
631 |
+ b'\x89' : ('permil', '2030'), |
632 |
+ b'\x8A' : ('Scaron', '160'), |
633 |
+ b'\x8B' : ('lsaquo', '2039'), |
634 |
+ b'\x8C' : ('OElig', '152'), |
635 |
+ b'\x8D' : '?', |
636 |
+ b'\x8E' : ('#x17D', '17D'), |
637 |
+ b'\x8F' : '?', |
638 |
+ b'\x90' : '?', |
639 |
+ b'\x91' : ('lsquo', '2018'), |
640 |
+ b'\x92' : ('rsquo', '2019'), |
641 |
+ b'\x93' : ('ldquo', '201C'), |
642 |
+ b'\x94' : ('rdquo', '201D'), |
643 |
+ b'\x95' : ('bull', '2022'), |
644 |
+ b'\x96' : ('ndash', '2013'), |
645 |
+ b'\x97' : ('mdash', '2014'), |
646 |
+ b'\x98' : ('tilde', '2DC'), |
647 |
+ b'\x99' : ('trade', '2122'), |
648 |
+ b'\x9a' : ('scaron', '161'), |
649 |
+ b'\x9b' : ('rsaquo', '203A'), |
650 |
+ b'\x9c' : ('oelig', '153'), |
651 |
+ b'\x9d' : '?', |
652 |
+ b'\x9e' : ('#x17E', '17E'), |
653 |
+ b'\x9f' : ('Yuml', ''),} |
654 |
|
655 |
####################################################################### |
656 |
|
657 |
@@ -1997,4 +1997,4 @@ |
658 |
if __name__ == '__main__': |
659 |
import sys |
660 |
soup = BeautifulSoup(sys.stdin) |
661 |
- print soup.prettify() |
662 |
+ print(soup.prettify()) |
663 |
--- BeautifulSoupTests.py |
664 |
+++ BeautifulSoupTests.py |
665 |
@@ -82,7 +82,7 @@ |
666 |
def testFindAllText(self): |
667 |
soup = BeautifulSoup("<html>\xbb</html>") |
668 |
self.assertEqual(soup.findAll(text=re.compile('.*')), |
669 |
- [u'\xbb']) |
670 |
+ ['\xbb']) |
671 |
|
672 |
def testFindAllByRE(self): |
673 |
import re |
674 |
@@ -215,7 +215,7 @@ |
675 |
soup = BeautifulSoup(self.x, parseOnlyThese=strainer) |
676 |
self.assertEquals(len(soup), 10) |
677 |
|
678 |
- strainer = SoupStrainer(text=lambda(x):x[8]=='3') |
679 |
+ strainer = SoupStrainer(text=lambda x:x[8]=='3') |
680 |
soup = BeautifulSoup(self.x, parseOnlyThese=strainer) |
681 |
self.assertEquals(len(soup), 3) |
682 |
|
683 |
@@ -256,7 +256,7 @@ |
684 |
self.assertEqual(copied.decode(), self.soup.decode()) |
685 |
|
686 |
def testUnicodePickle(self): |
687 |
- import cPickle as pickle |
688 |
+ import pickle as pickle |
689 |
html = "<b>" + chr(0xc3) + "</b>" |
690 |
soup = BeautifulSoup(html) |
691 |
dumped = pickle.dumps(soup, pickle.HIGHEST_PROTOCOL) |
692 |
@@ -586,23 +586,23 @@ |
693 |
self.assertEquals(soup.decode(), "<<sacré bleu!>>") |
694 |
|
695 |
soup = BeautifulStoneSoup(text, convertEntities=htmlEnt) |
696 |
- self.assertEquals(soup.decode(), u"<<sacr\xe9 bleu!>>") |
697 |
+ self.assertEquals(soup.decode(), "<<sacr\xe9 bleu!>>") |
698 |
|
699 |
# Make sure the "XML", "HTML", and "XHTML" settings work. |
700 |
text = "<™'" |
701 |
soup = BeautifulStoneSoup(text, convertEntities=xmlEnt) |
702 |
- self.assertEquals(soup.decode(), u"<™'") |
703 |
+ self.assertEquals(soup.decode(), "<™'") |
704 |
|
705 |
soup = BeautifulStoneSoup(text, convertEntities=htmlEnt) |
706 |
- self.assertEquals(soup.decode(), u"<\u2122'") |
707 |
+ self.assertEquals(soup.decode(), "<\u2122'") |
708 |
|
709 |
soup = BeautifulStoneSoup(text, convertEntities=xhtmlEnt) |
710 |
- self.assertEquals(soup.decode(), u"<\u2122'") |
711 |
+ self.assertEquals(soup.decode(), "<\u2122'") |
712 |
|
713 |
def testNonBreakingSpaces(self): |
714 |
soup = BeautifulSoup("<a> </a>", |
715 |
convertEntities=BeautifulStoneSoup.HTML_ENTITIES) |
716 |
- self.assertEquals(soup.decode(), u"<a>\xa0\xa0</a>") |
717 |
+ self.assertEquals(soup.decode(), "<a>\xa0\xa0</a>") |
718 |
|
719 |
def testWhitespaceInDeclaration(self): |
720 |
self.assertSoupEquals('<! DOCTYPE>', '<!DOCTYPE>') |
721 |
@@ -617,27 +617,27 @@ |
722 |
self.assertSoupEquals('<b>hello there</b>') |
723 |
|
724 |
def testEntitiesInAttributeValues(self): |
725 |
- self.assertSoupEquals('<x t="xñ">', '<x t="x\xc3\xb1"></x>', |
726 |
+ self.assertSoupEquals('<x t="xñ">', b'<x t="x\xc3\xb1"></x>', |
727 |
encoding='utf-8') |
728 |
- self.assertSoupEquals('<x t="xñ">', '<x t="x\xc3\xb1"></x>', |
729 |
+ self.assertSoupEquals('<x t="xñ">', b'<x t="x\xc3\xb1"></x>', |
730 |
encoding='utf-8') |
731 |
|
732 |
soup = BeautifulSoup('<x t=">™">', |
733 |
convertEntities=BeautifulStoneSoup.HTML_ENTITIES) |
734 |
- self.assertEquals(soup.decode(), u'<x t=">\u2122"></x>') |
735 |
+ self.assertEquals(soup.decode(), '<x t=">\u2122"></x>') |
736 |
|
737 |
uri = "http://crummy.com?sacré&bleu" |
738 |
link = '<a href="%s"></a>' % uri |
739 |
|
740 |
soup = BeautifulSoup(link, convertEntities=BeautifulSoup.HTML_ENTITIES) |
741 |
self.assertEquals(soup.decode(), |
742 |
- link.replace("é", u"\xe9")) |
743 |
+ link.replace("é", "\xe9")) |
744 |
|
745 |
uri = "http://crummy.com?sacré&bleu" |
746 |
link = '<a href="%s"></a>' % uri |
747 |
soup = BeautifulSoup(link, convertEntities=BeautifulSoup.HTML_ENTITIES) |
748 |
self.assertEquals(soup.a['href'], |
749 |
- uri.replace("é", u"\xe9")) |
750 |
+ uri.replace("é", "\xe9")) |
751 |
|
752 |
def testNakedAmpersands(self): |
753 |
html = {'convertEntities':BeautifulStoneSoup.HTML_ENTITIES} |
754 |
@@ -663,13 +663,13 @@ |
755 |
smart quote fixes.""" |
756 |
|
757 |
def testUnicodeDammitStandalone(self): |
758 |
- markup = "<foo>\x92</foo>" |
759 |
+ markup = b"<foo>\x92</foo>" |
760 |
dammit = UnicodeDammit(markup) |
761 |
- self.assertEquals(dammit.unicode, "<foo>’</foo>") |
762 |
+ self.assertEquals(dammit.str, "<foo>’</foo>") |
763 |
|
764 |
- hebrew = "\xed\xe5\xec\xf9" |
765 |
+ hebrew = b"\xed\xe5\xec\xf9" |
766 |
dammit = UnicodeDammit(hebrew, ["iso-8859-8"]) |
767 |
- self.assertEquals(dammit.unicode, u'\u05dd\u05d5\u05dc\u05e9') |
768 |
+ self.assertEquals(dammit.str, '\u05dd\u05d5\u05dc\u05e9') |
769 |
self.assertEquals(dammit.originalEncoding, 'iso-8859-8') |
770 |
|
771 |
def testGarbageInGarbageOut(self): |
772 |
@@ -677,13 +677,13 @@ |
773 |
asciiSoup = BeautifulStoneSoup(ascii) |
774 |
self.assertEquals(ascii, asciiSoup.decode()) |
775 |
|
776 |
- unicodeData = u"<foo>\u00FC</foo>" |
777 |
+ unicodeData = "<foo>\u00FC</foo>" |
778 |
utf8 = unicodeData.encode("utf-8") |
779 |
- self.assertEquals(utf8, '<foo>\xc3\xbc</foo>') |
780 |
+ self.assertEquals(utf8, b'<foo>\xc3\xbc</foo>') |
781 |
|
782 |
unicodeSoup = BeautifulStoneSoup(unicodeData) |
783 |
self.assertEquals(unicodeData, unicodeSoup.decode()) |
784 |
- self.assertEquals(unicodeSoup.foo.string, u'\u00FC') |
785 |
+ self.assertEquals(unicodeSoup.foo.string, '\u00FC') |
786 |
|
787 |
utf8Soup = BeautifulStoneSoup(utf8, fromEncoding='utf-8') |
788 |
self.assertEquals(utf8, utf8Soup.encode('utf-8')) |
789 |
@@ -696,18 +696,18 @@ |
790 |
|
791 |
def testHandleInvalidCodec(self): |
792 |
for bad_encoding in ['.utf8', '...', 'utF---16.!']: |
793 |
- soup = BeautifulSoup(u"Räksmörgås".encode("utf-8"), |
794 |
+ soup = BeautifulSoup("Räksmörgås".encode("utf-8"), |
795 |
fromEncoding=bad_encoding) |
796 |
self.assertEquals(soup.originalEncoding, 'utf-8') |
797 |
|
798 |
def testUnicodeSearch(self): |
799 |
- html = u'<html><body><h1>Räksmörgås</h1></body></html>' |
800 |
+ html = '<html><body><h1>Räksmörgås</h1></body></html>' |
801 |
soup = BeautifulSoup(html) |
802 |
- self.assertEqual(soup.find(text=u'Räksmörgås'),u'Räksmörgås') |
803 |
+ self.assertEqual(soup.find(text='Räksmörgås'),'Räksmörgås') |
804 |
|
805 |
def testRewrittenXMLHeader(self): |
806 |
- euc_jp = '<?xml version="1.0 encoding="euc-jp"?>\n<foo>\n\xa4\xb3\xa4\xec\xa4\xcfEUC-JP\xa4\xc7\xa5\xb3\xa1\xbc\xa5\xc7\xa5\xa3\xa5\xf3\xa5\xb0\xa4\xb5\xa4\xec\xa4\xbf\xc6\xfc\xcb\xdc\xb8\xec\xa4\xce\xa5\xd5\xa5\xa1\xa5\xa4\xa5\xeb\xa4\xc7\xa4\xb9\xa1\xa3\n</foo>\n' |
807 |
- utf8 = "<?xml version='1.0' encoding='utf-8'?>\n<foo>\n\xe3\x81\x93\xe3\x82\x8c\xe3\x81\xafEUC-JP\xe3\x81\xa7\xe3\x82\xb3\xe3\x83\xbc\xe3\x83\x87\xe3\x82\xa3\xe3\x83\xb3\xe3\x82\xb0\xe3\x81\x95\xe3\x82\x8c\xe3\x81\x9f\xe6\x97\xa5\xe6\x9c\xac\xe8\xaa\x9e\xe3\x81\xae\xe3\x83\x95\xe3\x82\xa1\xe3\x82\xa4\xe3\x83\xab\xe3\x81\xa7\xe3\x81\x99\xe3\x80\x82\n</foo>\n" |
808 |
+ euc_jp = b'<?xml version="1.0 encoding="euc-jp"?>\n<foo>\n\xa4\xb3\xa4\xec\xa4\xcfEUC-JP\xa4\xc7\xa5\xb3\xa1\xbc\xa5\xc7\xa5\xa3\xa5\xf3\xa5\xb0\xa4\xb5\xa4\xec\xa4\xbf\xc6\xfc\xcb\xdc\xb8\xec\xa4\xce\xa5\xd5\xa5\xa1\xa5\xa4\xa5\xeb\xa4\xc7\xa4\xb9\xa1\xa3\n</foo>\n' |
809 |
+ utf8 = b"<?xml version='1.0' encoding='utf-8'?>\n<foo>\n\xe3\x81\x93\xe3\x82\x8c\xe3\x81\xafEUC-JP\xe3\x81\xa7\xe3\x82\xb3\xe3\x83\xbc\xe3\x83\x87\xe3\x82\xa3\xe3\x83\xb3\xe3\x82\xb0\xe3\x81\x95\xe3\x82\x8c\xe3\x81\x9f\xe6\x97\xa5\xe6\x9c\xac\xe8\xaa\x9e\xe3\x81\xae\xe3\x83\x95\xe3\x82\xa1\xe3\x82\xa4\xe3\x83\xab\xe3\x81\xa7\xe3\x81\x99\xe3\x80\x82\n</foo>\n" |
810 |
soup = BeautifulStoneSoup(euc_jp) |
811 |
if soup.originalEncoding != "euc-jp": |
812 |
raise Exception("Test failed when parsing euc-jp document. " |
813 |
@@ -718,12 +718,12 @@ |
814 |
self.assertEquals(soup.originalEncoding, "euc-jp") |
815 |
self.assertEquals(soup.renderContents('utf-8'), utf8) |
816 |
|
817 |
- old_text = "<?xml encoding='windows-1252'><foo>\x92</foo>" |
818 |
+ old_text = b"<?xml encoding='windows-1252'><foo>\x92</foo>" |
819 |
new_text = "<?xml version='1.0' encoding='utf-8'?><foo>’</foo>" |
820 |
self.assertSoupEquals(old_text, new_text) |
821 |
|
822 |
def testRewrittenMetaTag(self): |
823 |
- no_shift_jis_html = '''<html><head>\n<meta http-equiv="Content-language" content="ja" /></head><body><pre>\n\x82\xb1\x82\xea\x82\xcdShift-JIS\x82\xc5\x83R\x81[\x83f\x83B\x83\x93\x83O\x82\xb3\x82\xea\x82\xbd\x93\xfa\x96{\x8c\xea\x82\xcc\x83t\x83@\x83C\x83\x8b\x82\xc5\x82\xb7\x81B\n</pre></body></html>''' |
824 |
+ no_shift_jis_html = b'''<html><head>\n<meta http-equiv="Content-language" content="ja" /></head><body><pre>\n\x82\xb1\x82\xea\x82\xcdShift-JIS\x82\xc5\x83R\x81[\x83f\x83B\x83\x93\x83O\x82\xb3\x82\xea\x82\xbd\x93\xfa\x96{\x8c\xea\x82\xcc\x83t\x83@\x83C\x83\x8b\x82\xc5\x82\xb7\x81B\n</pre></body></html>''' |
825 |
soup = BeautifulSoup(no_shift_jis_html) |
826 |
|
827 |
# Beautiful Soup used to try to rewrite the meta tag even if the |
828 |
@@ -733,16 +733,16 @@ |
829 |
soup = BeautifulSoup(no_shift_jis_html, parseOnlyThese=strainer) |
830 |
self.assertEquals(soup.contents[0].name, 'pre') |
831 |
|
832 |
- meta_tag = ('<meta content="text/html; charset=x-sjis" ' |
833 |
- 'http-equiv="Content-type" />') |
834 |
+ meta_tag = (b'<meta content="text/html; charset=x-sjis" ' |
835 |
+ b'http-equiv="Content-type" />') |
836 |
shift_jis_html = ( |
837 |
- '<html><head>\n%s\n' |
838 |
- '<meta http-equiv="Content-language" content="ja" />' |
839 |
- '</head><body><pre>\n' |
840 |
- '\x82\xb1\x82\xea\x82\xcdShift-JIS\x82\xc5\x83R\x81[\x83f' |
841 |
- '\x83B\x83\x93\x83O\x82\xb3\x82\xea\x82\xbd\x93\xfa\x96{\x8c' |
842 |
- '\xea\x82\xcc\x83t\x83@\x83C\x83\x8b\x82\xc5\x82\xb7\x81B\n' |
843 |
- '</pre></body></html>') % meta_tag |
844 |
+ b'<html><head>\n' + meta_tag + b'\n' |
845 |
+ b'<meta http-equiv="Content-language" content="ja" />' |
846 |
+ b'</head><body><pre>\n' |
847 |
+ b'\x82\xb1\x82\xea\x82\xcdShift-JIS\x82\xc5\x83R\x81[\x83f' |
848 |
+ b'\x83B\x83\x93\x83O\x82\xb3\x82\xea\x82\xbd\x93\xfa\x96{\x8c' |
849 |
+ b'\xea\x82\xcc\x83t\x83@\x83C\x83\x8b\x82\xc5\x82\xb7\x81B\n' |
850 |
+ b'</pre></body></html>') |
851 |
soup = BeautifulSoup(shift_jis_html) |
852 |
if soup.originalEncoding != "shift-jis": |
853 |
raise Exception("Test failed when parsing shift-jis document " |
854 |
@@ -755,59 +755,59 @@ |
855 |
content_type_tag = soup.meta['content'] |
856 |
self.assertEquals(content_type_tag[content_type_tag.find('charset='):], |
857 |
'charset=%SOUP-ENCODING%') |
858 |
- content_type = str(soup.meta) |
859 |
+ content_type = soup.meta.decode() |
860 |
index = content_type.find('charset=') |
861 |
self.assertEqual(content_type[index:index+len('charset=utf8')+1], |
862 |
'charset=utf-8') |
863 |
content_type = soup.meta.encode('shift-jis') |
864 |
- index = content_type.find('charset=') |
865 |
+ index = content_type.find(b'charset=') |
866 |
self.assertEqual(content_type[index:index+len('charset=shift-jis')], |
867 |
'charset=shift-jis'.encode()) |
868 |
|
869 |
self.assertEquals(soup.encode('utf-8'), ( |
870 |
- '<html><head>\n' |
871 |
- '<meta content="text/html; charset=utf-8" ' |
872 |
- 'http-equiv="Content-type" />\n' |
873 |
- '<meta http-equiv="Content-language" content="ja" />' |
874 |
- '</head><body><pre>\n' |
875 |
- '\xe3\x81\x93\xe3\x82\x8c\xe3\x81\xafShift-JIS\xe3\x81\xa7\xe3' |
876 |
- '\x82\xb3\xe3\x83\xbc\xe3\x83\x87\xe3\x82\xa3\xe3\x83\xb3\xe3' |
877 |
- '\x82\xb0\xe3\x81\x95\xe3\x82\x8c\xe3\x81\x9f\xe6\x97\xa5\xe6' |
878 |
- '\x9c\xac\xe8\xaa\x9e\xe3\x81\xae\xe3\x83\x95\xe3\x82\xa1\xe3' |
879 |
- '\x82\xa4\xe3\x83\xab\xe3\x81\xa7\xe3\x81\x99\xe3\x80\x82\n' |
880 |
- '</pre></body></html>')) |
881 |
+ b'<html><head>\n' |
882 |
+ b'<meta content="text/html; charset=utf-8" ' |
883 |
+ b'http-equiv="Content-type" />\n' |
884 |
+ b'<meta http-equiv="Content-language" content="ja" />' |
885 |
+ b'</head><body><pre>\n' |
886 |
+ b'\xe3\x81\x93\xe3\x82\x8c\xe3\x81\xafShift-JIS\xe3\x81\xa7\xe3' |
887 |
+ b'\x82\xb3\xe3\x83\xbc\xe3\x83\x87\xe3\x82\xa3\xe3\x83\xb3\xe3' |
888 |
+ b'\x82\xb0\xe3\x81\x95\xe3\x82\x8c\xe3\x81\x9f\xe6\x97\xa5\xe6' |
889 |
+ b'\x9c\xac\xe8\xaa\x9e\xe3\x81\xae\xe3\x83\x95\xe3\x82\xa1\xe3' |
890 |
+ b'\x82\xa4\xe3\x83\xab\xe3\x81\xa7\xe3\x81\x99\xe3\x80\x82\n' |
891 |
+ b'</pre></body></html>')) |
892 |
self.assertEquals(soup.encode("shift-jis"), |
893 |
shift_jis_html.replace('x-sjis'.encode(), |
894 |
'shift-jis'.encode())) |
895 |
|
896 |
- isolatin = """<html><meta http-equiv="Content-type" content="text/html; charset=ISO-Latin-1" />Sacr\xe9 bleu!</html>""" |
897 |
+ isolatin = b"""<html><meta http-equiv="Content-type" content="text/html; charset=ISO-Latin-1" />Sacr\xe9 bleu!</html>""" |
898 |
soup = BeautifulSoup(isolatin) |
899 |
|
900 |
utf8 = isolatin.replace("ISO-Latin-1".encode(), "utf-8".encode()) |
901 |
- utf8 = utf8.replace("\xe9", "\xc3\xa9") |
902 |
+ utf8 = utf8.replace(b"\xe9", b"\xc3\xa9") |
903 |
self.assertSoupEquals(soup.encode("utf-8"), utf8, encoding='utf-8') |
904 |
|
905 |
def testHebrew(self): |
906 |
- iso_8859_8= '<HEAD>\n<TITLE>Hebrew (ISO 8859-8) in Visual Directionality</TITLE>\n\n\n\n</HEAD>\n<BODY>\n<H1>Hebrew (ISO 8859-8) in Visual Directionality</H1>\n\xed\xe5\xec\xf9\n</BODY>\n' |
907 |
- utf8 = '<head>\n<title>Hebrew (ISO 8859-8) in Visual Directionality</title>\n</head>\n<body>\n<h1>Hebrew (ISO 8859-8) in Visual Directionality</h1>\n\xd7\x9d\xd7\x95\xd7\x9c\xd7\xa9\n</body>\n' |
908 |
+ iso_8859_8= b'<HEAD>\n<TITLE>Hebrew (ISO 8859-8) in Visual Directionality</TITLE>\n\n\n\n</HEAD>\n<BODY>\n<H1>Hebrew (ISO 8859-8) in Visual Directionality</H1>\n\xed\xe5\xec\xf9\n</BODY>\n' |
909 |
+ utf8 = b'<head>\n<title>Hebrew (ISO 8859-8) in Visual Directionality</title>\n</head>\n<body>\n<h1>Hebrew (ISO 8859-8) in Visual Directionality</h1>\n\xd7\x9d\xd7\x95\xd7\x9c\xd7\xa9\n</body>\n' |
910 |
soup = BeautifulStoneSoup(iso_8859_8, fromEncoding="iso-8859-8") |
911 |
self.assertEquals(soup.encode('utf-8'), utf8) |
912 |
|
913 |
def testSmartQuotesNotSoSmartAnymore(self): |
914 |
- self.assertSoupEquals("\x91Foo\x92 <!--blah-->", |
915 |
+ self.assertSoupEquals(b"\x91Foo\x92 <!--blah-->", |
916 |
'‘Foo’ <!--blah-->') |
917 |
|
918 |
def testDontConvertSmartQuotesWhenAlsoConvertingEntities(self): |
919 |
- smartQuotes = "Il a dit, \x8BSacré bleu!\x9b" |
920 |
+ smartQuotes = b"Il a dit, \x8BSacré bleu!\x9b" |
921 |
soup = BeautifulSoup(smartQuotes) |
922 |
self.assertEquals(soup.decode(), |
923 |
'Il a dit, ‹Sacré bleu!›') |
924 |
soup = BeautifulSoup(smartQuotes, convertEntities="html") |
925 |
self.assertEquals(soup.encode('utf-8'), |
926 |
- 'Il a dit, \xe2\x80\xb9Sacr\xc3\xa9 bleu!\xe2\x80\xba') |
927 |
+ b'Il a dit, \xe2\x80\xb9Sacr\xc3\xa9 bleu!\xe2\x80\xba') |
928 |
|
929 |
def testDontSeeSmartQuotesWhereThereAreNone(self): |
930 |
- utf_8 = "\343\202\261\343\203\274\343\202\277\343\202\244 Watch" |
931 |
+ utf_8 = b"\343\202\261\343\203\274\343\202\277\343\202\244 Watch" |
932 |
self.assertSoupEquals(utf_8, encoding='utf-8') |
933 |
|
934 |
|
935 |
--- setup.py |
936 |
+++ setup.py |
937 |
@@ -19,19 +19,19 @@ |
938 |
suite = loader.loadTestsFromModule(BeautifulSoupTests) |
939 |
suite.run(result) |
940 |
if not result.wasSuccessful(): |
941 |
- print "Unit tests have failed!" |
942 |
+ print("Unit tests have failed!") |
943 |
for l in result.errors, result.failures: |
944 |
for case, error in l: |
945 |
- print "-" * 80 |
946 |
+ print("-" * 80) |
947 |
desc = case.shortDescription() |
948 |
if desc: |
949 |
- print desc |
950 |
- print error |
951 |
- print '''If you see an error like: "'ascii' codec can't encode character...", see\nthe Beautiful Soup documentation:\n http://www.crummy.com/software/BeautifulSoup/documentation.html#Why%20can't%20Beautiful%20Soup%20print%20out%20the%20non-ASCII%20characters%20I%20gave%20it?''' |
952 |
- print "This might or might not be a problem depending on what you plan to do with\nBeautiful Soup." |
953 |
+ print(desc) |
954 |
+ print(error) |
955 |
+ print('''If you see an error like: "'ascii' codec can't encode character...", see\nthe Beautiful Soup documentation:\n http://www.crummy.com/software/BeautifulSoup/documentation.html#Why%20can't%20Beautiful%20Soup%20print%20out%20the%20non-ASCII%20characters%20I%20gave%20it?''') |
956 |
+ print("This might or might not be a problem depending on what you plan to do with\nBeautiful Soup.") |
957 |
if sys.argv[1] == 'sdist': |
958 |
- print |
959 |
- print "I'm not going to make a source distribution since the tests don't pass." |
960 |
+ print() |
961 |
+ print("I'm not going to make a source distribution since the tests don't pass.") |
962 |
sys.exit(1) |
963 |
|
964 |
setup(name="BeautifulSoup", |