diff options
Diffstat (limited to 'module/lib/BeautifulSoup.py')
-rw-r--r-- | module/lib/BeautifulSoup.py | 63 |
1 files changed, 34 insertions, 29 deletions
diff --git a/module/lib/BeautifulSoup.py b/module/lib/BeautifulSoup.py index 55567f588..7278215ca 100644 --- a/module/lib/BeautifulSoup.py +++ b/module/lib/BeautifulSoup.py @@ -79,8 +79,8 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE, DAMMIT. from __future__ import generators __author__ = "Leonard Richardson (leonardr@segfault.org)" -__version__ = "3.0.8.1" -__copyright__ = "Copyright (c) 2004-2010 Leonard Richardson" +__version__ = "3.2.1" +__copyright__ = "Copyright (c) 2004-2012 Leonard Richardson" __license__ = "New-style BSD" from sgmllib import SGMLParser, SGMLParseError @@ -114,6 +114,21 @@ class PageElement(object): """Contains the navigational information for some part of the page (either a tag or a piece of text)""" + def _invert(h): + "Cheap function to invert a hash." + i = {} + for k,v in h.items(): + i[v] = k + return i + + XML_ENTITIES_TO_SPECIAL_CHARS = { "apos" : "'", + "quot" : '"', + "amp" : "&", + "lt" : "<", + "gt" : ">" } + + XML_SPECIAL_CHARS_TO_ENTITIES = _invert(XML_ENTITIES_TO_SPECIAL_CHARS) + def setup(self, parent=None, previous=None): """Sets up the initial relations between this element and other elements.""" @@ -421,6 +436,16 @@ class PageElement(object): s = unicode(s) return s + BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|" + + "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)" + + ")") + + def _sub_entity(self, x): + """Used with a regular expression to substitute the + appropriate XML entity for an XML special character.""" + return "&" + self.XML_SPECIAL_CHARS_TO_ENTITIES[x.group(0)[0]] + ";" + + class NavigableString(unicode, PageElement): def __new__(cls, value): @@ -451,10 +476,12 @@ class NavigableString(unicode, PageElement): return str(self).decode(DEFAULT_OUTPUT_ENCODING) def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING): + # Substitute outgoing XML entities. + data = self.BARE_AMPERSAND_OR_BRACKET.sub(self._sub_entity, self) if encoding: - return self.encode(encoding) + return data.encode(encoding) else: - return self + return data class CData(NavigableString): @@ -480,21 +507,6 @@ class Tag(PageElement): """Represents a found HTML tag with its attributes and contents.""" - def _invert(h): - "Cheap function to invert a hash." - i = {} - for k,v in h.items(): - i[v] = k - return i - - XML_ENTITIES_TO_SPECIAL_CHARS = { "apos" : "'", - "quot" : '"', - "amp" : "&", - "lt" : "<", - "gt" : ">" } - - XML_SPECIAL_CHARS_TO_ENTITIES = _invert(XML_ENTITIES_TO_SPECIAL_CHARS) - def _convertEntities(self, match): """Used in a call to re.sub to replace HTML, XML, and numeric entities with the appropriate Unicode characters. If HTML @@ -531,6 +543,8 @@ class Tag(PageElement): self.name = name if attrs is None: attrs = [] + elif isinstance(attrs, dict): + attrs = attrs.items() self.attrs = attrs self.contents = [] self.setup(parent, previous) @@ -679,15 +693,6 @@ class Tag(PageElement): def __unicode__(self): return self.__str__(None) - BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|" - + "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)" - + ")") - - def _sub_entity(self, x): - """Used with a regular expression to substitute the - appropriate XML entity for an XML special character.""" - return "&" + self.XML_SPECIAL_CHARS_TO_ENTITIES[x.group(0)[0]] + ";" - def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING, prettyPrint=False, indentLevel=0): """Returns a string or Unicode representation of this tag and @@ -1295,7 +1300,7 @@ class BeautifulStoneSoup(Tag, SGMLParser): """ nestingResetTriggers = self.NESTABLE_TAGS.get(name) - isNestable = nestingResetTriggers is not None + isNestable = nestingResetTriggers != None isResetNesting = self.RESET_NESTING_TAGS.has_key(name) popTo = None inclusive = True |