diff options
author | mkaay <mkaay@mkaay.de> | 2010-01-27 20:05:23 +0100 |
---|---|---|
committer | mkaay <mkaay@mkaay.de> | 2010-01-27 20:05:23 +0100 |
commit | 6d1ec2baa795b0d90d5da33b0447120b50f10c17 (patch) | |
tree | 5fceb5474c83e709fda396f70121a6923b0be106 /module/unescape.py | |
parent | First Version, works but doesn't reconnect (diff) | |
parent | fix (diff) | |
download | pyload-6d1ec2baa795b0d90d5da33b0447120b50f10c17.tar.xz |
merge
Diffstat (limited to 'module/unescape.py')
-rw-r--r-- | module/unescape.py | 28 |
1 files changed, 21 insertions, 7 deletions
diff --git a/module/unescape.py b/module/unescape.py index 462423b03..59f35f36b 100644 --- a/module/unescape.py +++ b/module/unescape.py @@ -1,12 +1,25 @@ +from htmlentitydefs import name2codepoint as n2cp +import re + +def substitute_entity(match): + ent = match.group(2) + if match.group(1) == "#": + return unichr(int(ent)) + else: + cp = n2cp.get(ent) + if cp: + return unichr(cp) + else: + return match.group() + +def unescape(string): + entity_re = re.compile("&(#?)(\d{1,5}|\w{1,8});") + return entity_re.subn(substitute_entity, string)[0] + +""" import re def unescape(text): - """Removes HTML or XML character references - and entities from a text string. - keep &, >, < in the source code. - from Fredrik Lundh - http://effbot.org/zone/re-sub.htm#unescape-html - """ def fixup(m): text = m.group(0) if text[:2] == "&#": @@ -35,4 +48,5 @@ def unescape(text): print "keyerror" pass return text # leave as is - return str(re.sub("&#?\w+;", fixup, text)) + return re.sub("&#?\w+;", fixup, text) +""" |