summaryrefslogtreecommitdiffstats
path: root/module/unescape.py
diff options
context:
space:
mode:
authorGravatar mkaay <mkaay@mkaay.de> 2010-01-27 20:05:23 +0100
committerGravatar mkaay <mkaay@mkaay.de> 2010-01-27 20:05:23 +0100
commit6d1ec2baa795b0d90d5da33b0447120b50f10c17 (patch)
tree5fceb5474c83e709fda396f70121a6923b0be106 /module/unescape.py
parentFirst Version, works but doesn't reconnect (diff)
parentfix (diff)
downloadpyload-6d1ec2baa795b0d90d5da33b0447120b50f10c17.tar.xz
merge
Diffstat (limited to 'module/unescape.py')
-rw-r--r--module/unescape.py28
1 files changed, 21 insertions, 7 deletions
diff --git a/module/unescape.py b/module/unescape.py
index 462423b03..59f35f36b 100644
--- a/module/unescape.py
+++ b/module/unescape.py
@@ -1,12 +1,25 @@
+from htmlentitydefs import name2codepoint as n2cp
+import re
+
+def substitute_entity(match):
+ ent = match.group(2)
+ if match.group(1) == "#":
+ return unichr(int(ent))
+ else:
+ cp = n2cp.get(ent)
+ if cp:
+ return unichr(cp)
+ else:
+ return match.group()
+
+def unescape(string):
+ entity_re = re.compile("&(#?)(\d{1,5}|\w{1,8});")
+ return entity_re.subn(substitute_entity, string)[0]
+
+"""
import re
def unescape(text):
- """Removes HTML or XML character references
- and entities from a text string.
- keep &amp;, &gt;, &lt; in the source code.
- from Fredrik Lundh
- http://effbot.org/zone/re-sub.htm#unescape-html
- """
def fixup(m):
text = m.group(0)
if text[:2] == "&#":
@@ -35,4 +48,5 @@ def unescape(text):
print "keyerror"
pass
return text # leave as is
- return str(re.sub("&#?\w+;", fixup, text))
+ return re.sub("&#?\w+;", fixup, text)
+"""