summaryrefslogtreecommitdiffstats
path: root/module/unescape.py
diff options
context:
space:
mode:
Diffstat (limited to 'module/unescape.py')
-rw-r--r--module/unescape.py28
1 files changed, 21 insertions, 7 deletions
diff --git a/module/unescape.py b/module/unescape.py
index 462423b03..59f35f36b 100644
--- a/module/unescape.py
+++ b/module/unescape.py
@@ -1,12 +1,25 @@
+from htmlentitydefs import name2codepoint as n2cp
+import re
+
+def substitute_entity(match):
+ ent = match.group(2)
+ if match.group(1) == "#":
+ return unichr(int(ent))
+ else:
+ cp = n2cp.get(ent)
+ if cp:
+ return unichr(cp)
+ else:
+ return match.group()
+
+def unescape(string):
+ entity_re = re.compile("&(#?)(\d{1,5}|\w{1,8});")
+ return entity_re.subn(substitute_entity, string)[0]
+
+"""
import re
def unescape(text):
- """Removes HTML or XML character references
- and entities from a text string.
- keep &, >, < in the source code.
- from Fredrik Lundh
- http://effbot.org/zone/re-sub.htm#unescape-html
- """
def fixup(m):
text = m.group(0)
if text[:2] == "&#":
@@ -35,4 +48,5 @@ def unescape(text):
print "keyerror"
pass
return text # leave as is
- return str(re.sub("&#?\w+;", fixup, text))
+ return re.sub("&#?\w+;", fixup, text)
+"""