summaryrefslogtreecommitdiffstats
path: root/module/unescape.py
diff options
context:
space:
mode:
Diffstat (limited to 'module/unescape.py')
-rw-r--r--module/unescape.py54
1 files changed, 54 insertions, 0 deletions
diff --git a/module/unescape.py b/module/unescape.py
new file mode 100644
index 000000000..41a23be5b
--- /dev/null
+++ b/module/unescape.py
@@ -0,0 +1,54 @@
+from htmlentitydefs import name2codepoint as n2cp
+from urllib import unquote
+import re
+
+def substitute_entity(match):
+ ent = match.group(2)
+ if match.group(1) == "#":
+ return unichr(int(ent))
+ else:
+ cp = n2cp.get(ent)
+ if cp:
+ return unichr(cp)
+ else:
+ return match.group()
+
+def unescape(string):
+ entity_re = re.compile("&(#?)(\d{1,5}|\w{1,8});")
+ return entity_re.subn(substitute_entity, unquote(string))[0]
+
+
+"""
+import re
+
+def unescape(text):
+ def fixup(m):
+ text = m.group(0)
+ if text[:2] == "&#":
+ # character reference
+ try:
+ if text[:3] == "&#x":
+ return unichr(int(text[3:-1], 16))
+ else:
+ return unichr(int(text[2:-1]))
+ except ValueError:
+ print "erreur de valeur"
+ pass
+ else:
+ # named entity
+ try:
+ if text[1:-1] == "amp":
+ text = "&"
+ elif text[1:-1] == "gt":
+ text = ">"
+ elif text[1:-1] == "lt":
+ text = "<"
+ else:
+ print text[1:-1]
+ text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
+ except KeyError:
+ print "keyerror"
+ pass
+ return text # leave as is
+ return re.sub("&#?\w+;", fixup, text)
+"""