summaryrefslogtreecommitdiffstats
path: root/module/unescape.py
diff options
context:
space:
mode:
Diffstat (limited to 'module/unescape.py')
-rw-r--r--module/unescape.py38
1 files changed, 38 insertions, 0 deletions
diff --git a/module/unescape.py b/module/unescape.py
new file mode 100644
index 000000000..462423b03
--- /dev/null
+++ b/module/unescape.py
@@ -0,0 +1,38 @@
+import re
+
+def unescape(text):
+ """Removes HTML or XML character references
+ and entities from a text string.
+ keep &, >, < in the source code.
+ from Fredrik Lundh
+ http://effbot.org/zone/re-sub.htm#unescape-html
+ """
+ def fixup(m):
+ text = m.group(0)
+ if text[:2] == "&#":
+ # character reference
+ try:
+ if text[:3] == "&#x":
+ return unichr(int(text[3:-1], 16))
+ else:
+ return unichr(int(text[2:-1]))
+ except ValueError:
+ print "erreur de valeur"
+ pass
+ else:
+ # named entity
+ try:
+ if text[1:-1] == "amp":
+ text = "&"
+ elif text[1:-1] == "gt":
+ text = ">"
+ elif text[1:-1] == "lt":
+ text = "<"
+ else:
+ print text[1:-1]
+ text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
+ except KeyError:
+ print "keyerror"
+ pass
+ return text # leave as is
+ return str(re.sub("&#?\w+;", fixup, text))