summaryrefslogtreecommitdiffstats
path: root/module
diff options
context:
space:
mode:
authorGravatar spoob <spoob@gmx.de> 2009-06-14 02:32:26 +0200
committerGravatar spoob <spoob@gmx.de> 2009-06-14 02:32:26 +0200
commit6cad171c269c07d5b365ffba7b676f25e98e449f (patch)
tree14980a9a6bf91ddd6030c0e252e4ffe4c435b4d9 /module
parentadded 1kh.de container plugin (diff)
downloadpyload-6cad171c269c07d5b365ffba7b676f25e98e449f.tar.xz
added 1kh.de container plugin
Diffstat (limited to 'module')
-rw-r--r--module/unescape.py38
1 files changed, 38 insertions, 0 deletions
diff --git a/module/unescape.py b/module/unescape.py
new file mode 100644
index 000000000..462423b03
--- /dev/null
+++ b/module/unescape.py
@@ -0,0 +1,38 @@
+import re
+
+def unescape(text):
+ """Removes HTML or XML character references
+ and entities from a text string.
+ keep &amp;, &gt;, &lt; in the source code.
+ from Fredrik Lundh
+ http://effbot.org/zone/re-sub.htm#unescape-html
+ """
+ def fixup(m):
+ text = m.group(0)
+ if text[:2] == "&#":
+ # character reference
+ try:
+ if text[:3] == "&#x":
+ return unichr(int(text[3:-1], 16))
+ else:
+ return unichr(int(text[2:-1]))
+ except ValueError:
+ print "erreur de valeur"
+ pass
+ else:
+ # named entity
+ try:
+ if text[1:-1] == "amp":
+ text = "&amp;amp;"
+ elif text[1:-1] == "gt":
+ text = "&amp;gt;"
+ elif text[1:-1] == "lt":
+ text = "&amp;lt;"
+ else:
+ print text[1:-1]
+ text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
+ except KeyError:
+ print "keyerror"
+ pass
+ return text # leave as is
+ return str(re.sub("&#?\w+;", fixup, text))