From 6cad171c269c07d5b365ffba7b676f25e98e449f Mon Sep 17 00:00:00 2001 From: spoob <spoob@gmx.de> Date: Sun, 14 Jun 2009 02:32:26 +0200 Subject: added 1kh.de container plugin --- module/unescape.py | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) create mode 100644 module/unescape.py (limited to 'module/unescape.py') diff --git a/module/unescape.py b/module/unescape.py new file mode 100644 index 000000000..462423b03 --- /dev/null +++ b/module/unescape.py @@ -0,0 +1,38 @@ +import re + +def unescape(text): + """Removes HTML or XML character references + and entities from a text string. + keep &, >, < in the source code. + from Fredrik Lundh + http://effbot.org/zone/re-sub.htm#unescape-html + """ + def fixup(m): + text = m.group(0) + if text[:2] == "&#": + # character reference + try: + if text[:3] == "&#x": + return unichr(int(text[3:-1], 16)) + else: + return unichr(int(text[2:-1])) + except ValueError: + print "erreur de valeur" + pass + else: + # named entity + try: + if text[1:-1] == "amp": + text = "&amp;" + elif text[1:-1] == "gt": + text = "&gt;" + elif text[1:-1] == "lt": + text = "&lt;" + else: + print text[1:-1] + text = unichr(htmlentitydefs.name2codepoint[text[1:-1]]) + except KeyError: + print "keyerror" + pass + return text # leave as is + return str(re.sub("&#?\w+;", fixup, text)) -- cgit v1.2.3