diff options
author | spoob <spoob@gmx.de> | 2009-06-14 02:32:26 +0200 |
---|---|---|
committer | spoob <spoob@gmx.de> | 2009-06-14 02:32:26 +0200 |
commit | 6cad171c269c07d5b365ffba7b676f25e98e449f (patch) | |
tree | 14980a9a6bf91ddd6030c0e252e4ffe4c435b4d9 /module | |
parent | added 1kh.de container plugin (diff) | |
download | pyload-6cad171c269c07d5b365ffba7b676f25e98e449f.tar.xz |
added 1kh.de container plugin
Diffstat (limited to 'module')
-rw-r--r-- | module/unescape.py | 38 |
1 files changed, 38 insertions, 0 deletions
diff --git a/module/unescape.py b/module/unescape.py new file mode 100644 index 000000000..462423b03 --- /dev/null +++ b/module/unescape.py @@ -0,0 +1,38 @@ +import re + +def unescape(text): + """Removes HTML or XML character references + and entities from a text string. + keep &, >, < in the source code. + from Fredrik Lundh + http://effbot.org/zone/re-sub.htm#unescape-html + """ + def fixup(m): + text = m.group(0) + if text[:2] == "&#": + # character reference + try: + if text[:3] == "&#x": + return unichr(int(text[3:-1], 16)) + else: + return unichr(int(text[2:-1])) + except ValueError: + print "erreur de valeur" + pass + else: + # named entity + try: + if text[1:-1] == "amp": + text = "&amp;" + elif text[1:-1] == "gt": + text = "&gt;" + elif text[1:-1] == "lt": + text = "&lt;" + else: + print text[1:-1] + text = unichr(htmlentitydefs.name2codepoint[text[1:-1]]) + except KeyError: + print "keyerror" + pass + return text # leave as is + return str(re.sub("&#?\w+;", fixup, text)) |