diff options
Diffstat (limited to 'module/unescape.py')
-rw-r--r-- | module/unescape.py | 54 |
1 files changed, 54 insertions, 0 deletions
diff --git a/module/unescape.py b/module/unescape.py new file mode 100644 index 000000000..41a23be5b --- /dev/null +++ b/module/unescape.py @@ -0,0 +1,54 @@ +from htmlentitydefs import name2codepoint as n2cp +from urllib import unquote +import re + +def substitute_entity(match): + ent = match.group(2) + if match.group(1) == "#": + return unichr(int(ent)) + else: + cp = n2cp.get(ent) + if cp: + return unichr(cp) + else: + return match.group() + +def unescape(string): + entity_re = re.compile("&(#?)(\d{1,5}|\w{1,8});") + return entity_re.subn(substitute_entity, unquote(string))[0] + + +""" +import re + +def unescape(text): + def fixup(m): + text = m.group(0) + if text[:2] == "&#": + # character reference + try: + if text[:3] == "&#x": + return unichr(int(text[3:-1], 16)) + else: + return unichr(int(text[2:-1])) + except ValueError: + print "erreur de valeur" + pass + else: + # named entity + try: + if text[1:-1] == "amp": + text = "&" + elif text[1:-1] == "gt": + text = ">" + elif text[1:-1] == "lt": + text = "<" + else: + print text[1:-1] + text = unichr(htmlentitydefs.name2codepoint[text[1:-1]]) + except KeyError: + print "keyerror" + pass + return text # leave as is + return re.sub("&#?\w+;", fixup, text) +""" |