from htmlentitydefs import name2codepoint as n2cp from urllib import unquote import re def substitute_entity(match): ent = match.group(2) if match.group(1) == "#": return unichr(int(ent)) else: cp = n2cp.get(ent) if cp: return unichr(cp) else: return match.group() def unescape(string): entity_re = re.compile("&(#?)(\d{1,5}|\w{1,8});") return entity_re.subn(substitute_entity, unquote(string))[0] """ import re def unescape(text): def fixup(m): text = m.group(0) if text[:2] == "&#": # character reference try: if text[:3] == "&#x": return unichr(int(text[3:-1], 16)) else: return unichr(int(text[2:-1])) except ValueError: print "erreur de valeur" pass else: # named entity try: if text[1:-1] == "amp": text = "&" elif text[1:-1] == "gt": text = ">" elif text[1:-1] == "lt": text = "<" else: print text[1:-1] text = unichr(htmlentitydefs.name2codepoint[text[1:-1]]) except KeyError: print "keyerror" pass return text # leave as is return re.sub("&#?\w+;", fixup, text) """