Do I have everything in order? In any case, I parse a lot of html, but I do not always know what encoding it should have been (an amazing number of lies about this). The code below easily shows what I have done so far, but I'm sure there is a better way. Your suggestions will be highly appreciated.
import logging import codecs from utils.error import Error class UnicodingError(Error): pass # these encodings should be in most likely order to save time encodings = [ "ascii", "utf_8", "big5", "big5hkscs", "cp037", "cp424", "cp437", "cp500", "cp737", "cp775", "cp850", "cp852", "cp855", "cp856", "cp857", "cp860", "cp861", "cp862", "cp863", "cp864", "cp865", "cp866", "cp869", "cp874", "cp875", "cp932", "cp949", "cp950", "cp1006", "cp1026", "cp1140", "cp1250", "cp1251", "cp1252", "cp1253", "cp1254", "cp1255", "cp1256", "cp1257", "cp1258", "euc_jp", "euc_jis_2004", "euc_jisx0213", "euc_kr", "gb2312", "gbk", "gb18030", "hz", "iso2022_jp", "iso2022_jp_1", "iso2022_jp_2", "iso2022_jp_2004", "iso2022_jp_3", "iso2022_jp_ext", "iso2022_kr", "latin_1", "iso8859_2", "iso8859_3", "iso8859_4", "iso8859_5", "iso8859_6", "iso8859_7", "iso8859_8", "iso8859_9", "iso8859_10", "iso8859_13", "iso8859_14", "iso8859_15", "johab", "koi8_r", "koi8_u", "mac_cyrillic", "mac_greek", "mac_iceland", "mac_latin2", "mac_roman", "mac_turkish", "ptcp154", "shift_jis", "shift_jis_2004", "shift_jisx0213", "utf_32", "utf_32_be", "utf_32_le", "utf_16", "utf_16_be", "utf_16_le", "utf_7", "utf_8_sig" ] def unicode(string): '''make unicode''' for enc in self.encodings: try: logging.debug("unicoder is trying " + enc + " encoding") utf8 = unicode(string, enc) logging.info("unicoder is using " + enc + " encoding") return utf8 except UnicodingError: if enc == self.encodings[-1]: raise UnicodingError("still don't recognise encoding after trying do guess.")