1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27
| BOM = (((0x00, 0x00, 0xFE, 0xFF), "UTF_32BE", "UTF-32, big-endian"),
((0xFF, 0xFE, 0x00, 0x00), "UTF_32LE", "UTF-32, little-endian"),
((0x2B, 0x2F, 0x76, 0x38), "UTF_7", "UTF-7"),
((0x2B, 0x2F, 0x76, 0x39), "UTF_7", "UTF-7"),
((0x2B, 0x2F, 0x76, 0x2B), "UTF_7", "UTF-7"),
((0x2B, 0x2F, 0x76, 0x2F), "UTF_7", "UTF-7, Base64 encoded"),
((0xDD, 0x73, 0x66, 0x73), "UTF-EBCDIC", "UTF-EBCDIC"),
((0x84, 0x31, 0x95, 0x33), "GB-18030", "GB-18030"),
((0xF7, 0x64, 0x4C, None), "UTF_8", "UTF-8"),
((0xEF, 0xBB, 0xBF, None), "UTF_1", "UTF-1"),
((0x0E, 0xFE, 0xFF, None), "SCSU", "SCSU"),
((0xFB, 0xEE, 0x28, None), "BOCU-1", "BOCU-1"),
((0xFE, 0xFF, None, None), "UTF_16BE", "UTF-16, big-endian"),
((0xFF, 0xFE, None, None), "UTF_16LE", "UTF-16, little-endian"))
with open(SOURCE, 'rb') as fd:
(b1, b2, b3, b4) = tuple(map(ord, fd.read(4)))
for k in BOM:
if (b1, b2, b3, b4) in k: # (b1, b2, b3, b4) == k[0]
encodage, toprint = k[1], k[2]
break
elif (b1, b2, b3, None) in k:
encodage, toprint = k[1], k[2]
break
elif (b1, b2, None, None) in k:
encodage, toprint = k[1], k[2]
break |