encoding.py 1.2 KB

12345678910111213141516171819202122232425262728293031323334353637
  1. import codecs
  2. import locale
  3. import re
  4. import sys
  5. from typing import List, Tuple
  6. BOMS = [
  7. (codecs.BOM_UTF8, "utf-8"),
  8. (codecs.BOM_UTF16, "utf-16"),
  9. (codecs.BOM_UTF16_BE, "utf-16-be"),
  10. (codecs.BOM_UTF16_LE, "utf-16-le"),
  11. (codecs.BOM_UTF32, "utf-32"),
  12. (codecs.BOM_UTF32_BE, "utf-32-be"),
  13. (codecs.BOM_UTF32_LE, "utf-32-le"),
  14. ] # type: List[Tuple[bytes, str]]
  15. ENCODING_RE = re.compile(br"coding[:=]\s*([-\w.]+)")
  16. def auto_decode(data):
  17. # type: (bytes) -> str
  18. """Check a bytes string for a BOM to correctly detect the encoding
  19. Fallback to locale.getpreferredencoding(False) like open() on Python3"""
  20. for bom, encoding in BOMS:
  21. if data.startswith(bom):
  22. return data[len(bom) :].decode(encoding)
  23. # Lets check the first two lines as in PEP263
  24. for line in data.split(b"\n")[:2]:
  25. if line[0:1] == b"#" and ENCODING_RE.search(line):
  26. result = ENCODING_RE.search(line)
  27. assert result is not None
  28. encoding = result.groups()[0].decode("ascii")
  29. return data.decode(encoding)
  30. return data.decode(
  31. locale.getpreferredencoding(False) or sys.getdefaultencoding(),
  32. )