图片解析应用
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

36 lines
1.1 KiB

  1. import codecs
  2. import locale
  3. import re
  4. import sys
  5. from typing import List, Tuple
  6. BOMS: List[Tuple[bytes, str]] = [
  7. (codecs.BOM_UTF8, "utf-8"),
  8. (codecs.BOM_UTF16, "utf-16"),
  9. (codecs.BOM_UTF16_BE, "utf-16-be"),
  10. (codecs.BOM_UTF16_LE, "utf-16-le"),
  11. (codecs.BOM_UTF32, "utf-32"),
  12. (codecs.BOM_UTF32_BE, "utf-32-be"),
  13. (codecs.BOM_UTF32_LE, "utf-32-le"),
  14. ]
  15. ENCODING_RE = re.compile(rb"coding[:=]\s*([-\w.]+)")
  16. def auto_decode(data: bytes) -> str:
  17. """Check a bytes string for a BOM to correctly detect the encoding
  18. Fallback to locale.getpreferredencoding(False) like open() on Python3"""
  19. for bom, encoding in BOMS:
  20. if data.startswith(bom):
  21. return data[len(bom) :].decode(encoding)
  22. # Lets check the first two lines as in PEP263
  23. for line in data.split(b"\n")[:2]:
  24. if line[0:1] == b"#" and ENCODING_RE.search(line):
  25. result = ENCODING_RE.search(line)
  26. assert result is not None
  27. encoding = result.groups()[0].decode("ascii")
  28. return data.decode(encoding)
  29. return data.decode(
  30. locale.getpreferredencoding(False) or sys.getdefaultencoding(),
  31. )