m2m模型翻译
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1995 lines
40 KiB

6 months ago
  1. # -*- coding: utf-8 -*-
  2. from codecs import BOM_UTF8, BOM_UTF16_BE, BOM_UTF16_LE, BOM_UTF32_BE, BOM_UTF32_LE
  3. from encodings.aliases import aliases
  4. from re import IGNORECASE, compile as re_compile
  5. from typing import Dict, List, Set, Union
  6. # Contain for each eligible encoding a list of/item bytes SIG/BOM
  7. ENCODING_MARKS: Dict[str, Union[bytes, List[bytes]]] = {
  8. "utf_8": BOM_UTF8,
  9. "utf_7": [
  10. b"\x2b\x2f\x76\x38",
  11. b"\x2b\x2f\x76\x39",
  12. b"\x2b\x2f\x76\x2b",
  13. b"\x2b\x2f\x76\x2f",
  14. b"\x2b\x2f\x76\x38\x2d",
  15. ],
  16. "gb18030": b"\x84\x31\x95\x33",
  17. "utf_32": [BOM_UTF32_BE, BOM_UTF32_LE],
  18. "utf_16": [BOM_UTF16_BE, BOM_UTF16_LE],
  19. }
  20. TOO_SMALL_SEQUENCE: int = 32
  21. TOO_BIG_SEQUENCE: int = int(10e6)
  22. UTF8_MAXIMAL_ALLOCATION: int = 1_112_064
  23. # Up-to-date Unicode ucd/15.0.0
  24. UNICODE_RANGES_COMBINED: Dict[str, range] = {
  25. "Control character": range(32),
  26. "Basic Latin": range(32, 128),
  27. "Latin-1 Supplement": range(128, 256),
  28. "Latin Extended-A": range(256, 384),
  29. "Latin Extended-B": range(384, 592),
  30. "IPA Extensions": range(592, 688),
  31. "Spacing Modifier Letters": range(688, 768),
  32. "Combining Diacritical Marks": range(768, 880),
  33. "Greek and Coptic": range(880, 1024),
  34. "Cyrillic": range(1024, 1280),
  35. "Cyrillic Supplement": range(1280, 1328),
  36. "Armenian": range(1328, 1424),
  37. "Hebrew": range(1424, 1536),
  38. "Arabic": range(1536, 1792),
  39. "Syriac": range(1792, 1872),
  40. "Arabic Supplement": range(1872, 1920),
  41. "Thaana": range(1920, 1984),
  42. "NKo": range(1984, 2048),
  43. "Samaritan": range(2048, 2112),
  44. "Mandaic": range(2112, 2144),
  45. "Syriac Supplement": range(2144, 2160),
  46. "Arabic Extended-B": range(2160, 2208),
  47. "Arabic Extended-A": range(2208, 2304),
  48. "Devanagari": range(2304, 2432),
  49. "Bengali": range(2432, 2560),
  50. "Gurmukhi": range(2560, 2688),
  51. "Gujarati": range(2688, 2816),
  52. "Oriya": range(2816, 2944),
  53. "Tamil": range(2944, 3072),
  54. "Telugu": range(3072, 3200),
  55. "Kannada": range(3200, 3328),
  56. "Malayalam": range(3328, 3456),
  57. "Sinhala": range(3456, 3584),
  58. "Thai": range(3584, 3712),
  59. "Lao": range(3712, 3840),
  60. "Tibetan": range(3840, 4096),
  61. "Myanmar": range(4096, 4256),
  62. "Georgian": range(4256, 4352),
  63. "Hangul Jamo": range(4352, 4608),
  64. "Ethiopic": range(4608, 4992),
  65. "Ethiopic Supplement": range(4992, 5024),
  66. "Cherokee": range(5024, 5120),
  67. "Unified Canadian Aboriginal Syllabics": range(5120, 5760),
  68. "Ogham": range(5760, 5792),
  69. "Runic": range(5792, 5888),
  70. "Tagalog": range(5888, 5920),
  71. "Hanunoo": range(5920, 5952),
  72. "Buhid": range(5952, 5984),
  73. "Tagbanwa": range(5984, 6016),
  74. "Khmer": range(6016, 6144),
  75. "Mongolian": range(6144, 6320),
  76. "Unified Canadian Aboriginal Syllabics Extended": range(6320, 6400),
  77. "Limbu": range(6400, 6480),
  78. "Tai Le": range(6480, 6528),
  79. "New Tai Lue": range(6528, 6624),
  80. "Khmer Symbols": range(6624, 6656),
  81. "Buginese": range(6656, 6688),
  82. "Tai Tham": range(6688, 6832),
  83. "Combining Diacritical Marks Extended": range(6832, 6912),
  84. "Balinese": range(6912, 7040),
  85. "Sundanese": range(7040, 7104),
  86. "Batak": range(7104, 7168),
  87. "Lepcha": range(7168, 7248),
  88. "Ol Chiki": range(7248, 7296),
  89. "Cyrillic Extended-C": range(7296, 7312),
  90. "Georgian Extended": range(7312, 7360),
  91. "Sundanese Supplement": range(7360, 7376),
  92. "Vedic Extensions": range(7376, 7424),
  93. "Phonetic Extensions": range(7424, 7552),
  94. "Phonetic Extensions Supplement": range(7552, 7616),
  95. "Combining Diacritical Marks Supplement": range(7616, 7680),
  96. "Latin Extended Additional": range(7680, 7936),
  97. "Greek Extended": range(7936, 8192),
  98. "General Punctuation": range(8192, 8304),
  99. "Superscripts and Subscripts": range(8304, 8352),
  100. "Currency Symbols": range(8352, 8400),
  101. "Combining Diacritical Marks for Symbols": range(8400, 8448),
  102. "Letterlike Symbols": range(8448, 8528),
  103. "Number Forms": range(8528, 8592),
  104. "Arrows": range(8592, 8704),
  105. "Mathematical Operators": range(8704, 8960),
  106. "Miscellaneous Technical": range(8960, 9216),
  107. "Control Pictures": range(9216, 9280),
  108. "Optical Character Recognition": range(9280, 9312),
  109. "Enclosed Alphanumerics": range(9312, 9472),
  110. "Box Drawing": range(9472, 9600),
  111. "Block Elements": range(9600, 9632),
  112. "Geometric Shapes": range(9632, 9728),
  113. "Miscellaneous Symbols": range(9728, 9984),
  114. "Dingbats": range(9984, 10176),
  115. "Miscellaneous Mathematical Symbols-A": range(10176, 10224),
  116. "Supplemental Arrows-A": range(10224, 10240),
  117. "Braille Patterns": range(10240, 10496),
  118. "Supplemental Arrows-B": range(10496, 10624),
  119. "Miscellaneous Mathematical Symbols-B": range(10624, 10752),
  120. "Supplemental Mathematical Operators": range(10752, 11008),
  121. "Miscellaneous Symbols and Arrows": range(11008, 11264),
  122. "Glagolitic": range(11264, 11360),
  123. "Latin Extended-C": range(11360, 11392),
  124. "Coptic": range(11392, 11520),
  125. "Georgian Supplement": range(11520, 11568),
  126. "Tifinagh": range(11568, 11648),
  127. "Ethiopic Extended": range(11648, 11744),
  128. "Cyrillic Extended-A": range(11744, 11776),
  129. "Supplemental Punctuation": range(11776, 11904),
  130. "CJK Radicals Supplement": range(11904, 12032),
  131. "Kangxi Radicals": range(12032, 12256),
  132. "Ideographic Description Characters": range(12272, 12288),
  133. "CJK Symbols and Punctuation": range(12288, 12352),
  134. "Hiragana": range(12352, 12448),
  135. "Katakana": range(12448, 12544),
  136. "Bopomofo": range(12544, 12592),
  137. "Hangul Compatibility Jamo": range(12592, 12688),
  138. "Kanbun": range(12688, 12704),
  139. "Bopomofo Extended": range(12704, 12736),
  140. "CJK Strokes": range(12736, 12784),
  141. "Katakana Phonetic Extensions": range(12784, 12800),
  142. "Enclosed CJK Letters and Months": range(12800, 13056),
  143. "CJK Compatibility": range(13056, 13312),
  144. "CJK Unified Ideographs Extension A": range(13312, 19904),
  145. "Yijing Hexagram Symbols": range(19904, 19968),
  146. "CJK Unified Ideographs": range(19968, 40960),
  147. "Yi Syllables": range(40960, 42128),
  148. "Yi Radicals": range(42128, 42192),
  149. "Lisu": range(42192, 42240),
  150. "Vai": range(42240, 42560),
  151. "Cyrillic Extended-B": range(42560, 42656),
  152. "Bamum": range(42656, 42752),
  153. "Modifier Tone Letters": range(42752, 42784),
  154. "Latin Extended-D": range(42784, 43008),
  155. "Syloti Nagri": range(43008, 43056),
  156. "Common Indic Number Forms": range(43056, 43072),
  157. "Phags-pa": range(43072, 43136),
  158. "Saurashtra": range(43136, 43232),
  159. "Devanagari Extended": range(43232, 43264),
  160. "Kayah Li": range(43264, 43312),
  161. "Rejang": range(43312, 43360),
  162. "Hangul Jamo Extended-A": range(43360, 43392),
  163. "Javanese": range(43392, 43488),
  164. "Myanmar Extended-B": range(43488, 43520),
  165. "Cham": range(43520, 43616),
  166. "Myanmar Extended-A": range(43616, 43648),
  167. "Tai Viet": range(43648, 43744),
  168. "Meetei Mayek Extensions": range(43744, 43776),
  169. "Ethiopic Extended-A": range(43776, 43824),
  170. "Latin Extended-E": range(43824, 43888),
  171. "Cherokee Supplement": range(43888, 43968),
  172. "Meetei Mayek": range(43968, 44032),
  173. "Hangul Syllables": range(44032, 55216),
  174. "Hangul Jamo Extended-B": range(55216, 55296),
  175. "High Surrogates": range(55296, 56192),
  176. "High Private Use Surrogates": range(56192, 56320),
  177. "Low Surrogates": range(56320, 57344),
  178. "Private Use Area": range(57344, 63744),
  179. "CJK Compatibility Ideographs": range(63744, 64256),
  180. "Alphabetic Presentation Forms": range(64256, 64336),
  181. "Arabic Presentation Forms-A": range(64336, 65024),
  182. "Variation Selectors": range(65024, 65040),
  183. "Vertical Forms": range(65040, 65056),
  184. "Combining Half Marks": range(65056, 65072),
  185. "CJK Compatibility Forms": range(65072, 65104),
  186. "Small Form Variants": range(65104, 65136),
  187. "Arabic Presentation Forms-B": range(65136, 65280),
  188. "Halfwidth and Fullwidth Forms": range(65280, 65520),
  189. "Specials": range(65520, 65536),
  190. "Linear B Syllabary": range(65536, 65664),
  191. "Linear B Ideograms": range(65664, 65792),
  192. "Aegean Numbers": range(65792, 65856),
  193. "Ancient Greek Numbers": range(65856, 65936),
  194. "Ancient Symbols": range(65936, 66000),
  195. "Phaistos Disc": range(66000, 66048),
  196. "Lycian": range(66176, 66208),
  197. "Carian": range(66208, 66272),
  198. "Coptic Epact Numbers": range(66272, 66304),
  199. "Old Italic": range(66304, 66352),
  200. "Gothic": range(66352, 66384),
  201. "Old Permic": range(66384, 66432),
  202. "Ugaritic": range(66432, 66464),
  203. "Old Persian": range(66464, 66528),
  204. "Deseret": range(66560, 66640),
  205. "Shavian": range(66640, 66688),
  206. "Osmanya": range(66688, 66736),
  207. "Osage": range(66736, 66816),
  208. "Elbasan": range(66816, 66864),
  209. "Caucasian Albanian": range(66864, 66928),
  210. "Vithkuqi": range(66928, 67008),
  211. "Linear A": range(67072, 67456),
  212. "Latin Extended-F": range(67456, 67520),
  213. "Cypriot Syllabary": range(67584, 67648),
  214. "Imperial Aramaic": range(67648, 67680),
  215. "Palmyrene": range(67680, 67712),
  216. "Nabataean": range(67712, 67760),
  217. "Hatran": range(67808, 67840),
  218. "Phoenician": range(67840, 67872),
  219. "Lydian": range(67872, 67904),
  220. "Meroitic Hieroglyphs": range(67968, 68000),
  221. "Meroitic Cursive": range(68000, 68096),
  222. "Kharoshthi": range(68096, 68192),
  223. "Old South Arabian": range(68192, 68224),
  224. "Old North Arabian": range(68224, 68256),
  225. "Manichaean": range(68288, 68352),
  226. "Avestan": range(68352, 68416),
  227. "Inscriptional Parthian": range(68416, 68448),
  228. "Inscriptional Pahlavi": range(68448, 68480),
  229. "Psalter Pahlavi": range(68480, 68528),
  230. "Old Turkic": range(68608, 68688),
  231. "Old Hungarian": range(68736, 68864),
  232. "Hanifi Rohingya": range(68864, 68928),
  233. "Rumi Numeral Symbols": range(69216, 69248),
  234. "Yezidi": range(69248, 69312),
  235. "Arabic Extended-C": range(69312, 69376),
  236. "Old Sogdian": range(69376, 69424),
  237. "Sogdian": range(69424, 69488),
  238. "Old Uyghur": range(69488, 69552),
  239. "Chorasmian": range(69552, 69600),
  240. "Elymaic": range(69600, 69632),
  241. "Brahmi": range(69632, 69760),
  242. "Kaithi": range(69760, 69840),
  243. "Sora Sompeng": range(69840, 69888),
  244. "Chakma": range(69888, 69968),
  245. "Mahajani": range(69968, 70016),
  246. "Sharada": range(70016, 70112),
  247. "Sinhala Archaic Numbers": range(70112, 70144),
  248. "Khojki": range(70144, 70224),
  249. "Multani": range(70272, 70320),
  250. "Khudawadi": range(70320, 70400),
  251. "Grantha": range(70400, 70528),
  252. "Newa": range(70656, 70784),
  253. "Tirhuta": range(70784, 70880),
  254. "Siddham": range(71040, 71168),
  255. "Modi": range(71168, 71264),
  256. "Mongolian Supplement": range(71264, 71296),
  257. "Takri": range(71296, 71376),
  258. "Ahom": range(71424, 71504),
  259. "Dogra": range(71680, 71760),
  260. "Warang Citi": range(71840, 71936),
  261. "Dives Akuru": range(71936, 72032),
  262. "Nandinagari": range(72096, 72192),
  263. "Zanabazar Square": range(72192, 72272),
  264. "Soyombo": range(72272, 72368),
  265. "Unified Canadian Aboriginal Syllabics Extended-A": range(72368, 72384),
  266. "Pau Cin Hau": range(72384, 72448),
  267. "Devanagari Extended-A": range(72448, 72544),
  268. "Bhaiksuki": range(72704, 72816),
  269. "Marchen": range(72816, 72896),
  270. "Masaram Gondi": range(72960, 73056),
  271. "Gunjala Gondi": range(73056, 73136),
  272. "Makasar": range(73440, 73472),
  273. "Kawi": range(73472, 73568),
  274. "Lisu Supplement": range(73648, 73664),
  275. "Tamil Supplement": range(73664, 73728),
  276. "Cuneiform": range(73728, 74752),
  277. "Cuneiform Numbers and Punctuation": range(74752, 74880),
  278. "Early Dynastic Cuneiform": range(74880, 75088),
  279. "Cypro-Minoan": range(77712, 77824),
  280. "Egyptian Hieroglyphs": range(77824, 78896),
  281. "Egyptian Hieroglyph Format Controls": range(78896, 78944),
  282. "Anatolian Hieroglyphs": range(82944, 83584),
  283. "Bamum Supplement": range(92160, 92736),
  284. "Mro": range(92736, 92784),
  285. "Tangsa": range(92784, 92880),
  286. "Bassa Vah": range(92880, 92928),
  287. "Pahawh Hmong": range(92928, 93072),
  288. "Medefaidrin": range(93760, 93856),
  289. "Miao": range(93952, 94112),
  290. "Ideographic Symbols and Punctuation": range(94176, 94208),
  291. "Tangut": range(94208, 100352),
  292. "Tangut Components": range(100352, 101120),
  293. "Khitan Small Script": range(101120, 101632),
  294. "Tangut Supplement": range(101632, 101760),
  295. "Kana Extended-B": range(110576, 110592),
  296. "Kana Supplement": range(110592, 110848),
  297. "Kana Extended-A": range(110848, 110896),
  298. "Small Kana Extension": range(110896, 110960),
  299. "Nushu": range(110960, 111360),
  300. "Duployan": range(113664, 113824),
  301. "Shorthand Format Controls": range(113824, 113840),
  302. "Znamenny Musical Notation": range(118528, 118736),
  303. "Byzantine Musical Symbols": range(118784, 119040),
  304. "Musical Symbols": range(119040, 119296),
  305. "Ancient Greek Musical Notation": range(119296, 119376),
  306. "Kaktovik Numerals": range(119488, 119520),
  307. "Mayan Numerals": range(119520, 119552),
  308. "Tai Xuan Jing Symbols": range(119552, 119648),
  309. "Counting Rod Numerals": range(119648, 119680),
  310. "Mathematical Alphanumeric Symbols": range(119808, 120832),
  311. "Sutton SignWriting": range(120832, 121520),
  312. "Latin Extended-G": range(122624, 122880),
  313. "Glagolitic Supplement": range(122880, 122928),
  314. "Cyrillic Extended-D": range(122928, 123024),
  315. "Nyiakeng Puachue Hmong": range(123136, 123216),
  316. "Toto": range(123536, 123584),
  317. "Wancho": range(123584, 123648),
  318. "Nag Mundari": range(124112, 124160),
  319. "Ethiopic Extended-B": range(124896, 124928),
  320. "Mende Kikakui": range(124928, 125152),
  321. "Adlam": range(125184, 125280),
  322. "Indic Siyaq Numbers": range(126064, 126144),
  323. "Ottoman Siyaq Numbers": range(126208, 126288),
  324. "Arabic Mathematical Alphabetic Symbols": range(126464, 126720),
  325. "Mahjong Tiles": range(126976, 127024),
  326. "Domino Tiles": range(127024, 127136),
  327. "Playing Cards": range(127136, 127232),
  328. "Enclosed Alphanumeric Supplement": range(127232, 127488),
  329. "Enclosed Ideographic Supplement": range(127488, 127744),
  330. "Miscellaneous Symbols and Pictographs": range(127744, 128512),
  331. "Emoticons range(Emoji)": range(128512, 128592),
  332. "Ornamental Dingbats": range(128592, 128640),
  333. "Transport and Map Symbols": range(128640, 128768),
  334. "Alchemical Symbols": range(128768, 128896),
  335. "Geometric Shapes Extended": range(128896, 129024),
  336. "Supplemental Arrows-C": range(129024, 129280),
  337. "Supplemental Symbols and Pictographs": range(129280, 129536),
  338. "Chess Symbols": range(129536, 129648),
  339. "Symbols and Pictographs Extended-A": range(129648, 129792),
  340. "Symbols for Legacy Computing": range(129792, 130048),
  341. "CJK Unified Ideographs Extension B": range(131072, 173792),
  342. "CJK Unified Ideographs Extension C": range(173824, 177984),
  343. "CJK Unified Ideographs Extension D": range(177984, 178208),
  344. "CJK Unified Ideographs Extension E": range(178208, 183984),
  345. "CJK Unified Ideographs Extension F": range(183984, 191472),
  346. "CJK Compatibility Ideographs Supplement": range(194560, 195104),
  347. "CJK Unified Ideographs Extension G": range(196608, 201552),
  348. "CJK Unified Ideographs Extension H": range(201552, 205744),
  349. "Tags": range(917504, 917632),
  350. "Variation Selectors Supplement": range(917760, 918000),
  351. "Supplementary Private Use Area-A": range(983040, 1048576),
  352. "Supplementary Private Use Area-B": range(1048576, 1114112),
  353. }
  354. UNICODE_SECONDARY_RANGE_KEYWORD: List[str] = [
  355. "Supplement",
  356. "Extended",
  357. "Extensions",
  358. "Modifier",
  359. "Marks",
  360. "Punctuation",
  361. "Symbols",
  362. "Forms",
  363. "Operators",
  364. "Miscellaneous",
  365. "Drawing",
  366. "Block",
  367. "Shapes",
  368. "Supplemental",
  369. "Tags",
  370. ]
  371. RE_POSSIBLE_ENCODING_INDICATION = re_compile(
  372. r"(?:(?:encoding)|(?:charset)|(?:coding))(?:[\:= ]{1,10})(?:[\"\']?)([a-zA-Z0-9\-_]+)(?:[\"\']?)",
  373. IGNORECASE,
  374. )
  375. IANA_NO_ALIASES = [
  376. "cp720",
  377. "cp737",
  378. "cp856",
  379. "cp874",
  380. "cp875",
  381. "cp1006",
  382. "koi8_r",
  383. "koi8_t",
  384. "koi8_u",
  385. ]
  386. IANA_SUPPORTED: List[str] = sorted(
  387. filter(
  388. lambda x: x.endswith("_codec") is False
  389. and x not in {"rot_13", "tactis", "mbcs"},
  390. list(set(aliases.values())) + IANA_NO_ALIASES,
  391. )
  392. )
  393. IANA_SUPPORTED_COUNT: int = len(IANA_SUPPORTED)
  394. # pre-computed code page that are similar using the function cp_similarity.
  395. IANA_SUPPORTED_SIMILAR: Dict[str, List[str]] = {
  396. "cp037": ["cp1026", "cp1140", "cp273", "cp500"],
  397. "cp1026": ["cp037", "cp1140", "cp273", "cp500"],
  398. "cp1125": ["cp866"],
  399. "cp1140": ["cp037", "cp1026", "cp273", "cp500"],
  400. "cp1250": ["iso8859_2"],
  401. "cp1251": ["kz1048", "ptcp154"],
  402. "cp1252": ["iso8859_15", "iso8859_9", "latin_1"],
  403. "cp1253": ["iso8859_7"],
  404. "cp1254": ["iso8859_15", "iso8859_9", "latin_1"],
  405. "cp1257": ["iso8859_13"],
  406. "cp273": ["cp037", "cp1026", "cp1140", "cp500"],
  407. "cp437": ["cp850", "cp858", "cp860", "cp861", "cp862", "cp863", "cp865"],
  408. "cp500": ["cp037", "cp1026", "cp1140", "cp273"],
  409. "cp850": ["cp437", "cp857", "cp858", "cp865"],
  410. "cp857": ["cp850", "cp858", "cp865"],
  411. "cp858": ["cp437", "cp850", "cp857", "cp865"],
  412. "cp860": ["cp437", "cp861", "cp862", "cp863", "cp865"],
  413. "cp861": ["cp437", "cp860", "cp862", "cp863", "cp865"],
  414. "cp862": ["cp437", "cp860", "cp861", "cp863", "cp865"],
  415. "cp863": ["cp437", "cp860", "cp861", "cp862", "cp865"],
  416. "cp865": ["cp437", "cp850", "cp857", "cp858", "cp860", "cp861", "cp862", "cp863"],
  417. "cp866": ["cp1125"],
  418. "iso8859_10": ["iso8859_14", "iso8859_15", "iso8859_4", "iso8859_9", "latin_1"],
  419. "iso8859_11": ["tis_620"],
  420. "iso8859_13": ["cp1257"],
  421. "iso8859_14": [
  422. "iso8859_10",
  423. "iso8859_15",
  424. "iso8859_16",
  425. "iso8859_3",
  426. "iso8859_9",
  427. "latin_1",
  428. ],
  429. "iso8859_15": [
  430. "cp1252",
  431. "cp1254",
  432. "iso8859_10",
  433. "iso8859_14",
  434. "iso8859_16",
  435. "iso8859_3",
  436. "iso8859_9",
  437. "latin_1",
  438. ],
  439. "iso8859_16": [
  440. "iso8859_14",
  441. "iso8859_15",
  442. "iso8859_2",
  443. "iso8859_3",
  444. "iso8859_9",
  445. "latin_1",
  446. ],
  447. "iso8859_2": ["cp1250", "iso8859_16", "iso8859_4"],
  448. "iso8859_3": ["iso8859_14", "iso8859_15", "iso8859_16", "iso8859_9", "latin_1"],
  449. "iso8859_4": ["iso8859_10", "iso8859_2", "iso8859_9", "latin_1"],
  450. "iso8859_7": ["cp1253"],
  451. "iso8859_9": [
  452. "cp1252",
  453. "cp1254",
  454. "cp1258",
  455. "iso8859_10",
  456. "iso8859_14",
  457. "iso8859_15",
  458. "iso8859_16",
  459. "iso8859_3",
  460. "iso8859_4",
  461. "latin_1",
  462. ],
  463. "kz1048": ["cp1251", "ptcp154"],
  464. "latin_1": [
  465. "cp1252",
  466. "cp1254",
  467. "cp1258",
  468. "iso8859_10",
  469. "iso8859_14",
  470. "iso8859_15",
  471. "iso8859_16",
  472. "iso8859_3",
  473. "iso8859_4",
  474. "iso8859_9",
  475. ],
  476. "mac_iceland": ["mac_roman", "mac_turkish"],
  477. "mac_roman": ["mac_iceland", "mac_turkish"],
  478. "mac_turkish": ["mac_iceland", "mac_roman"],
  479. "ptcp154": ["cp1251", "kz1048"],
  480. "tis_620": ["iso8859_11"],
  481. }
  482. CHARDET_CORRESPONDENCE: Dict[str, str] = {
  483. "iso2022_kr": "ISO-2022-KR",
  484. "iso2022_jp": "ISO-2022-JP",
  485. "euc_kr": "EUC-KR",
  486. "tis_620": "TIS-620",
  487. "utf_32": "UTF-32",
  488. "euc_jp": "EUC-JP",
  489. "koi8_r": "KOI8-R",
  490. "iso8859_1": "ISO-8859-1",
  491. "iso8859_2": "ISO-8859-2",
  492. "iso8859_5": "ISO-8859-5",
  493. "iso8859_6": "ISO-8859-6",
  494. "iso8859_7": "ISO-8859-7",
  495. "iso8859_8": "ISO-8859-8",
  496. "utf_16": "UTF-16",
  497. "cp855": "IBM855",
  498. "mac_cyrillic": "MacCyrillic",
  499. "gb2312": "GB2312",
  500. "gb18030": "GB18030",
  501. "cp932": "CP932",
  502. "cp866": "IBM866",
  503. "utf_8": "utf-8",
  504. "utf_8_sig": "UTF-8-SIG",
  505. "shift_jis": "SHIFT_JIS",
  506. "big5": "Big5",
  507. "cp1250": "windows-1250",
  508. "cp1251": "windows-1251",
  509. "cp1252": "Windows-1252",
  510. "cp1253": "windows-1253",
  511. "cp1255": "windows-1255",
  512. "cp1256": "windows-1256",
  513. "cp1254": "Windows-1254",
  514. "cp949": "CP949",
  515. }
  516. COMMON_SAFE_ASCII_CHARACTERS: Set[str] = {
  517. "<",
  518. ">",
  519. "=",
  520. ":",
  521. "/",
  522. "&",
  523. ";",
  524. "{",
  525. "}",
  526. "[",
  527. "]",
  528. ",",
  529. "|",
  530. '"',
  531. "-",
  532. }
  533. KO_NAMES: Set[str] = {"johab", "cp949", "euc_kr"}
  534. ZH_NAMES: Set[str] = {"big5", "cp950", "big5hkscs", "hz"}
  535. # Logging LEVEL below DEBUG
  536. TRACE: int = 5
  537. # Language label that contain the em dash "—"
  538. # character are to be considered alternative seq to origin
  539. FREQUENCIES: Dict[str, List[str]] = {
  540. "English": [
  541. "e",
  542. "a",
  543. "t",
  544. "i",
  545. "o",
  546. "n",
  547. "s",
  548. "r",
  549. "h",
  550. "l",
  551. "d",
  552. "c",
  553. "u",
  554. "m",
  555. "f",
  556. "p",
  557. "g",
  558. "w",
  559. "y",
  560. "b",
  561. "v",
  562. "k",
  563. "x",
  564. "j",
  565. "z",
  566. "q",
  567. ],
  568. "English—": [
  569. "e",
  570. "a",
  571. "t",
  572. "i",
  573. "o",
  574. "n",
  575. "s",
  576. "r",
  577. "h",
  578. "l",
  579. "d",
  580. "c",
  581. "m",
  582. "u",
  583. "f",
  584. "p",
  585. "g",
  586. "w",
  587. "b",
  588. "y",
  589. "v",
  590. "k",
  591. "j",
  592. "x",
  593. "z",
  594. "q",
  595. ],
  596. "German": [
  597. "e",
  598. "n",
  599. "i",
  600. "r",
  601. "s",
  602. "t",
  603. "a",
  604. "d",
  605. "h",
  606. "u",
  607. "l",
  608. "g",
  609. "o",
  610. "c",
  611. "m",
  612. "b",
  613. "f",
  614. "k",
  615. "w",
  616. "z",
  617. "p",
  618. "v",
  619. "ü",
  620. "ä",
  621. "ö",
  622. "j",
  623. ],
  624. "French": [
  625. "e",
  626. "a",
  627. "s",
  628. "n",
  629. "i",
  630. "t",
  631. "r",
  632. "l",
  633. "u",
  634. "o",
  635. "d",
  636. "c",
  637. "p",
  638. "m",
  639. "é",
  640. "v",
  641. "g",
  642. "f",
  643. "b",
  644. "h",
  645. "q",
  646. "à",
  647. "x",
  648. "è",
  649. "y",
  650. "j",
  651. ],
  652. "Dutch": [
  653. "e",
  654. "n",
  655. "a",
  656. "i",
  657. "r",
  658. "t",
  659. "o",
  660. "d",
  661. "s",
  662. "l",
  663. "g",
  664. "h",
  665. "v",
  666. "m",
  667. "u",
  668. "k",
  669. "c",
  670. "p",
  671. "b",
  672. "w",
  673. "j",
  674. "z",
  675. "f",
  676. "y",
  677. "x",
  678. "ë",
  679. ],
  680. "Italian": [
  681. "e",
  682. "i",
  683. "a",
  684. "o",
  685. "n",
  686. "l",
  687. "t",
  688. "r",
  689. "s",
  690. "c",
  691. "d",
  692. "u",
  693. "p",
  694. "m",
  695. "g",
  696. "v",
  697. "f",
  698. "b",
  699. "z",
  700. "h",
  701. "q",
  702. "è",
  703. "à",
  704. "k",
  705. "y",
  706. "ò",
  707. ],
  708. "Polish": [
  709. "a",
  710. "i",
  711. "o",
  712. "e",
  713. "n",
  714. "r",
  715. "z",
  716. "w",
  717. "s",
  718. "c",
  719. "t",
  720. "k",
  721. "y",
  722. "d",
  723. "p",
  724. "m",
  725. "u",
  726. "l",
  727. "j",
  728. "ł",
  729. "g",
  730. "b",
  731. "h",
  732. "ą",
  733. "ę",
  734. "ó",
  735. ],
  736. "Spanish": [
  737. "e",
  738. "a",
  739. "o",
  740. "n",
  741. "s",
  742. "r",
  743. "i",
  744. "l",
  745. "d",
  746. "t",
  747. "c",
  748. "u",
  749. "m",
  750. "p",
  751. "b",
  752. "g",
  753. "v",
  754. "f",
  755. "y",
  756. "ó",
  757. "h",
  758. "q",
  759. "í",
  760. "j",
  761. "z",
  762. "á",
  763. ],
  764. "Russian": [
  765. "о",
  766. "а",
  767. "е",
  768. "и",
  769. "н",
  770. "с",
  771. "т",
  772. "р",
  773. "в",
  774. "л",
  775. "к",
  776. "м",
  777. "д",
  778. "п",
  779. "у",
  780. "г",
  781. "я",
  782. "ы",
  783. "з",
  784. "б",
  785. "й",
  786. "ь",
  787. "ч",
  788. "х",
  789. "ж",
  790. "ц",
  791. ],
  792. # Jap-Kanji
  793. "Japanese": [
  794. "",
  795. "",
  796. "",
  797. "",
  798. "",
  799. "",
  800. "",
  801. "",
  802. "",
  803. "",
  804. "",
  805. "",
  806. "",
  807. "",
  808. "",
  809. "",
  810. "",
  811. "",
  812. "",
  813. "",
  814. "",
  815. "丿",
  816. "",
  817. "",
  818. "",
  819. "",
  820. "",
  821. "",
  822. "",
  823. "",
  824. "",
  825. "",
  826. "",
  827. "",
  828. "",
  829. "",
  830. "",
  831. "",
  832. "",
  833. "",
  834. "",
  835. "",
  836. "",
  837. "",
  838. "",
  839. "",
  840. "",
  841. "",
  842. "",
  843. "",
  844. "",
  845. "",
  846. "",
  847. "",
  848. "",
  849. "",
  850. "",
  851. "",
  852. "",
  853. "",
  854. "",
  855. "",
  856. "",
  857. "",
  858. "",
  859. "",
  860. "",
  861. "",
  862. "",
  863. "",
  864. "",
  865. "",
  866. "",
  867. "",
  868. "",
  869. "",
  870. "",
  871. "",
  872. "",
  873. "",
  874. "",
  875. "",
  876. "",
  877. "",
  878. "",
  879. "广",
  880. "",
  881. "",
  882. "",
  883. "",
  884. "",
  885. "",
  886. "",
  887. "",
  888. "",
  889. "",
  890. "",
  891. "",
  892. "",
  893. "",
  894. ],
  895. # Jap-Katakana
  896. "Japanese—": [
  897. "",
  898. "",
  899. "",
  900. "",
  901. "",
  902. "",
  903. "",
  904. "",
  905. "",
  906. "",
  907. "",
  908. "",
  909. "",
  910. "",
  911. "",
  912. "",
  913. "",
  914. "",
  915. "",
  916. "",
  917. "",
  918. "",
  919. "",
  920. "",
  921. "",
  922. "",
  923. "",
  924. "",
  925. "",
  926. "",
  927. "",
  928. "",
  929. "",
  930. "",
  931. "",
  932. "",
  933. "",
  934. "",
  935. "",
  936. "",
  937. "",
  938. "",
  939. "",
  940. "",
  941. "",
  942. "",
  943. "",
  944. "",
  945. "",
  946. "",
  947. "",
  948. "",
  949. "",
  950. "",
  951. "",
  952. "",
  953. "",
  954. "",
  955. "",
  956. "",
  957. "",
  958. "",
  959. "",
  960. "",
  961. "",
  962. "",
  963. "",
  964. "",
  965. "",
  966. "",
  967. "",
  968. "",
  969. "",
  970. "",
  971. "",
  972. "",
  973. "",
  974. "",
  975. "",
  976. "",
  977. "",
  978. "",
  979. "",
  980. "",
  981. "",
  982. "",
  983. "",
  984. "",
  985. "",
  986. "",
  987. "",
  988. "",
  989. "",
  990. "",
  991. "",
  992. "",
  993. ],
  994. # Jap-Hiragana
  995. "Japanese——": [
  996. "",
  997. "",
  998. "",
  999. "",
  1000. "",
  1001. "",
  1002. "",
  1003. "",
  1004. "",
  1005. "",
  1006. "",
  1007. "",
  1008. "",
  1009. "",
  1010. "",
  1011. "",
  1012. "",
  1013. "",
  1014. "",
  1015. "",
  1016. "",
  1017. "",
  1018. "",
  1019. "",
  1020. "",
  1021. "",
  1022. "",
  1023. "",
  1024. "",
  1025. "",
  1026. "",
  1027. "",
  1028. "",
  1029. "",
  1030. "",
  1031. "",
  1032. "",
  1033. "",
  1034. "",
  1035. "",
  1036. "",
  1037. "",
  1038. "",
  1039. "",
  1040. "",
  1041. "",
  1042. "",
  1043. "",
  1044. "",
  1045. "",
  1046. "",
  1047. "",
  1048. "",
  1049. "",
  1050. "",
  1051. "",
  1052. "",
  1053. "",
  1054. "",
  1055. "",
  1056. "",
  1057. "",
  1058. "",
  1059. "",
  1060. "",
  1061. "",
  1062. "",
  1063. "",
  1064. "",
  1065. "",
  1066. "",
  1067. "",
  1068. "",
  1069. "",
  1070. "",
  1071. "",
  1072. "",
  1073. "",
  1074. "",
  1075. "",
  1076. "",
  1077. "",
  1078. "",
  1079. "",
  1080. "",
  1081. "",
  1082. "",
  1083. "",
  1084. "",
  1085. "",
  1086. "",
  1087. "",
  1088. "",
  1089. ],
  1090. "Portuguese": [
  1091. "a",
  1092. "e",
  1093. "o",
  1094. "s",
  1095. "i",
  1096. "r",
  1097. "d",
  1098. "n",
  1099. "t",
  1100. "m",
  1101. "u",
  1102. "c",
  1103. "l",
  1104. "p",
  1105. "g",
  1106. "v",
  1107. "b",
  1108. "f",
  1109. "h",
  1110. "ã",
  1111. "q",
  1112. "é",
  1113. "ç",
  1114. "á",
  1115. "z",
  1116. "í",
  1117. ],
  1118. "Swedish": [
  1119. "e",
  1120. "a",
  1121. "n",
  1122. "r",
  1123. "t",
  1124. "s",
  1125. "i",
  1126. "l",
  1127. "d",
  1128. "o",
  1129. "m",
  1130. "k",
  1131. "g",
  1132. "v",
  1133. "h",
  1134. "f",
  1135. "u",
  1136. "p",
  1137. "ä",
  1138. "c",
  1139. "b",
  1140. "ö",
  1141. "å",
  1142. "y",
  1143. "j",
  1144. "x",
  1145. ],
  1146. "Chinese": [
  1147. "",
  1148. "",
  1149. "",
  1150. "",
  1151. "",
  1152. "",
  1153. "",
  1154. "",
  1155. "",
  1156. "",
  1157. "",
  1158. "",
  1159. "",
  1160. "",
  1161. "",
  1162. "",
  1163. "",
  1164. "",
  1165. "",
  1166. "",
  1167. "",
  1168. "",
  1169. "",
  1170. "",
  1171. "",
  1172. "",
  1173. "",
  1174. "",
  1175. "",
  1176. "",
  1177. "",
  1178. "",
  1179. "",
  1180. "",
  1181. "",
  1182. "",
  1183. "",
  1184. "",
  1185. "",
  1186. "",
  1187. "",
  1188. "",
  1189. "",
  1190. "",
  1191. "",
  1192. "",
  1193. "",
  1194. "",
  1195. "",
  1196. "",
  1197. "",
  1198. "",
  1199. "",
  1200. "",
  1201. "",
  1202. "",
  1203. "",
  1204. "",
  1205. "",
  1206. "",
  1207. "",
  1208. "",
  1209. "",
  1210. "",
  1211. "",
  1212. "",
  1213. "",
  1214. "",
  1215. "",
  1216. "",
  1217. "",
  1218. "",
  1219. "",
  1220. "",
  1221. "",
  1222. "",
  1223. "",
  1224. "",
  1225. "",
  1226. "",
  1227. "",
  1228. "",
  1229. "",
  1230. "",
  1231. "",
  1232. "",
  1233. "",
  1234. "",
  1235. "",
  1236. "",
  1237. "",
  1238. "",
  1239. "",
  1240. "",
  1241. "",
  1242. "",
  1243. "",
  1244. "",
  1245. "",
  1246. "",
  1247. ],
  1248. "Ukrainian": [
  1249. "о",
  1250. "а",
  1251. "н",
  1252. "і",
  1253. "и",
  1254. "р",
  1255. "в",
  1256. "т",
  1257. "е",
  1258. "с",
  1259. "к",
  1260. "л",
  1261. "у",
  1262. "д",
  1263. "м",
  1264. "п",
  1265. "з",
  1266. "я",
  1267. "ь",
  1268. "б",
  1269. "г",
  1270. "й",
  1271. "ч",
  1272. "х",
  1273. "ц",
  1274. "ї",
  1275. ],
  1276. "Norwegian": [
  1277. "e",
  1278. "r",
  1279. "n",
  1280. "t",
  1281. "a",
  1282. "s",
  1283. "i",
  1284. "o",
  1285. "l",
  1286. "d",
  1287. "g",
  1288. "k",
  1289. "m",
  1290. "v",
  1291. "f",
  1292. "p",
  1293. "u",
  1294. "b",
  1295. "h",
  1296. "å",
  1297. "y",
  1298. "j",
  1299. "ø",
  1300. "c",
  1301. "æ",
  1302. "w",
  1303. ],
  1304. "Finnish": [
  1305. "a",
  1306. "i",
  1307. "n",
  1308. "t",
  1309. "e",
  1310. "s",
  1311. "l",
  1312. "o",
  1313. "u",
  1314. "k",
  1315. "ä",
  1316. "m",
  1317. "r",
  1318. "v",
  1319. "j",
  1320. "h",
  1321. "p",
  1322. "y",
  1323. "d",
  1324. "ö",
  1325. "g",
  1326. "c",
  1327. "b",
  1328. "f",
  1329. "w",
  1330. "z",
  1331. ],
  1332. "Vietnamese": [
  1333. "n",
  1334. "h",
  1335. "t",
  1336. "i",
  1337. "c",
  1338. "g",
  1339. "a",
  1340. "o",
  1341. "u",
  1342. "m",
  1343. "l",
  1344. "r",
  1345. "à",
  1346. "đ",
  1347. "s",
  1348. "e",
  1349. "v",
  1350. "p",
  1351. "b",
  1352. "y",
  1353. "ư",
  1354. "d",
  1355. "á",
  1356. "k",
  1357. "",
  1358. "ế",
  1359. ],
  1360. "Czech": [
  1361. "o",
  1362. "e",
  1363. "a",
  1364. "n",
  1365. "t",
  1366. "s",
  1367. "i",
  1368. "l",
  1369. "v",
  1370. "r",
  1371. "k",
  1372. "d",
  1373. "u",
  1374. "m",
  1375. "p",
  1376. "í",
  1377. "c",
  1378. "h",
  1379. "z",
  1380. "á",
  1381. "y",
  1382. "j",
  1383. "b",
  1384. "ě",
  1385. "é",
  1386. "ř",
  1387. ],
  1388. "Hungarian": [
  1389. "e",
  1390. "a",
  1391. "t",
  1392. "l",
  1393. "s",
  1394. "n",
  1395. "k",
  1396. "r",
  1397. "i",
  1398. "o",
  1399. "z",
  1400. "á",
  1401. "é",
  1402. "g",
  1403. "m",
  1404. "b",
  1405. "y",
  1406. "v",
  1407. "d",
  1408. "h",
  1409. "u",
  1410. "p",
  1411. "j",
  1412. "ö",
  1413. "f",
  1414. "c",
  1415. ],
  1416. "Korean": [
  1417. "",
  1418. "",
  1419. "",
  1420. "",
  1421. "",
  1422. "",
  1423. "",
  1424. "",
  1425. "",
  1426. "",
  1427. "",
  1428. "",
  1429. "",
  1430. "",
  1431. "",
  1432. "",
  1433. "",
  1434. "",
  1435. "",
  1436. "",
  1437. "",
  1438. "",
  1439. "",
  1440. "",
  1441. "",
  1442. "",
  1443. ],
  1444. "Indonesian": [
  1445. "a",
  1446. "n",
  1447. "e",
  1448. "i",
  1449. "r",
  1450. "t",
  1451. "u",
  1452. "s",
  1453. "d",
  1454. "k",
  1455. "m",
  1456. "l",
  1457. "g",
  1458. "p",
  1459. "b",
  1460. "o",
  1461. "h",
  1462. "y",
  1463. "j",
  1464. "c",
  1465. "w",
  1466. "f",
  1467. "v",
  1468. "z",
  1469. "x",
  1470. "q",
  1471. ],
  1472. "Turkish": [
  1473. "a",
  1474. "e",
  1475. "i",
  1476. "n",
  1477. "r",
  1478. "l",
  1479. "ı",
  1480. "k",
  1481. "d",
  1482. "t",
  1483. "s",
  1484. "m",
  1485. "y",
  1486. "u",
  1487. "o",
  1488. "b",
  1489. "ü",
  1490. "ş",
  1491. "v",
  1492. "g",
  1493. "z",
  1494. "h",
  1495. "c",
  1496. "p",
  1497. "ç",
  1498. "ğ",
  1499. ],
  1500. "Romanian": [
  1501. "e",
  1502. "i",
  1503. "a",
  1504. "r",
  1505. "n",
  1506. "t",
  1507. "u",
  1508. "l",
  1509. "o",
  1510. "c",
  1511. "s",
  1512. "d",
  1513. "p",
  1514. "m",
  1515. "ă",
  1516. "f",
  1517. "v",
  1518. "î",
  1519. "g",
  1520. "b",
  1521. "ș",
  1522. "ț",
  1523. "z",
  1524. "h",
  1525. "â",
  1526. "j",
  1527. ],
  1528. "Farsi": [
  1529. "ا",
  1530. "ی",
  1531. "ر",
  1532. "د",
  1533. "ن",
  1534. "ه",
  1535. "و",
  1536. "م",
  1537. "ت",
  1538. "ب",
  1539. "س",
  1540. "ل",
  1541. "ک",
  1542. "ش",
  1543. "ز",
  1544. "ف",
  1545. "گ",
  1546. "ع",
  1547. "خ",
  1548. "ق",
  1549. "ج",
  1550. "آ",
  1551. "پ",
  1552. "ح",
  1553. "ط",
  1554. "ص",
  1555. ],
  1556. "Arabic": [
  1557. "ا",
  1558. "ل",
  1559. "ي",
  1560. "م",
  1561. "و",
  1562. "ن",
  1563. "ر",
  1564. "ت",
  1565. "ب",
  1566. "ة",
  1567. "ع",
  1568. "د",
  1569. "س",
  1570. "ف",
  1571. "ه",
  1572. "ك",
  1573. "ق",
  1574. "أ",
  1575. "ح",
  1576. "ج",
  1577. "ش",
  1578. "ط",
  1579. "ص",
  1580. "ى",
  1581. "خ",
  1582. "إ",
  1583. ],
  1584. "Danish": [
  1585. "e",
  1586. "r",
  1587. "n",
  1588. "t",
  1589. "a",
  1590. "i",
  1591. "s",
  1592. "d",
  1593. "l",
  1594. "o",
  1595. "g",
  1596. "m",
  1597. "k",
  1598. "f",
  1599. "v",
  1600. "u",
  1601. "b",
  1602. "h",
  1603. "p",
  1604. "å",
  1605. "y",
  1606. "ø",
  1607. "æ",
  1608. "c",
  1609. "j",
  1610. "w",
  1611. ],
  1612. "Serbian": [
  1613. "а",
  1614. "и",
  1615. "о",
  1616. "е",
  1617. "н",
  1618. "р",
  1619. "с",
  1620. "у",
  1621. "т",
  1622. "к",
  1623. "ј",
  1624. "в",
  1625. "д",
  1626. "м",
  1627. "п",
  1628. "л",
  1629. "г",
  1630. "з",
  1631. "б",
  1632. "a",
  1633. "i",
  1634. "e",
  1635. "o",
  1636. "n",
  1637. "ц",
  1638. "ш",
  1639. ],
  1640. "Lithuanian": [
  1641. "i",
  1642. "a",
  1643. "s",
  1644. "o",
  1645. "r",
  1646. "e",
  1647. "t",
  1648. "n",
  1649. "u",
  1650. "k",
  1651. "m",
  1652. "l",
  1653. "p",
  1654. "v",
  1655. "d",
  1656. "j",
  1657. "g",
  1658. "ė",
  1659. "b",
  1660. "y",
  1661. "ų",
  1662. "š",
  1663. "ž",
  1664. "c",
  1665. "ą",
  1666. "į",
  1667. ],
  1668. "Slovene": [
  1669. "e",
  1670. "a",
  1671. "i",
  1672. "o",
  1673. "n",
  1674. "r",
  1675. "s",
  1676. "l",
  1677. "t",
  1678. "j",
  1679. "v",
  1680. "k",
  1681. "d",
  1682. "p",
  1683. "m",
  1684. "u",
  1685. "z",
  1686. "b",
  1687. "g",
  1688. "h",
  1689. "č",
  1690. "c",
  1691. "š",
  1692. "ž",
  1693. "f",
  1694. "y",
  1695. ],
  1696. "Slovak": [
  1697. "o",
  1698. "a",
  1699. "e",
  1700. "n",
  1701. "i",
  1702. "r",
  1703. "v",
  1704. "t",
  1705. "s",
  1706. "l",
  1707. "k",
  1708. "d",
  1709. "m",
  1710. "p",
  1711. "u",
  1712. "c",
  1713. "h",
  1714. "j",
  1715. "b",
  1716. "z",
  1717. "á",
  1718. "y",
  1719. "ý",
  1720. "í",
  1721. "č",
  1722. "é",
  1723. ],
  1724. "Hebrew": [
  1725. "י",
  1726. "ו",
  1727. "ה",
  1728. "ל",
  1729. "ר",
  1730. "ב",
  1731. "ת",
  1732. "מ",
  1733. "א",
  1734. "ש",
  1735. "נ",
  1736. "ע",
  1737. "ם",
  1738. "ד",
  1739. "ק",
  1740. "ח",
  1741. "פ",
  1742. "ס",
  1743. "כ",
  1744. "ג",
  1745. "ט",
  1746. "צ",
  1747. "ן",
  1748. "ז",
  1749. "ך",
  1750. ],
  1751. "Bulgarian": [
  1752. "а",
  1753. "и",
  1754. "о",
  1755. "е",
  1756. "н",
  1757. "т",
  1758. "р",
  1759. "с",
  1760. "в",
  1761. "л",
  1762. "к",
  1763. "д",
  1764. "п",
  1765. "м",
  1766. "з",
  1767. "г",
  1768. "я",
  1769. "ъ",
  1770. "у",
  1771. "б",
  1772. "ч",
  1773. "ц",
  1774. "й",
  1775. "ж",
  1776. "щ",
  1777. "х",
  1778. ],
  1779. "Croatian": [
  1780. "a",
  1781. "i",
  1782. "o",
  1783. "e",
  1784. "n",
  1785. "r",
  1786. "j",
  1787. "s",
  1788. "t",
  1789. "u",
  1790. "k",
  1791. "l",
  1792. "v",
  1793. "d",
  1794. "m",
  1795. "p",
  1796. "g",
  1797. "z",
  1798. "b",
  1799. "c",
  1800. "č",
  1801. "h",
  1802. "š",
  1803. "ž",
  1804. "ć",
  1805. "f",
  1806. ],
  1807. "Hindi": [
  1808. "",
  1809. "",
  1810. "",
  1811. "",
  1812. "",
  1813. "",
  1814. "",
  1815. "",
  1816. "",
  1817. "",
  1818. "",
  1819. "",
  1820. "",
  1821. "",
  1822. "",
  1823. "",
  1824. "",
  1825. "",
  1826. "",
  1827. "",
  1828. "",
  1829. "",
  1830. "",
  1831. "",
  1832. "",
  1833. "",
  1834. ],
  1835. "Estonian": [
  1836. "a",
  1837. "i",
  1838. "e",
  1839. "s",
  1840. "t",
  1841. "l",
  1842. "u",
  1843. "n",
  1844. "o",
  1845. "k",
  1846. "r",
  1847. "d",
  1848. "m",
  1849. "v",
  1850. "g",
  1851. "p",
  1852. "j",
  1853. "h",
  1854. "ä",
  1855. "b",
  1856. "õ",
  1857. "ü",
  1858. "f",
  1859. "c",
  1860. "ö",
  1861. "y",
  1862. ],
  1863. "Thai": [
  1864. "",
  1865. "",
  1866. "",
  1867. "",
  1868. "",
  1869. "",
  1870. "",
  1871. "",
  1872. "",
  1873. "",
  1874. "",
  1875. "",
  1876. "",
  1877. "",
  1878. "",
  1879. "",
  1880. "",
  1881. "",
  1882. "",
  1883. "",
  1884. "",
  1885. "",
  1886. "",
  1887. "",
  1888. "",
  1889. "",
  1890. ],
  1891. "Greek": [
  1892. "α",
  1893. "τ",
  1894. "ο",
  1895. "ι",
  1896. "ε",
  1897. "ν",
  1898. "ρ",
  1899. "σ",
  1900. "κ",
  1901. "η",
  1902. "π",
  1903. "ς",
  1904. "υ",
  1905. "μ",
  1906. "λ",
  1907. "ί",
  1908. "ό",
  1909. "ά",
  1910. "γ",
  1911. "έ",
  1912. "δ",
  1913. "ή",
  1914. "ω",
  1915. "χ",
  1916. "θ",
  1917. "ύ",
  1918. ],
  1919. "Tamil": [
  1920. "",
  1921. "",
  1922. "",
  1923. "",
  1924. "",
  1925. "",
  1926. "",
  1927. "",
  1928. "",
  1929. "",
  1930. "",
  1931. "",
  1932. "",
  1933. "",
  1934. "",
  1935. "",
  1936. "",
  1937. "",
  1938. "",
  1939. "",
  1940. "",
  1941. "",
  1942. "",
  1943. "",
  1944. ],
  1945. "Kazakh": [
  1946. "а",
  1947. "ы",
  1948. "е",
  1949. "н",
  1950. "т",
  1951. "р",
  1952. "л",
  1953. "і",
  1954. "д",
  1955. "с",
  1956. "м",
  1957. "қ",
  1958. "к",
  1959. "о",
  1960. "б",
  1961. "и",
  1962. "у",
  1963. "ғ",
  1964. "ж",
  1965. "ң",
  1966. "з",
  1967. "ш",
  1968. "й",
  1969. "п",
  1970. "г",
  1971. "ө",
  1972. ],
  1973. }
  1974. LANGUAGE_SUPPORTED_COUNT: int = len(FREQUENCIES)