m2m模型翻译
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

326 lines
9.3 KiB

6 months ago
  1. from __future__ import absolute_import
  2. import gzip
  3. import io
  4. import platform
  5. import struct
  6. from kafka.vendor import six
  7. from kafka.vendor.six.moves import range
  8. _XERIAL_V1_HEADER = (-126, b'S', b'N', b'A', b'P', b'P', b'Y', 0, 1, 1)
  9. _XERIAL_V1_FORMAT = 'bccccccBii'
  10. ZSTD_MAX_OUTPUT_SIZE = 1024 * 1024
  11. try:
  12. import snappy
  13. except ImportError:
  14. snappy = None
  15. try:
  16. import zstandard as zstd
  17. except ImportError:
  18. zstd = None
  19. try:
  20. import lz4.frame as lz4
  21. def _lz4_compress(payload, **kwargs):
  22. # Kafka does not support LZ4 dependent blocks
  23. try:
  24. # For lz4>=0.12.0
  25. kwargs.pop('block_linked', None)
  26. return lz4.compress(payload, block_linked=False, **kwargs)
  27. except TypeError:
  28. # For earlier versions of lz4
  29. kwargs.pop('block_mode', None)
  30. return lz4.compress(payload, block_mode=1, **kwargs)
  31. except ImportError:
  32. lz4 = None
  33. try:
  34. import lz4f
  35. except ImportError:
  36. lz4f = None
  37. try:
  38. import lz4framed
  39. except ImportError:
  40. lz4framed = None
  41. try:
  42. import xxhash
  43. except ImportError:
  44. xxhash = None
  45. PYPY = bool(platform.python_implementation() == 'PyPy')
  46. def has_gzip():
  47. return True
  48. def has_snappy():
  49. return snappy is not None
  50. def has_zstd():
  51. return zstd is not None
  52. def has_lz4():
  53. if lz4 is not None:
  54. return True
  55. if lz4f is not None:
  56. return True
  57. if lz4framed is not None:
  58. return True
  59. return False
  60. def gzip_encode(payload, compresslevel=None):
  61. if not compresslevel:
  62. compresslevel = 9
  63. buf = io.BytesIO()
  64. # Gzip context manager introduced in python 2.7
  65. # so old-fashioned way until we decide to not support 2.6
  66. gzipper = gzip.GzipFile(fileobj=buf, mode="w", compresslevel=compresslevel)
  67. try:
  68. gzipper.write(payload)
  69. finally:
  70. gzipper.close()
  71. return buf.getvalue()
  72. def gzip_decode(payload):
  73. buf = io.BytesIO(payload)
  74. # Gzip context manager introduced in python 2.7
  75. # so old-fashioned way until we decide to not support 2.6
  76. gzipper = gzip.GzipFile(fileobj=buf, mode='r')
  77. try:
  78. return gzipper.read()
  79. finally:
  80. gzipper.close()
  81. def snappy_encode(payload, xerial_compatible=True, xerial_blocksize=32*1024):
  82. """Encodes the given data with snappy compression.
  83. If xerial_compatible is set then the stream is encoded in a fashion
  84. compatible with the xerial snappy library.
  85. The block size (xerial_blocksize) controls how frequent the blocking occurs
  86. 32k is the default in the xerial library.
  87. The format winds up being:
  88. +-------------+------------+--------------+------------+--------------+
  89. | Header | Block1 len | Block1 data | Blockn len | Blockn data |
  90. +-------------+------------+--------------+------------+--------------+
  91. | 16 bytes | BE int32 | snappy bytes | BE int32 | snappy bytes |
  92. +-------------+------------+--------------+------------+--------------+
  93. It is important to note that the blocksize is the amount of uncompressed
  94. data presented to snappy at each block, whereas the blocklen is the number
  95. of bytes that will be present in the stream; so the length will always be
  96. <= blocksize.
  97. """
  98. if not has_snappy():
  99. raise NotImplementedError("Snappy codec is not available")
  100. if not xerial_compatible:
  101. return snappy.compress(payload)
  102. out = io.BytesIO()
  103. for fmt, dat in zip(_XERIAL_V1_FORMAT, _XERIAL_V1_HEADER):
  104. out.write(struct.pack('!' + fmt, dat))
  105. # Chunk through buffers to avoid creating intermediate slice copies
  106. if PYPY:
  107. # on pypy, snappy.compress() on a sliced buffer consumes the entire
  108. # buffer... likely a python-snappy bug, so just use a slice copy
  109. chunker = lambda payload, i, size: payload[i:size+i]
  110. elif six.PY2:
  111. # Sliced buffer avoids additional copies
  112. # pylint: disable-msg=undefined-variable
  113. chunker = lambda payload, i, size: buffer(payload, i, size)
  114. else:
  115. # snappy.compress does not like raw memoryviews, so we have to convert
  116. # tobytes, which is a copy... oh well. it's the thought that counts.
  117. # pylint: disable-msg=undefined-variable
  118. chunker = lambda payload, i, size: memoryview(payload)[i:size+i].tobytes()
  119. for chunk in (chunker(payload, i, xerial_blocksize)
  120. for i in range(0, len(payload), xerial_blocksize)):
  121. block = snappy.compress(chunk)
  122. block_size = len(block)
  123. out.write(struct.pack('!i', block_size))
  124. out.write(block)
  125. return out.getvalue()
  126. def _detect_xerial_stream(payload):
  127. """Detects if the data given might have been encoded with the blocking mode
  128. of the xerial snappy library.
  129. This mode writes a magic header of the format:
  130. +--------+--------------+------------+---------+--------+
  131. | Marker | Magic String | Null / Pad | Version | Compat |
  132. +--------+--------------+------------+---------+--------+
  133. | byte | c-string | byte | int32 | int32 |
  134. +--------+--------------+------------+---------+--------+
  135. | -126 | 'SNAPPY' | \0 | | |
  136. +--------+--------------+------------+---------+--------+
  137. The pad appears to be to ensure that SNAPPY is a valid cstring
  138. The version is the version of this format as written by xerial,
  139. in the wild this is currently 1 as such we only support v1.
  140. Compat is there to claim the miniumum supported version that
  141. can read a xerial block stream, presently in the wild this is
  142. 1.
  143. """
  144. if len(payload) > 16:
  145. header = struct.unpack('!' + _XERIAL_V1_FORMAT, bytes(payload)[:16])
  146. return header == _XERIAL_V1_HEADER
  147. return False
  148. def snappy_decode(payload):
  149. if not has_snappy():
  150. raise NotImplementedError("Snappy codec is not available")
  151. if _detect_xerial_stream(payload):
  152. # TODO ? Should become a fileobj ?
  153. out = io.BytesIO()
  154. byt = payload[16:]
  155. length = len(byt)
  156. cursor = 0
  157. while cursor < length:
  158. block_size = struct.unpack_from('!i', byt[cursor:])[0]
  159. # Skip the block size
  160. cursor += 4
  161. end = cursor + block_size
  162. out.write(snappy.decompress(byt[cursor:end]))
  163. cursor = end
  164. out.seek(0)
  165. return out.read()
  166. else:
  167. return snappy.decompress(payload)
  168. if lz4:
  169. lz4_encode = _lz4_compress # pylint: disable-msg=no-member
  170. elif lz4f:
  171. lz4_encode = lz4f.compressFrame # pylint: disable-msg=no-member
  172. elif lz4framed:
  173. lz4_encode = lz4framed.compress # pylint: disable-msg=no-member
  174. else:
  175. lz4_encode = None
  176. def lz4f_decode(payload):
  177. """Decode payload using interoperable LZ4 framing. Requires Kafka >= 0.10"""
  178. # pylint: disable-msg=no-member
  179. ctx = lz4f.createDecompContext()
  180. data = lz4f.decompressFrame(payload, ctx)
  181. lz4f.freeDecompContext(ctx)
  182. # lz4f python module does not expose how much of the payload was
  183. # actually read if the decompression was only partial.
  184. if data['next'] != 0:
  185. raise RuntimeError('lz4f unable to decompress full payload')
  186. return data['decomp']
  187. if lz4:
  188. lz4_decode = lz4.decompress # pylint: disable-msg=no-member
  189. elif lz4f:
  190. lz4_decode = lz4f_decode
  191. elif lz4framed:
  192. lz4_decode = lz4framed.decompress # pylint: disable-msg=no-member
  193. else:
  194. lz4_decode = None
  195. def lz4_encode_old_kafka(payload):
  196. """Encode payload for 0.8/0.9 brokers -- requires an incorrect header checksum."""
  197. assert xxhash is not None
  198. data = lz4_encode(payload)
  199. header_size = 7
  200. flg = data[4]
  201. if not isinstance(flg, int):
  202. flg = ord(flg)
  203. content_size_bit = ((flg >> 3) & 1)
  204. if content_size_bit:
  205. # Old kafka does not accept the content-size field
  206. # so we need to discard it and reset the header flag
  207. flg -= 8
  208. data = bytearray(data)
  209. data[4] = flg
  210. data = bytes(data)
  211. payload = data[header_size+8:]
  212. else:
  213. payload = data[header_size:]
  214. # This is the incorrect hc
  215. hc = xxhash.xxh32(data[0:header_size-1]).digest()[-2:-1] # pylint: disable-msg=no-member
  216. return b''.join([
  217. data[0:header_size-1],
  218. hc,
  219. payload
  220. ])
  221. def lz4_decode_old_kafka(payload):
  222. assert xxhash is not None
  223. # Kafka's LZ4 code has a bug in its header checksum implementation
  224. header_size = 7
  225. if isinstance(payload[4], int):
  226. flg = payload[4]
  227. else:
  228. flg = ord(payload[4])
  229. content_size_bit = ((flg >> 3) & 1)
  230. if content_size_bit:
  231. header_size += 8
  232. # This should be the correct hc
  233. hc = xxhash.xxh32(payload[4:header_size-1]).digest()[-2:-1] # pylint: disable-msg=no-member
  234. munged_payload = b''.join([
  235. payload[0:header_size-1],
  236. hc,
  237. payload[header_size:]
  238. ])
  239. return lz4_decode(munged_payload)
  240. def zstd_encode(payload):
  241. if not zstd:
  242. raise NotImplementedError("Zstd codec is not available")
  243. return zstd.ZstdCompressor().compress(payload)
  244. def zstd_decode(payload):
  245. if not zstd:
  246. raise NotImplementedError("Zstd codec is not available")
  247. try:
  248. return zstd.ZstdDecompressor().decompress(payload)
  249. except zstd.ZstdError:
  250. return zstd.ZstdDecompressor().decompress(payload, max_output_size=ZSTD_MAX_OUTPUT_SIZE)