m2m模型翻译
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1077 lines
44 KiB

6 months ago
  1. from __future__ import absolute_import, division
  2. import collections
  3. import copy
  4. import logging
  5. import random
  6. import socket
  7. import threading
  8. import time
  9. import weakref
  10. # selectors in stdlib as of py3.4
  11. try:
  12. import selectors # pylint: disable=import-error
  13. except ImportError:
  14. # vendored backport module
  15. from kafka.vendor import selectors34 as selectors
  16. from kafka.vendor import six
  17. from kafka.cluster import ClusterMetadata
  18. from kafka.conn import BrokerConnection, ConnectionStates, collect_hosts, get_ip_port_afi
  19. from kafka import errors as Errors
  20. from kafka.future import Future
  21. from kafka.metrics import AnonMeasurable
  22. from kafka.metrics.stats import Avg, Count, Rate
  23. from kafka.metrics.stats.rate import TimeUnit
  24. from kafka.protocol.metadata import MetadataRequest
  25. from kafka.util import Dict, WeakMethod
  26. # Although this looks unused, it actually monkey-patches socket.socketpair()
  27. # and should be left in as long as we're using socket.socketpair() in this file
  28. from kafka.vendor import socketpair
  29. from kafka.version import __version__
  30. if six.PY2:
  31. ConnectionError = None
  32. log = logging.getLogger('kafka.client')
  33. class KafkaClient(object):
  34. """
  35. A network client for asynchronous request/response network I/O.
  36. This is an internal class used to implement the user-facing producer and
  37. consumer clients.
  38. This class is not thread-safe!
  39. Attributes:
  40. cluster (:any:`ClusterMetadata`): Local cache of cluster metadata, retrieved
  41. via MetadataRequests during :meth:`~kafka.KafkaClient.poll`.
  42. Keyword Arguments:
  43. bootstrap_servers: 'host[:port]' string (or list of 'host[:port]'
  44. strings) that the client should contact to bootstrap initial
  45. cluster metadata. This does not have to be the full node list.
  46. It just needs to have at least one broker that will respond to a
  47. Metadata API Request. Default port is 9092. If no servers are
  48. specified, will default to localhost:9092.
  49. client_id (str): a name for this client. This string is passed in
  50. each request to servers and can be used to identify specific
  51. server-side log entries that correspond to this client. Also
  52. submitted to GroupCoordinator for logging with respect to
  53. consumer group administration. Default: 'kafka-python-{version}'
  54. reconnect_backoff_ms (int): The amount of time in milliseconds to
  55. wait before attempting to reconnect to a given host.
  56. Default: 50.
  57. reconnect_backoff_max_ms (int): The maximum amount of time in
  58. milliseconds to backoff/wait when reconnecting to a broker that has
  59. repeatedly failed to connect. If provided, the backoff per host
  60. will increase exponentially for each consecutive connection
  61. failure, up to this maximum. Once the maximum is reached,
  62. reconnection attempts will continue periodically with this fixed
  63. rate. To avoid connection storms, a randomization factor of 0.2
  64. will be applied to the backoff resulting in a random range between
  65. 20% below and 20% above the computed value. Default: 1000.
  66. request_timeout_ms (int): Client request timeout in milliseconds.
  67. Default: 30000.
  68. connections_max_idle_ms: Close idle connections after the number of
  69. milliseconds specified by this config. The broker closes idle
  70. connections after connections.max.idle.ms, so this avoids hitting
  71. unexpected socket disconnected errors on the client.
  72. Default: 540000
  73. retry_backoff_ms (int): Milliseconds to backoff when retrying on
  74. errors. Default: 100.
  75. max_in_flight_requests_per_connection (int): Requests are pipelined
  76. to kafka brokers up to this number of maximum requests per
  77. broker connection. Default: 5.
  78. receive_buffer_bytes (int): The size of the TCP receive buffer
  79. (SO_RCVBUF) to use when reading data. Default: None (relies on
  80. system defaults). Java client defaults to 32768.
  81. send_buffer_bytes (int): The size of the TCP send buffer
  82. (SO_SNDBUF) to use when sending data. Default: None (relies on
  83. system defaults). Java client defaults to 131072.
  84. socket_options (list): List of tuple-arguments to socket.setsockopt
  85. to apply to broker connection sockets. Default:
  86. [(socket.IPPROTO_TCP, socket.TCP_NODELAY, 1)]
  87. metadata_max_age_ms (int): The period of time in milliseconds after
  88. which we force a refresh of metadata even if we haven't seen any
  89. partition leadership changes to proactively discover any new
  90. brokers or partitions. Default: 300000
  91. security_protocol (str): Protocol used to communicate with brokers.
  92. Valid values are: PLAINTEXT, SSL, SASL_PLAINTEXT, SASL_SSL.
  93. Default: PLAINTEXT.
  94. ssl_context (ssl.SSLContext): Pre-configured SSLContext for wrapping
  95. socket connections. If provided, all other ssl_* configurations
  96. will be ignored. Default: None.
  97. ssl_check_hostname (bool): Flag to configure whether SSL handshake
  98. should verify that the certificate matches the broker's hostname.
  99. Default: True.
  100. ssl_cafile (str): Optional filename of CA file to use in certificate
  101. verification. Default: None.
  102. ssl_certfile (str): Optional filename of file in PEM format containing
  103. the client certificate, as well as any CA certificates needed to
  104. establish the certificate's authenticity. Default: None.
  105. ssl_keyfile (str): Optional filename containing the client private key.
  106. Default: None.
  107. ssl_password (str): Optional password to be used when loading the
  108. certificate chain. Default: None.
  109. ssl_crlfile (str): Optional filename containing the CRL to check for
  110. certificate expiration. By default, no CRL check is done. When
  111. providing a file, only the leaf certificate will be checked against
  112. this CRL. The CRL can only be checked with Python 3.4+ or 2.7.9+.
  113. Default: None.
  114. ssl_ciphers (str): optionally set the available ciphers for ssl
  115. connections. It should be a string in the OpenSSL cipher list
  116. format. If no cipher can be selected (because compile-time options
  117. or other configuration forbids use of all the specified ciphers),
  118. an ssl.SSLError will be raised. See ssl.SSLContext.set_ciphers
  119. api_version (tuple): Specify which Kafka API version to use. If set
  120. to None, KafkaClient will attempt to infer the broker version by
  121. probing various APIs. Example: (0, 10, 2). Default: None
  122. api_version_auto_timeout_ms (int): number of milliseconds to throw a
  123. timeout exception from the constructor when checking the broker
  124. api version. Only applies if api_version is None
  125. selector (selectors.BaseSelector): Provide a specific selector
  126. implementation to use for I/O multiplexing.
  127. Default: selectors.DefaultSelector
  128. metrics (kafka.metrics.Metrics): Optionally provide a metrics
  129. instance for capturing network IO stats. Default: None.
  130. metric_group_prefix (str): Prefix for metric names. Default: ''
  131. sasl_mechanism (str): Authentication mechanism when security_protocol
  132. is configured for SASL_PLAINTEXT or SASL_SSL. Valid values are:
  133. PLAIN, GSSAPI, OAUTHBEARER, SCRAM-SHA-256, SCRAM-SHA-512.
  134. sasl_plain_username (str): username for sasl PLAIN and SCRAM authentication.
  135. Required if sasl_mechanism is PLAIN or one of the SCRAM mechanisms.
  136. sasl_plain_password (str): password for sasl PLAIN and SCRAM authentication.
  137. Required if sasl_mechanism is PLAIN or one of the SCRAM mechanisms.
  138. sasl_kerberos_service_name (str): Service name to include in GSSAPI
  139. sasl mechanism handshake. Default: 'kafka'
  140. sasl_kerberos_domain_name (str): kerberos domain name to use in GSSAPI
  141. sasl mechanism handshake. Default: one of bootstrap servers
  142. sasl_oauth_token_provider (AbstractTokenProvider): OAuthBearer token provider
  143. instance. (See kafka.oauth.abstract). Default: None
  144. """
  145. DEFAULT_CONFIG = {
  146. 'bootstrap_servers': 'localhost',
  147. 'bootstrap_topics_filter': set(),
  148. 'client_id': 'kafka-python-' + __version__,
  149. 'request_timeout_ms': 30000,
  150. 'wakeup_timeout_ms': 3000,
  151. 'connections_max_idle_ms': 9 * 60 * 1000,
  152. 'reconnect_backoff_ms': 50,
  153. 'reconnect_backoff_max_ms': 1000,
  154. 'max_in_flight_requests_per_connection': 5,
  155. 'receive_buffer_bytes': None,
  156. 'send_buffer_bytes': None,
  157. 'socket_options': [(socket.IPPROTO_TCP, socket.TCP_NODELAY, 1)],
  158. 'sock_chunk_bytes': 4096, # undocumented experimental option
  159. 'sock_chunk_buffer_count': 1000, # undocumented experimental option
  160. 'retry_backoff_ms': 100,
  161. 'metadata_max_age_ms': 300000,
  162. 'security_protocol': 'PLAINTEXT',
  163. 'ssl_context': None,
  164. 'ssl_check_hostname': True,
  165. 'ssl_cafile': None,
  166. 'ssl_certfile': None,
  167. 'ssl_keyfile': None,
  168. 'ssl_password': None,
  169. 'ssl_crlfile': None,
  170. 'ssl_ciphers': None,
  171. 'api_version': None,
  172. 'api_version_auto_timeout_ms': 2000,
  173. 'selector': selectors.DefaultSelector,
  174. 'metrics': None,
  175. 'metric_group_prefix': '',
  176. 'sasl_mechanism': None,
  177. 'sasl_plain_username': None,
  178. 'sasl_plain_password': None,
  179. 'sasl_kerberos_service_name': 'kafka',
  180. 'sasl_kerberos_domain_name': None,
  181. 'sasl_oauth_token_provider': None
  182. }
  183. def __init__(self, **configs):
  184. self.config = copy.copy(self.DEFAULT_CONFIG)
  185. for key in self.config:
  186. if key in configs:
  187. self.config[key] = configs[key]
  188. # these properties need to be set on top of the initialization pipeline
  189. # because they are used when __del__ method is called
  190. self._closed = False
  191. self._wake_r, self._wake_w = socket.socketpair()
  192. self._selector = self.config['selector']()
  193. self.cluster = ClusterMetadata(**self.config)
  194. self._topics = set() # empty set will fetch all topic metadata
  195. self._metadata_refresh_in_progress = False
  196. self._conns = Dict() # object to support weakrefs
  197. self._api_versions = None
  198. self._connecting = set()
  199. self._sending = set()
  200. self._refresh_on_disconnects = True
  201. self._last_bootstrap = 0
  202. self._bootstrap_fails = 0
  203. self._wake_r.setblocking(False)
  204. self._wake_w.settimeout(self.config['wakeup_timeout_ms'] / 1000.0)
  205. self._wake_lock = threading.Lock()
  206. self._lock = threading.RLock()
  207. # when requests complete, they are transferred to this queue prior to
  208. # invocation. The purpose is to avoid invoking them while holding the
  209. # lock above.
  210. self._pending_completion = collections.deque()
  211. self._selector.register(self._wake_r, selectors.EVENT_READ)
  212. self._idle_expiry_manager = IdleConnectionManager(self.config['connections_max_idle_ms'])
  213. self._sensors = None
  214. if self.config['metrics']:
  215. self._sensors = KafkaClientMetrics(self.config['metrics'],
  216. self.config['metric_group_prefix'],
  217. weakref.proxy(self._conns))
  218. self._num_bootstrap_hosts = len(collect_hosts(self.config['bootstrap_servers']))
  219. # Check Broker Version if not set explicitly
  220. if self.config['api_version'] is None:
  221. check_timeout = self.config['api_version_auto_timeout_ms'] / 1000
  222. self.config['api_version'] = self.check_version(timeout=check_timeout)
  223. def _can_bootstrap(self):
  224. effective_failures = self._bootstrap_fails // self._num_bootstrap_hosts
  225. backoff_factor = 2 ** effective_failures
  226. backoff_ms = min(self.config['reconnect_backoff_ms'] * backoff_factor,
  227. self.config['reconnect_backoff_max_ms'])
  228. backoff_ms *= random.uniform(0.8, 1.2)
  229. next_at = self._last_bootstrap + backoff_ms / 1000.0
  230. now = time.time()
  231. if next_at > now:
  232. return False
  233. return True
  234. def _can_connect(self, node_id):
  235. if node_id not in self._conns:
  236. if self.cluster.broker_metadata(node_id):
  237. return True
  238. return False
  239. conn = self._conns[node_id]
  240. return conn.disconnected() and not conn.blacked_out()
  241. def _conn_state_change(self, node_id, sock, conn):
  242. with self._lock:
  243. if conn.connecting():
  244. # SSL connections can enter this state 2x (second during Handshake)
  245. if node_id not in self._connecting:
  246. self._connecting.add(node_id)
  247. try:
  248. self._selector.register(sock, selectors.EVENT_WRITE, conn)
  249. except KeyError:
  250. self._selector.modify(sock, selectors.EVENT_WRITE, conn)
  251. if self.cluster.is_bootstrap(node_id):
  252. self._last_bootstrap = time.time()
  253. elif conn.connected():
  254. log.debug("Node %s connected", node_id)
  255. if node_id in self._connecting:
  256. self._connecting.remove(node_id)
  257. try:
  258. self._selector.modify(sock, selectors.EVENT_READ, conn)
  259. except KeyError:
  260. self._selector.register(sock, selectors.EVENT_READ, conn)
  261. if self._sensors:
  262. self._sensors.connection_created.record()
  263. self._idle_expiry_manager.update(node_id)
  264. if self.cluster.is_bootstrap(node_id):
  265. self._bootstrap_fails = 0
  266. else:
  267. for node_id in list(self._conns.keys()):
  268. if self.cluster.is_bootstrap(node_id):
  269. self._conns.pop(node_id).close()
  270. # Connection failures imply that our metadata is stale, so let's refresh
  271. elif conn.state is ConnectionStates.DISCONNECTED:
  272. if node_id in self._connecting:
  273. self._connecting.remove(node_id)
  274. try:
  275. self._selector.unregister(sock)
  276. except KeyError:
  277. pass
  278. if self._sensors:
  279. self._sensors.connection_closed.record()
  280. idle_disconnect = False
  281. if self._idle_expiry_manager.is_expired(node_id):
  282. idle_disconnect = True
  283. self._idle_expiry_manager.remove(node_id)
  284. # If the connection has already by popped from self._conns,
  285. # we can assume the disconnect was intentional and not a failure
  286. if node_id not in self._conns:
  287. pass
  288. elif self.cluster.is_bootstrap(node_id):
  289. self._bootstrap_fails += 1
  290. elif self._refresh_on_disconnects and not self._closed and not idle_disconnect:
  291. log.warning("Node %s connection failed -- refreshing metadata", node_id)
  292. self.cluster.request_update()
  293. def maybe_connect(self, node_id, wakeup=True):
  294. """Queues a node for asynchronous connection during the next .poll()"""
  295. if self._can_connect(node_id):
  296. self._connecting.add(node_id)
  297. # Wakeup signal is useful in case another thread is
  298. # blocked waiting for incoming network traffic while holding
  299. # the client lock in poll().
  300. if wakeup:
  301. self.wakeup()
  302. return True
  303. return False
  304. def _should_recycle_connection(self, conn):
  305. # Never recycle unless disconnected
  306. if not conn.disconnected():
  307. return False
  308. # Otherwise, only recycle when broker metadata has changed
  309. broker = self.cluster.broker_metadata(conn.node_id)
  310. if broker is None:
  311. return False
  312. host, _, afi = get_ip_port_afi(broker.host)
  313. if conn.host != host or conn.port != broker.port:
  314. log.info("Broker metadata change detected for node %s"
  315. " from %s:%s to %s:%s", conn.node_id, conn.host, conn.port,
  316. broker.host, broker.port)
  317. return True
  318. return False
  319. def _maybe_connect(self, node_id):
  320. """Idempotent non-blocking connection attempt to the given node id."""
  321. with self._lock:
  322. conn = self._conns.get(node_id)
  323. if conn is None:
  324. broker = self.cluster.broker_metadata(node_id)
  325. assert broker, 'Broker id %s not in current metadata' % (node_id,)
  326. log.debug("Initiating connection to node %s at %s:%s",
  327. node_id, broker.host, broker.port)
  328. host, port, afi = get_ip_port_afi(broker.host)
  329. cb = WeakMethod(self._conn_state_change)
  330. conn = BrokerConnection(host, broker.port, afi,
  331. state_change_callback=cb,
  332. node_id=node_id,
  333. **self.config)
  334. self._conns[node_id] = conn
  335. # Check if existing connection should be recreated because host/port changed
  336. elif self._should_recycle_connection(conn):
  337. self._conns.pop(node_id)
  338. return False
  339. elif conn.connected():
  340. return True
  341. conn.connect()
  342. return conn.connected()
  343. def ready(self, node_id, metadata_priority=True):
  344. """Check whether a node is connected and ok to send more requests.
  345. Arguments:
  346. node_id (int): the id of the node to check
  347. metadata_priority (bool): Mark node as not-ready if a metadata
  348. refresh is required. Default: True
  349. Returns:
  350. bool: True if we are ready to send to the given node
  351. """
  352. self.maybe_connect(node_id)
  353. return self.is_ready(node_id, metadata_priority=metadata_priority)
  354. def connected(self, node_id):
  355. """Return True iff the node_id is connected."""
  356. conn = self._conns.get(node_id)
  357. if conn is None:
  358. return False
  359. return conn.connected()
  360. def _close(self):
  361. if not self._closed:
  362. self._closed = True
  363. self._wake_r.close()
  364. self._wake_w.close()
  365. self._selector.close()
  366. def close(self, node_id=None):
  367. """Close one or all broker connections.
  368. Arguments:
  369. node_id (int, optional): the id of the node to close
  370. """
  371. with self._lock:
  372. if node_id is None:
  373. self._close()
  374. conns = list(self._conns.values())
  375. self._conns.clear()
  376. for conn in conns:
  377. conn.close()
  378. elif node_id in self._conns:
  379. self._conns.pop(node_id).close()
  380. else:
  381. log.warning("Node %s not found in current connection list; skipping", node_id)
  382. return
  383. def __del__(self):
  384. self._close()
  385. def is_disconnected(self, node_id):
  386. """Check whether the node connection has been disconnected or failed.
  387. A disconnected node has either been closed or has failed. Connection
  388. failures are usually transient and can be resumed in the next ready()
  389. call, but there are cases where transient failures need to be caught
  390. and re-acted upon.
  391. Arguments:
  392. node_id (int): the id of the node to check
  393. Returns:
  394. bool: True iff the node exists and is disconnected
  395. """
  396. conn = self._conns.get(node_id)
  397. if conn is None:
  398. return False
  399. return conn.disconnected()
  400. def connection_delay(self, node_id):
  401. """
  402. Return the number of milliseconds to wait, based on the connection
  403. state, before attempting to send data. When disconnected, this respects
  404. the reconnect backoff time. When connecting, returns 0 to allow
  405. non-blocking connect to finish. When connected, returns a very large
  406. number to handle slow/stalled connections.
  407. Arguments:
  408. node_id (int): The id of the node to check
  409. Returns:
  410. int: The number of milliseconds to wait.
  411. """
  412. conn = self._conns.get(node_id)
  413. if conn is None:
  414. return 0
  415. return conn.connection_delay()
  416. def is_ready(self, node_id, metadata_priority=True):
  417. """Check whether a node is ready to send more requests.
  418. In addition to connection-level checks, this method also is used to
  419. block additional requests from being sent during a metadata refresh.
  420. Arguments:
  421. node_id (int): id of the node to check
  422. metadata_priority (bool): Mark node as not-ready if a metadata
  423. refresh is required. Default: True
  424. Returns:
  425. bool: True if the node is ready and metadata is not refreshing
  426. """
  427. if not self._can_send_request(node_id):
  428. return False
  429. # if we need to update our metadata now declare all requests unready to
  430. # make metadata requests first priority
  431. if metadata_priority:
  432. if self._metadata_refresh_in_progress:
  433. return False
  434. if self.cluster.ttl() == 0:
  435. return False
  436. return True
  437. def _can_send_request(self, node_id):
  438. conn = self._conns.get(node_id)
  439. if not conn:
  440. return False
  441. return conn.connected() and conn.can_send_more()
  442. def send(self, node_id, request, wakeup=True):
  443. """Send a request to a specific node. Bytes are placed on an
  444. internal per-connection send-queue. Actual network I/O will be
  445. triggered in a subsequent call to .poll()
  446. Arguments:
  447. node_id (int): destination node
  448. request (Struct): request object (not-encoded)
  449. wakeup (bool): optional flag to disable thread-wakeup
  450. Raises:
  451. AssertionError: if node_id is not in current cluster metadata
  452. Returns:
  453. Future: resolves to Response struct or Error
  454. """
  455. conn = self._conns.get(node_id)
  456. if not conn or not self._can_send_request(node_id):
  457. self.maybe_connect(node_id, wakeup=wakeup)
  458. return Future().failure(Errors.NodeNotReadyError(node_id))
  459. # conn.send will queue the request internally
  460. # we will need to call send_pending_requests()
  461. # to trigger network I/O
  462. future = conn.send(request, blocking=False)
  463. self._sending.add(conn)
  464. # Wakeup signal is useful in case another thread is
  465. # blocked waiting for incoming network traffic while holding
  466. # the client lock in poll().
  467. if wakeup:
  468. self.wakeup()
  469. return future
  470. def poll(self, timeout_ms=None, future=None):
  471. """Try to read and write to sockets.
  472. This method will also attempt to complete node connections, refresh
  473. stale metadata, and run previously-scheduled tasks.
  474. Arguments:
  475. timeout_ms (int, optional): maximum amount of time to wait (in ms)
  476. for at least one response. Must be non-negative. The actual
  477. timeout will be the minimum of timeout, request timeout and
  478. metadata timeout. Default: request_timeout_ms
  479. future (Future, optional): if provided, blocks until future.is_done
  480. Returns:
  481. list: responses received (can be empty)
  482. """
  483. if future is not None:
  484. timeout_ms = 100
  485. elif timeout_ms is None:
  486. timeout_ms = self.config['request_timeout_ms']
  487. elif not isinstance(timeout_ms, (int, float)):
  488. raise TypeError('Invalid type for timeout: %s' % type(timeout_ms))
  489. # Loop for futures, break after first loop if None
  490. responses = []
  491. while True:
  492. with self._lock:
  493. if self._closed:
  494. break
  495. # Attempt to complete pending connections
  496. for node_id in list(self._connecting):
  497. self._maybe_connect(node_id)
  498. # Send a metadata request if needed
  499. metadata_timeout_ms = self._maybe_refresh_metadata()
  500. # If we got a future that is already done, don't block in _poll
  501. if future is not None and future.is_done:
  502. timeout = 0
  503. else:
  504. idle_connection_timeout_ms = self._idle_expiry_manager.next_check_ms()
  505. timeout = min(
  506. timeout_ms,
  507. metadata_timeout_ms,
  508. idle_connection_timeout_ms,
  509. self.config['request_timeout_ms'])
  510. # if there are no requests in flight, do not block longer than the retry backoff
  511. if self.in_flight_request_count() == 0:
  512. timeout = min(timeout, self.config['retry_backoff_ms'])
  513. timeout = max(0, timeout) # avoid negative timeouts
  514. self._poll(timeout / 1000)
  515. # called without the lock to avoid deadlock potential
  516. # if handlers need to acquire locks
  517. responses.extend(self._fire_pending_completed_requests())
  518. # If all we had was a timeout (future is None) - only do one poll
  519. # If we do have a future, we keep looping until it is done
  520. if future is None or future.is_done:
  521. break
  522. return responses
  523. def _register_send_sockets(self):
  524. while self._sending:
  525. conn = self._sending.pop()
  526. try:
  527. key = self._selector.get_key(conn._sock)
  528. events = key.events | selectors.EVENT_WRITE
  529. self._selector.modify(key.fileobj, events, key.data)
  530. except KeyError:
  531. self._selector.register(conn._sock, selectors.EVENT_WRITE, conn)
  532. def _poll(self, timeout):
  533. # This needs to be locked, but since it is only called from within the
  534. # locked section of poll(), there is no additional lock acquisition here
  535. processed = set()
  536. # Send pending requests first, before polling for responses
  537. self._register_send_sockets()
  538. start_select = time.time()
  539. ready = self._selector.select(timeout)
  540. end_select = time.time()
  541. if self._sensors:
  542. self._sensors.select_time.record((end_select - start_select) * 1000000000)
  543. for key, events in ready:
  544. if key.fileobj is self._wake_r:
  545. self._clear_wake_fd()
  546. continue
  547. # Send pending requests if socket is ready to write
  548. if events & selectors.EVENT_WRITE:
  549. conn = key.data
  550. if conn.connecting():
  551. conn.connect()
  552. else:
  553. if conn.send_pending_requests_v2():
  554. # If send is complete, we dont need to track write readiness
  555. # for this socket anymore
  556. if key.events ^ selectors.EVENT_WRITE:
  557. self._selector.modify(
  558. key.fileobj,
  559. key.events ^ selectors.EVENT_WRITE,
  560. key.data)
  561. else:
  562. self._selector.unregister(key.fileobj)
  563. if not (events & selectors.EVENT_READ):
  564. continue
  565. conn = key.data
  566. processed.add(conn)
  567. if not conn.in_flight_requests:
  568. # if we got an EVENT_READ but there were no in-flight requests, one of
  569. # two things has happened:
  570. #
  571. # 1. The remote end closed the connection (because it died, or because
  572. # a firewall timed out, or whatever)
  573. # 2. The protocol is out of sync.
  574. #
  575. # either way, we can no longer safely use this connection
  576. #
  577. # Do a 1-byte read to check protocol didnt get out of sync, and then close the conn
  578. try:
  579. unexpected_data = key.fileobj.recv(1)
  580. if unexpected_data: # anything other than a 0-byte read means protocol issues
  581. log.warning('Protocol out of sync on %r, closing', conn)
  582. except socket.error:
  583. pass
  584. conn.close(Errors.KafkaConnectionError('Socket EVENT_READ without in-flight-requests'))
  585. continue
  586. self._idle_expiry_manager.update(conn.node_id)
  587. self._pending_completion.extend(conn.recv())
  588. # Check for additional pending SSL bytes
  589. if self.config['security_protocol'] in ('SSL', 'SASL_SSL'):
  590. # TODO: optimize
  591. for conn in self._conns.values():
  592. if conn not in processed and conn.connected() and conn._sock.pending():
  593. self._pending_completion.extend(conn.recv())
  594. for conn in six.itervalues(self._conns):
  595. if conn.requests_timed_out():
  596. log.warning('%s timed out after %s ms. Closing connection.',
  597. conn, conn.config['request_timeout_ms'])
  598. conn.close(error=Errors.RequestTimedOutError(
  599. 'Request timed out after %s ms' %
  600. conn.config['request_timeout_ms']))
  601. if self._sensors:
  602. self._sensors.io_time.record((time.time() - end_select) * 1000000000)
  603. self._maybe_close_oldest_connection()
  604. def in_flight_request_count(self, node_id=None):
  605. """Get the number of in-flight requests for a node or all nodes.
  606. Arguments:
  607. node_id (int, optional): a specific node to check. If unspecified,
  608. return the total for all nodes
  609. Returns:
  610. int: pending in-flight requests for the node, or all nodes if None
  611. """
  612. if node_id is not None:
  613. conn = self._conns.get(node_id)
  614. if conn is None:
  615. return 0
  616. return len(conn.in_flight_requests)
  617. else:
  618. return sum([len(conn.in_flight_requests)
  619. for conn in list(self._conns.values())])
  620. def _fire_pending_completed_requests(self):
  621. responses = []
  622. while True:
  623. try:
  624. # We rely on deque.popleft remaining threadsafe
  625. # to allow both the heartbeat thread and the main thread
  626. # to process responses
  627. response, future = self._pending_completion.popleft()
  628. except IndexError:
  629. break
  630. future.success(response)
  631. responses.append(response)
  632. return responses
  633. def least_loaded_node(self):
  634. """Choose the node with fewest outstanding requests, with fallbacks.
  635. This method will prefer a node with an existing connection and no
  636. in-flight-requests. If no such node is found, a node will be chosen
  637. randomly from disconnected nodes that are not "blacked out" (i.e.,
  638. are not subject to a reconnect backoff). If no node metadata has been
  639. obtained, will return a bootstrap node (subject to exponential backoff).
  640. Returns:
  641. node_id or None if no suitable node was found
  642. """
  643. nodes = [broker.nodeId for broker in self.cluster.brokers()]
  644. random.shuffle(nodes)
  645. inflight = float('inf')
  646. found = None
  647. for node_id in nodes:
  648. conn = self._conns.get(node_id)
  649. connected = conn is not None and conn.connected()
  650. blacked_out = conn is not None and conn.blacked_out()
  651. curr_inflight = len(conn.in_flight_requests) if conn is not None else 0
  652. if connected and curr_inflight == 0:
  653. # if we find an established connection
  654. # with no in-flight requests, we can stop right away
  655. return node_id
  656. elif not blacked_out and curr_inflight < inflight:
  657. # otherwise if this is the best we have found so far, record that
  658. inflight = curr_inflight
  659. found = node_id
  660. return found
  661. def set_topics(self, topics):
  662. """Set specific topics to track for metadata.
  663. Arguments:
  664. topics (list of str): topics to check for metadata
  665. Returns:
  666. Future: resolves after metadata request/response
  667. """
  668. if set(topics).difference(self._topics):
  669. future = self.cluster.request_update()
  670. else:
  671. future = Future().success(set(topics))
  672. self._topics = set(topics)
  673. return future
  674. def add_topic(self, topic):
  675. """Add a topic to the list of topics tracked via metadata.
  676. Arguments:
  677. topic (str): topic to track
  678. Returns:
  679. Future: resolves after metadata request/response
  680. """
  681. if topic in self._topics:
  682. return Future().success(set(self._topics))
  683. self._topics.add(topic)
  684. return self.cluster.request_update()
  685. # This method should be locked when running multi-threaded
  686. def _maybe_refresh_metadata(self, wakeup=False):
  687. """Send a metadata request if needed.
  688. Returns:
  689. int: milliseconds until next refresh
  690. """
  691. ttl = self.cluster.ttl()
  692. wait_for_in_progress_ms = self.config['request_timeout_ms'] if self._metadata_refresh_in_progress else 0
  693. metadata_timeout = max(ttl, wait_for_in_progress_ms)
  694. if metadata_timeout > 0:
  695. return metadata_timeout
  696. # Beware that the behavior of this method and the computation of
  697. # timeouts for poll() are highly dependent on the behavior of
  698. # least_loaded_node()
  699. node_id = self.least_loaded_node()
  700. if node_id is None:
  701. log.debug("Give up sending metadata request since no node is available");
  702. return self.config['reconnect_backoff_ms']
  703. if self._can_send_request(node_id):
  704. topics = list(self._topics)
  705. if not topics and self.cluster.is_bootstrap(node_id):
  706. topics = list(self.config['bootstrap_topics_filter'])
  707. if self.cluster.need_all_topic_metadata or not topics:
  708. topics = [] if self.config['api_version'] < (0, 10) else None
  709. api_version = 0 if self.config['api_version'] < (0, 10) else 1
  710. request = MetadataRequest[api_version](topics)
  711. log.debug("Sending metadata request %s to node %s", request, node_id)
  712. future = self.send(node_id, request, wakeup=wakeup)
  713. future.add_callback(self.cluster.update_metadata)
  714. future.add_errback(self.cluster.failed_update)
  715. self._metadata_refresh_in_progress = True
  716. def refresh_done(val_or_error):
  717. self._metadata_refresh_in_progress = False
  718. future.add_callback(refresh_done)
  719. future.add_errback(refresh_done)
  720. return self.config['request_timeout_ms']
  721. # If there's any connection establishment underway, wait until it completes. This prevents
  722. # the client from unnecessarily connecting to additional nodes while a previous connection
  723. # attempt has not been completed.
  724. if self._connecting:
  725. return self.config['reconnect_backoff_ms']
  726. if self.maybe_connect(node_id, wakeup=wakeup):
  727. log.debug("Initializing connection to node %s for metadata request", node_id)
  728. return self.config['reconnect_backoff_ms']
  729. # connected but can't send more, OR connecting
  730. # In either case we just need to wait for a network event
  731. # to let us know the selected connection might be usable again.
  732. return float('inf')
  733. def get_api_versions(self):
  734. """Return the ApiVersions map, if available.
  735. Note: A call to check_version must previously have succeeded and returned
  736. version 0.10.0 or later
  737. Returns: a map of dict mapping {api_key : (min_version, max_version)},
  738. or None if ApiVersion is not supported by the kafka cluster.
  739. """
  740. return self._api_versions
  741. def check_version(self, node_id=None, timeout=2, strict=False):
  742. """Attempt to guess the version of a Kafka broker.
  743. Note: It is possible that this method blocks longer than the
  744. specified timeout. This can happen if the entire cluster
  745. is down and the client enters a bootstrap backoff sleep.
  746. This is only possible if node_id is None.
  747. Returns: version tuple, i.e. (0, 10), (0, 9), (0, 8, 2), ...
  748. Raises:
  749. NodeNotReadyError (if node_id is provided)
  750. NoBrokersAvailable (if node_id is None)
  751. UnrecognizedBrokerVersion: please file bug if seen!
  752. AssertionError (if strict=True): please file bug if seen!
  753. """
  754. self._lock.acquire()
  755. end = time.time() + timeout
  756. while time.time() < end:
  757. # It is possible that least_loaded_node falls back to bootstrap,
  758. # which can block for an increasing backoff period
  759. try_node = node_id or self.least_loaded_node()
  760. if try_node is None:
  761. self._lock.release()
  762. raise Errors.NoBrokersAvailable()
  763. self._maybe_connect(try_node)
  764. conn = self._conns[try_node]
  765. # We will intentionally cause socket failures
  766. # These should not trigger metadata refresh
  767. self._refresh_on_disconnects = False
  768. try:
  769. remaining = end - time.time()
  770. version = conn.check_version(timeout=remaining, strict=strict, topics=list(self.config['bootstrap_topics_filter']))
  771. if version >= (0, 10, 0):
  772. # cache the api versions map if it's available (starting
  773. # in 0.10 cluster version)
  774. self._api_versions = conn.get_api_versions()
  775. self._lock.release()
  776. return version
  777. except Errors.NodeNotReadyError:
  778. # Only raise to user if this is a node-specific request
  779. if node_id is not None:
  780. self._lock.release()
  781. raise
  782. finally:
  783. self._refresh_on_disconnects = True
  784. # Timeout
  785. else:
  786. self._lock.release()
  787. raise Errors.NoBrokersAvailable()
  788. def wakeup(self):
  789. with self._wake_lock:
  790. try:
  791. self._wake_w.sendall(b'x')
  792. except socket.timeout:
  793. log.warning('Timeout to send to wakeup socket!')
  794. raise Errors.KafkaTimeoutError()
  795. except socket.error:
  796. log.warning('Unable to send to wakeup socket!')
  797. def _clear_wake_fd(self):
  798. # reading from wake socket should only happen in a single thread
  799. while True:
  800. try:
  801. self._wake_r.recv(1024)
  802. except socket.error:
  803. break
  804. def _maybe_close_oldest_connection(self):
  805. expired_connection = self._idle_expiry_manager.poll_expired_connection()
  806. if expired_connection:
  807. conn_id, ts = expired_connection
  808. idle_ms = (time.time() - ts) * 1000
  809. log.info('Closing idle connection %s, last active %d ms ago', conn_id, idle_ms)
  810. self.close(node_id=conn_id)
  811. def bootstrap_connected(self):
  812. """Return True if a bootstrap node is connected"""
  813. for node_id in self._conns:
  814. if not self.cluster.is_bootstrap(node_id):
  815. continue
  816. if self._conns[node_id].connected():
  817. return True
  818. else:
  819. return False
  820. # OrderedDict requires python2.7+
  821. try:
  822. from collections import OrderedDict
  823. except ImportError:
  824. # If we dont have OrderedDict, we'll fallback to dict with O(n) priority reads
  825. OrderedDict = dict
  826. class IdleConnectionManager(object):
  827. def __init__(self, connections_max_idle_ms):
  828. if connections_max_idle_ms > 0:
  829. self.connections_max_idle = connections_max_idle_ms / 1000
  830. else:
  831. self.connections_max_idle = float('inf')
  832. self.next_idle_close_check_time = None
  833. self.update_next_idle_close_check_time(time.time())
  834. self.lru_connections = OrderedDict()
  835. def update(self, conn_id):
  836. # order should reflect last-update
  837. if conn_id in self.lru_connections:
  838. del self.lru_connections[conn_id]
  839. self.lru_connections[conn_id] = time.time()
  840. def remove(self, conn_id):
  841. if conn_id in self.lru_connections:
  842. del self.lru_connections[conn_id]
  843. def is_expired(self, conn_id):
  844. if conn_id not in self.lru_connections:
  845. return None
  846. return time.time() >= self.lru_connections[conn_id] + self.connections_max_idle
  847. def next_check_ms(self):
  848. now = time.time()
  849. if not self.lru_connections:
  850. return float('inf')
  851. elif self.next_idle_close_check_time <= now:
  852. return 0
  853. else:
  854. return int((self.next_idle_close_check_time - now) * 1000)
  855. def update_next_idle_close_check_time(self, ts):
  856. self.next_idle_close_check_time = ts + self.connections_max_idle
  857. def poll_expired_connection(self):
  858. if time.time() < self.next_idle_close_check_time:
  859. return None
  860. if not len(self.lru_connections):
  861. return None
  862. oldest_conn_id = None
  863. oldest_ts = None
  864. if OrderedDict is dict:
  865. for conn_id, ts in self.lru_connections.items():
  866. if oldest_conn_id is None or ts < oldest_ts:
  867. oldest_conn_id = conn_id
  868. oldest_ts = ts
  869. else:
  870. (oldest_conn_id, oldest_ts) = next(iter(self.lru_connections.items()))
  871. self.update_next_idle_close_check_time(oldest_ts)
  872. if time.time() >= oldest_ts + self.connections_max_idle:
  873. return (oldest_conn_id, oldest_ts)
  874. else:
  875. return None
  876. class KafkaClientMetrics(object):
  877. def __init__(self, metrics, metric_group_prefix, conns):
  878. self.metrics = metrics
  879. self.metric_group_name = metric_group_prefix + '-metrics'
  880. self.connection_closed = metrics.sensor('connections-closed')
  881. self.connection_closed.add(metrics.metric_name(
  882. 'connection-close-rate', self.metric_group_name,
  883. 'Connections closed per second in the window.'), Rate())
  884. self.connection_created = metrics.sensor('connections-created')
  885. self.connection_created.add(metrics.metric_name(
  886. 'connection-creation-rate', self.metric_group_name,
  887. 'New connections established per second in the window.'), Rate())
  888. self.select_time = metrics.sensor('select-time')
  889. self.select_time.add(metrics.metric_name(
  890. 'select-rate', self.metric_group_name,
  891. 'Number of times the I/O layer checked for new I/O to perform per'
  892. ' second'), Rate(sampled_stat=Count()))
  893. self.select_time.add(metrics.metric_name(
  894. 'io-wait-time-ns-avg', self.metric_group_name,
  895. 'The average length of time the I/O thread spent waiting for a'
  896. ' socket ready for reads or writes in nanoseconds.'), Avg())
  897. self.select_time.add(metrics.metric_name(
  898. 'io-wait-ratio', self.metric_group_name,
  899. 'The fraction of time the I/O thread spent waiting.'),
  900. Rate(time_unit=TimeUnit.NANOSECONDS))
  901. self.io_time = metrics.sensor('io-time')
  902. self.io_time.add(metrics.metric_name(
  903. 'io-time-ns-avg', self.metric_group_name,
  904. 'The average length of time for I/O per select call in nanoseconds.'),
  905. Avg())
  906. self.io_time.add(metrics.metric_name(
  907. 'io-ratio', self.metric_group_name,
  908. 'The fraction of time the I/O thread spent doing I/O'),
  909. Rate(time_unit=TimeUnit.NANOSECONDS))
  910. metrics.add_metric(metrics.metric_name(
  911. 'connection-count', self.metric_group_name,
  912. 'The current number of active connections.'), AnonMeasurable(
  913. lambda config, now: len(conns)))