m2m模型翻译
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

449 lines
16 KiB

6 months ago
  1. # Human friendly input/output in Python.
  2. #
  3. # Author: Peter Odding <peter@peterodding.com>
  4. # Last Change: December 1, 2020
  5. # URL: https://humanfriendly.readthedocs.io
  6. """
  7. Simple text manipulation functions.
  8. The :mod:`~humanfriendly.text` module contains simple functions to manipulate text:
  9. - The :func:`concatenate()` and :func:`pluralize()` functions make it easy to
  10. generate human friendly output.
  11. - The :func:`format()`, :func:`compact()` and :func:`dedent()` functions
  12. provide a clean and simple to use syntax for composing large text fragments
  13. with interpolated variables.
  14. - The :func:`tokenize()` function parses simple user input.
  15. """
  16. # Standard library modules.
  17. import numbers
  18. import random
  19. import re
  20. import string
  21. import textwrap
  22. # Public identifiers that require documentation.
  23. __all__ = (
  24. 'compact',
  25. 'compact_empty_lines',
  26. 'concatenate',
  27. 'dedent',
  28. 'format',
  29. 'generate_slug',
  30. 'is_empty_line',
  31. 'join_lines',
  32. 'pluralize',
  33. 'pluralize_raw',
  34. 'random_string',
  35. 'split',
  36. 'split_paragraphs',
  37. 'tokenize',
  38. 'trim_empty_lines',
  39. )
  40. def compact(text, *args, **kw):
  41. '''
  42. Compact whitespace in a string.
  43. Trims leading and trailing whitespace, replaces runs of whitespace
  44. characters with a single space and interpolates any arguments using
  45. :func:`format()`.
  46. :param text: The text to compact (a string).
  47. :param args: Any positional arguments are interpolated using :func:`format()`.
  48. :param kw: Any keyword arguments are interpolated using :func:`format()`.
  49. :returns: The compacted text (a string).
  50. Here's an example of how I like to use the :func:`compact()` function, this
  51. is an example from a random unrelated project I'm working on at the moment::
  52. raise PortDiscoveryError(compact("""
  53. Failed to discover port(s) that Apache is listening on!
  54. Maybe I'm parsing the wrong configuration file? ({filename})
  55. """, filename=self.ports_config))
  56. The combination of :func:`compact()` and Python's multi line strings allows
  57. me to write long text fragments with interpolated variables that are easy
  58. to write, easy to read and work well with Python's whitespace
  59. sensitivity.
  60. '''
  61. non_whitespace_tokens = text.split()
  62. compacted_text = ' '.join(non_whitespace_tokens)
  63. return format(compacted_text, *args, **kw)
  64. def compact_empty_lines(text):
  65. """
  66. Replace repeating empty lines with a single empty line (similar to ``cat -s``).
  67. :param text: The text in which to compact empty lines (a string).
  68. :returns: The text with empty lines compacted (a string).
  69. """
  70. i = 0
  71. lines = text.splitlines(True)
  72. while i < len(lines):
  73. if i > 0 and is_empty_line(lines[i - 1]) and is_empty_line(lines[i]):
  74. lines.pop(i)
  75. else:
  76. i += 1
  77. return ''.join(lines)
  78. def concatenate(items, conjunction='and', serial_comma=False):
  79. """
  80. Concatenate a list of items in a human friendly way.
  81. :param items:
  82. A sequence of strings.
  83. :param conjunction:
  84. The word to use before the last item (a string, defaults to "and").
  85. :param serial_comma:
  86. :data:`True` to use a `serial comma`_, :data:`False` otherwise
  87. (defaults to :data:`False`).
  88. :returns:
  89. A single string.
  90. >>> from humanfriendly.text import concatenate
  91. >>> concatenate(["eggs", "milk", "bread"])
  92. 'eggs, milk and bread'
  93. .. _serial comma: https://en.wikipedia.org/wiki/Serial_comma
  94. """
  95. items = list(items)
  96. if len(items) > 1:
  97. final_item = items.pop()
  98. formatted = ', '.join(items)
  99. if serial_comma:
  100. formatted += ','
  101. return ' '.join([formatted, conjunction, final_item])
  102. elif items:
  103. return items[0]
  104. else:
  105. return ''
  106. def dedent(text, *args, **kw):
  107. """
  108. Dedent a string (remove common leading whitespace from all lines).
  109. Removes common leading whitespace from all lines in the string using
  110. :func:`textwrap.dedent()`, removes leading and trailing empty lines using
  111. :func:`trim_empty_lines()` and interpolates any arguments using
  112. :func:`format()`.
  113. :param text: The text to dedent (a string).
  114. :param args: Any positional arguments are interpolated using :func:`format()`.
  115. :param kw: Any keyword arguments are interpolated using :func:`format()`.
  116. :returns: The dedented text (a string).
  117. The :func:`compact()` function's documentation contains an example of how I
  118. like to use the :func:`compact()` and :func:`dedent()` functions. The main
  119. difference is that I use :func:`compact()` for text that will be presented
  120. to the user (where whitespace is not so significant) and :func:`dedent()`
  121. for data file and code generation tasks (where newlines and indentation are
  122. very significant).
  123. """
  124. dedented_text = textwrap.dedent(text)
  125. trimmed_text = trim_empty_lines(dedented_text)
  126. return format(trimmed_text, *args, **kw)
  127. def format(text, *args, **kw):
  128. """
  129. Format a string using the string formatting operator and/or :meth:`str.format()`.
  130. :param text: The text to format (a string).
  131. :param args: Any positional arguments are interpolated into the text using
  132. the string formatting operator (``%``). If no positional
  133. arguments are given no interpolation is done.
  134. :param kw: Any keyword arguments are interpolated into the text using the
  135. :meth:`str.format()` function. If no keyword arguments are given
  136. no interpolation is done.
  137. :returns: The text with any positional and/or keyword arguments
  138. interpolated (a string).
  139. The implementation of this function is so trivial that it seems silly to
  140. even bother writing and documenting it. Justifying this requires some
  141. context :-).
  142. **Why format() instead of the string formatting operator?**
  143. For really simple string interpolation Python's string formatting operator
  144. is ideal, but it does have some strange quirks:
  145. - When you switch from interpolating a single value to interpolating
  146. multiple values you have to wrap them in tuple syntax. Because
  147. :func:`format()` takes a `variable number of arguments`_ it always
  148. receives a tuple (which saves me a context switch :-). Here's an
  149. example:
  150. >>> from humanfriendly.text import format
  151. >>> # The string formatting operator.
  152. >>> print('the magic number is %s' % 42)
  153. the magic number is 42
  154. >>> print('the magic numbers are %s and %s' % (12, 42))
  155. the magic numbers are 12 and 42
  156. >>> # The format() function.
  157. >>> print(format('the magic number is %s', 42))
  158. the magic number is 42
  159. >>> print(format('the magic numbers are %s and %s', 12, 42))
  160. the magic numbers are 12 and 42
  161. - When you interpolate a single value and someone accidentally passes in a
  162. tuple your code raises a :exc:`~exceptions.TypeError`. Because
  163. :func:`format()` takes a `variable number of arguments`_ it always
  164. receives a tuple so this can never happen. Here's an example:
  165. >>> # How expecting to interpolate a single value can fail.
  166. >>> value = (12, 42)
  167. >>> print('the magic value is %s' % value)
  168. Traceback (most recent call last):
  169. File "<stdin>", line 1, in <module>
  170. TypeError: not all arguments converted during string formatting
  171. >>> # The following line works as intended, no surprises here!
  172. >>> print(format('the magic value is %s', value))
  173. the magic value is (12, 42)
  174. **Why format() instead of the str.format() method?**
  175. When you're doing complex string interpolation the :meth:`str.format()`
  176. function results in more readable code, however I frequently find myself
  177. adding parentheses to force evaluation order. The :func:`format()` function
  178. avoids this because of the relative priority between the comma and dot
  179. operators. Here's an example:
  180. >>> "{adjective} example" + " " + "(can't think of anything less {adjective})".format(adjective='silly')
  181. "{adjective} example (can't think of anything less silly)"
  182. >>> ("{adjective} example" + " " + "(can't think of anything less {adjective})").format(adjective='silly')
  183. "silly example (can't think of anything less silly)"
  184. >>> format("{adjective} example" + " " + "(can't think of anything less {adjective})", adjective='silly')
  185. "silly example (can't think of anything less silly)"
  186. The :func:`compact()` and :func:`dedent()` functions are wrappers that
  187. combine :func:`format()` with whitespace manipulation to make it easy to
  188. write nice to read Python code.
  189. .. _variable number of arguments: https://docs.python.org/2/tutorial/controlflow.html#arbitrary-argument-lists
  190. """
  191. if args:
  192. text %= args
  193. if kw:
  194. text = text.format(**kw)
  195. return text
  196. def generate_slug(text, delimiter="-"):
  197. """
  198. Convert text to a normalized "slug" without whitespace.
  199. :param text: The original text, for example ``Some Random Text!``.
  200. :param delimiter: The delimiter used to separate words
  201. (defaults to the ``-`` character).
  202. :returns: The slug text, for example ``some-random-text``.
  203. :raises: :exc:`~exceptions.ValueError` when the provided
  204. text is nonempty but results in an empty slug.
  205. """
  206. slug = text.lower()
  207. escaped = delimiter.replace("\\", "\\\\")
  208. slug = re.sub("[^a-z0-9]+", escaped, slug)
  209. slug = slug.strip(delimiter)
  210. if text and not slug:
  211. msg = "The provided text %r results in an empty slug!"
  212. raise ValueError(format(msg, text))
  213. return slug
  214. def is_empty_line(text):
  215. """
  216. Check if a text is empty or contains only whitespace.
  217. :param text: The text to check for "emptiness" (a string).
  218. :returns: :data:`True` if the text is empty or contains only whitespace,
  219. :data:`False` otherwise.
  220. """
  221. return len(text) == 0 or text.isspace()
  222. def join_lines(text):
  223. """
  224. Remove "hard wrapping" from the paragraphs in a string.
  225. :param text: The text to reformat (a string).
  226. :returns: The text without hard wrapping (a string).
  227. This function works by removing line breaks when the last character before
  228. a line break and the first character after the line break are both
  229. non-whitespace characters. This means that common leading indentation will
  230. break :func:`join_lines()` (in that case you can use :func:`dedent()`
  231. before calling :func:`join_lines()`).
  232. """
  233. return re.sub(r'(\S)\n(\S)', r'\1 \2', text)
  234. def pluralize(count, singular, plural=None):
  235. """
  236. Combine a count with the singular or plural form of a word.
  237. :param count: The count (a number).
  238. :param singular: The singular form of the word (a string).
  239. :param plural: The plural form of the word (a string or :data:`None`).
  240. :returns: The count and singular or plural word concatenated (a string).
  241. See :func:`pluralize_raw()` for the logic underneath :func:`pluralize()`.
  242. """
  243. return '%s %s' % (count, pluralize_raw(count, singular, plural))
  244. def pluralize_raw(count, singular, plural=None):
  245. """
  246. Select the singular or plural form of a word based on a count.
  247. :param count: The count (a number).
  248. :param singular: The singular form of the word (a string).
  249. :param plural: The plural form of the word (a string or :data:`None`).
  250. :returns: The singular or plural form of the word (a string).
  251. When the given count is exactly 1.0 the singular form of the word is
  252. selected, in all other cases the plural form of the word is selected.
  253. If the plural form of the word is not provided it is obtained by
  254. concatenating the singular form of the word with the letter "s". Of course
  255. this will not always be correct, which is why you have the option to
  256. specify both forms.
  257. """
  258. if not plural:
  259. plural = singular + 's'
  260. return singular if float(count) == 1.0 else plural
  261. def random_string(length=(25, 100), characters=string.ascii_letters):
  262. """random_string(length=(25, 100), characters=string.ascii_letters)
  263. Generate a random string.
  264. :param length: The length of the string to be generated (a number or a
  265. tuple with two numbers). If this is a tuple then a random
  266. number between the two numbers given in the tuple is used.
  267. :param characters: The characters to be used (a string, defaults
  268. to :data:`string.ascii_letters`).
  269. :returns: A random string.
  270. The :func:`random_string()` function is very useful in test suites; by the
  271. time I included it in :mod:`humanfriendly.text` I had already included
  272. variants of this function in seven different test suites :-).
  273. """
  274. if not isinstance(length, numbers.Number):
  275. length = random.randint(length[0], length[1])
  276. return ''.join(random.choice(characters) for _ in range(length))
  277. def split(text, delimiter=','):
  278. """
  279. Split a comma-separated list of strings.
  280. :param text: The text to split (a string).
  281. :param delimiter: The delimiter to split on (a string).
  282. :returns: A list of zero or more nonempty strings.
  283. Here's the default behavior of Python's built in :meth:`str.split()`
  284. function:
  285. >>> 'foo,bar, baz,'.split(',')
  286. ['foo', 'bar', ' baz', '']
  287. In contrast here's the default behavior of the :func:`split()` function:
  288. >>> from humanfriendly.text import split
  289. >>> split('foo,bar, baz,')
  290. ['foo', 'bar', 'baz']
  291. Here is an example that parses a nested data structure (a mapping of
  292. logging level names to one or more styles per level) that's encoded in a
  293. string so it can be set as an environment variable:
  294. >>> from pprint import pprint
  295. >>> encoded_data = 'debug=green;warning=yellow;error=red;critical=red,bold'
  296. >>> parsed_data = dict((k, split(v, ',')) for k, v in (split(kv, '=') for kv in split(encoded_data, ';')))
  297. >>> pprint(parsed_data)
  298. {'debug': ['green'],
  299. 'warning': ['yellow'],
  300. 'error': ['red'],
  301. 'critical': ['red', 'bold']}
  302. """
  303. return [token.strip() for token in text.split(delimiter) if token and not token.isspace()]
  304. def split_paragraphs(text):
  305. """
  306. Split a string into paragraphs (one or more lines delimited by an empty line).
  307. :param text: The text to split into paragraphs (a string).
  308. :returns: A list of strings.
  309. """
  310. paragraphs = []
  311. for chunk in text.split('\n\n'):
  312. chunk = trim_empty_lines(chunk)
  313. if chunk and not chunk.isspace():
  314. paragraphs.append(chunk)
  315. return paragraphs
  316. def tokenize(text):
  317. """
  318. Tokenize a text into numbers and strings.
  319. :param text: The text to tokenize (a string).
  320. :returns: A list of strings and/or numbers.
  321. This function is used to implement robust tokenization of user input in
  322. functions like :func:`.parse_size()` and :func:`.parse_timespan()`. It
  323. automatically coerces integer and floating point numbers, ignores
  324. whitespace and knows how to separate numbers from strings even without
  325. whitespace. Some examples to make this more concrete:
  326. >>> from humanfriendly.text import tokenize
  327. >>> tokenize('42')
  328. [42]
  329. >>> tokenize('42MB')
  330. [42, 'MB']
  331. >>> tokenize('42.5MB')
  332. [42.5, 'MB']
  333. >>> tokenize('42.5 MB')
  334. [42.5, 'MB']
  335. """
  336. tokenized_input = []
  337. for token in re.split(r'(\d+(?:\.\d+)?)', text):
  338. token = token.strip()
  339. if re.match(r'\d+\.\d+', token):
  340. tokenized_input.append(float(token))
  341. elif token.isdigit():
  342. tokenized_input.append(int(token))
  343. elif token:
  344. tokenized_input.append(token)
  345. return tokenized_input
  346. def trim_empty_lines(text):
  347. """
  348. Trim leading and trailing empty lines from the given text.
  349. :param text: The text to trim (a string).
  350. :returns: The trimmed text (a string).
  351. """
  352. lines = text.splitlines(True)
  353. while lines and is_empty_line(lines[0]):
  354. lines.pop(0)
  355. while lines and is_empty_line(lines[-1]):
  356. lines.pop(-1)
  357. return ''.join(lines)