图片解析应用
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

103 lines
3.3 KiB

  1. # -*- coding: utf-8 -*-
  2. #*****************************************************************************
  3. # Copyright (C) 2006 Jorgen Stenarson. <jorgen.stenarson@bostream.nu>
  4. #
  5. # Distributed under the terms of the BSD License. The full license is in
  6. # the file COPYING, distributed as part of this software.
  7. #*****************************************************************************
  8. from __future__ import print_function, unicode_literals, absolute_import
  9. import re, operator
  10. def str_find_all(str, ch):
  11. result = []
  12. index = 0
  13. while index >= 0:
  14. index = str.find(ch, index)
  15. if index >= 0:
  16. result.append(index)
  17. index += 1
  18. return result
  19. word_pattern = re.compile("(x*)")
  20. def markwords(str, iswordfun):
  21. markers = {True : "x", False : "o"}
  22. return "".join([markers[iswordfun(ch)] for ch in str])
  23. def split_words(str, iswordfun):
  24. return [x for x in word_pattern.split(markwords(str,iswordfun)) if x != ""]
  25. def mark_start_segment(str, is_segment):
  26. def mark_start(s):
  27. if s[0:1] == "x":
  28. return "s" + s[1:]
  29. else:
  30. return s
  31. return "".join(map(mark_start, split_words(str, is_segment)))
  32. def mark_end_segment(str, is_segment):
  33. def mark_start(s):
  34. if s[0:1] == "x":
  35. return s[:-1] + "s"
  36. else:
  37. return s
  38. return "".join(map(mark_start, split_words(str, is_segment)))
  39. def mark_start_segment_index(str, is_segment):
  40. return str_find_all(mark_start_segment(str, is_segment), "s")
  41. def mark_end_segment_index(str, is_segment):
  42. return [x + 1 for x in str_find_all(mark_end_segment(str, is_segment), "s")]
  43. ################ Following are used in lineobj ###########################
  44. def is_word_token(str):
  45. return not is_non_word_token(str)
  46. def is_non_word_token(str):
  47. if len(str) != 1 or str in " \t\n":
  48. return True
  49. else:
  50. return False
  51. def next_start_segment(str, is_segment):
  52. str = "".join(str)
  53. result = []
  54. for start in mark_start_segment_index(str, is_segment):
  55. result[len(result):start] = [start for x in range(start - len(result))]
  56. result[len(result):len(str)] = [len(str) for x in range(len(str) - len(result) + 1)]
  57. return result
  58. def next_end_segment(str, is_segment):
  59. str = "".join(str)
  60. result = []
  61. for start in mark_end_segment_index(str, is_segment):
  62. result[len(result):start] = [start for x in range(start - len(result))]
  63. result[len(result):len(str)] = [len(str) for x in range(len(str) - len(result) + 1)]
  64. return result
  65. def prev_start_segment(str, is_segment):
  66. str = "".join(str)
  67. result = []
  68. prev = 0
  69. for start in mark_start_segment_index(str, is_segment):
  70. result[len(result):start+1] = [prev for x in range(start - len(result) + 1)]
  71. prev=start
  72. result[len(result):len(str)] = [prev for x in range(len(str) - len(result) + 1)]
  73. return result
  74. def prev_end_segment(str, is_segment):
  75. str = "".join(str)
  76. result = []
  77. prev = 0
  78. for start in mark_end_segment_index(str, is_segment):
  79. result[len(result):start + 1] = [prev for x in range(start - len(result) + 1)]
  80. prev=start
  81. result[len(result):len(str)] = [len(str) for x in range(len(str) - len(result) + 1)]
  82. return result