chroma新增、删除、知识库应用
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

158 lines
6.6 KiB

  1. # coding:utf8
  2. import os, sys
  3. import io
  4. from jsonpath_ng import jsonpath, parse
  5. import uuid
  6. sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf8')
  7. cur_dir = os.path.dirname(os.path.abspath(__file__)) or os.getcwd()
  8. par_dir = os.path.abspath(os.path.join(cur_dir, os.path.pardir))
  9. sys.path.append(cur_dir)
  10. sys.path.append(par_dir)
  11. import json
  12. from django.http import HttpResponse
  13. from text_analysis.tools import to_kafka
  14. from django.views.decorators.csrf import csrf_exempt
  15. from log_util.set_logger import set_logger
  16. logging = set_logger('logs/results.log')
  17. import traceback
  18. import queue
  19. import requests
  20. from text_analysis.tools.tool import parse_data
  21. from text_analysis.chroma1 import LangChainChroma
  22. import time
  23. from kazoo.client import KazooClient
  24. from kazoo.protocol.states import EventType
  25. import queue
  26. task_queue = queue.PriorityQueue()
  27. stop_dict={}
  28. from text_analysis.read_config import load_config
  29. config=load_config()
  30. @csrf_exempt
  31. def createChroma(request):
  32. if request.method == 'POST':
  33. try:
  34. raw_data = json.loads(request.body)
  35. if "trace" in raw_data.keys() and raw_data["trace"]==True:
  36. task_queue.put((-1, time.time(),raw_data))
  37. else:
  38. task_queue.put((1, time.time(),raw_data))
  39. return HttpResponse(json.dumps({"code": 1, "msg": "请求正常!"}, ensure_ascii=False))
  40. except:
  41. logging.error(traceback.format_exc())
  42. return HttpResponse(json.dumps({"code": 0, "msg": "请求json格式不正确!"}, ensure_ascii=False))
  43. else:
  44. return HttpResponse(json.dumps({"code": 0, "msg": "请求方式错误,改为post请求"}, ensure_ascii=False))
  45. def upload():
  46. while True:
  47. try:
  48. if task_queue.qsize()>0:
  49. p,t,raw_data = task_queue.get(timeout=1)
  50. logging.info("当前任务队列长度{}".format(task_queue.qsize()+1))
  51. output=raw_data["output"]
  52. res_tmp={key: "" for key in output}
  53. if "id" in res_tmp.keys():
  54. res_tmp["id"]=str(uuid.uuid4())
  55. res_tmp["isLast"]=1
  56. task_id=raw_data["scenes_id"]
  57. task_version=raw_data["version"]
  58. logging.info("任务数据为:{}".format(raw_data))
  59. logging.info("当前version信息为:{}".format(stop_dict))
  60. if task_id in stop_dict.keys() and task_version!=stop_dict[task_id]["version"]:
  61. logging.info("已暂停任务,数据过滤掉")
  62. continue
  63. # chunkSize=parse_data(raw_data,raw_data["input"]["chunkSize"])
  64. if ':$[' not in raw_data["input"]["content"]:
  65. content=raw_data["input"]["content"]
  66. else:
  67. content=parse_data(raw_data,raw_data["input"]["content"])
  68. if ':$[' not in raw_data["input"]["fieldName"]:
  69. fieldName=raw_data["input"]["fieldName"]
  70. else:
  71. fieldName=parse_data(raw_data,raw_data["input"]["fieldName"])
  72. if ':$[' not in raw_data["input"]["dataId"]:
  73. dataId=raw_data["input"]["dataId"]
  74. else:
  75. dataId=parse_data(raw_data,raw_data["input"]["dataId"])
  76. # dataId=raw_data["dataId"]
  77. if content and fieldName and dataId:
  78. vector_db=LangChainChroma(fieldName)
  79. docs=vector_db.text_splitter.split_text(content)
  80. res,db_count=vector_db.add_documents(docs,dataId)
  81. vector_db.db_close()
  82. logging.info('当前数据划分{}个块。数据库{}共有{}个块'.format(len(res), fieldName,db_count))
  83. # res=LC.addChroma(content,fieldName,logging,chunkSize)
  84. res_tmp['resultsID']=res
  85. raw_data["result"] = {"successCode": "", "errorLog": "", "results": ""}
  86. if res:
  87. res_tmp["status"]=1
  88. raw_data["result"]["successCode"] = "1"
  89. raw_data["result"]["status"] = 1
  90. raw_data["result"]["message"] = "成功"
  91. else:
  92. res_tmp["status"]=3
  93. raw_data["result"]["successCode"] = "0"
  94. raw_data["result"]["status"] = 2
  95. raw_data["result"]["message"] = "异常"
  96. else:
  97. res_tmp["status"] = 3
  98. raw_data["result"]["successCode"] = "0"
  99. raw_data["result"]["errorLog"] = "请检查content/fieldName/dataId,要求非空"
  100. raw_data["result"]["status"] = 2
  101. raw_data["result"]["message"] = "请检查content/fieldName/dataId,要求非空"
  102. res_tmp_json = json.dumps(res_tmp, ensure_ascii=False)
  103. raw_data["result"]["results"]=res_tmp_json
  104. logging.info("结果数据为:{}".format(raw_data))
  105. to_kafka.send_kafka(raw_data, logging)
  106. else:
  107. # 暂无任务,进入休眠
  108. time.sleep(10)
  109. except:
  110. raw_data["result"]={}
  111. raw_data["result"]["successCode"] = "0"
  112. raw_data["result"]["errorLog"] = traceback.format_exc()
  113. res_tmp["status"] = 3
  114. raw_data["result"]["status"] = 2
  115. raw_data["result"]["message"] = "异常"
  116. raw_data["result"]["results"] = json.dumps(res_tmp, ensure_ascii=False)
  117. logging.error(traceback.format_exc())
  118. to_kafka.send_kafka(raw_data, logging)
  119. def zk_monitoring():
  120. try:
  121. #线上环境
  122. zk = KazooClient(hosts=config['zookeeper']['zkhost'])
  123. #测试环境
  124. # zk = KazooClient(hosts='172.16.12.55:2181,172.16.12.56:2181,172.16.12.57:2181')
  125. zk.start()
  126. # 设置监听器
  127. @zk.DataWatch("/analyze")
  128. def watch_node(data, stat, event):
  129. if event is not None and event.type == EventType.CHANGED:
  130. data, stat = zk.get("/analyze")
  131. logging.info("执行删除操作:{}".format(data))
  132. d = json.loads(data)
  133. id = d["scenes_id"]
  134. stop_dict[id] = {}
  135. stop_dict[id]["version"] = d["version"]
  136. stop_dict[id]["operation"] = d["operation"]
  137. # 保持程序运行以监听节点变化
  138. try:
  139. while True:
  140. time.sleep(1)
  141. except:
  142. logging.info("Stopping...")
  143. # 关闭连接
  144. zk.stop()
  145. zk.close()
  146. except:
  147. logging.error(traceback.format_exc())