Browse Source

语言识别应用

master
maojian 6 months ago
commit
859a791e3a
  1. 26
      config.ini
  2. BIN
      log_util/__pycache__/set_logger.cpython-36.pyc
  3. BIN
      log_util/__pycache__/set_logger.cpython-37.pyc
  4. BIN
      log_util/__pycache__/set_logger.cpython-38.pyc
  5. 33
      log_util/set_logger.py
  6. 0
      logs/results.log
  7. 22
      manage.py
  8. 34
      src.py
  9. 1
      start.sh
  10. 1
      stop_uwsgi.sh
  11. 36
      test.py
  12. 0
      text_analysis/__init__.py
  13. BIN
      text_analysis/__pycache__/__init__.cpython-36.pyc
  14. BIN
      text_analysis/__pycache__/__init__.cpython-37.pyc
  15. BIN
      text_analysis/__pycache__/__init__.cpython-38.pyc
  16. BIN
      text_analysis/__pycache__/read_config.cpython-38.pyc
  17. BIN
      text_analysis/__pycache__/settings.cpython-36.pyc
  18. BIN
      text_analysis/__pycache__/settings.cpython-37.pyc
  19. BIN
      text_analysis/__pycache__/settings.cpython-38.pyc
  20. BIN
      text_analysis/__pycache__/urls.cpython-36.pyc
  21. BIN
      text_analysis/__pycache__/urls.cpython-37.pyc
  22. BIN
      text_analysis/__pycache__/urls.cpython-38.pyc
  23. BIN
      text_analysis/__pycache__/views.cpython-36.pyc
  24. BIN
      text_analysis/__pycache__/views.cpython-37.pyc
  25. BIN
      text_analysis/__pycache__/views.cpython-38.pyc
  26. BIN
      text_analysis/__pycache__/views.cpython-39.pyc
  27. BIN
      text_analysis/__pycache__/wsgi.cpython-36.pyc
  28. 140
      text_analysis/bak/views.py0831
  29. 151
      text_analysis/bak/views.py0922_1
  30. 184
      text_analysis/bak/views.py0922_2
  31. 189
      text_analysis/bak/views.py1031
  32. 187
      text_analysis/bak/views.py_1109
  33. 208
      text_analysis/bak/views.py_1220
  34. 219
      text_analysis/bak/views.py_20240517
  35. 231
      text_analysis/bak/views.py_20240607
  36. 264
      text_analysis/bak/views.py_20240705
  37. 266
      text_analysis/bak/views.py_20240819
  38. 186
      text_analysis/bak/views.py_old
  39. 6
      text_analysis/eg.py
  40. 10
      text_analysis/read_config.py
  41. 14
      text_analysis/request.py
  42. 148
      text_analysis/settings.py
  43. BIN
      text_analysis/tools/__pycache__/cusException.cpython-36.pyc
  44. BIN
      text_analysis/tools/__pycache__/mysql_helper.cpython-36.pyc
  45. BIN
      text_analysis/tools/__pycache__/process.cpython-36.pyc
  46. BIN
      text_analysis/tools/__pycache__/to_kafka.cpython-36.pyc
  47. BIN
      text_analysis/tools/__pycache__/to_kafka.cpython-37.pyc
  48. BIN
      text_analysis/tools/__pycache__/to_kafka.cpython-38.pyc
  49. BIN
      text_analysis/tools/__pycache__/tool.cpython-36.pyc
  50. BIN
      text_analysis/tools/__pycache__/tool.cpython-37.pyc
  51. BIN
      text_analysis/tools/__pycache__/tool.cpython-38.pyc
  52. BIN
      text_analysis/tools/__pycache__/tools.cpython-36.pyc
  53. 129
      text_analysis/tools/bak/tool.py0822
  54. 25
      text_analysis/tools/cusException.py
  55. 67
      text_analysis/tools/kakfa_util.py
  56. 338
      text_analysis/tools/mysql_helper.py
  57. 51
      text_analysis/tools/process.py
  58. 171
      text_analysis/tools/seleniumTest.py
  59. 25
      text_analysis/tools/to_kafka.py
  60. 132
      text_analysis/tools/tool.py
  61. 13
      text_analysis/urls.py
  62. 268
      text_analysis/views.py
  63. 266
      text_analysis/views.py_20240819
  64. 271
      text_analysis/views_20240903.py
  65. 16
      text_analysis/wsgi.py
  66. 8
      uwsgi.ini
  67. 58
      wsgi.log
  68. 35
      wsgi.py
  69. 30
      wsgi.py_0228

26
config.ini

@ -0,0 +1,26 @@
[zookeeper]
;zk地址
zkhost=node-01:12181,node-02:12181,node-03:12181
;节点
node=/analyze
[kafka]
;服务器地址
bootstrap_servers=node-01:19092,node-02:19092,node-03:19092
;topic
topic=produce_analyze
[gofast]
;gofast前缀
;url=https://caiji.percent.cn/
url=http://8.152.196.157:8081/
[asr]
;音频上传
mp3_upload=http://voice.pontoaplus.com/apis/file/asr/upload
;音频结果获取
mp3_getResult=http://voice.pontoaplus.com/apis/file/asr/getResult
;视频上传
video_upload=http://voice.pontoaplus.com/apis/media/analysis/upload
;视频结果获取
video_getResult=http://voice.pontoaplus.com/apis/media/analysis/getResult

BIN
log_util/__pycache__/set_logger.cpython-36.pyc

BIN
log_util/__pycache__/set_logger.cpython-37.pyc

BIN
log_util/__pycache__/set_logger.cpython-38.pyc

33
log_util/set_logger.py

@ -0,0 +1,33 @@
#coding:utf8
import logging
import os
import sys
from logging.handlers import TimedRotatingFileHandler
import re
# cur_dir = os.path.dirname( os.path.abspath(__file__)) or os.getcwd()
# sys.path.append(cur_dir + '/log_util')
def set_logger(filename):
# 创建logger对象。传入logger名字
logger = logging.getLogger(filename)
# log_path = os.path.join(cur_dir, filename)
# 设置日志记录等级
logger.setLevel(logging.INFO)
# interval 滚动周期,
# when="MIDNIGHT", interval=1 表示每天0点为更新点,每天生成一个文件
# backupCount 表示日志保存个数
file_handler = TimedRotatingFileHandler(
filename=filename, when="MIDNIGHT",encoding="utf-8", interval=1, backupCount=3
)
# filename="mylog" suffix设置,会生成文件名为mylog.2020-02-25.log
file_handler.suffix = "%Y-%m-%d.log"
# extMatch是编译好正则表达式,用于匹配日志文件名后缀
# 需要注意的是suffix和extMatch一定要匹配的上,如果不匹配,过期日志不会被删除。
file_handler.extMatch = re.compile(r"^\d{4}-\d{2}-\d{2}.log$")
# 定义日志输出格式
file_handler.setFormatter(
logging.Formatter(
"[%(asctime)s] [%(process)d] [%(levelname)s] - %(module)s.%(funcName)s (%(filename)s:%(lineno)d) - %(message)s"
)
)
logger.addHandler(file_handler)
return logger

0
logs/results.log

22
manage.py

@ -0,0 +1,22 @@
#!/usr/bin/env python
import os
import sys
import threading
from text_analysis.views import upload,getResult
import django
if __name__ == "__main__":
t = threading.Thread(target=upload, name='upload')
t.daemon = True
t.start()
r = threading.Thread(target=getResult, name='getResult')
r.daemon = True
r.start()
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "text_analysis.settings")
django.setup()
from django.core.management import execute_from_command_line
execute_from_command_line(sys.argv)

34
src.py

@ -0,0 +1,34 @@
#coding:utf8
import requests
def upload():
url="https://realtime.pdeepmatrix.com/apis/media/analysis/upload"
# 定义form-data参数
data = {
'fromLanguage': 'zh'
}
# 定义文件参数
files = {
'file': open('test.mp4', 'rb')
}
response = requests.post(url, data=data, files=files)
print(response.text)
#结果—{"code":200,"message":"SUCCESS","data":"3a42ea9594b641c39e40d1497ca29be9"}
def getResults():
url="https://realtime.pdeepmatrix.com/apis/media/analysis/getResult"
# 定义参数
#'taskId': '3a42ea9594b641c39e40d1497ca29be9'
params = {
'taskId': '4ef21e404b7240acb14bbd5fe63227fc'
}
response = requests.get(url, params=params)
# 打印响应结果
print(response.text)
#{"code":200,"message":"SUCCESS","data":{"sentences":[{"silence_duration":0,"end_time":5108,"speech_rate":150,"begin_time":1130,"channel_id":0,"emotion_value":"5.0","text":"视频解析、语音识别。"}]...
getResults()

1
start.sh

@ -0,0 +1 @@
../../environment/python3.8/bin/uwsgi --ini uwsgi.ini --file wsgi.py --daemonize wsgi.log

1
stop_uwsgi.sh

@ -0,0 +1 @@
lsof -i:9014 |grep -v 'PID' | awk '{print $2}'| xargs kill -9

36
test.py

@ -0,0 +1,36 @@
#coding=utf8
import sys
import requests
import json
import time
# #url = 'http://0.0.0.0:5033'
# """
# url = 'http://20.0.2.6:5055/classify_event'
# url = 'http://20.0.2.6:5055/is_about_china'
# url = 'http://20.0.2.6:5055/associated_words'
# """
# url = 'http://127.0.0.1:9008/paper'
#
# # url_file ="http://172.18.1.130:9985/group33/default/20230415/09/15/1/“GF-1”影像质量评价及矿区土地利用分类潜力研究_陈明.docx"
# url_file="/opt/Project_kongtianyuan/inputfile/"
# filename = "“GF-1”影像质量评价及矿区土地利用分类潜力研究"
#
# data = {"url":url_file,"filename":filename}
# data_str = json.dumps(data)
#
# r = requests.post(url,data=str(data_str))
# print(r.text)
# # res =json.loads(r.text)
# # print(res)
from datetime import datetime
import os
path = datetime.now().strftime('%Y-%m-%d')
if not os.path.exists(path):
os.makedirs(path)

0
text_analysis/__init__.py

BIN
text_analysis/__pycache__/__init__.cpython-36.pyc

BIN
text_analysis/__pycache__/__init__.cpython-37.pyc

BIN
text_analysis/__pycache__/__init__.cpython-38.pyc

BIN
text_analysis/__pycache__/read_config.cpython-38.pyc

BIN
text_analysis/__pycache__/settings.cpython-36.pyc

BIN
text_analysis/__pycache__/settings.cpython-37.pyc

BIN
text_analysis/__pycache__/settings.cpython-38.pyc

BIN
text_analysis/__pycache__/urls.cpython-36.pyc

BIN
text_analysis/__pycache__/urls.cpython-37.pyc

BIN
text_analysis/__pycache__/urls.cpython-38.pyc

BIN
text_analysis/__pycache__/views.cpython-36.pyc

BIN
text_analysis/__pycache__/views.cpython-37.pyc

BIN
text_analysis/__pycache__/views.cpython-38.pyc

BIN
text_analysis/__pycache__/views.cpython-39.pyc

BIN
text_analysis/__pycache__/wsgi.cpython-36.pyc

140
text_analysis/bak/views.py0831

@ -0,0 +1,140 @@
#coding:utf8
import os, sys
import io
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf8')
cur_dir = os.path.dirname(os.path.abspath(__file__)) or os.getcwd()
par_dir = os.path.abspath(os.path.join(cur_dir, os.path.pardir))
sys.path.append(cur_dir)
sys.path.append(par_dir)
import json
from django.http import HttpResponse
from text_analysis.tools import to_kafka
from django.views.decorators.csrf import csrf_exempt
from log_util.set_logger import set_logger
logging=set_logger('logs/results.log')
import traceback
import queue
import requests
from text_analysis.tools.tool import get_data
import time
from datetime import datetime
import os
#任务队列
global task_queue
task_queue = queue.Queue()
#数据队列
global data_queue
data_queue = queue.Queue()
@csrf_exempt
def ASR(request):
if request.method == 'POST':
try:
raw_data = json.loads(request.body)
task_queue.put(raw_data)
return HttpResponse(json.dumps({"code": 1, "msg": "请求正常!"}, ensure_ascii=False))
except:
logging.error(traceback.format_exc())
return HttpResponse(json.dumps({"code": 0, "msg": "请求json格式不正确!"}, ensure_ascii=False))
else:
return HttpResponse(json.dumps({"code": 0, "msg": "请求方式错误,改为post请求"}, ensure_ascii=False))
def upload():
while True:
try:
if task_queue.qsize() >0:
logging.info("取任务队列长度{}".format(task_queue.qsize()))
raw_data = task_queue.get()
index=raw_data["metadata"]["index"]
datasource=raw_data["metadata"]["admin"]["datasource"]
if datasource not in raw_data["data"].keys():
logging.info("找不到相关数据源!—{}".format(raw_data))
continue
allFile=raw_data["data"][datasource]
currentFile=eval(allFile)[index]
file=currentFile["fileUrl"]
fileName=currentFile["fileName"]
#从gofast获取视频
myfile = requests.get(file)
starttime = datetime.now().strftime('%Y-%m-%d')
path='inputdata/'+starttime
if not os.path.exists(path):
os.makedirs(path)
with open(path+'/'+fileName, 'wb') as f:
f.write(myfile.content)
logging.info("视频从gofast下载完毕,开始上传{}".format(fileName))
#访问视频上传接口
url = "https://realtime.pdeepmatrix.com/apis/media/analysis/upload"
data = {'fromLanguage': 'zh'}
files = {'file': open(path+'/'+fileName, 'rb')}
response = requests.post(url, data=data, files=files)
d = json.loads(response.text)
if "code" in d.keys() and d["code"]==200:
#接口返回值data中存放视频获取结果的key
result = d["data"]
raw_data["result"] = {"successCode": "1", "errorLog": "", "results": "","dataKey":result}
data_queue.put(raw_data)
logging.info("视频上传成功{}".format(raw_data))
# to_kafka.send_kafka(raw_data,logging)
else:
logging.info("视频上传失败,接口返回值{}".format(d))
else:
#暂无任务,进入休眠
time.sleep(10)
except:
logging.error(traceback.format_exc())
def getResult():
while True:
#3秒钟结果获取一次
time.sleep(3)
try:
if data_queue.qsize() >0:
logging.info("取数据队列长度{}".format(data_queue.qsize()))
raw_data = data_queue.get()
print(raw_data)
#根据视频key访问获取结果接口
dataKey=raw_data["result"]["dataKey"]
url = "https://realtime.pdeepmatrix.com/apis/media/analysis/getResult"
params = {'taskId': dataKey}
response = requests.get(url, params=params)
# print(response.text)
d = json.loads(response.text)
if "code" in d.keys() and d["code"]==200:
results=""
if d["data"]["code"]=="1":
for sentence in d["data"]["sentences"]:
results+=sentence["text"]
raw_data["result"]["results"] =results
logging.info("视频解析获取结果成功{}".format(raw_data))
to_kafka.send_kafka(raw_data, logging)
elif d["data"]["code"]=="0":
#正在解析中,将任务再次放回数据队列
data_queue.put(raw_data)
logging.info("视频未解析完毕,放回队列等待{}-{}".format(raw_data,d))
else:
#解析失败
raw_data["result"]["successCode"] = "0"
raw_data["result"]["errorLog"] = response.text
logging.info("视频解析获取结果失败,数据{},接口返回值{}".format(raw_data, d))
to_kafka.send_kafka(raw_data, logging)
else:
raw_data["result"]["successCode"] = "0"
raw_data["result"]["errorLog"]=response.text
logging.info("视频解析获取结果失败,数据{},接口返回值{}".format(raw_data,d))
to_kafka.send_kafka(raw_data, logging)
else:
#暂无任务,进入休眠
time.sleep(10)
except:
raw_data["result"]["successCode"]="0"
raw_data["result"]["errorLog"]=traceback.format_exc()
logging.error(traceback.format_exc())
to_kafka.send_kafka(raw_data, logging)

151
text_analysis/bak/views.py0922_1

@ -0,0 +1,151 @@
#coding:utf8
import os, sys
import io
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf8')
cur_dir = os.path.dirname(os.path.abspath(__file__)) or os.getcwd()
par_dir = os.path.abspath(os.path.join(cur_dir, os.path.pardir))
sys.path.append(cur_dir)
sys.path.append(par_dir)
import json
from django.http import HttpResponse
from text_analysis.tools import to_kafka
from django.views.decorators.csrf import csrf_exempt
from log_util.set_logger import set_logger
logging=set_logger('logs/results.log')
import traceback
import queue
import requests
from text_analysis.tools.tool import get_data
import time
from datetime import datetime
import os
#任务队列
global task_queue
task_queue = queue.Queue()
#数据队列
global data_queue
data_queue = queue.Queue()
@csrf_exempt
def ASR(request):
if request.method == 'POST':
try:
raw_data = json.loads(request.body)
task_queue.put(raw_data)
return HttpResponse(json.dumps({"code": 1, "msg": "请求正常!"}, ensure_ascii=False))
except:
logging.error(traceback.format_exc())
return HttpResponse(json.dumps({"code": 0, "msg": "请求json格式不正确!"}, ensure_ascii=False))
else:
return HttpResponse(json.dumps({"code": 0, "msg": "请求方式错误,改为post请求"}, ensure_ascii=False))
def upload():
while True:
try:
if task_queue.qsize() >0:
logging.info("取任务队列长度{}".format(task_queue.qsize()))
raw_data = task_queue.get()
index=raw_data["metadata"]["index"]
datasource=raw_data["metadata"]["admin"]["datasource"]
if datasource not in raw_data["data"].keys():
logging.info("找不到相关数据源!—{}".format(raw_data))
continue
allFile=raw_data["data"][datasource]
currentFile=eval(allFile)[index]
currentFile["content"]=""
file=currentFile["fileUrl"]
if "http" not in file:
file="https://caiji.percent.cn/"+file.lstrip("/")
fileName=currentFile["fileName"]
language=raw_data["metadata"]["admin"]["fromLanguage"]
#从gofast获取视频
myfile = requests.get(file)
starttime = datetime.now().strftime('%Y-%m-%d')
path='inputdata/'+starttime
if not os.path.exists(path):
os.makedirs(path)
with open(path+'/'+fileName, 'wb') as f:
f.write(myfile.content)
logging.info("视频从gofast下载完毕,开始上传-{}".format(fileName))
#访问视频上传接口
url = "https://realtime.pdeepmatrix.com/apis/media/analysis/upload"
data = {'fromLanguage': language}
f=open(path+'/'+fileName, 'rb')
files = {'file': f}
response = requests.post(url, data=data, files=files)
d = json.loads(response.text)
if "code" in d.keys() and d["code"]==200:
#接口返回值data中存放视频获取结果的key
result = d["data"]
raw_data["result"] = {"successCode": "1", "errorLog": "", "results": currentFile,"dataKey":result}
data_queue.put(raw_data)
logging.info("视频上传成功{}".format(raw_data))
# to_kafka.send_kafka(raw_data,logging)
else:
logging.info("视频上传失败{}-{}".format(raw_data,d))
f.close()
#Todo删除视频文件
else:
#暂无任务,进入休眠
time.sleep(10)
except:
logging.error(traceback.format_exc())
def getResult():
while True:
#3秒钟结果获取一次
time.sleep(3)
try:
if data_queue.qsize() >0:
logging.info("取数据队列长度{}".format(data_queue.qsize()))
raw_data = data_queue.get()
# print(raw_data)
#根据视频key访问获取结果接口
dataKey=raw_data["result"]["dataKey"]
url = "https://realtime.pdeepmatrix.com/apis/media/analysis/getResult"
params = {'taskId': dataKey}
response = requests.get(url, params=params)
# print(response.text)
d = json.loads(response.text)
if "code" in d.keys() and d["code"]==200:
results=""
if d["data"]["code"]=="1":
for sentence in d["data"]["sentences"]:
results+=sentence["text"]
raw_data["result"]["results"]["content"] =results
raw_data["result"]["results"]=json.dumps(raw_data["result"]["results"],ensure_ascii=False)
logging.info("视频解析获取结果成功{}".format(raw_data))
to_kafka.send_kafka(raw_data, logging)
elif d["data"]["code"]=="0":
#正在解析中,将任务再次放回数据队列
data_queue.put(raw_data)
logging.info("视频未解析完毕,放回队列等待{}-{}".format(raw_data,d))
else:
#解析失败
raw_data["result"]["successCode"] = "0"
raw_data["result"]["errorLog"] = response.text
raw_data["result"]["results"]=json.dumps(raw_data["result"]["results"],ensure_ascii=False)
logging.info("视频解析获取结果失败,数据{},接口返回值{}".format(raw_data, d))
to_kafka.send_kafka(raw_data, logging)
else:
raw_data["result"]["successCode"] = "0"
raw_data["result"]["errorLog"]=response.text
raw_data["result"]["results"] = json.dumps(raw_data["result"]["results"], ensure_ascii=False)
logging.info("视频解析获取结果失败,数据{},接口返回值{}".format(raw_data,d))
to_kafka.send_kafka(raw_data, logging)
else:
#暂无任务,进入休眠
time.sleep(10)
except:
raw_data["result"]["successCode"]="0"
raw_data["result"]["errorLog"]=traceback.format_exc()
raw_data["result"]["results"]=""
logging.error(traceback.format_exc())
to_kafka.send_kafka(raw_data, logging)

184
text_analysis/bak/views.py0922_2

@ -0,0 +1,184 @@
# coding:utf8
import os, sys
import io
from jsonpath_ng import jsonpath, parse
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf8')
cur_dir = os.path.dirname(os.path.abspath(__file__)) or os.getcwd()
par_dir = os.path.abspath(os.path.join(cur_dir, os.path.pardir))
sys.path.append(cur_dir)
sys.path.append(par_dir)
import json
from django.http import HttpResponse
from text_analysis.tools import to_kafka
from django.views.decorators.csrf import csrf_exempt
from log_util.set_logger import set_logger
logging = set_logger('logs/results.log')
import traceback
import queue
import requests
from text_analysis.tools.tool import get_data
import time
from datetime import datetime
import os
# 任务队列
global task_queue
task_queue = queue.Queue()
# 数据队列
global data_queue
data_queue = queue.Queue()
@csrf_exempt
def ASR(request):
if request.method == 'POST':
try:
raw_data = json.loads(request.body)
task_queue.put(raw_data)
return HttpResponse(json.dumps({"code": 1, "msg": "请求正常!"}, ensure_ascii=False))
except:
logging.error(traceback.format_exc())
return HttpResponse(json.dumps({"code": 0, "msg": "请求json格式不正确!"}, ensure_ascii=False))
else:
return HttpResponse(json.dumps({"code": 0, "msg": "请求方式错误,改为post请求"}, ensure_ascii=False))
def upload():
while True:
try:
if task_queue.qsize() > 0:
logging.info("取任务队列长度{}".format(task_queue.qsize()))
raw_data = task_queue.get()
# index = raw_data["metadata"]["index"]
# datasource = raw_data["metadata"]["admin"]["datasource"]
# if datasource not in raw_data["data"].keys():
# logging.info("找不到相关数据源!—{}".format(raw_data))
# continue
# allFile = raw_data["data"][datasource]
# currentFile = eval(allFile)[index]
url=raw_data["metadata"]["admin"]["fileUrl"]
if '$.' in url:
# json.path表达式动态获取value
datasources = str(url).split(':')
# 0是数据源,1是JsonPath 表达式
datasourcestr = raw_data["data"][datasources[0]]
# print(datasourcestr)
datasource = json.loads(datasourcestr)
# 创建 JsonPath 表达式对象
expr = parse(datasources[1])
# 使用表达式来选择 JSON 元素
match = [match.value for match in expr.find(datasource)]
video_url = match[0]
fileName=video_url.rsplit('/')[-1]
if "http" not in video_url:
file = "https://caiji.percent.cn/" + video_url.lstrip("/")
# print(file)
# name=raw_data["metadata"]["admin"]["fileName"]
# if '$.' in name:
# # json.path表达式动态获取value
# datasources = str(name).split(':')
# # 0是数据源,1是JsonPath 表达式
# datasourcestr = raw_data["data"][datasources[0]]
# datasource = json.loads(datasourcestr)
# # 创建 JsonPath 表达式对象
# expr = parse(datasources[1])
# # 使用表达式来选择 JSON 元素
# match = [match.value for match in expr.find(datasource)]
# fileName = match[0]
currentFile={"content":"","fileName":fileName,"fileUrl":file}
language = raw_data["metadata"]["admin"]["fromLanguage"]
# 从gofast获取视频
myfile = requests.get(file)
starttime = datetime.now().strftime('%Y-%m-%d')
path = 'inputdata/' + starttime
if not os.path.exists(path):
os.makedirs(path)
with open(path + '/' + fileName, 'wb') as f:
f.write(myfile.content)
logging.info("视频从gofast下载完毕,开始上传-{}".format(fileName))
# 访问视频上传接口
url = "https://realtime.pdeepmatrix.com/apis/media/analysis/upload"
data = {'fromLanguage': language}
f = open(path + '/' + fileName, 'rb')
files = {'file': f}
response = requests.post(url, data=data, files=files)
d = json.loads(response.text)
if "code" in d.keys() and d["code"] == 200:
# 接口返回值data中存放视频获取结果的key
result = d["data"]
raw_data["result"] = {"successCode": "1", "errorLog": "", "results": currentFile, "dataKey": result}
data_queue.put(raw_data)
logging.info("视频上传成功{}".format(raw_data))
# to_kafka.send_kafka(raw_data,logging)
else:
logging.info("视频上传失败{}-{}".format(raw_data, d))
f.close()
# Todo删除视频文件
else:
# 暂无任务,进入休眠
time.sleep(10)
except:
logging.error(traceback.format_exc())
def getResult():
while True:
# 3秒钟结果获取一次
time.sleep(3)
try:
if data_queue.qsize() > 0:
logging.info("取数据队列长度{}".format(data_queue.qsize()))
raw_data = data_queue.get()
# print(raw_data)
# 根据视频key访问获取结果接口
dataKey = raw_data["result"]["dataKey"]
url = "https://realtime.pdeepmatrix.com/apis/media/analysis/getResult"
params = {'taskId': dataKey}
response = requests.get(url, params=params)
# print(response.text)
d = json.loads(response.text)
if "code" in d.keys() and d["code"] == 200:
results = ""
if d["data"]["code"] == "1":
for sentence in d["data"]["sentences"]:
results += sentence["text"]
raw_data["result"]["results"]["content"] = results
raw_data["result"]["results"] = json.dumps(raw_data["result"]["results"], ensure_ascii=False)
logging.info("视频解析获取结果成功{}".format(raw_data))
to_kafka.send_kafka(raw_data, logging)
elif d["data"]["code"] == "0":
# 正在解析中,将任务再次放回数据队列
data_queue.put(raw_data)
logging.info("视频未解析完毕,放回队列等待{}-{}".format(raw_data, d))
else:
# 解析失败
raw_data["result"]["successCode"] = "0"
raw_data["result"]["errorLog"] = response.text
raw_data["result"]["results"] = json.dumps(raw_data["result"]["results"], ensure_ascii=False)
logging.info("视频解析获取结果失败,数据{},接口返回值{}".format(raw_data, d))
to_kafka.send_kafka(raw_data, logging)
else:
raw_data["result"]["successCode"] = "0"
raw_data["result"]["errorLog"] = response.text
raw_data["result"]["results"] = json.dumps(raw_data["result"]["results"], ensure_ascii=False)
logging.info("视频解析获取结果失败,数据{},接口返回值{}".format(raw_data, d))
to_kafka.send_kafka(raw_data, logging)
else:
# 暂无任务,进入休眠
time.sleep(10)
except:
raw_data["result"]["successCode"] = "0"
raw_data["result"]["errorLog"] = traceback.format_exc()
raw_data["result"]["results"] = ""
logging.error(traceback.format_exc())
to_kafka.send_kafka(raw_data, logging)

189
text_analysis/bak/views.py1031

@ -0,0 +1,189 @@
# coding:utf8
import os, sys
import io
from jsonpath_ng import jsonpath, parse
import uuid
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf8')
cur_dir = os.path.dirname(os.path.abspath(__file__)) or os.getcwd()
par_dir = os.path.abspath(os.path.join(cur_dir, os.path.pardir))
sys.path.append(cur_dir)
sys.path.append(par_dir)
import json
from django.http import HttpResponse
from text_analysis.tools import to_kafka
from django.views.decorators.csrf import csrf_exempt
from log_util.set_logger import set_logger
logging = set_logger('logs/results.log')
import traceback
import queue
import requests
from text_analysis.tools.tool import get_data
import time
from datetime import datetime
import os
# 任务队列
global task_queue
task_queue = queue.Queue()
# 数据队列
global data_queue
data_queue = queue.Queue()
@csrf_exempt
def ASR(request):
if request.method == 'POST':
try:
raw_data = json.loads(request.body)
task_queue.put(raw_data)
return HttpResponse(json.dumps({"code": 1, "msg": "请求正常!"}, ensure_ascii=False))
except:
logging.error(traceback.format_exc())
return HttpResponse(json.dumps({"code": 0, "msg": "请求json格式不正确!"}, ensure_ascii=False))
else:
return HttpResponse(json.dumps({"code": 0, "msg": "请求方式错误,改为post请求"}, ensure_ascii=False))
def upload():
while True:
try:
if task_queue.qsize() > 0:
logging.info("取任务队列长度{}".format(task_queue.qsize()))
raw_data = task_queue.get()
# index = raw_data["metadata"]["index"]
# datasource = raw_data["metadata"]["admin"]["datasource"]
# if datasource not in raw_data["data"].keys():
# logging.info("找不到相关数据源!—{}".format(raw_data))
# continue
# allFile = raw_data["data"][datasource]
# currentFile = eval(allFile)[index]
url=raw_data["input"]["fileUrl"]
if '$.' in url:
# json.path表达式动态获取value
datasources = str(url).split(':')
# 0是数据源,1是JsonPath 表达式
datasourcestr = raw_data["data"][datasources[0]]
# print(datasourcestr)
datasource = json.loads(datasourcestr)
# 创建 JsonPath 表达式对象
expr = parse(datasources[1])
# 使用表达式来选择 JSON 元素
match = [match.value for match in expr.find(datasource)]
video_url = match[0]
fileName=video_url.rsplit('/')[-1]
if "http" not in video_url:
file = "https://caiji.percent.cn/" + video_url.lstrip("/")
# print(file)
# name=raw_data["metadata"]["admin"]["fileName"]
# if '$.' in name:
# # json.path表达式动态获取value
# datasources = str(name).split(':')
# # 0是数据源,1是JsonPath 表达式
# datasourcestr = raw_data["data"][datasources[0]]
# datasource = json.loads(datasourcestr)
# # 创建 JsonPath 表达式对象
# expr = parse(datasources[1])
# # 使用表达式来选择 JSON 元素
# match = [match.value for match in expr.find(datasource)]
# fileName = match[0]
currentFile={"fileName":fileName,"fileUrl":file}
language = raw_data["input"]["fromLanguage"]
# 从gofast获取视频
myfile = requests.get(file)
starttime = datetime.now().strftime('%Y-%m-%d')
path = 'inputdata/' + starttime
if not os.path.exists(path):
os.makedirs(path)
with open(path + '/' + fileName, 'wb') as f:
f.write(myfile.content)
logging.info("视频从gofast下载完毕,开始上传-{}".format(fileName))
# 访问视频上传接口
url = "https://realtime.pdeepmatrix.com/apis/media/analysis/upload"
data = {'fromLanguage': language}
f = open(path + '/' + fileName, 'rb')
files = {'file': f}
response = requests.post(url, data=data, files=files)
d = json.loads(response.text)
if "code" in d.keys() and d["code"] == 200:
# 接口返回值data中存放视频获取结果的key
result = d["data"]
raw_data["result"] = {"successCode": "1", "errorLog": "", "results": "", "dataKey": result,"file":currentFile}
data_queue.put(raw_data)
logging.info("视频上传成功{}".format(raw_data))
# to_kafka.send_kafka(raw_data,logging)
else:
logging.info("视频上传失败{}-{}".format(raw_data, d))
f.close()
# Todo删除视频文件
else:
# 暂无任务,进入休眠
time.sleep(10)
except:
logging.error(traceback.format_exc())
def getResult():
while True:
# 3秒钟结果获取一次
time.sleep(3)
try:
if data_queue.qsize() > 0:
logging.info("取数据队列长度{}".format(data_queue.qsize()))
raw_data = data_queue.get()
# print(raw_data)
output=raw_data["output"]
res_tmp={key: "" for key in output}
if "id" in res_tmp.keys():
res_tmp["id"]=str(uuid.uuid4())
# 根据视频key访问获取结果接口
dataKey = raw_data["result"]["dataKey"]
url = "https://realtime.pdeepmatrix.com/apis/media/analysis/getResult"
params = {'taskId': dataKey}
response = requests.get(url, params=params)
# print(response.text)
d = json.loads(response.text)
if "code" in d.keys() and d["code"] == 200:
results = ""
if d["data"]["code"] == "1":
for sentence in d["data"]["sentences"]:
results += sentence["text"]
if "content" in res_tmp.keys():
res_tmp["content"]=results
raw_data["result"]["results"] = json.dumps(res_tmp, ensure_ascii=False)
logging.info("视频解析获取结果成功{}".format(raw_data))
to_kafka.send_kafka(raw_data, logging)
elif d["data"]["code"] == "0":
# 正在解析中,将任务再次放回数据队列
data_queue.put(raw_data)
logging.info("视频未解析完毕,放回队列等待{}-{}".format(raw_data, d))
else:
# 解析失败
raw_data["result"]["successCode"] = "0"
raw_data["result"]["errorLog"] = response.text
raw_data["result"]["results"] = json.dumps(res_tmp, ensure_ascii=False)
logging.info("视频解析获取结果失败,数据{},接口返回值{}".format(raw_data, d))
to_kafka.send_kafka(raw_data, logging)
else:
raw_data["result"]["successCode"] = "0"
raw_data["result"]["errorLog"] = response.text
raw_data["result"]["results"] = json.dumps(res_tmp, ensure_ascii=False)
logging.info("视频解析获取结果失败,数据{},接口返回值{}".format(raw_data, d))
to_kafka.send_kafka(raw_data, logging)
else:
# 暂无任务,进入休眠
time.sleep(10)
except:
raw_data["result"]["successCode"] = "0"
raw_data["result"]["errorLog"] = traceback.format_exc()
raw_data["result"]["results"] = ""
logging.error(traceback.format_exc())
to_kafka.send_kafka(raw_data, logging)

187
text_analysis/bak/views.py_1109

@ -0,0 +1,187 @@
# coding:utf8
import os, sys
import io
from jsonpath_ng import jsonpath, parse
import uuid
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf8')
cur_dir = os.path.dirname(os.path.abspath(__file__)) or os.getcwd()
par_dir = os.path.abspath(os.path.join(cur_dir, os.path.pardir))
sys.path.append(cur_dir)
sys.path.append(par_dir)
import json
from django.http import HttpResponse
from text_analysis.tools import to_kafka
from django.views.decorators.csrf import csrf_exempt
from log_util.set_logger import set_logger
logging = set_logger('logs/results.log')
import traceback
import queue
import requests
from text_analysis.tools.tool import parse_data
import time
from datetime import datetime
import os
# 任务队列
global task_queue
task_queue = queue.Queue()
# 数据队列
global data_queue
data_queue = queue.Queue()
@csrf_exempt
def ASRNew(request):
if request.method == 'POST':
try:
raw_data = json.loads(request.body)
task_queue.put(raw_data)
return HttpResponse(json.dumps({"code": 1, "msg": "请求正常!"}, ensure_ascii=False))
except:
logging.error(traceback.format_exc())
return HttpResponse(json.dumps({"code": 0, "msg": "请求json格式不正确!"}, ensure_ascii=False))
else:
return HttpResponse(json.dumps({"code": 0, "msg": "请求方式错误,改为post请求"}, ensure_ascii=False))
def upload():
while True:
try:
if task_queue.qsize() > 0:
logging.info("取任务队列长度{}".format(task_queue.qsize()))
raw_data = task_queue.get()
# index = raw_data["metadata"]["index"]
# datasource = raw_data["metadata"]["admin"]["datasource"]
# if datasource not in raw_data["data"].keys():
# logging.info("找不到相关数据源!—{}".format(raw_data))
# continue
# allFile = raw_data["data"][datasource]
# currentFile = eval(allFile)[index]
url=raw_data["input"]["fileUrl"]
if "json" in url:
parm = url.split("#")
data1 = parse_data(raw_data, parm[0])
data1_json = json.loads(data1)
expr = parse(parm[2])
match = [match.value for match in expr.find(data1_json)]
video_url = match[0]
else:
video_url = parse_data(raw_data, url)
fileName=video_url.rsplit('/')[-1]
if "http" not in video_url:
file = "https://caiji.percent.cn/" + video_url.lstrip("/")
else:
file=video_url
# name=raw_data["metadata"]["admin"]["fileName"]
# if '$.' in name:
# # json.path表达式动态获取value
# datasources = str(name).split(':')
# # 0是数据源,1是JsonPath 表达式
# datasourcestr = raw_data["data"][datasources[0]]
# datasource = json.loads(datasourcestr)
# # 创建 JsonPath 表达式对象
# expr = parse(datasources[1])
# # 使用表达式来选择 JSON 元素
# match = [match.value for match in expr.find(datasource)]
# fileName = match[0]
currentFile={"fileName":fileName,"fileUrl":file}
language = raw_data["input"]["fromLanguage"]
# 从gofast获取视频
myfile = requests.get(file)
starttime = datetime.now().strftime('%Y-%m-%d')
path = 'inputdata/' + starttime
if not os.path.exists(path):
os.makedirs(path)
with open(path + '/' + fileName, 'wb') as f:
f.write(myfile.content)
logging.info("视频从gofast下载完毕,开始上传-{}".format(fileName))
# 访问视频上传接口
url = "https://realtime.pdeepmatrix.com/apis/media/analysis/upload"
data = {'fromLanguage': language}
f = open(path + '/' + fileName, 'rb')
files = {'file': f}
response = requests.post(url, data=data, files=files)
d = json.loads(response.text)
if "code" in d.keys() and d["code"] == 200:
# 接口返回值data中存放视频获取结果的key
result = d["data"]
raw_data["result"] = {"successCode": "1", "errorLog": "", "results": "", "dataKey": result,"file":currentFile}
data_queue.put(raw_data)
logging.info("视频上传成功{}".format(raw_data))
# to_kafka.send_kafka(raw_data,logging)
else:
logging.info("视频上传失败{}-{}".format(raw_data, d))
f.close()
# Todo删除视频文件
else:
# 暂无任务,进入休眠
time.sleep(10)
except:
logging.error(traceback.format_exc())
def getResult():
while True:
# 3秒钟结果获取一次
time.sleep(3)
try:
if data_queue.qsize() > 0:
logging.info("取数据队列长度{}".format(data_queue.qsize()))
raw_data = data_queue.get()
# print(raw_data)
output=raw_data["output"]
res_tmp={key: "" for key in output}
if "id" in res_tmp.keys():
res_tmp["id"]=str(uuid.uuid4())
# 根据视频key访问获取结果接口
dataKey = raw_data["result"]["dataKey"]
url = "https://realtime.pdeepmatrix.com/apis/media/analysis/getResult"
params = {'taskId': dataKey}
response = requests.get(url, params=params)
# print(response.text)
d = json.loads(response.text)
if "code" in d.keys() and d["code"] == 200:
results = ""
if d["data"]["code"] == "1":
for sentence in d["data"]["sentences"]:
results += sentence["text"]
if "content" in res_tmp.keys():
res_tmp["content"]=results
raw_data["result"]["results"] = json.dumps(res_tmp, ensure_ascii=False)
logging.info("视频解析获取结果成功{}".format(raw_data))
to_kafka.send_kafka(raw_data, logging)
elif d["data"]["code"] == "0":
# 正在解析中,将任务再次放回数据队列
data_queue.put(raw_data)
logging.info("视频未解析完毕,放回队列等待{}-{}".format(raw_data, d))
else:
# 解析失败
raw_data["result"]["successCode"] = "0"
raw_data["result"]["errorLog"] = response.text
raw_data["result"]["results"] = json.dumps(res_tmp, ensure_ascii=False)
logging.info("视频解析获取结果失败,数据{},接口返回值{}".format(raw_data, d))
to_kafka.send_kafka(raw_data, logging)
else:
raw_data["result"]["successCode"] = "0"
raw_data["result"]["errorLog"] = response.text
raw_data["result"]["results"] = json.dumps(res_tmp, ensure_ascii=False)
logging.info("视频解析获取结果失败,数据{},接口返回值{}".format(raw_data, d))
to_kafka.send_kafka(raw_data, logging)
else:
# 暂无任务,进入休眠
time.sleep(10)
except:
raw_data["result"]["successCode"] = "0"
raw_data["result"]["errorLog"] = traceback.format_exc()
raw_data["result"]["results"] = ""
logging.error(traceback.format_exc())
to_kafka.send_kafka(raw_data, logging)

208
text_analysis/bak/views.py_1220

@ -0,0 +1,208 @@
# coding:utf8
import os, sys
import io
from jsonpath_ng import jsonpath, parse
import uuid
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf8')
cur_dir = os.path.dirname(os.path.abspath(__file__)) or os.getcwd()
par_dir = os.path.abspath(os.path.join(cur_dir, os.path.pardir))
sys.path.append(cur_dir)
sys.path.append(par_dir)
import json
from django.http import HttpResponse
from text_analysis.tools import to_kafka
from django.views.decorators.csrf import csrf_exempt
from log_util.set_logger import set_logger
logging = set_logger('logs/results.log')
import traceback
import queue
import requests
from text_analysis.tools.tool import parse_data
import time
from datetime import datetime
import os
# 任务队列
global task_queue
task_queue = queue.Queue()
# 数据队列
global data_queue
data_queue = queue.Queue()
@csrf_exempt
def ASRNew(request):
if request.method == 'POST':
try:
raw_data = json.loads(request.body)
task_queue.put(raw_data)
return HttpResponse(json.dumps({"code": 1, "msg": "请求正常!"}, ensure_ascii=False))
except:
logging.error(traceback.format_exc())
return HttpResponse(json.dumps({"code": 0, "msg": "请求json格式不正确!"}, ensure_ascii=False))
else:
return HttpResponse(json.dumps({"code": 0, "msg": "请求方式错误,改为post请求"}, ensure_ascii=False))
def upload():
while True:
try:
if task_queue.qsize() > 0:
logging.info("取任务队列长度{}".format(task_queue.qsize()))
raw_data = task_queue.get()
output=raw_data["output"]
res_tmp={key: "" for key in output}
if "id" in res_tmp.keys():
res_tmp["id"]=str(uuid.uuid4())
# index = raw_data["metadata"]["index"]
# datasource = raw_data["metadata"]["admin"]["datasource"]
# if datasource not in raw_data["data"].keys():
# logging.info("找不到相关数据源!—{}".format(raw_data))
# continue
# allFile = raw_data["data"][datasource]
# currentFile = eval(allFile)[index]
logging.info("任务数据为:{}".format(raw_data))
url=raw_data["input"]["fileUrl"]
if "json" in url:
parm = url.split("#")
data1 = parse_data(raw_data, parm[0])
data1_json = json.loads(data1)
expr = parse(parm[2])
match = [match.value for match in expr.find(data1_json)]
video_url = match[0]
else:
video_url = parse_data(raw_data, url)
fileName=video_url.rsplit('/')[-1]
if "http" not in video_url:
file = "https://caiji.percent.cn/" + video_url.lstrip("/")
else:
file=video_url
# name=raw_data["metadata"]["admin"]["fileName"]
# if '$.' in name:
# # json.path表达式动态获取value
# datasources = str(name).split(':')
# # 0是数据源,1是JsonPath 表达式
# datasourcestr = raw_data["data"][datasources[0]]
# datasource = json.loads(datasourcestr)
# # 创建 JsonPath 表达式对象
# expr = parse(datasources[1])
# # 使用表达式来选择 JSON 元素
# match = [match.value for match in expr.find(datasource)]
# fileName = match[0]
currentFile={"fileName":fileName,"fileUrl":file}
language = raw_data["input"]["fromLanguage"]
# 从gofast获取视频
myfile = requests.get(file)
starttime = datetime.now().strftime('%Y-%m-%d')
path = 'inputdata/' + starttime
if not os.path.exists(path):
os.makedirs(path)
with open(path + '/' + fileName, 'wb') as f:
f.write(myfile.content)
logging.info("视频从gofast下载完毕,开始上传-{}".format(fileName))
# 访问视频上传接口
url = "https://realtime.pdeepmatrix.com/apis/media/analysis/upload"
data = {'fromLanguage': language}
f = open(path + '/' + fileName, 'rb')
files = {'file': f}
response = requests.post(url, data=data, files=files)
logging.info("上传后接口返回值:{}-{}".format(response,response.text))
d = json.loads(response.text)
if "code" in d.keys() and d["code"] == 200:
# 接口返回值data中存放视频获取结果的key
result = d["data"]
raw_data["result"] = {"successCode": "1", "errorLog": "", "results": "", "dataKey": result,"file":currentFile}
data_queue.put(raw_data)
logging.info("视频上传成功{}".format(raw_data))
# to_kafka.send_kafka(raw_data,logging)
else:
logging.info("视频上传失败{}-{}".format(raw_data, d))
f.close()
# Todo删除视频文件
else:
# 暂无任务,进入休眠
time.sleep(10)
except:
raw_data["result"]={}
raw_data["result"]["successCode"] = "0"
raw_data["result"]["errorLog"] = traceback.format_exc()
raw_data["result"]["results"] = json.dumps(res_tmp, ensure_ascii=False)
logging.error(traceback.format_exc())
to_kafka.send_kafka(raw_data, logging)
def getResult():
while True:
# 3秒钟结果获取一次
time.sleep(3)
try:
if data_queue.qsize() > 0:
logging.info("取数据队列长度{}".format(data_queue.qsize()))
raw_data = data_queue.get()
logging.info("任务数据为:{}".format(raw_data))
# print(raw_data)
output=raw_data["output"]
res_tmp={key: "" for key in output}
if "id" in res_tmp.keys():
res_tmp["id"]=str(uuid.uuid4())
# 根据视频key访问获取结果接口
dataKey = raw_data["result"]["dataKey"]
url = "https://realtime.pdeepmatrix.com/apis/media/analysis/getResult"
params = {'taskId': dataKey}
response = requests.get(url, params=params)
logging.info("ASR网站返回值:{}-{}".format(response,response.text))
d = json.loads(response.text)
if "code" in d.keys() and d["code"] == 200:
results = ""
if d["data"]["code"] == "1" and d["data"]["sentences"]:
for sentence in d["data"]["sentences"]:
if results:
results += ' ' + sentence["text"]
else:
results = sentence["text"]
if "content" in res_tmp.keys():
res_tmp["content"]=results
raw_data["result"]["results"] = json.dumps(res_tmp, ensure_ascii=False)
logging.info("视频解析获取结果成功{}".format(raw_data))
to_kafka.send_kafka(raw_data, logging)
elif d["data"]["code"] == "1" and not d["data"]["sentences"]:
results =""
if "content" in res_tmp.keys():
res_tmp["content"]=results
raw_data["result"]["results"] = json.dumps(res_tmp, ensure_ascii=False)
logging.info("视频解析获取结果成功{}".format(raw_data))
to_kafka.send_kafka(raw_data, logging)
elif d["data"]["code"] == "0":
# 正在解析中,将任务再次放回数据队列
data_queue.put(raw_data)
logging.info("视频未解析完毕,放回队列等待{}-{}".format(raw_data, d))
else:
# 解析失败
raw_data["result"]["successCode"] = "0"
raw_data["result"]["errorLog"] = response.text
raw_data["result"]["results"] = json.dumps(res_tmp, ensure_ascii=False)
logging.info("视频解析获取结果失败,数据{},接口返回值{}".format(raw_data, d))
to_kafka.send_kafka(raw_data, logging)
else:
raw_data["result"]["successCode"] = "0"
raw_data["result"]["errorLog"] = response.text
raw_data["result"]["results"] = json.dumps(res_tmp, ensure_ascii=False)
logging.info("视频解析获取结果失败,数据{},接口返回值{}".format(raw_data, d))
to_kafka.send_kafka(raw_data, logging)
else:
# 暂无任务,进入休眠
time.sleep(10)
except:
raw_data["result"]["successCode"] = "0"
raw_data["result"]["errorLog"] = traceback.format_exc()
raw_data["result"]["results"] = json.dumps(res_tmp, ensure_ascii=False)
logging.error(traceback.format_exc())
to_kafka.send_kafka(raw_data, logging)

219
text_analysis/bak/views.py_20240517

@ -0,0 +1,219 @@
# coding:utf8
import os, sys
import io
from jsonpath_ng import jsonpath, parse
import uuid
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf8')
cur_dir = os.path.dirname(os.path.abspath(__file__)) or os.getcwd()
par_dir = os.path.abspath(os.path.join(cur_dir, os.path.pardir))
sys.path.append(cur_dir)
sys.path.append(par_dir)
import json
from django.http import HttpResponse
from text_analysis.tools import to_kafka
from django.views.decorators.csrf import csrf_exempt
from log_util.set_logger import set_logger
logging = set_logger('logs/results.log')
import traceback
import queue
import requests
from text_analysis.tools.tool import parse_data
import time
from datetime import datetime
import os
# 任务队列
global task_queue
task_queue = queue.Queue()
# 数据队列
global data_queue
data_queue = queue.Queue()
@csrf_exempt
def ASRNew(request):
if request.method == 'POST':
try:
raw_data = json.loads(request.body)
task_queue.put(raw_data)
return HttpResponse(json.dumps({"code": 1, "msg": "请求正常!"}, ensure_ascii=False))
except:
logging.error(traceback.format_exc())
return HttpResponse(json.dumps({"code": 0, "msg": "请求json格式不正确!"}, ensure_ascii=False))
else:
return HttpResponse(json.dumps({"code": 0, "msg": "请求方式错误,改为post请求"}, ensure_ascii=False))
def upload():
while True:
try:
if task_queue.qsize() > 0:
logging.info("取任务队列长度{}".format(task_queue.qsize()))
raw_data = task_queue.get()
output=raw_data["output"]
res_tmp={key: "" for key in output}
if "id" in res_tmp.keys():
res_tmp["id"]=str(uuid.uuid4())
# index = raw_data["metadata"]["index"]
# datasource = raw_data["metadata"]["admin"]["datasource"]
# if datasource not in raw_data["data"].keys():
# logging.info("找不到相关数据源!—{}".format(raw_data))
# continue
# allFile = raw_data["data"][datasource]
# currentFile = eval(allFile)[index]
logging.info("任务数据为:{}".format(raw_data))
url=raw_data["input"]["fileUrl"]
if "json" in url:
parm = url.split("#")
data1 = parse_data(raw_data, parm[0])
data1_json = json.loads(data1)
expr = parse(parm[2])
match = [match.value for match in expr.find(data1_json)]
video_url = match[0]
else:
video_url = parse_data(raw_data, url)
fileName=video_url.rsplit('/')[-1]
if "http" not in video_url:
file = "https://caiji.percent.cn/" + video_url.lstrip("/")
else:
file=video_url
# name=raw_data["metadata"]["admin"]["fileName"]
# if '$.' in name:
# # json.path表达式动态获取value
# datasources = str(name).split(':')
# # 0是数据源,1是JsonPath 表达式
# datasourcestr = raw_data["data"][datasources[0]]
# datasource = json.loads(datasourcestr)
# # 创建 JsonPath 表达式对象
# expr = parse(datasources[1])
# # 使用表达式来选择 JSON 元素
# match = [match.value for match in expr.find(datasource)]
# fileName = match[0]
currentFile={"fileName":fileName,"fileUrl":file}
language = raw_data["input"]["fromLanguage"]
# 从gofast获取视频
myfile = requests.get(file)
starttime = datetime.now().strftime('%Y-%m-%d')
path = 'inputdata/' + starttime
if not os.path.exists(path):
os.makedirs(path)
with open(path + '/' + fileName, 'wb') as f:
f.write(myfile.content)
logging.info("视频从gofast下载完毕,开始上传-{}".format(fileName))
# 访问视频上传接口
# video=1视频,0音频。
video=1
if fileName[-3:]=="m4a" or fileName[-3:]=="mp3" or fileName[-3:]=="wav":
url="https://realtime.pdeepmatrix.com/apis/file/asr/upload"
video=0
else:
url = "https://realtime.pdeepmatrix.com/apis/media/analysis/upload"
data = {'fromLanguage': language}
f = open(path + '/' + fileName, 'rb')
files = {'file': f}
response = requests.post(url, data=data, files=files,verify=False)
logging.info("上传后接口返回值:{}-{}".format(response,response.text))
d = json.loads(response.text)
if "code" in d.keys() and d["code"] == 200:
# 接口返回值data中存放视频获取结果的key
result = d["data"]
raw_data["result"] = {"successCode": "1", "errorLog": "", "results": "", "dataKey": result,"video":video,"file":currentFile}
data_queue.put(raw_data)
logging.info("视频上传成功{}".format(raw_data))
# to_kafka.send_kafka(raw_data,logging)
else:
logging.info("视频上传失败{}-{}".format(raw_data, d))
f.close()
# Todo删除视频文件
else:
# 暂无任务,进入休眠
time.sleep(10)
except:
raw_data["result"]={}
raw_data["result"]["successCode"] = "0"
raw_data["result"]["errorLog"] = traceback.format_exc()
raw_data["result"]["results"] = json.dumps(res_tmp, ensure_ascii=False)
logging.error(traceback.format_exc())
to_kafka.send_kafka(raw_data, logging)
def getResult():
while True:
# 3秒钟结果获取一次
time.sleep(3)
try:
if data_queue.qsize() > 0:
logging.info("取数据队列长度{}".format(data_queue.qsize()))
raw_data = data_queue.get()
logging.info("任务数据为:{}".format(raw_data))
# print(raw_data)
output=raw_data["output"]
res_tmp={key: "" for key in output}
if "id" in res_tmp.keys():
res_tmp["id"]=str(uuid.uuid4())
# 根据视频key访问获取结果接口
dataKey = raw_data["result"]["dataKey"]
params = {'taskId': dataKey}
language = raw_data["input"]["fromLanguage"]
data = {'fromLanguage': language,'taskId': dataKey}
if raw_data["result"]["video"]==1:
url = "https://realtime.pdeepmatrix.com/apis/media/analysis/getResult"
response = requests.get(url, params=params, verify=False)
else:
url ="https://realtime.pdeepmatrix.com/apis/file/asr/getResult"
response = requests.post(url, data=data, verify=False)
logging.info("ASR网站返回值:{}-{}".format(response,response.text))
d = json.loads(response.text)
if "code" in d.keys() and d["code"] == 200:
results = ""
if d["data"]["code"] == "1" and d["data"]["sentences"]:
for sentence in d["data"]["sentences"]:
if results:
results += ' ' + sentence["text"]
else:
results = sentence["text"]
if "content" in res_tmp.keys():
res_tmp["content"]=results
raw_data["result"]["results"] = json.dumps(res_tmp, ensure_ascii=False)
logging.info("视频解析获取结果成功{}".format(raw_data))
to_kafka.send_kafka(raw_data, logging)
elif d["data"]["code"] == "1" and not d["data"]["sentences"]:
results =""
if "content" in res_tmp.keys():
res_tmp["content"]=results
raw_data["result"]["results"] = json.dumps(res_tmp, ensure_ascii=False)
logging.info("视频解析获取结果成功{}".format(raw_data))
to_kafka.send_kafka(raw_data, logging)
elif d["data"]["code"] == "0":
# 正在解析中,将任务再次放回数据队列
data_queue.put(raw_data)
logging.info("视频未解析完毕,放回队列等待{}-{}".format(raw_data, d))
else:
# 解析失败
raw_data["result"]["successCode"] = "0"
raw_data["result"]["errorLog"] = response.text
raw_data["result"]["results"] = json.dumps(res_tmp, ensure_ascii=False)
logging.info("视频解析获取结果失败,数据{},接口返回值{}".format(raw_data, d))
to_kafka.send_kafka(raw_data, logging)
else:
raw_data["result"]["successCode"] = "0"
raw_data["result"]["errorLog"] = response.text
raw_data["result"]["results"] = json.dumps(res_tmp, ensure_ascii=False)
logging.info("视频解析获取结果失败,数据{},接口返回值{}".format(raw_data, d))
to_kafka.send_kafka(raw_data, logging)
else:
# 暂无任务,进入休眠
time.sleep(10)
except:
raw_data["result"]["successCode"] = "0"
raw_data["result"]["errorLog"] = traceback.format_exc()
raw_data["result"]["results"] = json.dumps(res_tmp, ensure_ascii=False)
logging.error(traceback.format_exc())
to_kafka.send_kafka(raw_data, logging)

231
text_analysis/bak/views.py_20240607

@ -0,0 +1,231 @@
# coding:utf8
import os, sys
import io
from jsonpath_ng import jsonpath, parse
import uuid
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf8')
cur_dir = os.path.dirname(os.path.abspath(__file__)) or os.getcwd()
par_dir = os.path.abspath(os.path.join(cur_dir, os.path.pardir))
sys.path.append(cur_dir)
sys.path.append(par_dir)
import json
from django.http import HttpResponse
from text_analysis.tools import to_kafka
from django.views.decorators.csrf import csrf_exempt
from log_util.set_logger import set_logger
logging = set_logger('logs/results.log')
import traceback
import queue
import requests
from text_analysis.tools.tool import parse_data
import time
from datetime import datetime
import os
# 任务队列
global task_queue
task_queue = queue.Queue()
# 数据队列
global data_queue
data_queue = queue.Queue()
@csrf_exempt
def ASRNew(request):
if request.method == 'POST':
try:
raw_data = json.loads(request.body)
task_queue.put(raw_data)
return HttpResponse(json.dumps({"code": 1, "msg": "请求正常!"}, ensure_ascii=False))
except:
logging.error(traceback.format_exc())
return HttpResponse(json.dumps({"code": 0, "msg": "请求json格式不正确!"}, ensure_ascii=False))
else:
return HttpResponse(json.dumps({"code": 0, "msg": "请求方式错误,改为post请求"}, ensure_ascii=False))
def upload():
while True:
try:
if task_queue.qsize() > 0:
logging.info("取任务队列长度{}".format(task_queue.qsize()))
raw_data = task_queue.get()
output=raw_data["output"]
res_tmp={key: "" for key in output}
if "id" in res_tmp.keys():
res_tmp["id"]=str(uuid.uuid4())
# index = raw_data["metadata"]["index"]
# datasource = raw_data["metadata"]["admin"]["datasource"]
# if datasource not in raw_data["data"].keys():
# logging.info("找不到相关数据源!—{}".format(raw_data))
# continue
# allFile = raw_data["data"][datasource]
# currentFile = eval(allFile)[index]
logging.info("任务数据为:{}".format(raw_data))
url=raw_data["input"]["fileUrl"]
if "json" in url:
parm = url.split("#")
data1 = parse_data(raw_data, parm[0])
data1_json = json.loads(data1)
expr = parse(parm[2])
match = [match.value for match in expr.find(data1_json)]
video_url = match[0]
else:
video_url = parse_data(raw_data, url)
fileName=video_url.rsplit('/')[-1]
if "http" not in video_url:
file = "https://caiji.percent.cn/" + video_url.lstrip("/")
else:
file=video_url
# name=raw_data["metadata"]["admin"]["fileName"]
# if '$.' in name:
# # json.path表达式动态获取value
# datasources = str(name).split(':')
# # 0是数据源,1是JsonPath 表达式
# datasourcestr = raw_data["data"][datasources[0]]
# datasource = json.loads(datasourcestr)
# # 创建 JsonPath 表达式对象
# expr = parse(datasources[1])
# # 使用表达式来选择 JSON 元素
# match = [match.value for match in expr.find(datasource)]
# fileName = match[0]
currentFile={"fileName":fileName,"fileUrl":file}
language = raw_data["input"]["fromLanguage"]
# 从gofast获取视频
myfile = requests.get(file)
starttime = datetime.now().strftime('%Y-%m-%d')
path = 'inputdata/' + starttime
if not os.path.exists(path):
os.makedirs(path)
with open(path + '/' + fileName, 'wb') as f:
f.write(myfile.content)
logging.info("视频从gofast下载完毕,开始上传-{}".format(fileName))
# 访问视频上传接口
# video=1视频,0音频。
video=1
if fileName[-3:]=="m4a" or fileName[-3:]=="mp3" or fileName[-3:]=="wav":
url="https://realtime.pdeepmatrix.com/apis/file/asr/upload"
video=0
else:
url = "https://realtime.pdeepmatrix.com/apis/media/analysis/upload"
data = {'fromLanguage': language}
f = open(path + '/' + fileName, 'rb')
files = {'file': f}
response = requests.post(url, data=data, files=files,verify=False)
logging.info("上传后接口返回值:{}-{}".format(response,response.text))
d = json.loads(response.text)
if "code" in d.keys() and d["code"] == 200:
# 接口返回值data中存放视频获取结果的key
result = d["data"]
raw_data["result"] = {"successCode": "1", "errorLog": "", "results": "", "dataKey": result,"video":video,"file":currentFile}
data_queue.put(raw_data)
logging.info("视频上传成功{}".format(raw_data))
# to_kafka.send_kafka(raw_data,logging)
else:
logging.info("视频上传失败{}-{}".format(raw_data, d))
f.close()
# Todo删除视频文件
else:
# 暂无任务,进入休眠
time.sleep(10)
except:
raw_data["result"]={}
raw_data["result"]["successCode"] = "0"
raw_data["result"]["status"]=2
raw_data["result"]["message"]="视频/音频上传异常"
raw_data["result"]["errorLog"] = traceback.format_exc()
raw_data["result"]["results"] = json.dumps(res_tmp, ensure_ascii=False)
logging.error(traceback.format_exc())
to_kafka.send_kafka(raw_data, logging)
def getResult():
while True:
# 3秒钟结果获取一次
time.sleep(3)
try:
if data_queue.qsize() > 0:
logging.info("取数据队列长度{}".format(data_queue.qsize()))
raw_data = data_queue.get()
logging.info("任务数据为:{}".format(raw_data))
# print(raw_data)
output=raw_data["output"]
res_tmp={key: "" for key in output}
if "id" in res_tmp.keys():
res_tmp["id"]=str(uuid.uuid4())
# 根据视频key访问获取结果接口
dataKey = raw_data["result"]["dataKey"]
params = {'taskId': dataKey}
language = raw_data["input"]["fromLanguage"]
data = {'fromLanguage': language,'taskId': dataKey}
if raw_data["result"]["video"]==1:
url = "https://realtime.pdeepmatrix.com/apis/media/analysis/getResult"
response = requests.get(url, params=params, verify=False)
else:
url ="https://realtime.pdeepmatrix.com/apis/file/asr/getResult"
response = requests.post(url, data=data, verify=False)
logging.info("ASR网站返回值:{}-{}".format(response,response.text))
d = json.loads(response.text)
if "code" in d.keys() and d["code"] == 200:
results = ""
if d["data"]["code"] == "1" and d["data"]["sentences"]:
for sentence in d["data"]["sentences"]:
if results:
results += ' ' + sentence["text"]
else:
results = sentence["text"]
if "content" in res_tmp.keys():
res_tmp["content"]=results
raw_data["result"]["results"] = json.dumps(res_tmp, ensure_ascii=False)
raw_data["result"]["status"]=1
raw_data["result"]["message"]="成功"
logging.info("视频解析获取结果成功{}".format(raw_data))
to_kafka.send_kafka(raw_data, logging)
elif d["data"]["code"] == "1" and not d["data"]["sentences"]:
results =""
if "content" in res_tmp.keys():
res_tmp["content"]=results
raw_data["result"]["results"] = json.dumps(res_tmp, ensure_ascii=False)
raw_data["result"]["status"]=1
raw_data["result"]["message"]="成功"
logging.info("视频解析获取结果成功{}".format(raw_data))
to_kafka.send_kafka(raw_data, logging)
elif d["data"]["code"] == "0":
# 正在解析中,将任务再次放回数据队列
data_queue.put(raw_data)
logging.info("视频未解析完毕,放回队列等待{}-{}".format(raw_data, d))
else:
# 解析失败
raw_data["result"]["successCode"] = "0"
raw_data["result"]["errorLog"] = response.text
raw_data["result"]["results"] = json.dumps(res_tmp, ensure_ascii=False)
raw_data["result"]["status"]=2
raw_data["result"]["message"]="视频/音频解析异常"
logging.info("视频解析获取结果失败,数据{},接口返回值{}".format(raw_data, d))
to_kafka.send_kafka(raw_data, logging)
else:
raw_data["result"]["successCode"] = "0"
raw_data["result"]["errorLog"] = response.text
raw_data["result"]["results"] = json.dumps(res_tmp, ensure_ascii=False)
raw_data["result"]["status"] = 2
raw_data["result"]["message"] = "视频/音频解析异常"
logging.info("视频解析获取结果失败,数据{},接口返回值{}".format(raw_data, d))
to_kafka.send_kafka(raw_data, logging)
else:
# 暂无任务,进入休眠
time.sleep(10)
except:
raw_data["result"]["successCode"] = "0"
raw_data["result"]["errorLog"] = traceback.format_exc()
raw_data["result"]["status"] = 2
raw_data["result"]["message"] = "视频/音频解析异常"
raw_data["result"]["results"] = json.dumps(res_tmp, ensure_ascii=False)
logging.error(traceback.format_exc())
to_kafka.send_kafka(raw_data, logging)

264
text_analysis/bak/views.py_20240705

@ -0,0 +1,264 @@
# coding:utf8
import os, sys
import io
from jsonpath_ng import jsonpath, parse
import uuid
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf8')
cur_dir = os.path.dirname(os.path.abspath(__file__)) or os.getcwd()
par_dir = os.path.abspath(os.path.join(cur_dir, os.path.pardir))
sys.path.append(cur_dir)
sys.path.append(par_dir)
import json
from django.http import HttpResponse
from text_analysis.tools import to_kafka
from django.views.decorators.csrf import csrf_exempt
from log_util.set_logger import set_logger
logging = set_logger('logs/results.log')
import traceback
import queue
import requests
from text_analysis.tools.tool import parse_data
import time
from datetime import datetime
import os
from kazoo.client import KazooClient
from kazoo.protocol.states import EventType
# 任务队列
# global task_queue
task_queue = queue.Queue()
# 数据队列
# global data_queue
data_queue = queue.Queue()
stop_dict={}
@csrf_exempt
def ASRNew(request):
if request.method == 'POST':
try:
raw_data = json.loads(request.body)
task_queue.put(raw_data)
return HttpResponse(json.dumps({"code": 1, "msg": "请求正常!"}, ensure_ascii=False))
except:
logging.error(traceback.format_exc())
return HttpResponse(json.dumps({"code": 0, "msg": "请求json格式不正确!"}, ensure_ascii=False))
else:
return HttpResponse(json.dumps({"code": 0, "msg": "请求方式错误,改为post请求"}, ensure_ascii=False))
def upload():
while True:
try:
if task_queue.qsize() > 0:
logging.info("取任务队列长度{}".format(task_queue.qsize()))
raw_data = task_queue.get()
output=raw_data["output"]
res_tmp={key: "" for key in output}
if "id" in res_tmp.keys():
res_tmp["id"]=str(uuid.uuid4())
logging.info("任务数据为:{}".format(raw_data))
logging.info("当前version信息为:{}".format(stop_dict))
task_id=raw_data["scenes_id"]
task_version=raw_data["version"]
if task_id in stop_dict.keys() and task_version!=stop_dict[task_id]["version"]:
logging.info("已暂停任务上传,过滤掉。{}".format(raw_data))
continue
url=raw_data["input"]["fileUrl"]
if "json" in url:
parm = url.split("#")
data1 = parse_data(raw_data, parm[0])
data1_json = json.loads(data1)
expr = parse(parm[2])
match = [match.value for match in expr.find(data1_json)]
video_url = match[0]
else:
video_url = parse_data(raw_data, url)
fileName=video_url.rsplit('/')[-1]
if "http" not in video_url:
file = "https://caiji.percent.cn/" + video_url.lstrip("/")
else:
file=video_url
# name=raw_data["metadata"]["admin"]["fileName"]
# if '$.' in name:
# # json.path表达式动态获取value
# datasources = str(name).split(':')
# # 0是数据源,1是JsonPath 表达式
# datasourcestr = raw_data["data"][datasources[0]]
# datasource = json.loads(datasourcestr)
# # 创建 JsonPath 表达式对象
# expr = parse(datasources[1])
# # 使用表达式来选择 JSON 元素
# match = [match.value for match in expr.find(datasource)]
# fileName = match[0]
currentFile={"fileName":fileName,"fileUrl":file}
language = raw_data["input"]["fromLanguage"]
# 从gofast获取视频
myfile = requests.get(file)
starttime = datetime.now().strftime('%Y-%m-%d')
path = 'inputdata/' + starttime
if not os.path.exists(path):
os.makedirs(path)
with open(path + '/' + fileName, 'wb') as f:
f.write(myfile.content)
logging.info("视频从gofast下载完毕,开始上传-{}".format(fileName))
# 访问视频上传接口
# video=1视频,0音频。
video=1
if fileName[-3:]=="m4a" or fileName[-3:]=="mp3" or fileName[-3:]=="wav":
url="https://realtime.pdeepmatrix.com/apis/file/asr/upload"
video=0
else:
url = "https://realtime.pdeepmatrix.com/apis/media/analysis/upload"
data = {'fromLanguage': language}
f = open(path + '/' + fileName, 'rb')
files = {'file': f}
response = requests.post(url, data=data, files=files,verify=False)
logging.info("上传后接口返回值:{}-{}".format(response,response.text))
d = json.loads(response.text)
if "code" in d.keys() and d["code"] == 200:
# 接口返回值data中存放视频获取结果的key
result = d["data"]
raw_data["result"] = {"successCode": "1", "errorLog": "", "results": "", "dataKey": result,"video":video,"file":currentFile}
data_queue.put(raw_data)
logging.info("视频上传成功{}".format(raw_data))
# to_kafka.send_kafka(raw_data,logging)
else:
logging.info("视频上传失败{}-{}".format(raw_data, d))
f.close()
# Todo删除视频文件
else:
# 暂无任务,进入休眠
time.sleep(10)
except:
raw_data["result"]={}
raw_data["result"]["successCode"] = "0"
raw_data["result"]["status"]=2
raw_data["result"]["message"]="视频/音频上传异常"
raw_data["result"]["errorLog"] = traceback.format_exc()
raw_data["result"]["results"] = json.dumps(res_tmp, ensure_ascii=False)
logging.error(traceback.format_exc())
to_kafka.send_kafka(raw_data, logging)
def getResult():
while True:
# 3秒钟结果获取一次
time.sleep(3)
try:
if data_queue.qsize() > 0:
logging.info("取数据队列长度{}".format(data_queue.qsize()))
raw_data = data_queue.get()
logging.info("任务数据为:{}".format(raw_data))
task_id=raw_data["scenes_id"]
task_version=raw_data["version"]
if task_id in stop_dict.keys() and task_version!=stop_dict[task_id]["version"]:
logging.info("已暂停获取结果任务,过滤掉。{}".format(raw_data))
continue
output=raw_data["output"]
res_tmp={key: "" for key in output}
if "id" in res_tmp.keys():
res_tmp["id"]=str(uuid.uuid4())
# 根据视频key访问获取结果接口
dataKey = raw_data["result"]["dataKey"]
params = {'taskId': dataKey}
language = raw_data["input"]["fromLanguage"]
data = {'fromLanguage': language,'taskId': dataKey}
if raw_data["result"]["video"]==1:
url = "https://realtime.pdeepmatrix.com/apis/media/analysis/getResult"
response = requests.get(url, params=params, verify=False)
else:
url ="https://realtime.pdeepmatrix.com/apis/file/asr/getResult"
response = requests.post(url, data=data, verify=False)
logging.info("ASR网站返回值:{}-{}".format(response,response.text))
d = json.loads(response.text)
if "code" in d.keys() and d["code"] == 200:
results = ""
if d["data"]["code"] == "1" and d["data"]["sentences"]:
for sentence in d["data"]["sentences"]:
if results:
results += ' ' + sentence["text"]
else:
results = sentence["text"]
if "content" in res_tmp.keys():
res_tmp["content"]=results
raw_data["result"]["results"] = json.dumps(res_tmp, ensure_ascii=False)
raw_data["result"]["status"]=1
raw_data["result"]["message"]="成功"
logging.info("视频解析获取结果成功{}".format(raw_data))
to_kafka.send_kafka(raw_data, logging)
elif d["data"]["code"] == "1" and not d["data"]["sentences"]:
results =""
if "content" in res_tmp.keys():
res_tmp["content"]=results
raw_data["result"]["results"] = json.dumps(res_tmp, ensure_ascii=False)
raw_data["result"]["status"]=1
raw_data["result"]["message"]="成功"
logging.info("视频解析获取结果成功{}".format(raw_data))
to_kafka.send_kafka(raw_data, logging)
elif d["data"]["code"] == "0":
# 正在解析中,将任务再次放回数据队列
data_queue.put(raw_data)
logging.info("视频未解析完毕,放回队列等待{}-{}".format(raw_data, d))
else:
# 解析失败
raw_data["result"]["successCode"] = "0"
raw_data["result"]["errorLog"] = response.text
raw_data["result"]["results"] = json.dumps(res_tmp, ensure_ascii=False)
raw_data["result"]["status"]=2
raw_data["result"]["message"]="视频/音频解析异常"
logging.info("视频解析获取结果失败,数据{},接口返回值{}".format(raw_data, d))
to_kafka.send_kafka(raw_data, logging)
else:
raw_data["result"]["successCode"] = "0"
raw_data["result"]["errorLog"] = response.text
raw_data["result"]["results"] = json.dumps(res_tmp, ensure_ascii=False)
raw_data["result"]["status"] = 2
raw_data["result"]["message"] = "视频/音频解析异常"
logging.info("视频解析获取结果失败,数据{},接口返回值{}".format(raw_data, d))
to_kafka.send_kafka(raw_data, logging)
else:
# 暂无任务,进入休眠
time.sleep(10)
except:
raw_data["result"]["successCode"] = "0"
raw_data["result"]["errorLog"] = traceback.format_exc()
raw_data["result"]["status"] = 2
raw_data["result"]["message"] = "视频/音频解析异常"
raw_data["result"]["results"] = json.dumps(res_tmp, ensure_ascii=False)
logging.error(traceback.format_exc())
to_kafka.send_kafka(raw_data, logging)
def zk_monitoring():
try:
#线上环境
zk = KazooClient(hosts='172.18.1.146:2181,172.18.1.147:2181,172.18.1.148:2181')
#测试环境
# zk = KazooClient(hosts='172.16.12.55:2181,172.16.12.56:2181,172.16.12.57:2181')
zk.start()
# 设置监听器
@zk.DataWatch("/analyze")
def watch_node(data, stat, event):
if event is not None and event.type == EventType.CHANGED:
data, stat = zk.get("/analyze")
logging.info("执行删除操作:{}".format(data))
d = json.loads(data)
id = d["scenes_id"]
stop_dict[id] = {}
stop_dict[id]["version"] = d["version"]
stop_dict[id]["operation"] = d["operation"]
# 保持程序运行以监听节点变化
try:
while True:
time.sleep(1)
except:
logging.info("Stopping...")
# 关闭连接
zk.stop()
zk.close()
except:
logging.error(traceback.format_exc())

266
text_analysis/bak/views.py_20240819

@ -0,0 +1,266 @@
# coding:utf8
import os, sys
import io
from jsonpath_ng import jsonpath, parse
import uuid
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf8')
cur_dir = os.path.dirname(os.path.abspath(__file__)) or os.getcwd()
par_dir = os.path.abspath(os.path.join(cur_dir, os.path.pardir))
sys.path.append(cur_dir)
sys.path.append(par_dir)
import json
from django.http import HttpResponse
from text_analysis.tools import to_kafka
from django.views.decorators.csrf import csrf_exempt
from log_util.set_logger import set_logger
logging = set_logger('logs/results.log')
import traceback
import queue
import requests
from text_analysis.tools.tool import parse_data
import time
from datetime import datetime
import os
from kazoo.client import KazooClient
from kazoo.protocol.states import EventType
# 任务队列
# global task_queue
task_queue = queue.Queue()
# 数据队列
# global data_queue
data_queue = queue.Queue()
stop_dict={}
@csrf_exempt
def ASRNew(request):
if request.method == 'POST':
try:
raw_data = json.loads(request.body)
task_queue.put(raw_data)
return HttpResponse(json.dumps({"code": 1, "msg": "请求正常!"}, ensure_ascii=False))
except:
logging.error(traceback.format_exc())
return HttpResponse(json.dumps({"code": 0, "msg": "请求json格式不正确!"}, ensure_ascii=False))
else:
return HttpResponse(json.dumps({"code": 0, "msg": "请求方式错误,改为post请求"}, ensure_ascii=False))
def upload():
while True:
try:
if task_queue.qsize() > 0:
logging.info("取任务队列长度{}".format(task_queue.qsize()))
raw_data = task_queue.get()
output=raw_data["output"]
res_tmp={key: "" for key in output}
if "id" in res_tmp.keys():
res_tmp["id"]=str(uuid.uuid4())
logging.info("任务数据为:{}".format(raw_data))
logging.info("当前version信息为:{}".format(stop_dict))
task_id=raw_data["scenes_id"]
task_version=raw_data["version"]
if task_id in stop_dict.keys() and task_version!=stop_dict[task_id]["version"]:
logging.info("已暂停任务上传,过滤掉。{}".format(raw_data))
continue
url=raw_data["input"]["fileUrl"]
if "json" in url:
parm = url.split("#")
data1 = parse_data(raw_data, parm[0])
data1_json = json.loads(data1)
expr = parse(parm[2])
match = [match.value for match in expr.find(data1_json)]
video_url = match[0]
else:
video_url = parse_data(raw_data, url)
fileName=video_url.rsplit('/')[-1]
if "http" not in video_url:
file = "https://caiji.percent.cn/" + video_url.lstrip("/")
else:
file=video_url
# name=raw_data["metadata"]["admin"]["fileName"]
# if '$.' in name:
# # json.path表达式动态获取value
# datasources = str(name).split(':')
# # 0是数据源,1是JsonPath 表达式
# datasourcestr = raw_data["data"][datasources[0]]
# datasource = json.loads(datasourcestr)
# # 创建 JsonPath 表达式对象
# expr = parse(datasources[1])
# # 使用表达式来选择 JSON 元素
# match = [match.value for match in expr.find(datasource)]
# fileName = match[0]
currentFile={"fileName":fileName,"fileUrl":file}
language = raw_data["input"]["fromLanguage"]
# 从gofast获取视频
myfile = requests.get(file)
starttime = datetime.now().strftime('%Y-%m-%d')
path = 'inputdata/' + starttime
if not os.path.exists(path):
os.makedirs(path)
with open(path + '/' + fileName, 'wb') as f:
f.write(myfile.content)
logging.info("视频从gofast下载完毕,开始上传-{}".format(fileName))
# 访问视频上传接口
# video=1视频,0音频。
video=1
if fileName[-3:]=="m4a" or fileName[-3:]=="mp3" or fileName[-3:]=="wav":
url="https://realtime.pdeepmatrix.com/apis/file/asr/upload"
video=0
else:
url = "https://realtime.pdeepmatrix.com/apis/media/analysis/upload"
data = {'fromLanguage': language}
f = open(path + '/' + fileName, 'rb')
files = {'file': f}
response = requests.post(url, data=data, files=files,verify=False)
logging.info("上传后接口返回值:{}-{}".format(response,response.text))
d = json.loads(response.text)
if "code" in d.keys() and d["code"] == 200:
# 接口返回值data中存放视频获取结果的key
result = d["data"]
raw_data["result"] = {"successCode": "1", "errorLog": "", "results": "", "dataKey": result,"video":video,"file":currentFile}
data_queue.put(raw_data)
logging.info("视频上传成功{}".format(raw_data))
# to_kafka.send_kafka(raw_data,logging)
else:
logging.info("视频上传失败{}-{}".format(raw_data, d))
f.close()
# Todo删除视频文件
else:
# 暂无任务,进入休眠
time.sleep(10)
except:
raw_data["result"]={}
raw_data["result"]["successCode"] = "0"
raw_data["result"]["status"]=2
raw_data["result"]["message"]="视频/音频上传异常"
raw_data["result"]["errorLog"] = traceback.format_exc()
raw_data["result"]["results"] = json.dumps(res_tmp, ensure_ascii=False)
logging.error(traceback.format_exc())
to_kafka.send_kafka(raw_data, logging)
def getResult():
while True:
# 3秒钟结果获取一次
time.sleep(3)
try:
if data_queue.qsize() > 0:
logging.info("取数据队列长度{}".format(data_queue.qsize()))
raw_data = data_queue.get()
logging.info("任务数据为:{}".format(raw_data))
task_id=raw_data["scenes_id"]
task_version=raw_data["version"]
if task_id in stop_dict.keys() and task_version!=stop_dict[task_id]["version"]:
logging.info("已暂停获取结果任务,过滤掉。{}".format(raw_data))
continue
output=raw_data["output"]
res_tmp={key: "" for key in output}
if "id" in res_tmp.keys():
res_tmp["id"]=str(uuid.uuid4())
res_tmp["isLast"]=1
res_tmp["fileName"]=raw_data["result"]["file"]["fileName"]
# 根据视频key访问获取结果接口
dataKey = raw_data["result"]["dataKey"]
params = {'taskId': dataKey}
language = raw_data["input"]["fromLanguage"]
data = {'fromLanguage': language,'taskId': dataKey}
if raw_data["result"]["video"]==1:
url = "https://realtime.pdeepmatrix.com/apis/media/analysis/getResult"
response = requests.get(url, params=params, verify=False)
else:
url ="https://realtime.pdeepmatrix.com/apis/file/asr/getResult"
response = requests.post(url, data=data, verify=False)
logging.info("ASR网站返回值:{}-{}".format(response,response.text))
d = json.loads(response.text)
if "code" in d.keys() and d["code"] == 200:
results = ""
if d["data"]["code"] == "1" and d["data"]["sentences"]:
for sentence in d["data"]["sentences"]:
if results:
results += ' ' + sentence["text"]
else:
results = sentence["text"]
if "content" in res_tmp.keys():
res_tmp["content"]=results
raw_data["result"]["results"] = json.dumps(res_tmp, ensure_ascii=False)
raw_data["result"]["status"]=1
raw_data["result"]["message"]="成功"
logging.info("视频解析获取结果成功{}".format(raw_data))
to_kafka.send_kafka(raw_data, logging)
elif d["data"]["code"] == "1" and not d["data"]["sentences"]:
results =""
if "content" in res_tmp.keys():
res_tmp["content"]=results
raw_data["result"]["results"] = json.dumps(res_tmp, ensure_ascii=False)
raw_data["result"]["status"]=1
raw_data["result"]["message"]="成功"
logging.info("视频解析获取结果成功{}".format(raw_data))
to_kafka.send_kafka(raw_data, logging)
elif d["data"]["code"] == "0":
# 正在解析中,将任务再次放回数据队列
data_queue.put(raw_data)
logging.info("视频未解析完毕,放回队列等待{}-{}".format(raw_data, d))
else:
# 解析失败
raw_data["result"]["successCode"] = "0"
raw_data["result"]["errorLog"] = response.text
raw_data["result"]["results"] = json.dumps(res_tmp, ensure_ascii=False)
raw_data["result"]["status"]=2
raw_data["result"]["message"]="视频/音频解析异常"
logging.info("视频解析获取结果失败,数据{},接口返回值{}".format(raw_data, d))
to_kafka.send_kafka(raw_data, logging)
else:
raw_data["result"]["successCode"] = "0"
raw_data["result"]["errorLog"] = response.text
raw_data["result"]["results"] = json.dumps(res_tmp, ensure_ascii=False)
raw_data["result"]["status"] = 2
raw_data["result"]["message"] = "视频/音频解析异常"
logging.info("视频解析获取结果失败,数据{},接口返回值{}".format(raw_data, d))
to_kafka.send_kafka(raw_data, logging)
else:
# 暂无任务,进入休眠
time.sleep(10)
except:
raw_data["result"]["successCode"] = "0"
raw_data["result"]["errorLog"] = traceback.format_exc()
raw_data["result"]["status"] = 2
raw_data["result"]["message"] = "视频/音频解析异常"
raw_data["result"]["results"] = json.dumps(res_tmp, ensure_ascii=False)
logging.error(traceback.format_exc())
to_kafka.send_kafka(raw_data, logging)
def zk_monitoring():
try:
#线上环境
zk = KazooClient(hosts='172.18.1.146:2181,172.18.1.147:2181,172.18.1.148:2181')
#测试环境
# zk = KazooClient(hosts='172.16.12.55:2181,172.16.12.56:2181,172.16.12.57:2181')
zk.start()
# 设置监听器
@zk.DataWatch("/analyze")
def watch_node(data, stat, event):
if event is not None and event.type == EventType.CHANGED:
data, stat = zk.get("/analyze")
logging.info("执行删除操作:{}".format(data))
d = json.loads(data)
id = d["scenes_id"]
stop_dict[id] = {}
stop_dict[id]["version"] = d["version"]
stop_dict[id]["operation"] = d["operation"]
# 保持程序运行以监听节点变化
try:
while True:
time.sleep(1)
except:
logging.info("Stopping...")
# 关闭连接
zk.stop()
zk.close()
except:
logging.error(traceback.format_exc())

186
text_analysis/bak/views.py_old

@ -0,0 +1,186 @@
# coding:utf8
import os, sys
import io
from jsonpath_ng import jsonpath, parse
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf8')
cur_dir = os.path.dirname(os.path.abspath(__file__)) or os.getcwd()
par_dir = os.path.abspath(os.path.join(cur_dir, os.path.pardir))
sys.path.append(cur_dir)
sys.path.append(par_dir)
import json
from django.http import HttpResponse
from text_analysis.tools import to_kafka
from django.views.decorators.csrf import csrf_exempt
from log_util.set_logger import set_logger
logging = set_logger('logs/results.log')
import traceback
import queue
import requests
# from text_analysis.tools.tool import get_data
import time
from datetime import datetime
import os
# 任务队列
global task_queue
task_queue = queue.Queue()
# 数据队列
global data_queue
data_queue = queue.Queue()
@csrf_exempt
def ASR(request):
if request.method == 'POST':
try:
raw_data = json.loads(request.body)
task_queue.put(raw_data)
return HttpResponse(json.dumps({"code": 1, "msg": "请求正常!"}, ensure_ascii=False))
except:
logging.error(traceback.format_exc())
return HttpResponse(json.dumps({"code": 0, "msg": "请求json格式不正确!"}, ensure_ascii=False))
else:
return HttpResponse(json.dumps({"code": 0, "msg": "请求方式错误,改为post请求"}, ensure_ascii=False))
def upload():
while True:
try:
if task_queue.qsize() > 0:
logging.info("取任务队列长度{}".format(task_queue.qsize()))
raw_data = task_queue.get()
# index = raw_data["metadata"]["index"]
# datasource = raw_data["metadata"]["admin"]["datasource"]
# if datasource not in raw_data["data"].keys():
# logging.info("找不到相关数据源!—{}".format(raw_data))
# continue
# allFile = raw_data["data"][datasource]
# currentFile = eval(allFile)[index]
url=raw_data["metadata"]["admin"]["fileUrl"]
if '$.' in url:
# json.path表达式动态获取value
datasources = str(url).split(':')
# 0是数据源,1是JsonPath 表达式
datasourcestr = raw_data["data"][datasources[0]]
# print(datasourcestr)
datasource = json.loads(datasourcestr)
# 创建 JsonPath 表达式对象
expr = parse(datasources[1])
# 使用表达式来选择 JSON 元素
match = [match.value for match in expr.find(datasource)]
video_url = match[0]
fileName=video_url.rsplit('/')[-1]
if "http" not in video_url:
file = "https://caiji.percent.cn/" + video_url.lstrip("/")
else:
file=video_url
# print(file)
# name=raw_data["metadata"]["admin"]["fileName"]
# if '$.' in name:
# # json.path表达式动态获取value
# datasources = str(name).split(':')
# # 0是数据源,1是JsonPath 表达式
# datasourcestr = raw_data["data"][datasources[0]]
# datasource = json.loads(datasourcestr)
# # 创建 JsonPath 表达式对象
# expr = parse(datasources[1])
# # 使用表达式来选择 JSON 元素
# match = [match.value for match in expr.find(datasource)]
# fileName = match[0]
currentFile={"content":"","fileName":fileName,"fileUrl":file}
language = raw_data["metadata"]["admin"]["fromLanguage"]
# 从gofast获取视频
myfile = requests.get(file)
starttime = datetime.now().strftime('%Y-%m-%d')
path = 'inputdata/' + starttime
if not os.path.exists(path):
os.makedirs(path)
with open(path + '/' + fileName, 'wb') as f:
f.write(myfile.content)
logging.info("视频从gofast下载完毕,开始上传-{}".format(fileName))
# 访问视频上传接口
url = "https://realtime.pdeepmatrix.com/apis/media/analysis/upload"
data = {'fromLanguage': language}
f = open(path + '/' + fileName, 'rb')
files = {'file': f}
response = requests.post(url, data=data, files=files)
d = json.loads(response.text)
if "code" in d.keys() and d["code"] == 200:
# 接口返回值data中存放视频获取结果的key
result = d["data"]
raw_data["result"] = {"successCode": "1", "errorLog": "", "results": currentFile, "dataKey": result}
data_queue.put(raw_data)
logging.info("视频上传成功{}".format(raw_data))
# to_kafka.send_kafka(raw_data,logging)
else:
logging.info("视频上传失败{}-{}".format(raw_data, d))
f.close()
# Todo删除视频文件
else:
# 暂无任务,进入休眠
time.sleep(10)
except:
logging.error(traceback.format_exc())
def getResult():
while True:
# 3秒钟结果获取一次
time.sleep(3)
try:
if data_queue.qsize() > 0:
logging.info("取数据队列长度{}".format(data_queue.qsize()))
raw_data = data_queue.get()
# print(raw_data)
# 根据视频key访问获取结果接口
dataKey = raw_data["result"]["dataKey"]
url = "https://realtime.pdeepmatrix.com/apis/media/analysis/getResult"
params = {'taskId': dataKey}
response = requests.get(url, params=params)
# print(response.text)
d = json.loads(response.text)
if "code" in d.keys() and d["code"] == 200:
results = ""
if d["data"]["code"] == "1":
for sentence in d["data"]["sentences"]:
results += sentence["text"]
raw_data["result"]["results"]["content"] = results
raw_data["result"]["results"] = json.dumps(raw_data["result"]["results"], ensure_ascii=False)
logging.info("视频解析获取结果成功{}".format(raw_data))
to_kafka.send_kafka(raw_data, logging)
elif d["data"]["code"] == "0":
# 正在解析中,将任务再次放回数据队列
data_queue.put(raw_data)
logging.info("视频未解析完毕,放回队列等待{}-{}".format(raw_data, d))
else:
# 解析失败
raw_data["result"]["successCode"] = "0"
raw_data["result"]["errorLog"] = response.text
raw_data["result"]["results"] = json.dumps(raw_data["result"]["results"], ensure_ascii=False)
logging.info("视频解析获取结果失败,数据{},接口返回值{}".format(raw_data, d))
to_kafka.send_kafka(raw_data, logging)
else:
raw_data["result"]["successCode"] = "0"
raw_data["result"]["errorLog"] = response.text
raw_data["result"]["results"] = json.dumps(raw_data["result"]["results"], ensure_ascii=False)
logging.info("视频解析获取结果失败,数据{},接口返回值{}".format(raw_data, d))
to_kafka.send_kafka(raw_data, logging)
else:
# 暂无任务,进入休眠
time.sleep(10)
except:
raw_data["result"]["successCode"] = "0"
raw_data["result"]["errorLog"] = traceback.format_exc()
raw_data["result"]["results"] = ""
logging.error(traceback.format_exc())
to_kafka.send_kafka(raw_data, logging)

6
text_analysis/eg.py

@ -0,0 +1,6 @@
#coding:utf8
from views import data_queue
print(list(data_queue.queue))

10
text_analysis/read_config.py

@ -0,0 +1,10 @@
import configparser
#加载配置文件
def load_config():
configFile = './config.ini'
# 创建配置文件对象
con = configparser.ConfigParser()
# 读取文件
con.read(configFile, encoding='utf-8')
return con

14
text_analysis/request.py

@ -0,0 +1,14 @@
#coding:utf8
# import leida_ner_bert_crf
import requests
url = "http://172.18.1.166:9000/leidaduikang"
payload = "{\"inputUrl\":\"/home/bfdadmin/leidabert/Project_leidaduikang/AInputdata/content_100.xlsx\"}"
headers = {'user-agent': "vscode-restclient",'header name': "header value"}
response = requests.request("POST", url, timeout=1000000,data=payload, headers=headers)
print(response.text)

148
text_analysis/settings.py

@ -0,0 +1,148 @@
"""
Django settings for Zhijian_Project_WebService project.
Generated by 'django-admin startproject' using Django 1.8.
For more information on this file, see
https://docs.djangoproject.com/en/1.8/topics/settings/
For the full list of settings and their values, see
https://docs.djangoproject.com/en/1.8/ref/settings/
"""
# Build paths inside the project like this: os.path.join(BASE_DIR, ...)
import os
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
# Quick-start development settings - unsuitable for production
# See https://docs.djangoproject.com/en/1.8/howto/deployment/checklist/
# SECURITY WARNING: keep the secret key used in production secret!
SECRET_KEY = '330r)_!^qhd7$!w4)$y@4=p2bd*vlxf%4z(bx-fx-1i3txagvz'
# SECURITY WARNING: don't run with debug turned on in production!
DEBUG = True
ALLOWED_HOSTS = ['*']
# Application definition
INSTALLED_APPS = (
'django.contrib.admin',
'django.contrib.auth',
'django.contrib.contenttypes',
'django.contrib.sessions',
'django.contrib.messages',
'django.contrib.staticfiles',
)
MIDDLEWARE = [
'django.contrib.sessions.middleware.SessionMiddleware',
'django.middleware.common.CommonMiddleware',
'django.middleware.csrf.CsrfViewMiddleware',
'django.contrib.auth.middleware.AuthenticationMiddleware',
# 'django.contrib.auth.middleware.SessionAuthenticationMiddleware',
'django.contrib.messages.middleware.MessageMiddleware',
'django.middleware.clickjacking.XFrameOptionsMiddleware',
'django.middleware.security.SecurityMiddleware',
]
ROOT_URLCONF = 'text_analysis.urls'
TEMPLATES = [
{
'BACKEND': 'django.template.backends.django.DjangoTemplates',
'DIRS': [],
'APP_DIRS': True,
'OPTIONS': {
'context_processors': [
'django.template.context_processors.debug',
'django.template.context_processors.request',
'django.contrib.auth.context_processors.auth',
'django.contrib.messages.context_processors.messages',
],
},
},
]
WSGI_APPLICATION = 'text_analysis.wsgi.application'
# Database
# https://docs.djangoproject.com/en/1.8/ref/settings/#databases
# DATABASES = {
# 'default': {
# 'ENGINE': 'django.db.backends.sqlite3',
# 'NAME': os.path.join(BASE_DIR, 'db.sqlite3'),
# }
# }
# Internationalization
# https://docs.djangoproject.com/en/1.8/topics/i18n/
LANGUAGE_CODE = 'en-us'
TIME_ZONE = 'Asia/Shanghai'
USE_I18N = True
USE_L10N = True
USE_TZ = True
# Static files (CSS, JavaScript, Images)
# https://docs.djangoproject.com/en/1.8/howto/static-files/
STATIC_URL = '/static/'
# U_LOGFILE_SIZE = 1 * 1024 * 1024 # 单日志文件最大100M
# U_LOGFILE_COUNT = 7 # 保留10个日志文件
#
# LOGGING = {
# 'version': 1,
# 'disable_existing_loggers': True, # 禁用所有已经存在的日志配置
# 'filters': {
# 'require_debug_false': {
# '()': 'django.utils.log.RequireDebugFalse'
# }
# },
# 'formatters': {
# 'verbose': {
# 'format': '[%(levelname)s %(asctime)s @ %(process)d] %(module)s %(process)d %(thread)d %(message)s'
# },
# 'simple': {
# 'format': '%(levelname)s %(asctime)s @ %(process)d %(message)s'
# },
# 'complete': {
# 'format': '[%(levelname)s %(asctime)s @ %(process)d] (%(pathname)s/%(funcName)s:%(lineno)d) - %(message)s'
# },
# 'online': {
# 'format': '[%(levelname)s %(asctime)s @ %(process)d] - %(message)s'
# }
# },
# 'handlers': {
# 'text': {
# 'level': 'DEBUG',
# #'class': 'logging.handlers.RotatingFileHandler',
# 'class': 'logging.handlers.TimedRotatingFileHandler',
# 'when': 'H',
# 'interval': 1,
# 'backupCount': U_LOGFILE_COUNT,
# 'formatter': 'complete',
# 'filename': os.path.join(BASE_DIR, 'logs/resultNew.log').replace('\\', '/'),
# }
# },
# 'loggers': {
# 'text': {
# 'handlers': ['text'],
# 'level': 'DEBUG',
# 'propagate': False,
# }
# }
# }

BIN
text_analysis/tools/__pycache__/cusException.cpython-36.pyc

BIN
text_analysis/tools/__pycache__/mysql_helper.cpython-36.pyc

BIN
text_analysis/tools/__pycache__/process.cpython-36.pyc

BIN
text_analysis/tools/__pycache__/to_kafka.cpython-36.pyc

BIN
text_analysis/tools/__pycache__/to_kafka.cpython-37.pyc

BIN
text_analysis/tools/__pycache__/to_kafka.cpython-38.pyc

BIN
text_analysis/tools/__pycache__/tool.cpython-36.pyc

BIN
text_analysis/tools/__pycache__/tool.cpython-37.pyc

BIN
text_analysis/tools/__pycache__/tool.cpython-38.pyc

BIN
text_analysis/tools/__pycache__/tools.cpython-36.pyc

129
text_analysis/tools/bak/tool.py0822

@ -0,0 +1,129 @@
#coding:utf8
import re
def get_data(inputdata):
"""
重新组装参数
:param inputdata:原json数据
:return: 组装的prompt及其他参数
"""
res={}
return res
def get_content(inputdata,logging):
"""
重新组装参数
:param inputdata:原json数据
:return: 组装的prompt及其他参数
"""
res={}
admin=inputdata["metadata"]["admin"]
data=inputdata["data"]
prompt=admin["prompt"]
if_user=re.findall("{{(.*)}}",prompt)
if_data=re.findall("@@(.*)@@",prompt)
if if_user != []:
user_data=inputdata["metadata"]["user"]
if if_user[0] in user_data.keys():
tmp=user_data[if_user[0]]
prompt=re.sub("{{(.*)}}",tmp,prompt)
if if_data!=[] and if_data[0] in data.keys():
tmp1=data[if_data[0]]
prompt=re.sub("@@(.*)@@",tmp1,prompt)
res["prompt"]=prompt
res["authorization"]=admin["authorization"]
res["model"]=admin["model"]
res["temperature"]=admin["temperature"]
res["authorization"]=admin["authorization"]
res["top_p"]=admin["top_p"]
res["n"]=admin["n"]
return res
if __name__=="__main__":
inputdata={
"metadata":{
"output":{
"output_type":"table",
"label_col":[
"软件著作抽取结果"
]
},
"input":{
"input_type":"text",
"label":[
"7_软件著作过滤器"
]
},
"address":"http://172.18.1.181:9011/chatGpt/",
"admin":{
"authorization":"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",
"top_p":"1",
"user_input":[
{
"keyname":"tag",
"keydesc":""
}
],
"temperature":"0.2",
"model":"gpt-3.5-turbo-16k",
"prompt":"请在下面这句话中提取出:证书号、软件名称、著作权人,以json格式输出,找不到的字段赋值为空字符串,不要有多余的文字输出,只输出json结构。@@7_软件著作过滤器@@",
"n":"1"
},
"index":1
},
"data":{
"1_项目文件上传":"[{ \"fileUrl\":\"http://172.18.1.130:9985/group33/default/20230816/16/05/1/1-基于时间序列遥感 影像洪涝检测系统.jpg\",\"fileType\":\"jpg\", \"filePath\":\"/软件著作/1-基于时间序列遥感 影像洪涝检测系统.jpg\",\"fileId\":\"cd6592f0389bb1da25afbb44901f9cde\",\"fileName\":\"1-基于时间序列遥感 影像洪涝检测系统.jpg\" },{ \"fileUrl\":\"http://172.18.1.130:9985/group33/default/20230816/16/06/1/2-基于遥感影像的快速变化检测系统.jpg\",\"fileType\":\"jpg\", \"filePath\":\"/软件著作/2-基于遥感影像的快速变化检测系统.jpg\",\"fileId\":\"338847e34904fa96e8834cb220667db8\",\"fileName\":\"2-基于遥感影像的快速变化检测系统.jpg\" },{ \"fileUrl\":\"http://172.18.1.130:9985/group33/default/20230816/16/08/1/3-基于时空模型的遥感时间序列森林火灾检测系统.jpg\",\"fileType\":\"jpg\", \"filePath\":\"/软件著作/1/3-基于时空模型的遥感时间序列森林火灾检测系统.jpg\",\"fileId\":\"944eec1cf98f216ea953459dac4dd505\",\"fileName\":\"3-基于时空模型的遥感时间序列森林火灾检测系统.jpg\" },{ \"fileUrl\":\"http://172.18.1.130:9985/group33/default/20230816/16/09/1/4-基于隐马尔可夫模型的遥感时间序列分类系统.jpg\",\"fileType\":\"jpg\", \"filePath\":\"/软件著作/4-基于隐马尔可夫模型的遥感时间序列分类系统.jpg\",\"fileId\":\"eb378cb9ee914323f601500378dfad76\",\"fileName\":\"4-基于隐马尔可夫模型的遥感时间序列分类系统.jpg\" }]",
"2_文件分类信息":"{\"软件著作\":4}",
"3_OCR识别内容":"{\"content\":\" 22222222222222222222222222222222222222222222222222\\n中华人民共和国国家版权局\\n计算机软件著作权登记证书\\n证书号:软著登字第1623261号\\n软件名称:\\n基于遥感影像的快速变化检测系统\\nV1.0\\n著作权人:中国科学院遥感与数字地球研究所\\n开发完成日期:2016年08月01日\\n首次发表日期:未发表\\n权利取得方式:原始取得\\n权利范围:全部权利\\n登记号:2017SR037977\\n根据《计算机软件保护条例》和《计算机软件著作权登记办法》的\\n规定,经中国版权保护中心审核,对以上事项予以登记\\n计算机软件著作权\\n登记专用章\\n2017年02月10日\\nNo.01433672\",\"fileId\":\"338847e34904fa96e8834cb220667db8\",\"fileName\":\"2-基于遥感影像的快速变化检测系统.jpg\",\"filePath\":\"/软件著作/2-基于遥感影像的快速变化检测系统.jpg\",\"fileType\":\"jpg\",\"fileUrl\":\"http://172.18.1.130:9985/group33/default/20230816/16/06/1/2-基于遥感影像的快速变化检测系统.jpg\",\"pageNum\":1}",
"businessKey":"185aef3b1c810799a6be8314abf6512c",
"7_软件著作过滤器":"{\"content\":\" 22222222222222222222222222222222222222222222222222\\n中华人民共和国国家版权局\\n计算机软件著作权登记证书\\n证书号:软著登字第1623261号\\n软件名称:\\n基于遥感影像的快速变化检测系统\\nV1.0\\n著作权人:中国科学院遥感与数字地球研究所\\n开发完成日期:2016年08月01日\\n首次发表日期:未发表\\n权利取得方式:原始取得\\n权利范围:全部权利\\n登记号:2017SR037977\\n根据《计算机软件保护条例》和《计算机软件著作权登记办法》的\\n规定,经中国版权保护中心审核,对以上事项予以登记\\n计算机软件著作权\\n登记专用章\\n2017年02月10日\\nNo.01433672\",\"fileId\":\"338847e34904fa96e8834cb220667db8\",\"fileName\":\"2-基于遥感影像的快速变化检测系统.jpg\",\"filePath\":\"/软件著作/2-基于遥感影像的快速变化检测系统.jpg\",\"fileType\":\"jpg\",\"fileUrl\":\"http://172.18.1.130:9985/group33/default/20230816/16/06/1/2-基于遥感影像的快速变化检测系统.jpg\",\"pageNum\":1}"
},
"created":1691004265000,
"module":"OCR",
"start_tag":"false",
"last_edit":1692464331000,
"next_app_id":[
{
"start_id":86,
"edge_id":49,
"end_id":90
}
],
"transfer_id":11,
"blueprint_id":3,
"scenes_id":3,
"scenario":{
"dataloss":1,
"autoCommitTriggerLast":1,
"maxErrors":3,
"autoCommit":1,
"freshVariables":1
},
"wait_condition":[
],
"scheduling":{
"interval":-1,
"type":"single"
},
"name":"软件著作抽取",
"businessKey":"185aef3b1c810799a6be8314abf6512c",
"id":86,
"describe":"软件著作抽取"
}
a=get_content(inputdata,"")
print(a)

25
text_analysis/tools/cusException.py

@ -0,0 +1,25 @@
# -*- coding:utf-8 -*-
class pt_v_Exception(Exception):
def __str__(self):
return 'pt规则未在缓存中命中'
class dt_v_Exception(Exception):
def __str__(self):
return 'dt规则未在缓存中命中'
class dt_v_attr_Exception(Exception):
def __str__(self):
return 'dt_attrcode规则未在缓存中命中'
class dt_v_codeid_Exception(Exception):
def __str__(self):
return 'dt_codeid规则未在缓存中命中'
class dt_v_senti_Exception(Exception):
def __str__(self):
return 'dt_senti规则未在缓存中命中'
class dt_v_res_Exception(Exception):
def __str__(self):
return 'dt_resverse规则未在缓存中命中'

67
text_analysis/tools/kakfa_util.py

@ -0,0 +1,67 @@
# coding=utf-8
from kafka import KafkaProducer
from kafka import KafkaConsumer
import json
import traceback
import time
import traceback
import datetime
import queue
from logUtil import get_logger
logger = get_logger("crawlWebsrcCode.log")
"""
kafka
"""
def kafkaProduce(topic,resultData,address):
producer = KafkaProducer(bootstrap_servers = '{}'.format(address),request_timeout_ms=120000)
topics = topic.split(',')
for tc in topics:
future = producer.send(tc,resultData)
result = future.get(timeout=60)
producer.flush()
print (result)
#写入文件
def writeTxt(filePath,result):
f = open(filePath,'a',encoding='utf-8')
f.write(result.encode('utf-8').decode('unicode_escape')+'\n')
f.close
def KafkaConsume(topic,address,group_id,task_queue,logger):
'''
kafka
:param topic:
:param address:
:param group_id:
:param task_queue:
:return:
'''
try:
consumer = KafkaConsumer(topic, auto_offset_reset='earliest',fetch_max_bytes=1024768000,fetch_max_wait_ms=5000, bootstrap_servers=address,group_id = group_id)
i = 1
while True:
for msg in consumer:
print('第{}条数据'.format(i))
data = str(msg.value, encoding = "utf-8")
print(data)
task_queue.put(data)
i = i+1
else:
print('暂无任务------')
time.sleep(10)
except Exception as e:
print('kafka未知异常----')
traceback.print_exc()
def writeTxt(filePath,result):
f = open(filePath,'a')
f.write(result+'\n')
f.close
if __name__ == '__main__':
# resultData = {'id': '中文', 'url': 'https://zh.wikipedia.org/zh/%E8%94%A1%E8%8B%B1%E6%96%87'}
# kafkaProduce('test', json.dumps(resultData).encode('utf-8').decode('unicode_escape').encode(),'121.4.41.194:8008')
task_queue = queue.Queue()
KafkaConsume('fq-Taobao-eccontent','39.129.129.172:6666,39.129.129.172:6668,39.129.129.172:6669,39.129.129.172:6670,39.129.129.172:6671','news_sche_8',task_queue,logger)
# KafkaConsume('zxbnewstopic','120.133.14.71:9992','group3',task_queue,logger)

338
text_analysis/tools/mysql_helper.py

@ -0,0 +1,338 @@
# coding:utf8
import os, sys
cur_dir = os.path.dirname(os.path.abspath(__file__)) or os.getcwd()
par_dir = os.path.abspath(os.path.join(cur_dir, os.path.pardir))
sys.path.append(cur_dir)
sys.path.append(par_dir)
import json
import re
# from log_util.set_logger import set_logger
# logging = set_logger('logs/error.log')
import pymysql.cursors
import traceback
def mysqlConn(data,logging):
res={"successCode":"1","errorLog":"","results":""}
p_host=data["Host"]
p_port=int(data["Port"])
p_db=data["Database"]
p_user=data["User"]
p_password=data["Password"]
try:
db = pymysql.connect(host=p_host, user=p_user, passwd=p_password, db=p_db, port=p_port,
charset='utf8', cursorclass=pymysql.cursors.DictCursor)
db.ping(reconnect=True)
cursor = db.cursor()
sql = "SHOW TABLES"
cursor.execute(sql)
tables = cursor.fetchall()
if tables:
table_names = list(map(lambda x: list(x.values())[0], tables))
res["results"] = table_names
else:
res["successCode"] = "0"
cursor.close()
db.close()
return res
except:
res["successCode"] = "0"
res["errorLog"]=traceback.format_exc()
logging.error(traceback.format_exc())
return res
def getTableColumnNames(data,logging):
res={"successCode":"1","errorLog":"","results":""}
p_host=data["Host"]
p_port=int(data["Port"])
p_db=data["Database"]
p_user=data["User"]
p_password=data["Password"]
p_table=data["Table"]
try:
db = pymysql.connect(host=p_host, user=p_user, passwd=p_password, db=p_db, port=p_port,
charset='utf8', cursorclass=pymysql.cursors.DictCursor)
db.ping(reconnect=True)
cursor = db.cursor()
sql = "DESCRIBE "+p_table
cursor.execute(sql)
tables = cursor.fetchall()
if tables:
table_names = list(map(lambda x: x['Field'], tables))
res["results"] = table_names
else:
res["successCode"] = "0"
cursor.close()
db.close()
return res
except:
res["successCode"] = "0"
res["errorLog"]=traceback.format_exc()
logging.error(traceback.format_exc())
return res
def mysqlInsert(input,logging):
res={"successCode":"1","errorLog":"","results":""}
data=input["metadata"]["admin"]
p_host=data["Host"]
p_port=int(data["Port"])
p_db=data["Database"]
p_user=data["User"]
p_password=data["Password"]
p_table=data["Table"]
p_columnName=data["columnName"]
cN='('+','.join(p_columnName)+') '
p_values=data["values"]
val=tuple(p_values)
try:
db = pymysql.connect(host=p_host, user=p_user, passwd=p_password, db=p_db, port=p_port,
charset='utf8', cursorclass=pymysql.cursors.DictCursor)
db.ping(reconnect=True)
cursor = db.cursor()
sql = "insert into " + p_table + cN + "values ("+ ','.join(['%s'] * len(val)) + ")"
cursor.execute(sql,val)
db.commit()
cursor.close()
db.close()
return res
except:
res["successCode"] = "0"
res["errorLog"]=traceback.format_exc()
logging.error(traceback.format_exc())
return res
def mysqlUpdate(input,logging):
res={"successCode":"1","errorLog":"","results":""}
data=input["metadata"]["admin"]
p_host=data["Host"]
p_port=int(data["Port"])
p_db=data["Database"]
p_user=data["User"]
p_password=data["Password"]
p_table=data["Table"]
# p_set=data["Set"]
p_set=get_updateSet(input)
# where=process_where(data["Filter"])
where=get_filter(data["Filter"])
try:
db = pymysql.connect(host=p_host, user=p_user, passwd=p_password, db=p_db, port=p_port,
charset='utf8', cursorclass=pymysql.cursors.DictCursor)
db.ping(reconnect=True)
cursor = db.cursor()
sql = "UPDATE " + p_table + p_set + where
print(sql)
cursor.execute(sql)
db.commit()
cursor.close()
db.close()
return res
except:
res["successCode"] = "0"
res["errorLog"]=traceback.format_exc()
logging.error(traceback.format_exc())
return res
def mysqlExecute(input,logging):
res={"successCode":"1","errorLog":"","results":""}
data=input["metadata"]["admin"]
p_host=data["Host"]
p_port=int(data["Port"])
p_db=data["Database"]
p_user=data["User"]
p_password=data["Password"]
execute=data["Execute"]
try:
db = pymysql.connect(host=p_host, user=p_user, passwd=p_password, db=p_db, port=p_port,
charset='utf8', cursorclass=pymysql.cursors.DictCursor)
db.ping(reconnect=True)
cursor = db.cursor()
cursor.execute(execute)
if 'select' in execute.lower():
result = cursor.fetchall()
res["results"]=json.dumps(result,ensure_ascii=False)
else:
db.commit()
cursor.close()
db.close()
return res
except:
res["successCode"] = "0"
res["errorLog"]=traceback.format_exc()
logging.error(traceback.format_exc())
return res
# def process_where(data):
# '''
# 组装where
# :param data: data["Filter"],{"key":"age","value":"20","operator":">"},{"logicalSymbol":"and"},{"key":"weight","value":"50","operator":"<"}
# :return: WHERE age>20 and weight<50
# '''
# if data=="" or data==[]:
# return ""
# where = " WHERE "
# for line in data:
# if "key" in line.keys():
# val = line["value"]
# if isinstance(val, str):
# val = "\'" + val + "\'"
# tmp = str(line["key"]) + " " + line["operator"] + " " + str(val)
# where += tmp
# else:
# where += " " + line["logicalSymbol"] + " "
# return where
#
# def process_filter(data):
# '''
# 组装key,value,operator
# :param data: data["Filter"],{"key":"age",value:"20","operator":"="}
# :return: age=20
# '''
# if data=="" or data==[]:
# return ""
# res=data["key"]+" "+data["operator"]+" "+data["value"]
# return res
def get_updateSet(input):
metadata=input["metadata"]
user=metadata["user"]
sets=metadata["admin"]["Set"]
res=[]
for line in sets:
part=line.split("=")
tmp = []
for p in part:
user_match=re.findall('##(.*?)##', p)
if user_match!=[]:
tmp.append(user[user_match[0]])
res.append(str(tmp[0])+"="+str(tmp[1]))
result=" SET "+",".join(res)
return result
def get_filter(data):
if "OR" not in data.keys():
return ""
op_or=data["OR"]
res = ""
if len(op_or) == 1:
tmp = []
line = op_or[0]["AND"]
for single_line in line:
val = single_line["value"]
if isinstance(val, str):
val = "\'" + val + "\'"
tmp.append(str(single_line["key"]) + single_line["operator"] + str(val))
if single_line != line[-1]:
tmp.append("and")
res = " WHERE "+" ".join(tmp)
elif len(op_or) > 1:
tmp = []
for single_and in op_or:
line = single_and["AND"]
for sigle_line in line:
val = sigle_line["value"]
if isinstance(val, str):
val = "\'" + val + "\'"
tmp.append(str(sigle_line["key"]) + sigle_line["operator"] + str(val))
if sigle_line != line[-1]:
tmp.append("and")
if single_and != op_or[-1]:
tmp.append("or")
res = " WHERE "+" ".join(tmp)
return res
def mysqlQuery(input,logging):
res={"successCode":"1","errorLog":"","results":""}
data=input["metadata"]["admin"]
p_host=data["Host"]
p_port=int(data["Port"])
p_db=data["Database"]
p_user=data["User"]
p_password=data["Password"]
p_table=data["Table"]
p_columnNames=data["columnNames"]
# p_filter=data["Filter"]
column='*'
if len(p_columnNames)==1:
column=p_columnNames[0]
elif len(p_columnNames)>1:
column=','.join(p_columnNames)
where=get_filter(data["Filter"])
try:
db = pymysql.connect(host=p_host, user=p_user, passwd=p_password, db=p_db, port=p_port,
charset='utf8', cursorclass=pymysql.cursors.DictCursor)
db.ping(reconnect=True)
cursor = db.cursor()
sql = "SELECT " + column +" From "+ p_table + where
# print(sql)
cursor.execute(sql)
result = cursor.fetchall()
res["results"]=json.dumps(result,ensure_ascii=False)
cursor.close()
db.close()
return res
except:
res["successCode"] = "0"
res["errorLog"]=traceback.format_exc()
logging.error(traceback.format_exc())
return res
def mysqlDelete(input,logging):
res={"successCode":"1","errorLog":"","results":""}
data=input["metadata"]["admin"]
p_host=data["Host"]
p_port=int(data["Port"])
p_db=data["Database"]
p_user=data["User"]
p_password=data["Password"]
p_table=data["Table"]
# where=process_where(data["Filter"])
where=get_filter(data["Filter"])
try:
db = pymysql.connect(host=p_host, user=p_user, passwd=p_password, db=p_db, port=p_port,
charset='utf8', cursorclass=pymysql.cursors.DictCursor)
db.ping(reconnect=True)
cursor = db.cursor()
sql = "DELETE From "+ p_table + where
cursor.execute(sql)
db.commit()
cursor.close()
db.close()
return res
except:
res["successCode"] = "0"
res["errorLog"]=traceback.format_exc()
logging.error(traceback.format_exc())
return res
if __name__=="__main__":
input={"metadata":{"admin":{
"type":"query",
"Table":"student",
"columnNames":["name","age"],
"Set":["##tag1##=##value1##","##tag2##=##value2##"],
"Filter":{
"OR":[
{
"AND":[{"key":"age","value":20,"operator":">"},{"key":"weight","value":50,"operator":"<"}]
},
{
"AND":[{"key":"name","value":"ff","operator":"="}]
}
]
},
"Host":"172.26.28.30",
"Port":"3306",
"Database":"test",
"User":"crawl",
"Password":"crawl123"
}},
"user": {
"tag1": "age",
"tag2": "weight",
"value1": 2,
"value2": 100
}
}
res=mysqlUpdate(input,"")
print(res)

51
text_analysis/tools/process.py

@ -0,0 +1,51 @@
#coding:utf8
import os, sys
cur_dir = os.path.dirname(os.path.abspath(__file__)) or os.getcwd()
par_dir = os.path.abspath(os.path.join(cur_dir, os.path.pardir))
sys.path.append(cur_dir)
sys.path.append(par_dir)
import json
from text_analysis.tools import to_kafka
from tools.mysql_helper import mysqlConn,mysqlInsert,mysqlQuery,mysqlExecute,mysqlUpdate,mysqlDelete,getTableColumnNames
import traceback
import time
from log_util.set_logger import set_logger
logging=set_logger('results.log')
from views import task_queue
def process_data():
while True:
try:
# print("task_queue:",task_queue)
if task_queue.qsize() >0:
try:
raw_data = task_queue.get()
res = ""
logging.info("启动数据处理线程——")
logging.info(raw_data)
flag = raw_data["metadata"]["admin"]["type"]
# type分为execute、query、insert、update、delete
if flag == 'insert':
res = mysqlInsert(raw_data, logging)
elif flag == 'execute':
res = mysqlExecute(raw_data, logging)
elif flag == 'update':
res = mysqlUpdate(raw_data, logging)
elif flag == 'query':
res = mysqlQuery(raw_data, logging)
elif flag == 'delete':
res = mysqlDelete(raw_data, logging)
raw_data["result"] = res
logging.info("************写入kafka***********")
to_kafka.send_kafka(raw_data)
except:
raw_data["result"] = {"successCode": "0", "errorLog": "", "results": ""}
raw_data["result"]["errorLog"] = traceback.format_exc()
to_kafka.send_kafka(raw_data)
else:
logging.info("暂无任务,进入休眠--")
print("222222222222222222222222")
time.sleep(10)
except:
logging.error(traceback.format_exc())

171
text_analysis/tools/seleniumTest.py

@ -0,0 +1,171 @@
# -*- coding: utf-8 -*-
import time
import threading
from selenium import webdriver
import json
from urllib.parse import urljoin
from kakfa_util import KafkaConsume
from kakfa_util import kafkaProduce
from logUtil import get_logger
from Go_fastDfs import uploadFile
import traceback
import queue
import configparser
import os, sys
import re
logger = get_logger("./logs/crawlWebsrcCode.log")
#加载配置文件
configFile = './config.ini'
# 创建配置文件对象
con = configparser.ConfigParser()
# 读取文件
con.read(configFile, encoding='utf-8')
kafkaConfig = dict(con.items('kafka'))#kafka配置信息
goFastdfsConfig = dict(con.items('goFastdfs'))#goFastdfs配置信息
class Spider(object):
def __init__(self,url):
self.chromeOptions = self.get_profile()
self.browser = self.get_browser()
self.url = url
def get_profile(self):
chromeOptions = webdriver.ChromeOptions()
chromeOptions.add_argument('--headless') # 谷歌无头模式
chromeOptions.add_argument('--disable-gpu') # 禁用显卡
# chromeOptions.add_argument('window-size=1280,800') # 指定浏览器分辨率
chromeOptions.add_argument("--no-sandbox")
return chromeOptions
def get_browser(self):
browser = webdriver.Chrome("D:\\工作使用\\zhaoshang\\chromedriver.exe",chrome_options=self.chromeOptions)
return browser
def _get_page(self,path):
'''
:param path:
:return:
'''
self.browser.get(self.url)
time.sleep(5)
logger.info("休眠结束")
# 向下偏移了10000个像素,到达底部。
scrollTop = 10000
for num in range(1,10):
js = "var q=document.documentElement.scrollTop={}".format(scrollTop*num)
logger.info("第{}次滚动".format(num))
self.browser.execute_script(js)
time.sleep(5)
# 执行 Chome 开发工具命令,得到mhtml内容
res = self.browser.execute_cdp_cmd('Page.captureSnapshot', {})
#获取文章标题
title = '无标题'
try:
title = self.browser.find_element_by_css_selector("title").get_attribute("textContent")
except Exception as e:
logger.error('获取标题异常----')
traceback.print_exc()
pathName = '{}{}.mhtml'.format(path,title)
with open(pathName, 'w',newline='') as f:
f.write(res['data'])
return pathName,title
if __name__ == '__main__':
#初始化任务队列
task_queue = queue.Queue()
#跟读kafka线程
logger.info("开启读取kafka线程---")
t = threading.Thread(target=KafkaConsume, name='LoopThread',args=(kafkaConfig['read_topic'], kafkaConfig['address'], kafkaConfig['group_id'], task_queue,logger))
t.daemon = True
t.start()
#获取任务执行页面原格式保留
while True:
try:
if task_queue.qsize() >0:
taskStr = task_queue.get()
logger.info('当前任务:{}'.format(taskStr))
task = json.loads(taskStr)
p1 = u'(https?|ftp|file)://[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]'
pattern1 = re.compile(p1)
matcher1 = re.search(p1, task['url'])
if matcher1:
l = Spider(task['url'])
pathName,title = l._get_page(goFastdfsConfig['path'])
l.browser.quit()
#gofast 上传,写入kafka
if '404 Not Found' in title:
logger.error('页面404,无效')
resultData = {
'code': 500,
'id': task['id'],
'message': '页面404'
}
kafkaProduce(kafkaConfig['data_topics'],
json.dumps(resultData).encode('utf-8').decode('unicode_escape').encode(),
kafkaConfig['address'])
time.sleep(2)
continue
try:
uploadStr = uploadFile('{}upload'.format(goFastdfsConfig['uploadaddress']),pathName,logger)
uploadJson = json.loads(uploadStr)
except Exception as e:
logger.error('文件上传异常----')
traceback.print_exc()
resultData = {
'code': 500,
'id': task['id'],
'message': '文件上传失败'
}
kafkaProduce(kafkaConfig['data_topics'],
json.dumps(resultData).encode('utf-8').decode('unicode_escape').encode(),
kafkaConfig['address'])
time.sleep(2)
continue
resultData = {
'code':200,
'id':task['id'],
'url':goFastdfsConfig['downloadaddress']+uploadJson['path'],
'title':title,
'delMd5':uploadJson['md5'],
'uploadTime':uploadJson['mtime'],
'message':'成功'
}
kafkaProduce(kafkaConfig['data_topics'],json.dumps(resultData).encode('utf-8').decode('unicode_escape').encode(),kafkaConfig['address'])
logger.info('数据写入成功')
#删除文件
if (os.path.exists(pathName)):
os.remove(pathName)
logger.info('清除文件:{}'.format(pathName))
else:
logger.info('要删除的文件不存在:{}'.format(pathName))
else:
logger.error('非正确url:'.format(task['url']))
resultData = {
'code': 500,
'id': task['id'],
'message': '非正确url'
}
kafkaProduce(kafkaConfig['data_topics'],
json.dumps(resultData).encode('utf-8').decode('unicode_escape').encode(),
kafkaConfig['address'])
time.sleep(2)
continue
else:
logger.info("暂无任务,进入休眠--")
time.sleep(10)
except Exception as e:
logger.error('未知异常----')
traceback.print_exc()
resultData = {
'code': 500,
'id': task['id'],
'message': '未知异常'
}
kafkaProduce(kafkaConfig['data_topics'],
json.dumps(resultData).encode('utf-8').decode('unicode_escape').encode(),
kafkaConfig['address'])
time.sleep(2)

25
text_analysis/tools/to_kafka.py

@ -0,0 +1,25 @@
#coding:utf8
import traceback
import json
from kafka import KafkaProducer
from text_analysis.read_config import load_config
config=load_config()
def send_kafka(data,logging):
try:
producer = None
topic = config["kafka"]["topic"]
data1=json.dumps(data,ensure_ascii=False)
kafkaProduce(topic,bytes(data1, encoding='utf-8'))
logging.info("数据推入kafka!")
except Exception as e:
logging.info(traceback.format_exc())
logging.info('写入kafka失败')
def kafkaProduce(topic,resultData):
producer = KafkaProducer(bootstrap_servers = '{}'.format(config["kafka"]["bootstrap_servers"]),max_request_size=52428800)
topics = topic.split(',')
for tc in topics:
future = producer.send(tc,resultData)
producer.flush()

132
text_analysis/tools/tool.py

@ -0,0 +1,132 @@
#coding:utf8
import re
import json
from jsonpath_ng import jsonpath, parse
def parse_data(raw_data,url):
all_result = raw_data['data']
param_split = str(url).split(":")
datasourcestr = all_result[param_split[0]]
datasource = json.loads(datasourcestr)
# 创建 JsonPath 表达式对象
expr = parse(param_split[1])
# 使用表达式来选择 JSON 元素
match = [match.value for match in expr.find(datasource)]
val = match[0]
return val
def get_content(inputdata,logging):
"""
:param inputdata:json数据
:return: prompt及其他参数
"""
res={}
admin=inputdata["metadata"]["admin"]
data=inputdata["data"]
prompt=admin["prompt"]
if_user=re.findall("{{(.*)}}",prompt)
if_data=re.findall("@@(.*)@@",prompt)
if if_user != []:
user_data=inputdata["metadata"]["user"]
if if_user[0] in user_data.keys():
tmp=user_data[if_user[0]]
prompt=re.sub("{{(.*)}}",tmp,prompt)
if if_data!=[] and if_data[0] in data.keys():
tmp1=data[if_data[0]]
prompt=re.sub("@@(.*)@@",tmp1,prompt)
res["prompt"]=prompt
res["authorization"]=admin["authorization"]
res["model"]=admin["model"]
res["temperature"]=admin["temperature"]
res["authorization"]=admin["authorization"]
res["top_p"]=admin["top_p"]
res["n"]=admin["n"]
return res
if __name__=="__main__":
inputdata={
"metadata":{
"output":{
"output_type":"table",
"label_col":[
"软件著作抽取结果"
]
},
"input":{
"input_type":"text",
"label":[
"7_软件著作过滤器"
]
},
"address":"http://172.18.1.181:9011/chatGpt/",
"admin":{
"authorization":"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",
"top_p":"1",
"user_input":[
{
"keyname":"tag",
"keydesc":""
}
],
"temperature":"0.2",
"model":"gpt-3.5-turbo-16k",
"prompt":"请在下面这句话中提取出:证书号、软件名称、著作权人,以json格式输出,找不到的字段赋值为空字符串,不要有多余的文字输出,只输出json结构。@@7_软件著作过滤器@@",
"n":"1"
},
"index":1
},
"data":{
"1_项目文件上传":"[{ \"fileUrl\":\"http://172.18.1.130:9985/group33/default/20230816/16/05/1/1-基于时间序列遥感 影像洪涝检测系统.jpg\",\"fileType\":\"jpg\", \"filePath\":\"/软件著作/1-基于时间序列遥感 影像洪涝检测系统.jpg\",\"fileId\":\"cd6592f0389bb1da25afbb44901f9cde\",\"fileName\":\"1-基于时间序列遥感 影像洪涝检测系统.jpg\" },{ \"fileUrl\":\"http://172.18.1.130:9985/group33/default/20230816/16/06/1/2-基于遥感影像的快速变化检测系统.jpg\",\"fileType\":\"jpg\", \"filePath\":\"/软件著作/2-基于遥感影像的快速变化检测系统.jpg\",\"fileId\":\"338847e34904fa96e8834cb220667db8\",\"fileName\":\"2-基于遥感影像的快速变化检测系统.jpg\" },{ \"fileUrl\":\"http://172.18.1.130:9985/group33/default/20230816/16/08/1/3-基于时空模型的遥感时间序列森林火灾检测系统.jpg\",\"fileType\":\"jpg\", \"filePath\":\"/软件著作/1/3-基于时空模型的遥感时间序列森林火灾检测系统.jpg\",\"fileId\":\"944eec1cf98f216ea953459dac4dd505\",\"fileName\":\"3-基于时空模型的遥感时间序列森林火灾检测系统.jpg\" },{ \"fileUrl\":\"http://172.18.1.130:9985/group33/default/20230816/16/09/1/4-基于隐马尔可夫模型的遥感时间序列分类系统.jpg\",\"fileType\":\"jpg\", \"filePath\":\"/软件著作/4-基于隐马尔可夫模型的遥感时间序列分类系统.jpg\",\"fileId\":\"eb378cb9ee914323f601500378dfad76\",\"fileName\":\"4-基于隐马尔可夫模型的遥感时间序列分类系统.jpg\" }]",
"2_文件分类信息":"{\"软件著作\":4}",
"3_OCR识别内容":"{\"content\":\" 22222222222222222222222222222222222222222222222222\\n中华人民共和国国家版权局\\n计算机软件著作权登记证书\\n证书号:软著登字第1623261号\\n软件名称:\\n基于遥感影像的快速变化检测系统\\nV1.0\\n著作权人:中国科学院遥感与数字地球研究所\\n开发完成日期:2016年08月01日\\n首次发表日期:未发表\\n权利取得方式:原始取得\\n权利范围:全部权利\\n登记号:2017SR037977\\n根据《计算机软件保护条例》和《计算机软件著作权登记办法》的\\n规定,经中国版权保护中心审核,对以上事项予以登记\\n计算机软件著作权\\n登记专用章\\n2017年02月10日\\nNo.01433672\",\"fileId\":\"338847e34904fa96e8834cb220667db8\",\"fileName\":\"2-基于遥感影像的快速变化检测系统.jpg\",\"filePath\":\"/软件著作/2-基于遥感影像的快速变化检测系统.jpg\",\"fileType\":\"jpg\",\"fileUrl\":\"http://172.18.1.130:9985/group33/default/20230816/16/06/1/2-基于遥感影像的快速变化检测系统.jpg\",\"pageNum\":1}",
"businessKey":"185aef3b1c810799a6be8314abf6512c",
"7_软件著作过滤器":"{\"content\":\" 22222222222222222222222222222222222222222222222222\\n中华人民共和国国家版权局\\n计算机软件著作权登记证书\\n证书号:软著登字第1623261号\\n软件名称:\\n基于遥感影像的快速变化检测系统\\nV1.0\\n著作权人:中国科学院遥感与数字地球研究所\\n开发完成日期:2016年08月01日\\n首次发表日期:未发表\\n权利取得方式:原始取得\\n权利范围:全部权利\\n登记号:2017SR037977\\n根据《计算机软件保护条例》和《计算机软件著作权登记办法》的\\n规定,经中国版权保护中心审核,对以上事项予以登记\\n计算机软件著作权\\n登记专用章\\n2017年02月10日\\nNo.01433672\",\"fileId\":\"338847e34904fa96e8834cb220667db8\",\"fileName\":\"2-基于遥感影像的快速变化检测系统.jpg\",\"filePath\":\"/软件著作/2-基于遥感影像的快速变化检测系统.jpg\",\"fileType\":\"jpg\",\"fileUrl\":\"http://172.18.1.130:9985/group33/default/20230816/16/06/1/2-基于遥感影像的快速变化检测系统.jpg\",\"pageNum\":1}"
},
"created":1691004265000,
"module":"OCR",
"start_tag":"false",
"last_edit":1692464331000,
"next_app_id":[
{
"start_id":86,
"edge_id":49,
"end_id":90
}
],
"transfer_id":11,
"blueprint_id":3,
"scenes_id":3,
"scenario":{
"dataloss":1,
"autoCommitTriggerLast":1,
"maxErrors":3,
"autoCommit":1,
"freshVariables":1
},
"wait_condition":[
],
"scheduling":{
"interval":-1,
"type":"single"
},
"name":"软件著作抽取",
"businessKey":"185aef3b1c810799a6be8314abf6512c",
"id":86,
"describe":"软件著作抽取"
}
a=get_content(inputdata,"")
print(a)

13
text_analysis/urls.py

@ -0,0 +1,13 @@
from django.conf.urls import include, url
from django.contrib import admin
from text_analysis import views
urlpatterns = [
url(r'^ASRNew',views.ASRNew, name='ASRNew'),
# url(r'^mysqlConnection',views.mysqlConnection, name='mysqlConnection'),
# url(r'^mysqlField', views.mysqlField, name='mysqlField')
]

268
text_analysis/views.py

@ -0,0 +1,268 @@
# coding:utf8
import os, sys
import io
from jsonpath_ng import jsonpath, parse
import uuid
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf8')
cur_dir = os.path.dirname(os.path.abspath(__file__)) or os.getcwd()
par_dir = os.path.abspath(os.path.join(cur_dir, os.path.pardir))
sys.path.append(cur_dir)
sys.path.append(par_dir)
import json
from django.http import HttpResponse
from text_analysis.tools import to_kafka
from django.views.decorators.csrf import csrf_exempt
from log_util.set_logger import set_logger
logging = set_logger('logs/results.log')
import traceback
import queue
import requests
from text_analysis.tools.tool import parse_data
import time
from datetime import datetime
import os
from kazoo.client import KazooClient
from kazoo.protocol.states import EventType
# 任务队列
import queue
task_queue = queue.PriorityQueue()
# 数据队列
data_queue = queue.Queue()
stop_dict={}
from text_analysis.read_config import load_config
config=load_config()
@csrf_exempt
def ASRNew(request):
if request.method == 'POST':
try:
raw_data = json.loads(request.body)
if "trace" in raw_data.keys() and raw_data["trace"]==True:
task_queue.put((-1,time.time(), raw_data))
else:
task_queue.put((1, time.time(),raw_data))
return HttpResponse(json.dumps({"code": 1, "msg": "请求正常!"}, ensure_ascii=False))
except:
logging.error(traceback.format_exc())
return HttpResponse(json.dumps({"code": 0, "msg": "请求json格式不正确!"}, ensure_ascii=False))
else:
return HttpResponse(json.dumps({"code": 0, "msg": "请求方式错误,改为post请求"}, ensure_ascii=False))
def upload():
while True:
try:
if task_queue.qsize()>0:
p,t,raw_data = task_queue.get(timeout=1)
logging.info("当前任务队列长度{}".format(task_queue.qsize()+1))
output=raw_data["output"]
res_tmp={key: "" for key in output}
if "id" in res_tmp.keys():
res_tmp["id"]=str(uuid.uuid4())
logging.info("任务数据为:{}".format(raw_data))
logging.info("当前version信息为:{}".format(stop_dict))
task_id=raw_data["scenes_id"]
task_version=raw_data["version"]
if task_id in stop_dict.keys() and task_version!=stop_dict[task_id]["version"]:
logging.info("已暂停任务上传,过滤掉。{}".format(raw_data))
continue
url=raw_data["input"]["fileUrl"]
if "json" in url:
parm = url.split("#")
data1 = parse_data(raw_data, parm[0])
data1_json = json.loads(data1)
expr = parse(parm[2])
match = [match.value for match in expr.find(data1_json)]
video_url = match[0]
else:
video_url = parse_data(raw_data, url)
fileName=video_url.rsplit('/')[-1]
if "http" not in video_url:
file = config["gofast"]["url"] + video_url.lstrip("/")
else:
file=video_url
currentFile={"fileName":fileName,"fileUrl":file}
language = raw_data["input"]["fromLanguage"]
# 从gofast获取视频
myfile = requests.get(file)
starttime = datetime.now().strftime('%Y-%m-%d')
path = 'inputdata/' + starttime
if not os.path.exists(path):
os.makedirs(path)
with open(path + '/' + fileName, 'wb') as f:
f.write(myfile.content)
logging.info("视频从gofast下载完毕,开始上传-{}".format(fileName))
# 访问视频上传接口
# video=1视频,0音频。
video=1
if fileName[-3:]=="m4a" or fileName[-3:]=="mp3" or fileName[-3:]=="wav":
# url="https://realtime.pdeepmatrix.com/apis/file/asr/upload"
url=config["asr"]["mp3_upload"]
video=0
else:
# url = "https://realtime.pdeepmatrix.com/apis/media/analysis/upload"
url = config["asr"]["video_upload"]
data = {'fromLanguage': language}
f = open(path + '/' + fileName, 'rb')
files = {'file': f}
response = requests.post(url, data=data, files=files,verify=False)
logging.info("上传后接口返回值:{}-{}".format(response,response.text))
d = json.loads(response.text)
if "code" in d.keys() and d["code"] == 200:
# 接口返回值data中存放视频获取结果的key
result = d["data"]
raw_data["result"] = {"successCode": "1", "errorLog": "", "results": "", "dataKey": result,"video":video,"file":currentFile}
data_queue.put(raw_data)
logging.info("视频上传成功{}".format(raw_data))
# to_kafka.send_kafka(raw_data,logging)
else:
logging.info("视频上传失败{}-{}".format(raw_data, d))
f.close()
# Todo删除视频文件
else:
# 暂无任务,进入休眠
time.sleep(10)
except:
raw_data["result"]={}
raw_data["result"]["successCode"] = "0"
raw_data["result"]["status"]=2
raw_data["result"]["message"]="视频/音频上传异常"
raw_data["result"]["errorLog"] = traceback.format_exc()
raw_data["result"]["results"] = json.dumps(res_tmp, ensure_ascii=False)
logging.error(traceback.format_exc())
to_kafka.send_kafka(raw_data, logging)
def getResult():
while True:
# 3秒钟结果获取一次
time.sleep(3)
try:
if data_queue.qsize() > 0:
logging.info("取数据队列长度{}".format(data_queue.qsize()))
raw_data = data_queue.get()
logging.info("任务数据为:{}".format(raw_data))
task_id=raw_data["scenes_id"]
task_version=raw_data["version"]
if task_id in stop_dict.keys() and task_version!=stop_dict[task_id]["version"]:
logging.info("已暂停获取结果任务,过滤掉。{}".format(raw_data))
continue
output=raw_data["output"]
res_tmp={key: "" for key in output}
if "id" in res_tmp.keys():
res_tmp["id"]=str(uuid.uuid4())
res_tmp["isLast"]=1
res_tmp["fileName"]=raw_data["result"]["file"]["fileName"]
# 根据视频key访问获取结果接口
dataKey = raw_data["result"]["dataKey"]
params = {'taskId': dataKey}
language = raw_data["input"]["fromLanguage"]
data = {'fromLanguage': language,'taskId': dataKey}
if raw_data["result"]["video"]==1:
#url="https://realtime.pdeepmatrix.com/apis/media/analysis/getResult"
# url = "http://172.18.1.155:6611/apis/media/analysis/getResult?taskId={}".format(dataKey)
url=config["asr"]["video_getResult"]+"?taskId={}".format(dataKey)
response = requests.get(url, verify=False)
else:
# url ="https://realtime.pdeepmatrix.com/apis/file/asr/getResult"
# url ="http://172.18.1.155:6611/apis/file/asr/getResult"
url=config["asr"]["mp3_getResult"]
response = requests.post(url, data=data, verify=False)
logging.info("ASR网站返回值:{}-{}".format(response,response.text))
d = json.loads(response.text)
if "code" in d.keys() and d["code"] == 200:
results = ""
if d["data"]["code"] == "1" and d["data"]["sentences"]:
for sentence in d["data"]["sentences"]:
if results:
results += ' ' + sentence["text"]
else:
results = sentence["text"]
if "content" in res_tmp.keys():
res_tmp["content"]=results
raw_data["result"]["results"] = json.dumps(res_tmp, ensure_ascii=False)
raw_data["result"]["status"]=1
raw_data["result"]["message"]="成功"
logging.info("视频解析获取结果成功{}".format(raw_data))
to_kafka.send_kafka(raw_data, logging)
elif d["data"]["code"] == "1" and not d["data"]["sentences"]:
results =""
if "content" in res_tmp.keys():
res_tmp["content"]=results
raw_data["result"]["results"] = json.dumps(res_tmp, ensure_ascii=False)
raw_data["result"]["status"]=1
raw_data["result"]["message"]="成功"
logging.info("视频解析获取结果成功{}".format(raw_data))
to_kafka.send_kafka(raw_data, logging)
elif d["data"]["code"] == "0":
# 正在解析中,将任务再次放回数据队列
data_queue.put(raw_data)
logging.info("视频未解析完毕,放回队列等待{}-{}".format(raw_data, d))
else:
# 解析失败
raw_data["result"]["successCode"] = "0"
raw_data["result"]["errorLog"] = response.text
raw_data["result"]["results"] = json.dumps(res_tmp, ensure_ascii=False)
raw_data["result"]["status"]=2
raw_data["result"]["message"]="视频/音频解析异常"
logging.info("视频解析获取结果失败,数据{},接口返回值{}".format(raw_data, d))
to_kafka.send_kafka(raw_data, logging)
else:
raw_data["result"]["successCode"] = "0"
raw_data["result"]["errorLog"] = response.text
raw_data["result"]["results"] = json.dumps(res_tmp, ensure_ascii=False)
raw_data["result"]["status"] = 2
raw_data["result"]["message"] = "视频/音频解析异常"
logging.info("视频解析获取结果失败,数据{},接口返回值{}".format(raw_data, d))
to_kafka.send_kafka(raw_data, logging)
else:
# 暂无任务,进入休眠
time.sleep(10)
except:
raw_data["result"]["successCode"] = "0"
raw_data["result"]["errorLog"] = traceback.format_exc()
raw_data["result"]["status"] = 2
raw_data["result"]["message"] = "视频/音频解析异常"
raw_data["result"]["results"] = json.dumps(res_tmp, ensure_ascii=False)
logging.error(traceback.format_exc())
to_kafka.send_kafka(raw_data, logging)
def zk_monitoring():
try:
#线上环境
zk = KazooClient(hosts=config['zookeeper']['zkhost'])
#测试环境
# zk = KazooClient(hosts='172.16.12.55:2181,172.16.12.56:2181,172.16.12.57:2181')
zk.start()
# 设置监听器
@zk.DataWatch("/analyze")
def watch_node(data, stat, event):
if event is not None and event.type == EventType.CHANGED:
data, stat = zk.get("/analyze")
# logging.info("执行删除操作:{}".format(data))
try:
d = json.loads(data)
id = d["scenes_id"]
stop_dict[id] = {}
stop_dict[id]["version"] = d["version"]
stop_dict[id]["operation"] = d["operation"]
except:
pass
# 保持程序运行以监听节点变化
try:
while True:
time.sleep(1)
except:
logging.info("Stopping...")
# 关闭连接
zk.stop()
zk.close()
except:
logging.error(traceback.format_exc())

266
text_analysis/views.py_20240819

@ -0,0 +1,266 @@
# coding:utf8
import os, sys
import io
from jsonpath_ng import jsonpath, parse
import uuid
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf8')
cur_dir = os.path.dirname(os.path.abspath(__file__)) or os.getcwd()
par_dir = os.path.abspath(os.path.join(cur_dir, os.path.pardir))
sys.path.append(cur_dir)
sys.path.append(par_dir)
import json
from django.http import HttpResponse
from text_analysis.tools import to_kafka
from django.views.decorators.csrf import csrf_exempt
from log_util.set_logger import set_logger
logging = set_logger('logs/results.log')
import traceback
import queue
import requests
from text_analysis.tools.tool import parse_data
import time
from datetime import datetime
import os
from kazoo.client import KazooClient
from kazoo.protocol.states import EventType
# 任务队列
# global task_queue
task_queue = queue.Queue()
# 数据队列
# global data_queue
data_queue = queue.Queue()
stop_dict={}
@csrf_exempt
def ASRNew(request):
if request.method == 'POST':
try:
raw_data = json.loads(request.body)
task_queue.put(raw_data)
return HttpResponse(json.dumps({"code": 1, "msg": "请求正常!"}, ensure_ascii=False))
except:
logging.error(traceback.format_exc())
return HttpResponse(json.dumps({"code": 0, "msg": "请求json格式不正确!"}, ensure_ascii=False))
else:
return HttpResponse(json.dumps({"code": 0, "msg": "请求方式错误,改为post请求"}, ensure_ascii=False))
def upload():
while True:
try:
if task_queue.qsize() > 0:
logging.info("取任务队列长度{}".format(task_queue.qsize()))
raw_data = task_queue.get()
output=raw_data["output"]
res_tmp={key: "" for key in output}
if "id" in res_tmp.keys():
res_tmp["id"]=str(uuid.uuid4())
logging.info("任务数据为:{}".format(raw_data))
logging.info("当前version信息为:{}".format(stop_dict))
task_id=raw_data["scenes_id"]
task_version=raw_data["version"]
if task_id in stop_dict.keys() and task_version!=stop_dict[task_id]["version"]:
logging.info("已暂停任务上传,过滤掉。{}".format(raw_data))
continue
url=raw_data["input"]["fileUrl"]
if "json" in url:
parm = url.split("#")
data1 = parse_data(raw_data, parm[0])
data1_json = json.loads(data1)
expr = parse(parm[2])
match = [match.value for match in expr.find(data1_json)]
video_url = match[0]
else:
video_url = parse_data(raw_data, url)
fileName=video_url.rsplit('/')[-1]
if "http" not in video_url:
file = "https://caiji.percent.cn/" + video_url.lstrip("/")
else:
file=video_url
# name=raw_data["metadata"]["admin"]["fileName"]
# if '$.' in name:
# # json.path表达式动态获取value
# datasources = str(name).split(':')
# # 0是数据源,1是JsonPath 表达式
# datasourcestr = raw_data["data"][datasources[0]]
# datasource = json.loads(datasourcestr)
# # 创建 JsonPath 表达式对象
# expr = parse(datasources[1])
# # 使用表达式来选择 JSON 元素
# match = [match.value for match in expr.find(datasource)]
# fileName = match[0]
currentFile={"fileName":fileName,"fileUrl":file}
language = raw_data["input"]["fromLanguage"]
# 从gofast获取视频
myfile = requests.get(file)
starttime = datetime.now().strftime('%Y-%m-%d')
path = 'inputdata/' + starttime
if not os.path.exists(path):
os.makedirs(path)
with open(path + '/' + fileName, 'wb') as f:
f.write(myfile.content)
logging.info("视频从gofast下载完毕,开始上传-{}".format(fileName))
# 访问视频上传接口
# video=1视频,0音频。
video=1
if fileName[-3:]=="m4a" or fileName[-3:]=="mp3" or fileName[-3:]=="wav":
url="https://realtime.pdeepmatrix.com/apis/file/asr/upload"
video=0
else:
url = "https://realtime.pdeepmatrix.com/apis/media/analysis/upload"
data = {'fromLanguage': language}
f = open(path + '/' + fileName, 'rb')
files = {'file': f}
response = requests.post(url, data=data, files=files,verify=False)
logging.info("上传后接口返回值:{}-{}".format(response,response.text))
d = json.loads(response.text)
if "code" in d.keys() and d["code"] == 200:
# 接口返回值data中存放视频获取结果的key
result = d["data"]
raw_data["result"] = {"successCode": "1", "errorLog": "", "results": "", "dataKey": result,"video":video,"file":currentFile}
data_queue.put(raw_data)
logging.info("视频上传成功{}".format(raw_data))
# to_kafka.send_kafka(raw_data,logging)
else:
logging.info("视频上传失败{}-{}".format(raw_data, d))
f.close()
# Todo删除视频文件
else:
# 暂无任务,进入休眠
time.sleep(10)
except:
raw_data["result"]={}
raw_data["result"]["successCode"] = "0"
raw_data["result"]["status"]=2
raw_data["result"]["message"]="视频/音频上传异常"
raw_data["result"]["errorLog"] = traceback.format_exc()
raw_data["result"]["results"] = json.dumps(res_tmp, ensure_ascii=False)
logging.error(traceback.format_exc())
to_kafka.send_kafka(raw_data, logging)
def getResult():
while True:
# 3秒钟结果获取一次
time.sleep(3)
try:
if data_queue.qsize() > 0:
logging.info("取数据队列长度{}".format(data_queue.qsize()))
raw_data = data_queue.get()
logging.info("任务数据为:{}".format(raw_data))
task_id=raw_data["scenes_id"]
task_version=raw_data["version"]
if task_id in stop_dict.keys() and task_version!=stop_dict[task_id]["version"]:
logging.info("已暂停获取结果任务,过滤掉。{}".format(raw_data))
continue
output=raw_data["output"]
res_tmp={key: "" for key in output}
if "id" in res_tmp.keys():
res_tmp["id"]=str(uuid.uuid4())
res_tmp["isLast"]=1
res_tmp["fileName"]=raw_data["result"]["file"]["fileName"]
# 根据视频key访问获取结果接口
dataKey = raw_data["result"]["dataKey"]
params = {'taskId': dataKey}
language = raw_data["input"]["fromLanguage"]
data = {'fromLanguage': language,'taskId': dataKey}
if raw_data["result"]["video"]==1:
url = "https://realtime.pdeepmatrix.com/apis/media/analysis/getResult"
response = requests.get(url, params=params, verify=False)
else:
url ="https://realtime.pdeepmatrix.com/apis/file/asr/getResult"
response = requests.post(url, data=data, verify=False)
logging.info("ASR网站返回值:{}-{}".format(response,response.text))
d = json.loads(response.text)
if "code" in d.keys() and d["code"] == 200:
results = ""
if d["data"]["code"] == "1" and d["data"]["sentences"]:
for sentence in d["data"]["sentences"]:
if results:
results += ' ' + sentence["text"]
else:
results = sentence["text"]
if "content" in res_tmp.keys():
res_tmp["content"]=results
raw_data["result"]["results"] = json.dumps(res_tmp, ensure_ascii=False)
raw_data["result"]["status"]=1
raw_data["result"]["message"]="成功"
logging.info("视频解析获取结果成功{}".format(raw_data))
to_kafka.send_kafka(raw_data, logging)
elif d["data"]["code"] == "1" and not d["data"]["sentences"]:
results =""
if "content" in res_tmp.keys():
res_tmp["content"]=results
raw_data["result"]["results"] = json.dumps(res_tmp, ensure_ascii=False)
raw_data["result"]["status"]=1
raw_data["result"]["message"]="成功"
logging.info("视频解析获取结果成功{}".format(raw_data))
to_kafka.send_kafka(raw_data, logging)
elif d["data"]["code"] == "0":
# 正在解析中,将任务再次放回数据队列
data_queue.put(raw_data)
logging.info("视频未解析完毕,放回队列等待{}-{}".format(raw_data, d))
else:
# 解析失败
raw_data["result"]["successCode"] = "0"
raw_data["result"]["errorLog"] = response.text
raw_data["result"]["results"] = json.dumps(res_tmp, ensure_ascii=False)
raw_data["result"]["status"]=2
raw_data["result"]["message"]="视频/音频解析异常"
logging.info("视频解析获取结果失败,数据{},接口返回值{}".format(raw_data, d))
to_kafka.send_kafka(raw_data, logging)
else:
raw_data["result"]["successCode"] = "0"
raw_data["result"]["errorLog"] = response.text
raw_data["result"]["results"] = json.dumps(res_tmp, ensure_ascii=False)
raw_data["result"]["status"] = 2
raw_data["result"]["message"] = "视频/音频解析异常"
logging.info("视频解析获取结果失败,数据{},接口返回值{}".format(raw_data, d))
to_kafka.send_kafka(raw_data, logging)
else:
# 暂无任务,进入休眠
time.sleep(10)
except:
raw_data["result"]["successCode"] = "0"
raw_data["result"]["errorLog"] = traceback.format_exc()
raw_data["result"]["status"] = 2
raw_data["result"]["message"] = "视频/音频解析异常"
raw_data["result"]["results"] = json.dumps(res_tmp, ensure_ascii=False)
logging.error(traceback.format_exc())
to_kafka.send_kafka(raw_data, logging)
def zk_monitoring():
try:
#线上环境
zk = KazooClient(hosts='172.18.1.146:2181,172.18.1.147:2181,172.18.1.148:2181')
#测试环境
# zk = KazooClient(hosts='172.16.12.55:2181,172.16.12.56:2181,172.16.12.57:2181')
zk.start()
# 设置监听器
@zk.DataWatch("/analyze")
def watch_node(data, stat, event):
if event is not None and event.type == EventType.CHANGED:
data, stat = zk.get("/analyze")
logging.info("执行删除操作:{}".format(data))
d = json.loads(data)
id = d["scenes_id"]
stop_dict[id] = {}
stop_dict[id]["version"] = d["version"]
stop_dict[id]["operation"] = d["operation"]
# 保持程序运行以监听节点变化
try:
while True:
time.sleep(1)
except:
logging.info("Stopping...")
# 关闭连接
zk.stop()
zk.close()
except:
logging.error(traceback.format_exc())

271
text_analysis/views_20240903.py

@ -0,0 +1,271 @@
# coding:utf8
import os, sys
import io
from jsonpath_ng import jsonpath, parse
import uuid
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf8')
cur_dir = os.path.dirname(os.path.abspath(__file__)) or os.getcwd()
par_dir = os.path.abspath(os.path.join(cur_dir, os.path.pardir))
sys.path.append(cur_dir)
sys.path.append(par_dir)
import json
from django.http import HttpResponse
from text_analysis.tools import to_kafka
from django.views.decorators.csrf import csrf_exempt
from log_util.set_logger import set_logger
logging = set_logger('logs/results.log')
import traceback
import queue
import requests
from text_analysis.tools.tool import parse_data
import time
from datetime import datetime
import os
from kazoo.client import KazooClient
from kazoo.protocol.states import EventType
# 任务队列
# global task_queue
task_queue = queue.Queue()
# 数据队列
# global data_queue
data_queue = queue.Queue()
stop_dict={}
@csrf_exempt
def ASRNew(request):
if request.method == 'POST':
try:
raw_data = json.loads(request.body)
task_queue.put(raw_data)
return HttpResponse(json.dumps({"code": 1, "msg": "请求正常!"}, ensure_ascii=False))
except:
logging.error(traceback.format_exc())
return HttpResponse(json.dumps({"code": 0, "msg": "请求json格式不正确!"}, ensure_ascii=False))
else:
return HttpResponse(json.dumps({"code": 0, "msg": "请求方式错误,改为post请求"}, ensure_ascii=False))
def upload():
while True:
try:
if task_queue.qsize() > 0:
logging.info("取任务队列长度{}".format(task_queue.qsize()))
raw_data = task_queue.get()
output=raw_data["output"]
res_tmp={key: "" for key in output}
if "id" in res_tmp.keys():
res_tmp["id"]=str(uuid.uuid4())
logging.info("任务数据为:{}".format(raw_data))
logging.info("当前version信息为:{}".format(stop_dict))
task_id=raw_data["scenes_id"]
task_version=raw_data["version"]
if task_id in stop_dict.keys() and task_version!=stop_dict[task_id]["version"]:
logging.info("已暂停任务上传,过滤掉。{}".format(raw_data))
continue
url=raw_data["input"]["fileUrl"]
if "json" in url:
parm = url.split("#")
data1 = parse_data(raw_data, parm[0])
data1_json = json.loads(data1)
expr = parse(parm[2])
match = [match.value for match in expr.find(data1_json)]
video_url = match[0]
else:
video_url = parse_data(raw_data, url)
fileName=video_url.rsplit('/')[-1]
if "http" not in video_url:
file = "https://caiji.percent.cn/" + video_url.lstrip("/")
else:
file=video_url
# name=raw_data["metadata"]["admin"]["fileName"]
# if '$.' in name:
# # json.path表达式动态获取value
# datasources = str(name).split(':')
# # 0是数据源,1是JsonPath 表达式
# datasourcestr = raw_data["data"][datasources[0]]
# datasource = json.loads(datasourcestr)
# # 创建 JsonPath 表达式对象
# expr = parse(datasources[1])
# # 使用表达式来选择 JSON 元素
# match = [match.value for match in expr.find(datasource)]
# fileName = match[0]
currentFile={"fileName":fileName,"fileUrl":file}
language = raw_data["input"]["fromLanguage"]
# 从gofast获取视频
myfile = requests.get(file)
starttime = datetime.now().strftime('%Y-%m-%d')
path = 'inputdata/' + starttime
if not os.path.exists(path):
os.makedirs(path)
with open(path + '/' + fileName, 'wb') as f:
f.write(myfile.content)
logging.info("视频从gofast下载完毕,开始上传-{}".format(fileName))
# 访问视频上传接口
# video=1视频,0音频。
video=1
if fileName[-3:]=="m4a" or fileName[-3:]=="mp3" or fileName[-3:]=="wav":
# url="https://realtime.pdeepmatrix.com/apis/file/asr/upload"
url="http://172.18.1.155:6611/apis/file/asr/upload "
video=0
else:
# url = "https://realtime.pdeepmatrix.com/apis/media/analysis/upload"
url = "http://172.18.1.155:6611/apis/media/analysis/upload"
data = {'fromLanguage': language}
f = open(path + '/' + fileName, 'rb')
files = {'file': f}
response = requests.post(url, data=data, files=files,verify=False)
logging.info("上传后接口返回值:{}-{}".format(response,response.text))
d = json.loads(response.text)
if "code" in d.keys() and d["code"] == 200:
# 接口返回值data中存放视频获取结果的key
result = d["data"]
raw_data["result"] = {"successCode": "1", "errorLog": "", "results": "", "dataKey": result,"video":video,"file":currentFile}
data_queue.put(raw_data)
logging.info("视频上传成功{}".format(raw_data))
# to_kafka.send_kafka(raw_data,logging)
else:
logging.info("视频上传失败{}-{}".format(raw_data, d))
f.close()
# Todo删除视频文件
else:
# 暂无任务,进入休眠
time.sleep(10)
except:
raw_data["result"]={}
raw_data["result"]["successCode"] = "0"
raw_data["result"]["status"]=2
raw_data["result"]["message"]="视频/音频上传异常"
raw_data["result"]["errorLog"] = traceback.format_exc()
raw_data["result"]["results"] = json.dumps(res_tmp, ensure_ascii=False)
logging.error(traceback.format_exc())
to_kafka.send_kafka(raw_data, logging)
def getResult():
while True:
# 3秒钟结果获取一次
time.sleep(3)
try:
if data_queue.qsize() > 0:
logging.info("取数据队列长度{}".format(data_queue.qsize()))
raw_data = data_queue.get()
logging.info("任务数据为:{}".format(raw_data))
task_id=raw_data["scenes_id"]
task_version=raw_data["version"]
if task_id in stop_dict.keys() and task_version!=stop_dict[task_id]["version"]:
logging.info("已暂停获取结果任务,过滤掉。{}".format(raw_data))
continue
output=raw_data["output"]
res_tmp={key: "" for key in output}
if "id" in res_tmp.keys():
res_tmp["id"]=str(uuid.uuid4())
res_tmp["isLast"]=1
res_tmp["fileName"]=raw_data["result"]["file"]["fileName"]
# 根据视频key访问获取结果接口
dataKey = raw_data["result"]["dataKey"]
params = {'taskId': dataKey}
language = raw_data["input"]["fromLanguage"]
data = {'fromLanguage': language,'taskId': dataKey}
if raw_data["result"]["video"]==1:
#url="https://realtime.pdeepmatrix.com/apis/media/analysis/getResult"
url = "http://172.18.1.155:6611/apis/media/analysis/getResult?taskId={}".format(dataKey)
response = requests.get(url, verify=False)
else:
# url ="https://realtime.pdeepmatrix.com/apis/file/asr/getResult"
url ="http://172.18.1.155:6611/apis/file/asr/getResult"
response = requests.post(url, data=data, verify=False)
logging.info("ASR网站返回值:{}-{}".format(response,response.text))
d = json.loads(response.text)
if "code" in d.keys() and d["code"] == 200:
results = ""
if d["data"]["code"] == "1" and d["data"]["sentences"]:
for sentence in d["data"]["sentences"]:
if results:
results += ' ' + sentence["text"]
else:
results = sentence["text"]
if "content" in res_tmp.keys():
res_tmp["content"]=results
raw_data["result"]["results"] = json.dumps(res_tmp, ensure_ascii=False)
raw_data["result"]["status"]=1
raw_data["result"]["message"]="成功"
logging.info("视频解析获取结果成功{}".format(raw_data))
to_kafka.send_kafka(raw_data, logging)
elif d["data"]["code"] == "1" and not d["data"]["sentences"]:
results =""
if "content" in res_tmp.keys():
res_tmp["content"]=results
raw_data["result"]["results"] = json.dumps(res_tmp, ensure_ascii=False)
raw_data["result"]["status"]=1
raw_data["result"]["message"]="成功"
logging.info("视频解析获取结果成功{}".format(raw_data))
to_kafka.send_kafka(raw_data, logging)
elif d["data"]["code"] == "0":
# 正在解析中,将任务再次放回数据队列
data_queue.put(raw_data)
logging.info("视频未解析完毕,放回队列等待{}-{}".format(raw_data, d))
else:
# 解析失败
raw_data["result"]["successCode"] = "0"
raw_data["result"]["errorLog"] = response.text
raw_data["result"]["results"] = json.dumps(res_tmp, ensure_ascii=False)
raw_data["result"]["status"]=2
raw_data["result"]["message"]="视频/音频解析异常"
logging.info("视频解析获取结果失败,数据{},接口返回值{}".format(raw_data, d))
to_kafka.send_kafka(raw_data, logging)
else:
raw_data["result"]["successCode"] = "0"
raw_data["result"]["errorLog"] = response.text
raw_data["result"]["results"] = json.dumps(res_tmp, ensure_ascii=False)
raw_data["result"]["status"] = 2
raw_data["result"]["message"] = "视频/音频解析异常"
logging.info("视频解析获取结果失败,数据{},接口返回值{}".format(raw_data, d))
to_kafka.send_kafka(raw_data, logging)
else:
# 暂无任务,进入休眠
time.sleep(10)
except:
raw_data["result"]["successCode"] = "0"
raw_data["result"]["errorLog"] = traceback.format_exc()
raw_data["result"]["status"] = 2
raw_data["result"]["message"] = "视频/音频解析异常"
raw_data["result"]["results"] = json.dumps(res_tmp, ensure_ascii=False)
logging.error(traceback.format_exc())
to_kafka.send_kafka(raw_data, logging)
def zk_monitoring():
try:
#线上环境
zk = KazooClient(hosts='172.18.1.146:2181,172.18.1.147:2181,172.18.1.148:2181')
#测试环境
# zk = KazooClient(hosts='172.16.12.55:2181,172.16.12.56:2181,172.16.12.57:2181')
zk.start()
# 设置监听器
@zk.DataWatch("/analyze")
def watch_node(data, stat, event):
if event is not None and event.type == EventType.CHANGED:
data, stat = zk.get("/analyze")
logging.info("执行删除操作:{}".format(data))
d = json.loads(data)
id = d["scenes_id"]
stop_dict[id] = {}
stop_dict[id]["version"] = d["version"]
stop_dict[id]["operation"] = d["operation"]
# 保持程序运行以监听节点变化
try:
while True:
time.sleep(1)
except:
logging.info("Stopping...")
# 关闭连接
zk.stop()
zk.close()
except:
logging.error(traceback.format_exc())

16
text_analysis/wsgi.py

@ -0,0 +1,16 @@
"""
WSGI config for Zhijian_Project_WebService project.
It exposes the WSGI callable as a module-level variable named ``application``.
For more information on this file, see
https://docs.djangoproject.com/en/1.8/howto/deployment/wsgi/
"""
import os
from django.core.wsgi import get_wsgi_application
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "text_analysis.settings")
application = get_wsgi_application()

8
uwsgi.ini

@ -0,0 +1,8 @@
[uwsgi]
http = 0.0.0.0:9014
chdir = ../asrNew
wsgi-file = ../asrNew/wsgi.py
processes = 1
threads = 2
listen = 1024
http-timeout=21600

58
wsgi.log

@ -0,0 +1,58 @@
*** Starting uWSGI 2.0.21 (64bit) on [Thu Jan 2 14:58:11 2025] ***
compiled with version: 11.2.0 on 24 October 2023 19:53:56
os: Linux-3.10.0-1127.19.1.el7.x86_64 #1 SMP Tue Aug 25 17:23:54 UTC 2020
nodename: node-04
machine: x86_64
clock source: unix
pcre jit disabled
detected number of CPU cores: 64
current working directory: /opt/analyze/apps/asrNew
detected binary path: /opt/analyze/environment/python3.8/bin/uwsgi
uWSGI running as root, you can use --uid/--gid/--chroot options
*** WARNING: you are running uWSGI as root !!! (use the --uid flag) ***
chdir() to ../asrNew
*** WARNING: you are running uWSGI without its master process manager ***
your processes number limit is 1031041
your memory page size is 4096 bytes
detected max file descriptor number: 65535
lock engine: pthread robust mutexes
thunder lock: disabled (you can enable it with --thunder-lock)
Listen queue size is greater than the system max net.core.somaxconn (128).
*** Starting uWSGI 2.0.21 (64bit) on [Thu Jan 2 15:05:08 2025] ***
compiled with version: 11.2.0 on 24 October 2023 19:53:56
os: Linux-3.10.0-1127.19.1.el7.x86_64 #1 SMP Tue Aug 25 17:23:54 UTC 2020
nodename: node-04
machine: x86_64
clock source: unix
pcre jit disabled
detected number of CPU cores: 64
current working directory: /opt/analyze/apps/asrNew
detected binary path: /opt/analyze/environment/python3.8/bin/uwsgi
uWSGI running as root, you can use --uid/--gid/--chroot options
*** WARNING: you are running uWSGI as root !!! (use the --uid flag) ***
chdir() to ../asrNew
*** WARNING: you are running uWSGI without its master process manager ***
your processes number limit is 1031041
your memory page size is 4096 bytes
detected max file descriptor number: 65535
lock engine: pthread robust mutexes
thunder lock: disabled (you can enable it with --thunder-lock)
uWSGI http bound on 0.0.0.0:9014 fd 4
spawned uWSGI http 1 (pid: 32756)
uwsgi socket 0 bound to TCP address 127.0.0.1:39733 (port auto-assigned) fd 3
uWSGI running as root, you can use --uid/--gid/--chroot options
*** WARNING: you are running uWSGI as root !!! (use the --uid flag) ***
Python version: 3.8.16 (default, Jun 12 2023, 18:09:05) [GCC 11.2.0]
Python main interpreter initialized at 0x22e11b0
uWSGI running as root, you can use --uid/--gid/--chroot options
*** WARNING: you are running uWSGI as root !!! (use the --uid flag) ***
python threads support enabled
your server socket listen backlog is limited to 1024 connections
your mercy for graceful operations on workers is 60 seconds
mapped 83376 bytes (81 KB) for 2 cores
*** Operational MODE: threaded ***
WSGI app 0 (mountpoint='') ready in 0 seconds on interpreter 0x22e11b0 pid: 32755 (default app)
uWSGI running as root, you can use --uid/--gid/--chroot options
*** WARNING: you are running uWSGI as root !!! (use the --uid flag) ***
*** uWSGI is running in multiple interpreter mode ***
spawned uWSGI worker 1 (and the only) (pid: 32755, cores: 2)

35
wsgi.py

@ -0,0 +1,35 @@
"""
WSGI config for Zhijian_Project_WebService project.
It exposes the WSGI callable as a module-level variable named ``application``.
For more information on this file, see
https://docs.djangoproject.com/en/1.8/howto/deployment/wsgi/
"""
import os
import threading
from text_analysis.views import upload,getResult,zk_monitoring
t = threading.Thread(target=upload, name='upload')
t.daemon = True
t.start()
r = threading.Thread(target=getResult, name='getResult')
r.daemon = True
r.start()
#启动zk监听线程
t = threading.Thread(target=zk_monitoring, name='zk_monitoring')
t.daemon = True
t.start()
from django.core.wsgi import get_wsgi_application
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "text_analysis.settings")
application = get_wsgi_application()

30
wsgi.py_0228

@ -0,0 +1,30 @@
"""
WSGI config for Zhijian_Project_WebService project.
It exposes the WSGI callable as a module-level variable named ``application``.
For more information on this file, see
https://docs.djangoproject.com/en/1.8/howto/deployment/wsgi/
"""
import os
import threading
from text_analysis.views import upload,getResult
t = threading.Thread(target=upload, name='upload')
t.daemon = True
t.start()
r = threading.Thread(target=getResult, name='getResult')
r.daemon = True
r.start()
from django.core.wsgi import get_wsgi_application
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "text_analysis.settings")
application = get_wsgi_application()
Loading…
Cancel
Save