算法暴露接口(xhs、dy、ks、wx、hnw)
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

336 lines
16 KiB

7 months ago
  1. # coding:utf-8
  2. import json
  3. import time
  4. import os, sys
  5. import argparse
  6. # 相对路径补充
  7. root_path = os.path.abspath(os.path.dirname(__file__)).split('api-py')[0] + "api-py"
  8. sys.path.append(root_path)
  9. from concurrent.futures._base import as_completed
  10. from concurrent.futures.thread import ThreadPoolExecutor
  11. from urllib.parse import urlparse, quote_plus, parse_qs
  12. from utils.MysqlData import MysqlPoolClient, CRAWLER_DB_CONF_KS
  13. from kuaishou.ks_http import download_q, BaseHeaders, BaseParam, download_pic, BaseURL
  14. from utils.Logger import MyLogger
  15. from loguru import logger
  16. from kuaishou.ks_make_trace import Generate_trajectory
  17. from utils.ImageHelper import recognize_gap
  18. import traceback
  19. class KSSlide(object):
  20. """
  21. """
  22. def __init__(self, count=5, is_proxy=False):
  23. self.is_proxy = is_proxy
  24. self.count = count
  25. self.did_url = "https://www.kuaishou.com/short-video/3xdhjtb9xs7xpaw?authorId=3xsdu49r65skedk&streamSource=profile&area=profilexxnull"
  26. self.api_url = "https://www.kuaishou.com/graphql"
  27. self.headers = BaseHeaders.HEADERS.value
  28. self.doc_headers = BaseHeaders.DOC_HEADERS.value
  29. self.pic_headers = BaseHeaders.PIC_HEADERS.value
  30. self.verify_headers = BaseHeaders.VERIFY_HEADERS.value
  31. self.did = ""
  32. self.captchaSession = ""
  33. self.captcha_refer = ""
  34. self.sql_list = []
  35. self.post_data = {
  36. "search": {
  37. "referer": "",
  38. "data": {
  39. "operationName": "visionSearchPhoto",
  40. "variables": {
  41. "keyword": "f1",
  42. "pcursor": "",
  43. "page": "search"
  44. },
  45. "query": "fragment photoContent on PhotoEntity {\n __typename\n id\n duration\n caption\n originCaption\n likeCount\n viewCount\n commentCount\n realLikeCount\n coverUrl\n photoUrl\n photoH265Url\n manifest\n manifestH265\n videoResource\n coverUrls {\n url\n __typename\n }\n timestamp\n expTag\n animatedCoverUrl\n distance\n videoRatio\n liked\n stereoType\n profileUserTopPhoto\n musicBlocked\n}\n\nfragment recoPhotoFragment on recoPhotoEntity {\n __typename\n id\n duration\n caption\n originCaption\n likeCount\n viewCount\n commentCount\n realLikeCount\n coverUrl\n photoUrl\n photoH265Url\n manifest\n manifestH265\n videoResource\n coverUrls {\n url\n __typename\n }\n timestamp\n expTag\n animatedCoverUrl\n distance\n videoRatio\n liked\n stereoType\n profileUserTopPhoto\n musicBlocked\n}\n\nfragment feedContent on Feed {\n type\n author {\n id\n name\n headerUrl\n following\n headerUrls {\n url\n __typename\n }\n __typename\n }\n photo {\n ...photoContent\n ...recoPhotoFragment\n __typename\n }\n canAddComment\n llsid\n status\n currentPcursor\n tags {\n type\n name\n __typename\n }\n __typename\n}\n\nquery visionSearchPhoto($keyword: String, $pcursor: String, $searchSessionId: String, $page: String, $webPageArea: String) {\n visionSearchPhoto(keyword: $keyword, pcursor: $pcursor, searchSessionId: $searchSessionId, page: $page, webPageArea: $webPageArea) {\n result\n llsid\n webPageArea\n feeds {\n ...feedContent\n __typename\n }\n searchSessionId\n pcursor\n aladdinBanner {\n imgUrl\n link\n __typename\n }\n __typename\n }\n}\n"
  46. },
  47. },
  48. "video": {},
  49. "user": {
  50. "referer": "https://www.kuaishou.com/profile/3xsdu49r65skedk",
  51. "data": {
  52. "operationName": "visionProfilePhotoList",
  53. "variables": {
  54. "userId": "3xsdu49r65skedk",
  55. "pcursor": "",
  56. "page": "profile"
  57. },
  58. "query": "fragment photoContent on PhotoEntity {\n __typename\n id\n duration\n caption\n originCaption\n likeCount\n viewCount\n commentCount\n realLikeCount\n coverUrl\n photoUrl\n photoH265Url\n manifest\n manifestH265\n videoResource\n coverUrls {\n url\n __typename\n }\n timestamp\n expTag\n animatedCoverUrl\n distance\n videoRatio\n liked\n stereoType\n profileUserTopPhoto\n musicBlocked\n}\n\nfragment recoPhotoFragment on recoPhotoEntity {\n __typename\n id\n duration\n caption\n originCaption\n likeCount\n viewCount\n commentCount\n realLikeCount\n coverUrl\n photoUrl\n photoH265Url\n manifest\n manifestH265\n videoResource\n coverUrls {\n url\n __typename\n }\n timestamp\n expTag\n animatedCoverUrl\n distance\n videoRatio\n liked\n stereoType\n profileUserTopPhoto\n musicBlocked\n}\n\nfragment feedContent on Feed {\n type\n author {\n id\n name\n headerUrl\n following\n headerUrls {\n url\n __typename\n }\n __typename\n }\n photo {\n ...photoContent\n ...recoPhotoFragment\n __typename\n }\n canAddComment\n llsid\n status\n currentPcursor\n tags {\n type\n name\n __typename\n }\n __typename\n}\n\nquery visionProfilePhotoList($pcursor: String, $userId: String, $page: String, $webPageArea: String) {\n visionProfilePhotoList(pcursor: $pcursor, userId: $userId, page: $page, webPageArea: $webPageArea) {\n result\n llsid\n webPageArea\n feeds {\n ...feedContent\n __typename\n }\n hostName\n pcursor\n __typename\n }\n}\n"
  59. }
  60. },
  61. "comment": {
  62. "referer": "https://www.kuaishou.com/short-video/3xdhjtb9xs7xpaw?authorId=3xsdu49r65skedk&streamSource=profile&area=profilexxnull",
  63. "data": {
  64. "operationName": "commentListQuery",
  65. "variables": {
  66. "photoId": "3xdhjtb9xs7xpaw",
  67. "pcursor": ""
  68. },
  69. "query": "query commentListQuery($photoId: String, $pcursor: String) {\n visionCommentList(photoId: $photoId, pcursor: $pcursor) {\n commentCount\n pcursor\n rootComments {\n commentId\n authorId\n authorName\n content\n headurl\n timestamp\n likedCount\n realLikedCount\n liked\n status\n authorLiked\n subCommentCount\n subCommentsPcursor\n subComments {\n commentId\n authorId\n authorName\n content\n headurl\n timestamp\n likedCount\n realLikedCount\n liked\n status\n authorLiked\n replyToUserName\n replyTo\n __typename\n }\n __typename\n }\n __typename\n }\n}\n"
  70. }
  71. }
  72. }
  73. pass
  74. def get_did(self):
  75. """
  76. did
  77. """
  78. response = download_q(self.did_url, self.doc_headers, {}, is_proxy=self.is_proxy)
  79. self.did = response.cookies["did"]
  80. logger.info(f"获得did -> {self.did}")
  81. return self.did
  82. def get_session(self, did, token="", type_="user"):
  83. """
  84. comment接口拿session
  85. """
  86. if not did:
  87. if not self.did:
  88. return
  89. self.did = did
  90. headers = self.headers.copy()
  91. headers["Referer"] = self.post_data[type_]["referer"]
  92. if token:
  93. headers["Identity-Verification-Token"] = token
  94. headers["Identity-verification-type"] = "2"
  95. data = json.dumps(self.post_data[type_]["data"], separators=(',', ':'))
  96. cookies = {
  97. "kpf": "PC_WEB",
  98. "kpn": "KUAISHOU_VISION",
  99. "clientid": "3",
  100. "did": self.did
  101. }
  102. response = download_q(self.api_url, headers, cookies, data=data, is_proxy=self.is_proxy)
  103. res = response.json()
  104. status = res["data"].get("captcha")
  105. if status: # 需要验证码 非 400002 暂时抛弃40002状态码
  106. captchaSession_url = res["data"]["captcha"]["url"]
  107. params = parse_qs(urlparse(captchaSession_url).query)
  108. elif res["data"].get("result"):
  109. logger.warning("验证码异常 !400002 由于评论不出验证码 暂时放行! ")
  110. captchaSession_url = res["data"]["url"]
  111. params = parse_qs(urlparse(captchaSession_url).query)
  112. else: # 有数据或者为空 不是验证码问题
  113. s = res.get("data", {}).get("visionProfilePhotoList", {}).get("result")
  114. if s and s == 1: # 这里只对user生效
  115. logger.info(f"did 有效; 携带 token {token}")
  116. else:
  117. logger.warning(f"did 无效 {res}; 携带 token {token}")
  118. return None, None
  119. self.captchaSession = params["captchaSession"][0]
  120. self.captcha_refer = captchaSession_url
  121. # logger.info(f"获得captchaSession 》》 {self.captchaSession}")
  122. return self.captcha_refer, self.captchaSession
  123. def get_config(self, did, Referer, captchaSession, is_save=False):
  124. """
  125. """
  126. headers = self.verify_headers.copy()
  127. headers["Referer"] = Referer
  128. timeStamp = str(int(time.time()))
  129. cookies = {
  130. "did": did
  131. }
  132. url = "https://captcha.zt.kuaishou.com/rest/zt/captcha/sliding/config"
  133. data = {
  134. "captchaSession": captchaSession
  135. }
  136. response = download_q(url, headers, cookies, data=data, is_proxy=self.is_proxy)
  137. data = response.json()
  138. # logger.info(f"获得config -> {data}")
  139. captchaSn = data.get("captchaSn")
  140. bgPicUrl = data.get("bgPicUrl") + f"?captchaSn={captchaSn}"
  141. cutPicUrl = data.get("cutPicUrl") + f"?captchaSn={captchaSn}"
  142. self.pic_headers["Referer"] = Referer
  143. bgContent = download_q(bgPicUrl, self.pic_headers, cookies).content
  144. fgcontent = download_q(cutPicUrl, self.pic_headers, cookies).content
  145. if is_save:
  146. bg_path = f"tmp/{timeStamp}_bg_pic.jpg"
  147. fg_path = f"tmp/{timeStamp}_fg_pic.jpg"
  148. download_pic(bg_path, bgContent)
  149. download_pic(fg_path, fgcontent)
  150. else:
  151. bg_path = bgContent
  152. fg_path = fgcontent
  153. disY = data.get("disY")
  154. verifyParam = self.build_param(bg_path, fg_path, captchaSn, disY)
  155. return verifyParam
  156. def build_param(self, bg, fg, captchaSn, y):
  157. """
  158. """
  159. distance = recognize_gap(bg, fg)
  160. relativeX = int(distance * 0.46) # 缩放
  161. trajectory = Generate_trajectory().get_slide_track(int(distance * 1.76)) # 1.76 1.764 暂时测定稳定
  162. logger.info(f"缩放距离为 -> {relativeX}")
  163. param = BaseParam.VERIFY_PARAM.value
  164. param["captchaSn"] = captchaSn
  165. param["relativeX"] = relativeX
  166. param["relativeY"] = int(y * 0.46) # config接口里的y 缩放 136 * 56/122 ()
  167. param["trajectory"] = trajectory
  168. # param["gpuInfo"] = "" ## TODO: 需要随机替换
  169. # param["captchaExtraParam"] = ""
  170. def get_plaintext(t: dict):
  171. concat_order = ["captchaSn", "bgDisWidth", "bgDisHeight", "cutDisWidth", "cutDisHeight",
  172. "relativeX", "relativeY", "trajectory", "gpuInfo", "captchaExtraParam"]
  173. return "&".join([k + "=" + quote_plus(str(t[k])).replace("+", "%20") for k in concat_order])
  174. info = get_plaintext(param)
  175. verifyParam = KSSlide.encrypt(info)
  176. return verifyParam
  177. @staticmethod
  178. def encrypt(info):
  179. """
  180. node-js
  181. """
  182. url = BaseURL.NodeURL.value
  183. data = {
  184. "info": info
  185. }
  186. response = download_q(url, {}, {}, data=data)
  187. # logger.info(f"node服务获取加密数据")
  188. return response.text
  189. def verify(self, verifyParam, did, refer):
  190. headers = self.verify_headers.copy()
  191. headers["Referer"] = refer
  192. headers["Content-Type"] = "application/json"
  193. cookies = {
  194. "did": did
  195. }
  196. url = "https://captcha.zt.kuaishou.com/rest/zt/captcha/sliding/kSecretApiVerify"
  197. data1 = {
  198. "verifyParam": verifyParam
  199. }
  200. data = json.dumps(data1, separators=(',', ':'))
  201. response = download_q(url, headers, cookies, data=data, is_proxy=self.is_proxy)
  202. logger.info(f"verify 结果 -> {response.json()}")
  203. res = response.json()
  204. captcha_token = res.get("captchaToken", None)
  205. # logger.info(f"获得captchaToken为: {captcha_token}")
  206. return captcha_token
  207. def run(self):
  208. """
  209. """
  210. try:
  211. did = self.get_did()
  212. referer, session = self.get_session(did)
  213. if session:
  214. verifyParam = self.get_config(did, referer, session)
  215. if verifyParam:
  216. token = self.verify(verifyParam, did, referer)
  217. if token:
  218. logger.success(f"验证码成功: -> {did}")
  219. sql_demo = "INSERT INTO `ksCookie`(`cookie`, `token`) VALUES('%s', '%s');" % (did, token)
  220. self.sql_list.append(sql_demo)
  221. # logger.info(f"sql_demo {sql_demo}")
  222. self.get_session(did=did, token=token, type_="user")
  223. else:
  224. logger.error("verify 失败")
  225. else:
  226. logger.error("node 生成 参数失败")
  227. else:
  228. logger.error("获得 session 失败")
  229. except Exception as e:
  230. logger.error(e)
  231. traceback.print_exc()
  232. # def write_file(l):
  233. # with open("webdid.txt", "w") as f:
  234. # f.write("\n".join(l))
  235. # f.close()
  236. # logger.info("文件保存成功")
  237. def insert_data(sql_list):
  238. """
  239. :param sql_list:
  240. :return:
  241. """
  242. client = MysqlPoolClient(CRAWLER_DB_CONF_KS)
  243. for sql in sql_list:
  244. try:
  245. logger.success(f"insert cookie -> {sql}")
  246. client.getOne(sql)
  247. except Exception as e:
  248. logger.error(f"insert cookie -> {sql}")
  249. def create_by_thread(slid, count):
  250. """
  251. :param slid:
  252. :param count:
  253. :return:
  254. """
  255. with ThreadPoolExecutor(max_workers=2) as t:
  256. obj_list = []
  257. for i in range(count * 2):
  258. obj = t.submit(slid.run)
  259. obj_list.append(obj)
  260. insert_data(slide.sql_list)
  261. logger.info(f"[sum] 并发任务 需要生成数量 {count}, 实际抓取数量 {count*2}, 实际生成数量 {len(slide.sql_list)}, 成功率 {len(slid.sql_list)/(count*2)}")
  262. def create_by_for(slid, count):
  263. """
  264. for循环执行
  265. :param slid:
  266. :param count:
  267. :return:
  268. """
  269. num = 100
  270. i = 0
  271. while num > i:
  272. if len(slid.sql_list) >= count: # 超出目标数量结束
  273. break
  274. slid.run()
  275. i += 1
  276. insert_data(slide.sql_list)
  277. logger.info(f"[sum] 循环任务 需要生成数量 {count}, 实际抓取数量 {i}, 实际生成数量 {len(slide.sql_list)}, 成功率 {len(slide.sql_list)/i}")
  278. if __name__ == '__main__':
  279. slide = KSSlide(is_proxy=True)
  280. log = MyLogger()
  281. parser = argparse.ArgumentParser(description='pass kuaishou cookie slider')
  282. parser.add_argument('-c', type=int, default=10, help="needed cookie count;default count=10;")
  283. parser.add_argument('-m', type=str, default="0", help="method: {0:for, 1:thread}; default method=0;")
  284. args = parser.parse_args()
  285. method = {
  286. "0": create_by_for,
  287. "1": create_by_thread
  288. }
  289. args_count = args.c
  290. args_method = args.m
  291. method[args_method](slide, args_count) # 执行函数