基于百度翻译的python爬虫示例

打印 上一主题 下一主题

主题 1007|帖子 1007|积分 3021

(本年java工作真难找啊,有广州java高级岗位招人的好心人麻烦推一下,拜谢。。)
花了一周时间,从零基础开始学习了python,学有所获之后,就总想爬些什么,否则感觉不得劲,以是花了一天时间整出了个百度翻译的爬虫示例,主要卡点花在了找token、sign以及调试请求上。代码有点乱,究竟是demo,但是功能是实现了的。
  1. import requests
  2. import js2py
  3. import re
  4. from urllib.parse import urlencode
  5. url = "https://fanyi.baidu.com/#zh/en/"
  6. session  = requests.session()
  7. headers = {
  8.     'Content-Type': 'application/x-www-form-urlencoded',
  9.     'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 16_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1',
  10. }
  11. cookies = {
  12.     'BAIDUID': '624820D8D9163F370A491E7CA70C23D4:SL=0:NR=10:FG=1',
  13. }
  14. response = session.get(url,headers=headers,cookies=cookies)
  15. print(dict(response.cookies))
  16. with open('baidu.html', 'w') as f:
  17.     f.write(response.content.decode())
  18. token_pattern = r"token:\s*'([a-f0-9]+)'"
  19. token = re.search(token_pattern, response.content.decode()).group(1)
  20. gtk_pattern = "gtk:\s*'([^']+)'"
  21. gtk = re.search(gtk_pattern, response.content.decode()).group(1)
  22. print(token)
  23. print(gtk)
  24. # 获取sign
  25. context = js2py.EvalJs()
  26. public_js = ""
  27. with open('public.js', 'r') as f:
  28.     public_js += f.read()
  29. context.execute(public_js)
  30. context.wd = '好好学习,天天向上'
  31. context.token = token
  32. context.gtk = gtk
  33. sug_response = session.post("https://fanyi.baidu.com/sug", data={'kw': context.wd}, headers=headers)
  34. print(sug_response.json())
  35. context.execute("""
  36.      function n(r, o) {
  37.         for (var t = 0; t < o.length - 2; t += 3) {
  38.             var e = o.charAt(t + 2);
  39.             e = e >= "a" ? e.charCodeAt(0) - 87 : Number(e),
  40.             e = "+" === o.charAt(t + 1) ? r >>> e : r << e,
  41.             r = "+" === o.charAt(t) ? r + e & 4294967295 : r ^ e
  42.         }
  43.         return r
  44.     }
  45.      function a(r) {
  46.    
  47.         var a = r.length;
  48.         a > 30 && (r = "" + r.substr(0, 10) + r.substr(Math.floor(a / 2) - 5, 10) + r.substr(-10, 10))
  49.       
  50.         var l = void 0
  51.           , d = "" + String.fromCharCode(103) + String.fromCharCode(116) + String.fromCharCode(107);
  52.         l = gtk;
  53.         for (var m = l.split("."), S = Number(m[0]) || 0, s = Number(m[1]) || 0, c = [], v = 0, F = 0; F < r.length; F++) {
  54.             var p = r.charCodeAt(F);
  55.             128 > p ? c[v++] = p : (2048 > p ? c[v++] = p >> 6 | 192 : (55296 === (64512 & p) && F + 1 < r.length && 56320 === (64512 & r.charCodeAt(F + 1)) ? (p = 65536 + ((1023 & p) << 10) + (1023 & r.charCodeAt(++F)),
  56.             c[v++] = p >> 18 | 240,
  57.             c[v++] = p >> 12 & 63 | 128) : c[v++] = p >> 12 | 224,
  58.             c[v++] = p >> 6 & 63 | 128),
  59.             c[v++] = 63 & p | 128)
  60.         }
  61.         for (var w = S, A = "" + String.fromCharCode(43) + String.fromCharCode(45) + String.fromCharCode(97) + ("" + String.fromCharCode(94) + String.fromCharCode(43) + String.fromCharCode(54)), b = "" + String.fromCharCode(43) + String.fromCharCode(45) + String.fromCharCode(51) + ("" + String.fromCharCode(94) + String.fromCharCode(43) + String.fromCharCode(98)) + ("" + String.fromCharCode(43) + String.fromCharCode(45) + String.fromCharCode(102)), D = 0; D < c.length; D++)
  62.             w += c[D],
  63.             w = n(w, A);
  64.         return w = n(w, b),
  65.         w ^= s,
  66.         0 > w && (w = (2147483647 & w) + 2147483648),
  67.         w %= 1e6,
  68.         w.toString() + "." + (w ^ S)
  69.     }
  70.     var sign = a(wd)
  71. """)
  72. print(context.sign)
  73. url = 'https://fanyi.baidu.com/basetrans'
  74. data = {
  75.    "query": context.wd,
  76.    "from": "zh",
  77.    "to": "en",
  78.    "token": token,
  79.    "sign": context.sign
  80. }
  81. encoded_data = urlencode(data)
  82. print(cookies)
  83. print(encoded_data)
  84. headers = {
  85.     'Content-Type': 'application/x-www-form-urlencoded',
  86.     'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 16_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1',
  87. }
  88. # session请求会更改user-agent {'User-Agent': 'python-requests/2.32.3', 'Accept-Encoding': 'gzip, deflate', 'Accept': '*/*', 'Connection': 'keep-alive'}
  89. print(session.headers)
  90. response = requests.post(url, headers = headers,cookies=cookies,data=data, verify=True)
  91. print(response.json())
  92. """
  93. wd=全家的执行结果:
  94. {}
  95. 3d7980a56760ca30e97aeeeda8e8fc6d
  96. 320305.131321201
  97. {'errno': 0, 'data': [{'k': '全家福', 'v': '(全家合影) a photograph of the whole family; (中餐菜名) ho'}, {'k': '全家团聚', 'v': '动. whole family gather'}], 'logid': 2318810217}
  98. 681757.951340
  99. {'BAIDUID': '624820D8D9163F370A491E7CA70C23D4:SL=0:NR=10:FG=1'}
  100. query=%E5%85%A8%E5%AE%B6&from=zh&to=en&token=3d7980a56760ca30e97aeeeda8e8fc6d&sign=681757.951340
  101. {'User-Agent': 'python-requests/2.32.3', 'Accept-Encoding': 'gzip, deflate', 'Accept': '*/*', 'Connection': 'keep-alive'}
  102. {'errno': 0, 'from': 'zh', 'to': 'en', 'trans': [{'dst': 'whole family', 'prefixWrap': 0, 'result': [[0, 'whole family', ['0|6'], [], ['0|6'], ['0|12']]], 'src': '全家'}], 'dict': {'symbols': [{'word_symbol': 'quán jiā', 'parts': [{'part_name': '名', 'means': [{'text': 'the whole family', 'word_mean': 'the whole family'}]}]}], 'word_name': '全家', 'from': 'green', 'word_means': ['the whole family']}, 'keywords': []}
  103. """
复制代码
最新版本python3.13不支持js2py模块,以是我切换到了3.8版本


免责声明:如果侵犯了您的权益,请联系站长,我们会及时删除侵权内容,谢谢合作!更多信息从访问主页:qidao123.com:ToB企服之家,中国第一个企服评测及商务社交产业平台。

本帖子中包含更多资源

您需要 登录 才可以下载或查看,没有账号?立即注册

x
回复

使用道具 举报

0 个回复

倒序浏览

快速回复

您需要登录后才可以回帖 登录 or 立即注册

本版积分规则

美食家大橙子

论坛元老
这个人很懒什么都没写!
快速回复 返回顶部 返回列表