爬取b站评论

打印 上一主题 下一主题

主题 886|帖子 886|积分 2658

本博客旨在分享关于爬虫技术的学习和实践履历,仅供学习使用,请使用爬虫技术的用户自行负担相应的法律责任,务必在进行任何网络数据抓取操纵之前,细致审查相干法律法规,并取得相应的授权或同意。请确保你的行为符合道德和法律的双重标准,恭敬知识产权和网站的服务协议,仅将此技术应用于正当、合法的学习和研究目的。

  
1.单视频评论

爬取某个视频的评论时,只需要对请求载荷的w_rid和wts进行加密,然后请求网络拿到数据,拿到的数据不需要解密。
这里的加密我直接扣js代码破解,比较简单,首先在comment_url.js里面写加密函数。这里有一个参数是从本地的localstorage里面拿到的,这里我直接写死了,但其实也没用,因为o和i都是常量。
  1. function lt(e) {
  2.     ct = "wbi_img_urls";
  3.     var t, r, n = function(e) {
  4.         var t;
  5.         if (e.useAssignKey)
  6.             return {
  7.                 imgKey: e.wbiImgKey,
  8.                 subKey: e.wbiSubKey
  9.             };
  10.         var r = (null === (t = function(e) {
  11.             try {
  12.                 return "https://i0.hdslb.com/bfs/wbi/7cd084941338484aae1ad9425b84077c.png-https://i0.hdslb.com/bfs/wbi/4932caff0ff746eab6f01bf08b70ac45.png"
  13.             } catch (e) {
  14.                 return null
  15.             }
  16.         }(ct)) || void 0 === t ? void 0 : t.split("-")) || []
  17.           , n = r[0]
  18.           , o = r[1]
  19.           , i = n ? ft(n) : e.wbiImgKey
  20.           , a = o ? ft(o) : e.wbiSubKey;
  21.         return {
  22.             imgKey: i,
  23.             subKey: a
  24.         }
  25.     }(arguments.length > 1 && void 0 !== arguments[1] ? arguments[1] : {
  26.         wbiImgKey: "",
  27.         wbiSubKey: ""
  28.     }), o = n.imgKey, i = n.subKey;
  29.     // ,o = '7cd084941338484aae1ad9425b84077c', i = '4932caff0ff746eab6f01bf08b70ac45';
  30.     if (o && i) {
  31.         for (var a = (t = o + i,
  32.         r = [],
  33.         [46, 47, 18, 2, 53, 8, 23, 32, 15, 50, 10, 31, 58, 3, 45, 35, 27, 43, 5, 49, 33, 9, 42, 19, 29, 28, 14, 39, 12, 38, 41, 13, 37, 48, 7, 16, 24, 55, 40, 61, 26, 17, 0, 1, 60, 51, 30, 4, 22, 25, 54, 21, 56, 59, 6, 63, 57, 62, 11, 36, 20, 34, 44, 52].forEach((function(e) {
  34.             t.charAt(e) && r.push(t.charAt(e))
  35.         }
  36.         )),
  37.         r.join("").slice(0, 32)), u = Math.round(Date.now() / 1e3), s = Object.assign({}, e, {
  38.             wts: u
  39.         }), c = Object.keys(s).sort(), l = [], f = /[!'()*]/g, d = 0; d < c.length; d++) {
  40.             var p = c[d]
  41.               , h = s[p];
  42.             h && "string" == typeof h && (h = h.replace(f, "")),
  43.             null != h && l.push("".concat(encodeURIComponent(p), "=").concat(encodeURIComponent(h)))
  44.         }
  45.         var y = l.join("&");
  46.         return {
  47.             w_rid: at(y + a),
  48.             wts: u.toString()
  49.         }
  50.     }
  51.     return "sssss"
  52. }
  53. function ft(e) {
  54.     return e.substring(e.lastIndexOf("/") + 1, e.length).split(".")[0]
  55. }
  56. r = function() {
  57.     return e
  58. }
  59. EwordsToBytes=function(e) {
  60.     console.log(e)
  61.     for (var t = [], r = 0; r < 32 * e.length; r += 8)
  62.         t.push(e[r >>> 5] >>> 24 - r % 32 & 255);
  63.     return t
  64. }
  65. EbytesToWords=function(e) {
  66.     for (var t = [], r = 0, n = 0; r < e.length; r++,
  67.     n += 8)
  68.         t[n >>> 5] |= e[r] << 24 - n % 32;
  69.     return t
  70. }
  71. TstringToBytes=function(e) {
  72.     return NstringToBytes(unescape(encodeURIComponent(e)))
  73. },
  74. TbytesToString=function(e) {
  75.     return decodeURIComponent(escape(rt.bin.bytesToString(e)))
  76. }
  77. NstringToBytes= function(e) {
  78.     for (var t = [], r = 0; r < e.length; r++)
  79.         t.push(255 & e.charCodeAt(r));
  80.     return t
  81. },
  82. NbytesToString=function(e) {
  83.     for (var t = [], r = 0; r < e.length; r++)
  84.         t.push(String.fromCharCode(e[r]));
  85.     return t.join("")
  86. }
  87. function hFF(e, t, r, n, o, i, a) {
  88.     var u = e + (t & r | ~t & n) + (o >>> 0) + a;
  89.     return (u << i | u >>> 32 - i) + t
  90. }
  91. function yGG(e, t, r, n, o, i, a) {
  92.     var u = e + (t & n | r & ~n) + (o >>> 0) + a;
  93.     return (u << i | u >>> 32 - i) + t
  94. }
  95. function vHH(e, t, r, n, o, i, a) {
  96.     var u = e + (t ^ r ^ n) + (o >>> 0) + a;
  97.     return (u << i | u >>> 32 - i) + t
  98. }
  99. function bII(e, t, r, n, o, i, a) {
  100.     var u = e + (r ^ (t | ~n)) + (o >>> 0) + a;
  101.     return (u << i | u >>> 32 - i) + t
  102. }
  103. o = function o(i, a) {
  104.     i.constructor == String ? i = a && "binary" === a.encoding ? NstringToBytes(i) : TstringToBytes(i) : r(i) ? i = Array.prototype.slice.call(i, 0) : Array.isArray(i) || i.constructor === Uint8Array || (i = i.toString());
  105.     for (var u = EbytesToWords(i), s = 8 * i.length, c = 1732584193, l = -271733879, f = -1732584194, d = 271733878, p = 0; p < u.length; p++)
  106.         u[p] = 16711935 & (u[p] << 8 | u[p] >>> 24) | 4278255360 & (u[p] << 24 | u[p] >>> 8);
  107.     u[s >>> 5] |= 128 << s % 32,
  108.     u[14 + (s + 64 >>> 9 << 4)] = s;
  109.     var h = o._ff
  110.       , y = o._gg
  111.       , v = o._hh
  112.       , b = o._ii;
  113.     for (p = 0; p < u.length; p += 16) {
  114.         var m = c
  115.           , w = l
  116.           , g = f
  117.           , x = d;
  118.         c = hFF(c, l, f, d, u[p + 0], 7, -680876936),
  119.         d = hFF(d, c, l, f, u[p + 1], 12, -389564586),
  120.         f = hFF(f, d, c, l, u[p + 2], 17, 606105819),
  121.         l = hFF(l, f, d, c, u[p + 3], 22, -1044525330),
  122.         c = hFF(c, l, f, d, u[p + 4], 7, -176418897),
  123.         d = hFF(d, c, l, f, u[p + 5], 12, 1200080426),
  124.         f = hFF(f, d, c, l, u[p + 6], 17, -1473231341),
  125.         l = hFF(l, f, d, c, u[p + 7], 22, -45705983),
  126.         c = hFF(c, l, f, d, u[p + 8], 7, 1770035416),
  127.         d = hFF(d, c, l, f, u[p + 9], 12, -1958414417),
  128.         f = hFF(f, d, c, l, u[p + 10], 17, -42063),
  129.         l = hFF(l, f, d, c, u[p + 11], 22, -1990404162),
  130.         c = hFF(c, l, f, d, u[p + 12], 7, 1804603682),
  131.         d = hFF(d, c, l, f, u[p + 13], 12, -40341101),
  132.         f = hFF(f, d, c, l, u[p + 14], 17, -1502002290),
  133.         c = yGG(c, l = hFF(l, f, d, c, u[p + 15], 22, 1236535329), f, d, u[p + 1], 5, -165796510),
  134.         d = yGG(d, c, l, f, u[p + 6], 9, -1069501632),
  135.         f = yGG(f, d, c, l, u[p + 11], 14, 643717713),
  136.         l = yGG(l, f, d, c, u[p + 0], 20, -373897302),
  137.         c = yGG(c, l, f, d, u[p + 5], 5, -701558691),
  138.         d = yGG(d, c, l, f, u[p + 10], 9, 38016083),
  139.         f = yGG(f, d, c, l, u[p + 15], 14, -660478335),
  140.         l = yGG(l, f, d, c, u[p + 4], 20, -405537848),
  141.         c = yGG(c, l, f, d, u[p + 9], 5, 568446438),
  142.         d = yGG(d, c, l, f, u[p + 14], 9, -1019803690),
  143.         f = yGG(f, d, c, l, u[p + 3], 14, -187363961),
  144.         l = yGG(l, f, d, c, u[p + 8], 20, 1163531501),
  145.         c = yGG(c, l, f, d, u[p + 13], 5, -1444681467),
  146.         d = yGG(d, c, l, f, u[p + 2], 9, -51403784),
  147.         f = yGG(f, d, c, l, u[p + 7], 14, 1735328473),
  148.         c = vHH(c, l = yGG(l, f, d, c, u[p + 12], 20, -1926607734), f, d, u[p + 5], 4, -378558),
  149.         d = vHH(d, c, l, f, u[p + 8], 11, -2022574463),
  150.         f = vHH(f, d, c, l, u[p + 11], 16, 1839030562),
  151.         l = vHH(l, f, d, c, u[p + 14], 23, -35309556),
  152.         c = vHH(c, l, f, d, u[p + 1], 4, -1530992060),
  153.         d = vHH(d, c, l, f, u[p + 4], 11, 1272893353),
  154.         f = vHH(f, d, c, l, u[p + 7], 16, -155497632),
  155.         l = vHH(l, f, d, c, u[p + 10], 23, -1094730640),
  156.         c = vHH(c, l, f, d, u[p + 13], 4, 681279174),
  157.         d = vHH(d, c, l, f, u[p + 0], 11, -358537222),
  158.         f = vHH(f, d, c, l, u[p + 3], 16, -722521979),
  159.         l = vHH(l, f, d, c, u[p + 6], 23, 76029189),
  160.         c = vHH(c, l, f, d, u[p + 9], 4, -640364487),
  161.         d = vHH(d, c, l, f, u[p + 12], 11, -421815835),
  162.         f = vHH(f, d, c, l, u[p + 15], 16, 530742520),
  163.         c = bII(c, l = vHH(l, f, d, c, u[p + 2], 23, -995338651), f, d, u[p + 0], 6, -198630844),
  164.         d = bII(d, c, l, f, u[p + 7], 10, 1126891415),
  165.         f = bII(f, d, c, l, u[p + 14], 15, -1416354905),
  166.         l = bII(l, f, d, c, u[p + 5], 21, -57434055),
  167.         c = bII(c, l, f, d, u[p + 12], 6, 1700485571),
  168.         d = bII(d, c, l, f, u[p + 3], 10, -1894986606),
  169.         f = bII(f, d, c, l, u[p + 10], 15, -1051523),
  170.         l = bII(l, f, d, c, u[p + 1], 21, -2054922799),
  171.         c = bII(c, l, f, d, u[p + 8], 6, 1873313359),
  172.         d = bII(d, c, l, f, u[p + 15], 10, -30611744),
  173.         f = bII(f, d, c, l, u[p + 6], 15, -1560198380),
  174.         l = bII(l, f, d, c, u[p + 13], 21, 1309151649),
  175.         c = bII(c, l, f, d, u[p + 4], 6, -145523070),
  176.         d = bII(d, c, l, f, u[p + 11], 10, -1120210379),
  177.         f = bII(f, d, c, l, u[p + 2], 15, 718787259),
  178.         l = bII(l, f, d, c, u[p + 9], 21, -343485551),
  179.         c = c + m >>> 0,
  180.         l = l + w >>> 0,
  181.         f = f + g >>> 0,
  182.         d = d + x >>> 0
  183.     }
  184.     return endian([c, l, f, d])
  185. };
  186. rotl=function(e, t) {
  187.     return e << t | e >>> 32 - t
  188. },
  189. rotr=function(e, t) {
  190.     return e << 32 - t | e >>> t
  191. }
  192. function endian(e) {
  193.     if (e.constructor == Number)
  194.         return 16711935 & rotl(e, 8) | 4278255360 & rotl(e, 24);
  195.     for (var r = 0; r < e.length; r++)
  196.         e[r] = endian(e[r]);
  197.     return e
  198. }
  199. bytesToHex=function(e) {
  200.     for (var t = [], r = 0; r < e.length; r++)
  201.         t.push((e[r] >>> 4).toString(16)),
  202.         t.push((15 & e[r]).toString(16));
  203.     return t.join("")
  204. }
  205. var Qe= function(t, r) {
  206.     if (null == t)
  207.         throw new Error("Illegal argument " + t);
  208.     var i = EwordsToBytes(o(t, r));
  209.     return r && r.asBytes ? i : r && r.asString ? NbytesToString(i) : bytesToHex(i)
  210. }
  211. function Ze(e) {
  212.     return e && e.__esModule && Object.prototype.hasOwnProperty.call(e, "default") ? e.default : e
  213. }
  214. var at = Ze(Qe)
  215. // e={
  216. //     "oid": "1906333968",
  217. //     "type": 1,
  218. //     "mode": 3,
  219. //     "pagination_str": "{"offset":"{\\"type\\":1,\\"direction\\":1,\\"session_id\\":\\"1778169679258543\\",\\"data\\":{}}"}",
  220. //     "plat": 1,
  221. //     "web_location": 1315875
  222. // }
  223. e={
  224.     "oid": "1906333968",
  225.     "type": 1,
  226.     "mode": 3,
  227.     "pagination_str": "{"offset":""}",
  228.     "plat": 1,
  229.     "seek_rpid": "",
  230.     "web_location": 1315875
  231. }
  232. console.log(lt(e))
复制代码
然后在py里面调用js文件,获得加密后的载荷数据,然后请求,并对数据进行分析和保存,这里我存成了csv文件。这里需要留意的是参数有的是字符串有的是数字,所以严格按照输出的格式,否则加密效果同等通不过验证,还有就是pagination_str的格式,一定要按照控制台的输特别式写,否则验证失败。评论懒加载,第一次页拿到sessionid后后续请求时会带着sessionid。
  1. import requests
  2. import urllib.parse
  3. import csv
  4. import execjs
  5. oid="1906333968"#必须是string类型啊!!!
  6. web_location= 1315875#必须是整数类型啊!!!
  7. session_id=""#第一页无session_id
  8. cookies = {
  9.     'buvid3': '6C16A34E-4B78-F350-03AA-71E6B21A703519906infoc',
  10.     'b_nut': '1726211919',
  11.     '_uuid': '828DDCCD-F3CD-3997-11077-1729B6881A6120884infoc',
  12.     'enable_web_push': 'DISABLE',
  13.     'buvid4': '3CB58DB4-B2F0-07C1-06FA-E452949C4A8942274-024082300-j2Owk+KrE1E0oCXj+7DzqA%3D%3D',
  14.     'header_theme_version': 'CLOSE',
  15.     'rpdid': "|(u|kkmlu~ll0J'u~kYkukl|m",
  16.     'fingerprint': '65fbd3ec7ea1fba4aa76eb96cb7f6249',
  17.     'buvid_fp_plain': 'undefined',
  18.     'buvid_fp': '65fbd3ec7ea1fba4aa76eb96cb7f6249',
  19.     'DedeUserID': '37611353',
  20.     'DedeUserID__ckMd5': 'af2f5320e5c29dea',
  21.     'home_feed_column': '5',
  22.     'browser_resolution': '2048-1023',
  23.     'bili_ticket': 'eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MzY0OTQzNTgsImlhdCI6MTczNjIzNTA5OCwicGx0IjotMX0.UA_DNnfYHwmuWf3mk3zAc45Ar6QrABl70LmFhjli-ms',
  24.     'bili_ticket_expires': '1736494298',
  25.     'SESSDATA': 'dcf4f3e6%2C1751950746%2C36331%2A12CjDsO7miIb_M9f1MQIIa7qIN5AucRW-WAnR_3eKJ6r4sPE3wgHTKNDZEFG6BeHrHqg4SVmd5YlUxTDM0NVdRQ3hHZHhNMFkyS0JQbjhvRWh2U0RTZElXVHFWSy1ZYkZVYzVHSlNhSWV4WUMxV0pRMHB1ZkV6TEFhd1RfaEZqVG90dUJvazNEUVV3IIEC',
  26.     'bili_jct': '91812d98065f2f1035dfb5271f1057b6',
  27.     'CURRENT_FNVAL': '4048',
  28.    
  29.     #TODO
  30.     'sid': '6rzu47nf',#8位
  31.     'b_lsid': 'EF10A7B92_1944D9191B0',#位
  32.     'bp_t_offset_37611353': '1020612344109072384',#位
  33.    
  34. }
  35. headers = {
  36.     'accept': '*/*',
  37.     'accept-language': 'zh-CN,zh;q=0.9',
  38.     'cache-control': 'no-cache',
  39.     # 'cookie': "buvid3=6C16A34E-4B78-F350-03AA-71E6B21A703519906infoc; b_nut=1726211919; _uuid=828DDCCD-F3CD-3997-11077-1729B6881A6120884infoc; enable_web_push=DISABLE; buvid4=3CB58DB4-B2F0-07C1-06FA-E452949C4A8942274-024082300-j2Owk+KrE1E0oCXj+7DzqA%3D%3D; header_theme_version=CLOSE; rpdid=|(u|kkmlu~ll0J'u~kYkukl|m; fingerprint=65fbd3ec7ea1fba4aa76eb96cb7f6249; buvid_fp_plain=undefined; buvid_fp=65fbd3ec7ea1fba4aa76eb96cb7f6249; DedeUserID=37611353; DedeUserID__ckMd5=af2f5320e5c29dea; home_feed_column=5; browser_resolution=2048-1023; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MzY0OTQzNTgsImlhdCI6MTczNjIzNTA5OCwicGx0IjotMX0.UA_DNnfYHwmuWf3mk3zAc45Ar6QrABl70LmFhjli-ms; bili_ticket_expires=1736494298; SESSDATA=dcf4f3e6%2C1751950746%2C36331%2A12CjDsO7miIb_M9f1MQIIa7qIN5AucRW-WAnR_3eKJ6r4sPE3wgHTKNDZEFG6BeHrHqg4SVmd5YlUxTDM0NVdRQ3hHZHhNMFkyS0JQbjhvRWh2U0RTZElXVHFWSy1ZYkZVYzVHSlNhSWV4WUMxV0pRMHB1ZkV6TEFhd1RfaEZqVG90dUJvazNEUVV3IIEC; bili_jct=91812d98065f2f1035dfb5271f1057b6; CURRENT_FNVAL=4048; sid=6rzu47nf; b_lsid=EF10A7B92_1944D9191B0; bp_t_offset_37611353=1020612344109072384",
  40.     'origin': 'https://www.bilibili.com',
  41.     'pragma': 'no-cache',
  42.     'priority': 'u=1, i',
  43.     'referer': 'https://www.bilibili.com/video/BV1xU411U7PW/?spm_id_from=333.1391.0.0&vd_source=fd84ddc58aead0485969c92933b61484',
  44.     'sec-ch-ua': '"Google Chrome";v="131", "Chromium";v="131", "Not_A Brand";v="24"',
  45.     'sec-ch-ua-mobile': '?0',
  46.     'sec-ch-ua-platform': '"Windows"',
  47.     'sec-fetch-dest': 'empty',
  48.     'sec-fetch-mode': 'cors',
  49.     'sec-fetch-site': 'same-site',
  50.     'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
  51. }
  52. def save2csv(*args):
  53.     if len(args) < 11:
  54.         raise ValueError("参数错误.")
  55.     with open(f"{args[10]}_{args[11]}.csv", "a", newline='',encoding="utf-8") as f:
  56.         f_csv=csv.writer(f)#写入缓存
  57.         if f.tell() == 0:  # 检查文件是否为空,如果是,则先写入表头
  58.             headers_csv=['rpid','replay_count','message','like','avatar','sex','uname','oid','parent','is_end','all_count','name']
  59.             f_csv.writerow(headers_csv)
  60.         data = list(args)
  61.         f_csv.writerow(data)#写入一行
  62. def handle_content(list_comment,is_end,all_count,name):
  63.     for comment in list_comment:
  64.         rpid=comment["rpid"]#我的id
  65.         count=comment["count"]
  66.         replay_count=comment["rcount"]#回复数
  67.         message=comment["content"]["message"]#回复内容
  68.         like=comment["like"]#点赞数
  69.         avatar=comment["member"]["avatar"]#回复者头像
  70.         sex=comment["member"]["sex"]#回复者性别
  71.         uname=comment["member"]["uname"]#回复者昵称
  72.         oid=comment["oid"]#我以及我的回复者们共用id
  73.         parent=comment["parent"]#回复者id
  74.         if comment["replies"]:
  75.             replies=handle_content(comment["replies"],is_end,all_count,name)
  76.         # save2csv(rpid,replay_count,message,like,avatar,sex,uname,oid,parent,is_end,all_count,name)
  77.         print(rpid,replay_count,message,like,avatar,sex,uname,oid,parent)
  78. def handle_cursor(cursor):
  79.     is_end=cursor["is_end"]#是否最后一页
  80.     all_count=cursor["all_count"]#总评论数
  81.     name=cursor["name"]#热门评论
  82.     return is_end,all_count,name
  83. def get_params(session_id):#{"offset":"{"type":1,"direction":1,"session_id":"1778143604964054","data":{}}"}
  84.     pagination_str = "{"offset":"{\\"type\\":1,\\"direction\\":1,\\"session_id\\":\\""+str(session_id)+"\\",\\"data\\":{}}"}" if session_id else '{"offset":""}'
  85.     params={
  86.     "oid": oid,
  87.     "type": 1,
  88.     "mode": 3,
  89.     "pagination_str": pagination_str,
  90.     "plat": 1,
  91.     'seek_rpid': '',
  92.     "web_location": web_location
  93.     }
  94.     ctx=execjs.compile(open('./bili/comment_url.js','r',encoding='utf-8').read()).call('lt',params)
  95.     params.update({
  96.         'w_rid': ctx["w_rid"],
  97.         'wts': ctx["wts"]
  98.     })
  99.     return params
  100. if __name__=="__main__":
  101.     count=1
  102.     while True:
  103.         params=get_params(session_id)
  104.         print(params)
  105.         response = requests.get(
  106.         'https://api.bilibili.com/x/v2/reply/wbi/main',
  107.         cookies=cookies,
  108.         headers=headers,
  109.         params=params,
  110.     )
  111.         # print(response.text)
  112.         is_end,all_count,name=handle_cursor(response.json()["data"]["cursor"])
  113.         handle_content(response.json()["data"]["replies"],is_end,all_count,name)
  114.         print(f"第{count}页爬完了")
  115.         count+=1
  116.         if is_end==True:
  117.             print(f"爬取完成,一共有{all_count}条")
  118.             break
复制代码
2.单视频弹幕


免责声明:如果侵犯了您的权益,请联系站长,我们会及时删除侵权内容,谢谢合作!更多信息从访问主页:qidao123.com:ToB企服之家,中国第一个企服评测及商务社交产业平台。
回复

使用道具 举报

0 个回复

倒序浏览

快速回复

您需要登录后才可以回帖 登录 or 立即注册

本版积分规则

钜形不锈钢水箱

金牌会员
这个人很懒什么都没写!

标签云

快速回复 返回顶部 返回列表