本博客旨在分享关于爬虫技术的学习和实践履历,仅供学习使用,请使用爬虫技术的用户自行负担相应的法律责任,务必在进行任何网络数据抓取操纵之前,细致审查相干法律法规,并取得相应的授权或同意。请确保你的行为符合道德和法律的双重标准,恭敬知识产权和网站的服务协议,仅将此技术应用于正当、合法的学习和研究目的。
1.单视频评论
爬取某个视频的评论时,只需要对请求载荷的w_rid和wts进行加密,然后请求网络拿到数据,拿到的数据不需要解密。
这里的加密我直接扣js代码破解,比较简单,首先在comment_url.js里面写加密函数。这里有一个参数是从本地的localstorage里面拿到的,这里我直接写死了,但其实也没用,因为o和i都是常量。
- function lt(e) {
- ct = "wbi_img_urls";
- var t, r, n = function(e) {
- var t;
- if (e.useAssignKey)
- return {
- imgKey: e.wbiImgKey,
- subKey: e.wbiSubKey
- };
- var r = (null === (t = function(e) {
- try {
- return "https://i0.hdslb.com/bfs/wbi/7cd084941338484aae1ad9425b84077c.png-https://i0.hdslb.com/bfs/wbi/4932caff0ff746eab6f01bf08b70ac45.png"
- } catch (e) {
- return null
- }
- }(ct)) || void 0 === t ? void 0 : t.split("-")) || []
- , n = r[0]
- , o = r[1]
- , i = n ? ft(n) : e.wbiImgKey
- , a = o ? ft(o) : e.wbiSubKey;
- return {
- imgKey: i,
- subKey: a
- }
- }(arguments.length > 1 && void 0 !== arguments[1] ? arguments[1] : {
- wbiImgKey: "",
- wbiSubKey: ""
- }), o = n.imgKey, i = n.subKey;
- // ,o = '7cd084941338484aae1ad9425b84077c', i = '4932caff0ff746eab6f01bf08b70ac45';
- if (o && i) {
- for (var a = (t = o + i,
- r = [],
- [46, 47, 18, 2, 53, 8, 23, 32, 15, 50, 10, 31, 58, 3, 45, 35, 27, 43, 5, 49, 33, 9, 42, 19, 29, 28, 14, 39, 12, 38, 41, 13, 37, 48, 7, 16, 24, 55, 40, 61, 26, 17, 0, 1, 60, 51, 30, 4, 22, 25, 54, 21, 56, 59, 6, 63, 57, 62, 11, 36, 20, 34, 44, 52].forEach((function(e) {
- t.charAt(e) && r.push(t.charAt(e))
- }
- )),
- r.join("").slice(0, 32)), u = Math.round(Date.now() / 1e3), s = Object.assign({}, e, {
- wts: u
- }), c = Object.keys(s).sort(), l = [], f = /[!'()*]/g, d = 0; d < c.length; d++) {
- var p = c[d]
- , h = s[p];
- h && "string" == typeof h && (h = h.replace(f, "")),
- null != h && l.push("".concat(encodeURIComponent(p), "=").concat(encodeURIComponent(h)))
- }
- var y = l.join("&");
- return {
- w_rid: at(y + a),
- wts: u.toString()
- }
- }
- return "sssss"
- }
- function ft(e) {
- return e.substring(e.lastIndexOf("/") + 1, e.length).split(".")[0]
- }
- r = function() {
- return e
- }
- EwordsToBytes=function(e) {
- console.log(e)
- for (var t = [], r = 0; r < 32 * e.length; r += 8)
- t.push(e[r >>> 5] >>> 24 - r % 32 & 255);
- return t
- }
- EbytesToWords=function(e) {
- for (var t = [], r = 0, n = 0; r < e.length; r++,
- n += 8)
- t[n >>> 5] |= e[r] << 24 - n % 32;
- return t
- }
- TstringToBytes=function(e) {
- return NstringToBytes(unescape(encodeURIComponent(e)))
- },
- TbytesToString=function(e) {
- return decodeURIComponent(escape(rt.bin.bytesToString(e)))
- }
- NstringToBytes= function(e) {
- for (var t = [], r = 0; r < e.length; r++)
- t.push(255 & e.charCodeAt(r));
- return t
- },
- NbytesToString=function(e) {
- for (var t = [], r = 0; r < e.length; r++)
- t.push(String.fromCharCode(e[r]));
- return t.join("")
- }
- function hFF(e, t, r, n, o, i, a) {
- var u = e + (t & r | ~t & n) + (o >>> 0) + a;
- return (u << i | u >>> 32 - i) + t
- }
- function yGG(e, t, r, n, o, i, a) {
- var u = e + (t & n | r & ~n) + (o >>> 0) + a;
- return (u << i | u >>> 32 - i) + t
- }
- function vHH(e, t, r, n, o, i, a) {
- var u = e + (t ^ r ^ n) + (o >>> 0) + a;
- return (u << i | u >>> 32 - i) + t
- }
- function bII(e, t, r, n, o, i, a) {
- var u = e + (r ^ (t | ~n)) + (o >>> 0) + a;
- return (u << i | u >>> 32 - i) + t
- }
- o = function o(i, a) {
- i.constructor == String ? i = a && "binary" === a.encoding ? NstringToBytes(i) : TstringToBytes(i) : r(i) ? i = Array.prototype.slice.call(i, 0) : Array.isArray(i) || i.constructor === Uint8Array || (i = i.toString());
- for (var u = EbytesToWords(i), s = 8 * i.length, c = 1732584193, l = -271733879, f = -1732584194, d = 271733878, p = 0; p < u.length; p++)
- u[p] = 16711935 & (u[p] << 8 | u[p] >>> 24) | 4278255360 & (u[p] << 24 | u[p] >>> 8);
- u[s >>> 5] |= 128 << s % 32,
- u[14 + (s + 64 >>> 9 << 4)] = s;
- var h = o._ff
- , y = o._gg
- , v = o._hh
- , b = o._ii;
- for (p = 0; p < u.length; p += 16) {
- var m = c
- , w = l
- , g = f
- , x = d;
- c = hFF(c, l, f, d, u[p + 0], 7, -680876936),
- d = hFF(d, c, l, f, u[p + 1], 12, -389564586),
- f = hFF(f, d, c, l, u[p + 2], 17, 606105819),
- l = hFF(l, f, d, c, u[p + 3], 22, -1044525330),
- c = hFF(c, l, f, d, u[p + 4], 7, -176418897),
- d = hFF(d, c, l, f, u[p + 5], 12, 1200080426),
- f = hFF(f, d, c, l, u[p + 6], 17, -1473231341),
- l = hFF(l, f, d, c, u[p + 7], 22, -45705983),
- c = hFF(c, l, f, d, u[p + 8], 7, 1770035416),
- d = hFF(d, c, l, f, u[p + 9], 12, -1958414417),
- f = hFF(f, d, c, l, u[p + 10], 17, -42063),
- l = hFF(l, f, d, c, u[p + 11], 22, -1990404162),
- c = hFF(c, l, f, d, u[p + 12], 7, 1804603682),
- d = hFF(d, c, l, f, u[p + 13], 12, -40341101),
- f = hFF(f, d, c, l, u[p + 14], 17, -1502002290),
- c = yGG(c, l = hFF(l, f, d, c, u[p + 15], 22, 1236535329), f, d, u[p + 1], 5, -165796510),
- d = yGG(d, c, l, f, u[p + 6], 9, -1069501632),
- f = yGG(f, d, c, l, u[p + 11], 14, 643717713),
- l = yGG(l, f, d, c, u[p + 0], 20, -373897302),
- c = yGG(c, l, f, d, u[p + 5], 5, -701558691),
- d = yGG(d, c, l, f, u[p + 10], 9, 38016083),
- f = yGG(f, d, c, l, u[p + 15], 14, -660478335),
- l = yGG(l, f, d, c, u[p + 4], 20, -405537848),
- c = yGG(c, l, f, d, u[p + 9], 5, 568446438),
- d = yGG(d, c, l, f, u[p + 14], 9, -1019803690),
- f = yGG(f, d, c, l, u[p + 3], 14, -187363961),
- l = yGG(l, f, d, c, u[p + 8], 20, 1163531501),
- c = yGG(c, l, f, d, u[p + 13], 5, -1444681467),
- d = yGG(d, c, l, f, u[p + 2], 9, -51403784),
- f = yGG(f, d, c, l, u[p + 7], 14, 1735328473),
- c = vHH(c, l = yGG(l, f, d, c, u[p + 12], 20, -1926607734), f, d, u[p + 5], 4, -378558),
- d = vHH(d, c, l, f, u[p + 8], 11, -2022574463),
- f = vHH(f, d, c, l, u[p + 11], 16, 1839030562),
- l = vHH(l, f, d, c, u[p + 14], 23, -35309556),
- c = vHH(c, l, f, d, u[p + 1], 4, -1530992060),
- d = vHH(d, c, l, f, u[p + 4], 11, 1272893353),
- f = vHH(f, d, c, l, u[p + 7], 16, -155497632),
- l = vHH(l, f, d, c, u[p + 10], 23, -1094730640),
- c = vHH(c, l, f, d, u[p + 13], 4, 681279174),
- d = vHH(d, c, l, f, u[p + 0], 11, -358537222),
- f = vHH(f, d, c, l, u[p + 3], 16, -722521979),
- l = vHH(l, f, d, c, u[p + 6], 23, 76029189),
- c = vHH(c, l, f, d, u[p + 9], 4, -640364487),
- d = vHH(d, c, l, f, u[p + 12], 11, -421815835),
- f = vHH(f, d, c, l, u[p + 15], 16, 530742520),
- c = bII(c, l = vHH(l, f, d, c, u[p + 2], 23, -995338651), f, d, u[p + 0], 6, -198630844),
- d = bII(d, c, l, f, u[p + 7], 10, 1126891415),
- f = bII(f, d, c, l, u[p + 14], 15, -1416354905),
- l = bII(l, f, d, c, u[p + 5], 21, -57434055),
- c = bII(c, l, f, d, u[p + 12], 6, 1700485571),
- d = bII(d, c, l, f, u[p + 3], 10, -1894986606),
- f = bII(f, d, c, l, u[p + 10], 15, -1051523),
- l = bII(l, f, d, c, u[p + 1], 21, -2054922799),
- c = bII(c, l, f, d, u[p + 8], 6, 1873313359),
- d = bII(d, c, l, f, u[p + 15], 10, -30611744),
- f = bII(f, d, c, l, u[p + 6], 15, -1560198380),
- l = bII(l, f, d, c, u[p + 13], 21, 1309151649),
- c = bII(c, l, f, d, u[p + 4], 6, -145523070),
- d = bII(d, c, l, f, u[p + 11], 10, -1120210379),
- f = bII(f, d, c, l, u[p + 2], 15, 718787259),
- l = bII(l, f, d, c, u[p + 9], 21, -343485551),
- c = c + m >>> 0,
- l = l + w >>> 0,
- f = f + g >>> 0,
- d = d + x >>> 0
- }
- return endian([c, l, f, d])
- };
- rotl=function(e, t) {
- return e << t | e >>> 32 - t
- },
- rotr=function(e, t) {
- return e << 32 - t | e >>> t
- }
- function endian(e) {
- if (e.constructor == Number)
- return 16711935 & rotl(e, 8) | 4278255360 & rotl(e, 24);
- for (var r = 0; r < e.length; r++)
- e[r] = endian(e[r]);
- return e
- }
- bytesToHex=function(e) {
- for (var t = [], r = 0; r < e.length; r++)
- t.push((e[r] >>> 4).toString(16)),
- t.push((15 & e[r]).toString(16));
- return t.join("")
- }
- var Qe= function(t, r) {
- if (null == t)
- throw new Error("Illegal argument " + t);
- var i = EwordsToBytes(o(t, r));
- return r && r.asBytes ? i : r && r.asString ? NbytesToString(i) : bytesToHex(i)
- }
- function Ze(e) {
- return e && e.__esModule && Object.prototype.hasOwnProperty.call(e, "default") ? e.default : e
- }
- var at = Ze(Qe)
- // e={
- // "oid": "1906333968",
- // "type": 1,
- // "mode": 3,
- // "pagination_str": "{"offset":"{\\"type\\":1,\\"direction\\":1,\\"session_id\\":\\"1778169679258543\\",\\"data\\":{}}"}",
- // "plat": 1,
- // "web_location": 1315875
- // }
- e={
- "oid": "1906333968",
- "type": 1,
- "mode": 3,
- "pagination_str": "{"offset":""}",
- "plat": 1,
- "seek_rpid": "",
- "web_location": 1315875
- }
- console.log(lt(e))
复制代码 然后在py里面调用js文件,获得加密后的载荷数据,然后请求,并对数据进行分析和保存,这里我存成了csv文件。这里需要留意的是参数有的是字符串有的是数字,所以严格按照输出的格式,否则加密效果同等通不过验证,还有就是pagination_str的格式,一定要按照控制台的输特别式写,否则验证失败。评论懒加载,第一次页拿到sessionid后后续请求时会带着sessionid。
- import requests
- import urllib.parse
- import csv
- import execjs
- oid="1906333968"#必须是string类型啊!!!
- web_location= 1315875#必须是整数类型啊!!!
- session_id=""#第一页无session_id
- cookies = {
- 'buvid3': '6C16A34E-4B78-F350-03AA-71E6B21A703519906infoc',
- 'b_nut': '1726211919',
- '_uuid': '828DDCCD-F3CD-3997-11077-1729B6881A6120884infoc',
- 'enable_web_push': 'DISABLE',
- 'buvid4': '3CB58DB4-B2F0-07C1-06FA-E452949C4A8942274-024082300-j2Owk+KrE1E0oCXj+7DzqA%3D%3D',
- 'header_theme_version': 'CLOSE',
- 'rpdid': "|(u|kkmlu~ll0J'u~kYkukl|m",
- 'fingerprint': '65fbd3ec7ea1fba4aa76eb96cb7f6249',
- 'buvid_fp_plain': 'undefined',
- 'buvid_fp': '65fbd3ec7ea1fba4aa76eb96cb7f6249',
- 'DedeUserID': '37611353',
- 'DedeUserID__ckMd5': 'af2f5320e5c29dea',
- 'home_feed_column': '5',
- 'browser_resolution': '2048-1023',
- 'bili_ticket': 'eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MzY0OTQzNTgsImlhdCI6MTczNjIzNTA5OCwicGx0IjotMX0.UA_DNnfYHwmuWf3mk3zAc45Ar6QrABl70LmFhjli-ms',
- 'bili_ticket_expires': '1736494298',
- 'SESSDATA': 'dcf4f3e6%2C1751950746%2C36331%2A12CjDsO7miIb_M9f1MQIIa7qIN5AucRW-WAnR_3eKJ6r4sPE3wgHTKNDZEFG6BeHrHqg4SVmd5YlUxTDM0NVdRQ3hHZHhNMFkyS0JQbjhvRWh2U0RTZElXVHFWSy1ZYkZVYzVHSlNhSWV4WUMxV0pRMHB1ZkV6TEFhd1RfaEZqVG90dUJvazNEUVV3IIEC',
- 'bili_jct': '91812d98065f2f1035dfb5271f1057b6',
- 'CURRENT_FNVAL': '4048',
-
- #TODO
- 'sid': '6rzu47nf',#8位
- 'b_lsid': 'EF10A7B92_1944D9191B0',#位
- 'bp_t_offset_37611353': '1020612344109072384',#位
-
- }
- headers = {
- 'accept': '*/*',
- 'accept-language': 'zh-CN,zh;q=0.9',
- 'cache-control': 'no-cache',
- # 'cookie': "buvid3=6C16A34E-4B78-F350-03AA-71E6B21A703519906infoc; b_nut=1726211919; _uuid=828DDCCD-F3CD-3997-11077-1729B6881A6120884infoc; enable_web_push=DISABLE; buvid4=3CB58DB4-B2F0-07C1-06FA-E452949C4A8942274-024082300-j2Owk+KrE1E0oCXj+7DzqA%3D%3D; header_theme_version=CLOSE; rpdid=|(u|kkmlu~ll0J'u~kYkukl|m; fingerprint=65fbd3ec7ea1fba4aa76eb96cb7f6249; buvid_fp_plain=undefined; buvid_fp=65fbd3ec7ea1fba4aa76eb96cb7f6249; DedeUserID=37611353; DedeUserID__ckMd5=af2f5320e5c29dea; home_feed_column=5; browser_resolution=2048-1023; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MzY0OTQzNTgsImlhdCI6MTczNjIzNTA5OCwicGx0IjotMX0.UA_DNnfYHwmuWf3mk3zAc45Ar6QrABl70LmFhjli-ms; bili_ticket_expires=1736494298; SESSDATA=dcf4f3e6%2C1751950746%2C36331%2A12CjDsO7miIb_M9f1MQIIa7qIN5AucRW-WAnR_3eKJ6r4sPE3wgHTKNDZEFG6BeHrHqg4SVmd5YlUxTDM0NVdRQ3hHZHhNMFkyS0JQbjhvRWh2U0RTZElXVHFWSy1ZYkZVYzVHSlNhSWV4WUMxV0pRMHB1ZkV6TEFhd1RfaEZqVG90dUJvazNEUVV3IIEC; bili_jct=91812d98065f2f1035dfb5271f1057b6; CURRENT_FNVAL=4048; sid=6rzu47nf; b_lsid=EF10A7B92_1944D9191B0; bp_t_offset_37611353=1020612344109072384",
- 'origin': 'https://www.bilibili.com',
- 'pragma': 'no-cache',
- 'priority': 'u=1, i',
- 'referer': 'https://www.bilibili.com/video/BV1xU411U7PW/?spm_id_from=333.1391.0.0&vd_source=fd84ddc58aead0485969c92933b61484',
- 'sec-ch-ua': '"Google Chrome";v="131", "Chromium";v="131", "Not_A Brand";v="24"',
- 'sec-ch-ua-mobile': '?0',
- 'sec-ch-ua-platform': '"Windows"',
- 'sec-fetch-dest': 'empty',
- 'sec-fetch-mode': 'cors',
- 'sec-fetch-site': 'same-site',
- 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
- }
- def save2csv(*args):
- if len(args) < 11:
- raise ValueError("参数错误.")
- with open(f"{args[10]}_{args[11]}.csv", "a", newline='',encoding="utf-8") as f:
- f_csv=csv.writer(f)#写入缓存
- if f.tell() == 0: # 检查文件是否为空,如果是,则先写入表头
- headers_csv=['rpid','replay_count','message','like','avatar','sex','uname','oid','parent','is_end','all_count','name']
- f_csv.writerow(headers_csv)
- data = list(args)
- f_csv.writerow(data)#写入一行
- def handle_content(list_comment,is_end,all_count,name):
- for comment in list_comment:
- rpid=comment["rpid"]#我的id
- count=comment["count"]
- replay_count=comment["rcount"]#回复数
- message=comment["content"]["message"]#回复内容
- like=comment["like"]#点赞数
- avatar=comment["member"]["avatar"]#回复者头像
- sex=comment["member"]["sex"]#回复者性别
- uname=comment["member"]["uname"]#回复者昵称
- oid=comment["oid"]#我以及我的回复者们共用id
- parent=comment["parent"]#回复者id
- if comment["replies"]:
- replies=handle_content(comment["replies"],is_end,all_count,name)
- # save2csv(rpid,replay_count,message,like,avatar,sex,uname,oid,parent,is_end,all_count,name)
- print(rpid,replay_count,message,like,avatar,sex,uname,oid,parent)
- def handle_cursor(cursor):
- is_end=cursor["is_end"]#是否最后一页
- all_count=cursor["all_count"]#总评论数
- name=cursor["name"]#热门评论
- return is_end,all_count,name
- def get_params(session_id):#{"offset":"{"type":1,"direction":1,"session_id":"1778143604964054","data":{}}"}
- pagination_str = "{"offset":"{\\"type\\":1,\\"direction\\":1,\\"session_id\\":\\""+str(session_id)+"\\",\\"data\\":{}}"}" if session_id else '{"offset":""}'
- params={
- "oid": oid,
- "type": 1,
- "mode": 3,
- "pagination_str": pagination_str,
- "plat": 1,
- 'seek_rpid': '',
- "web_location": web_location
- }
- ctx=execjs.compile(open('./bili/comment_url.js','r',encoding='utf-8').read()).call('lt',params)
- params.update({
- 'w_rid': ctx["w_rid"],
- 'wts': ctx["wts"]
- })
- return params
- if __name__=="__main__":
- count=1
- while True:
- params=get_params(session_id)
- print(params)
- response = requests.get(
- 'https://api.bilibili.com/x/v2/reply/wbi/main',
- cookies=cookies,
- headers=headers,
- params=params,
- )
- # print(response.text)
- is_end,all_count,name=handle_cursor(response.json()["data"]["cursor"])
- handle_content(response.json()["data"]["replies"],is_end,all_count,name)
- print(f"第{count}页爬完了")
- count+=1
- if is_end==True:
- print(f"爬取完成,一共有{all_count}条")
- break
复制代码 2.单视频弹幕
免责声明:如果侵犯了您的权益,请联系站长,我们会及时删除侵权内容,谢谢合作!更多信息从访问主页:qidao123.com:ToB企服之家,中国第一个企服评测及商务社交产业平台。 |