总结一下爬虫学习中的实例实战
1.金山翻译
- # -*- coding: utf-8 -*-
- import requests
- # 获取翻译包的url,需要去掉多余的保护壳:
- # https://ifanyi.iciba.com/index.php?c=trans&m=fy&client=6&auth_user=key_web_new_fanyi&sign=9X%2BHAviAKqteMMuVvr%2B0X9RriqVIAJSQ%2BxmfU0q7dIE%3D
- url = 'https://ifanyi.iciba.com/index.php?c=trans'
- # 构建请求头
- headers = {
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36',
- 'Referer': 'https://www.iciba.com/',
- 'Host': 'ifanyi.iciba.com'
- }
- while True:
- # 实现用户输入的功能
- content = input('请输入您想翻译的内容(输入"exit"结束程序):')
- # 检查是否需要退出
- if content.lower() == 'exit':
- break
- # 构建参数字典
- post_data = {
- 'from': 'auto',
- 'to': 'auto',
- 'q': content,
- }
- # 发送请求
- res = requests.post(url, headers=headers, data=post_data)
- res_1 = res.content.decode()
- # 输出翻译结果
- print(eval(res_1)['out'])
复制代码 2.github模仿登录
- # -*- coding: utf-8 -*-
- import re
- # 1.获取并模拟登录操作 2.保存登录会话信息 3.验证是否登录成功
- import requests
- from requests import Session
- def do_auth_token(session: Session):
- global response
- response = session.get('https://github.com/login')
- if response.status_code != 200:
- print("请求失败,请稍后再试!")
- exit(0)
- login_html = response.content.decode()
- auth_token = re.findall(r'name="authenticity_token" value="(.*?)"', login_html)[0]
- return auth_token
- def do_auth_login(session: Session):
- post_data = {
- "commit": "Sign in",
- "authenticity_token": auth_token,
- "login": "2834438515@qq.com",
- "password": "991016csq", # 登录密码,为了个人账号安全我这里不是真实密码
- "webauthn-conditional": "undefined",
- "javascript-support": "true",
- "webauthn-support": "supported",
- "webauthn-iuvpaa-support": "unsupported",
- "return_to": "https://github.com/login"
- }
- response = session.post(url='https://github.com/session', data=post_data)
- if response.status_code != 200:
- print("请求失败,请检查参数!")
- else:
- print("请求session 成功!")
- def do_login_status(session: Session):
- response = session.get('https://github.com/csqting')
- html_content = response.content
- response1 = re.findall(r'<title>(.+?)(GitHub)?</title>', html_content.decode('utf-8'))
- try:
- end_str = response1[0][1]
- except IndexError:
- end_str = ""
- if end_str == "":
- # 个人主页的title内容如果结尾没有GitHub,说明登录成功
- print("登录成功!")
- else:
- print("登录失败!")
- with open("github_profile.html", "wb") as f:
- f.write(html_content)
- if __name__ == '__main__':
- # 使用session进行状态保持
- session = requests.session()
- session.headers = {
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36'
- }
- # 1. 获取并模拟登录操作
- auth_token = do_auth_token(session)
- # 2. 保存登录会话信息
- do_auth_login(session)
- # 3. 验证是否登录成功
- do_login_status(session)
复制代码 3.百度贴吧爬取
- # -*- coding: utf-8 -*-
- import requests
- from lxml import etree
- # url
- # headers
- # 发送请求获取响应
- # 从响应中提取数据
- # 判断结束
- class Tieba(object):
- def __init__(self, name):
- self.url = "https://tieba.baidu.com/f?kw={}".format(name)
- print(self.url)
- self.headers = {
- "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36"
- # "User-Agent": "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0; T132461)"
- }
- def get_data(self, url):
- response = requests.get(url, headers=self.headers)
- with open("temp.html", "wb") as f:
- f.write(response.content)
- return response.content
- def parse_data(self, data):
- # 创建element对象
- data = data.decode().replace("<!--", "").replace("-->", "")
- html = etree.HTML(data)
- el_list = html.xpath('//*[@id="thread_list"]/li/div/div[2]/div[1]/div[1]/a')
- # print(len(el_list))
- data_list = []
- for el in el_list:
- temp = {}
- temp['title'] = el.xpath('./text()')[0]
- temp['link'] = 'https://tieba.baidu.com' + el.xpath('./@href')[0]
- data_list.append(temp)
- # 获取下一页url
- try:
- next_url = 'https:' + html.xpath('//a[contains(text(),"下一页>")]/@href')[0]
- except:
- next_url = None
- return data_list, next_url
- def save_data(self, data_list):
- for data in data_list:
- print(data)
- def run(self):
- next_url = self.url
- while True:
- # 发送请求获取响应
- data = self.get_data(next_url)
- # 从响应中提取数据,数据和翻页用的url
- data_list, next_url = self.parse_data(data)
- self.save_data(data_list)
- print(next_url)
- # 判断是否结束
- if next_url == None:
- break
- if __name__ == '__main__':
- tieba = Tieba("美食天下")
- tieba.run()
复制代码 4.斗鱼直播
- # -*- coding: utf-8 -*-
- from selenium import webdriver
- from selenium.webdriver.common.by import By
- import time
- class Douyu(object):
- def __init__(self):
- self.url = 'https://www.douyu.com/directory/all'
- self.driver = webdriver.Chrome()
- self.driver.implicitly_wait(10) # 设置隐式等待,最大等待10秒
- def parse_data(self):
- room_list = self.driver.find_elements(By.XPATH, '//*[@id="listAll"]/section[2]/div[2]/ul/li/div')
- print(len(room_list))
- data_list = []
- # 遍历房间列表,从每一个房间节点获取数据
- for room in room_list:
- temp = {}
- # temp['title'] = room.find_element(By.XPATH, './div[2]/div[1]/a').text
- # temp['type'] = room.find_element(By.XPATH, './div[2]/div[2]/span/a').text
- # temp['owner'] = room.find_element(By.XPATH, './div[1]/div/a/div/div[2]/div/div[1]/div').text
- # temp['num'] = room.find_element(By.XPATH, './div[1]/div/a/div/div[2]/div/div[2]/span').text
- temp['picture'] = room.find_element(By.XPATH, './div[1]/picture/source[1]').get_attribute('srcset')
- # print(temp)
- data_list.append(temp)
- return data_list
- def run(self):
- self.driver.get(self.url)
- total_rooms = 0
- last_count = 0 # 上一次获取的房间数量
- while True:
- # 滚动到页面底部
- self.driver.execute_script('window.scrollTo(0, document.body.scrollHeight);')
- time.sleep(2) # 等待页面加载新内容
- # 获取当前房间数据
- new_data = self.parse_data()
- total_rooms += len(new_data)
- print(f"Total rooms : {total_rooms}")
- # 检查当前房间数量
- if total_rooms == last_count: # 如果新加载的房间数量没有增加,停止滚动
- print("No more new data to load.")
- break
- last_count = total_rooms # 更新最后一次的房间数量
- print(f"Final total rooms fetched: {total_rooms}")
- self.driver.quit() # 退出浏览器
- if __name__ == '__main__':
- douyu = Douyu()
- douyu.run()
复制代码 5.黑马贴吧
- import requests
- import re
- def fetch_page(url):
- headers = {
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36',
- }
- response = requests.get(url, headers=headers)
- if response.status_code == 200:
- # 使用正则表达式提取文章标题
- titles = re.findall(r'class="s xst">([^<]+)</a>', response.text)
- # 提取发布时间和作者
- details = re.findall(
- r'<span style="margin-left: 0;">([^<]+)</span></a><span style="margin-left: 5px;">@ ([^<]+)</span>',
- response.text)
- authors = [detail[0] for detail in details]
- dates = [detail[1] for detail in details]
- # 输出提取的结果
- for title, date, author in zip(titles, dates, authors):
- print(f"文章标题: {title}")
- print(f"发布时间: {date}")
- print(f"文章作者: {author}")
- print('-' * 40)
- # 使用正则表达式提取下一页的链接,search第一次出现
- next_page_link = re.search(r'<a href="([^"]+)" class="nxt">下一页</a>', response.text)
- if next_page_link:
- return next_page_link.group(1) # 返回完整的链接
- else:
- return None
- else:
- print("访问失败", response.status_code)
- return None
- # 初始页面
- current_url = 'https://bbs.itheima.com/forum-425-1.html'
- # 循环遍历每一页,直到没有下一页
- while current_url:
- print(f"正在爬取: {current_url}")
- next_url = fetch_page(current_url)
- current_url = next_url
复制代码 6.网易云
- # -*- coding: utf-8 -*-
- # document.charset 查看源码编码格式
- import requests
- import time
- import re
- import os
- filename = 'musics\\'
- # 如果没有则创建文件夹,os与操作系统实现交互功能(创建文件夹和目录)
- if not os.path.exists(filename):
- os.makedirs(filename)
- url = 'https://music.163.com/discover/toplist?id=3778678'
- headers = {
- 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36'}
- response = requests.get(url, headers=headers)
- time.sleep(5)
- # re.findall
- # 这个函数用于在字符串中查找所有与正则表达式模式匹配的部分,并返回一个包含所有匹配项的列表
- # r 前缀表示这是一个原始字符串,其中的反斜杠不会被解释为转义字符
- # (\d+): 捕获组,匹配一个或多个数字
- # (.*?): 捕获组,非贪婪匹配任何字符(包括空字符),直到遇到 </a>
- # print(response.text)
- html_data = re.findall(r'<li><a href="/song\?id=(\d+)">(.*?)</a>', response.text)
- for num_id, title in html_data:
- # f-string 直接嵌入表达式
- music_download = f'https://music.163.com/song/media/outer/url?id={num_id}.mp3'
- music_content = requests.get(music_download, headers=headers)
- with open('musics\\' + title + '.mp3', 'wb') as f:
- f.write(music_content.content)
- print(num_id, title)
复制代码 7.微博热榜
- # # -*- coding: utf-8 -*-
- # import time
- # from lxml import etree
- # import requests
- #
- # url = 'https://m.weibo.cn/p/106003type=25&t=3&disable_hot=1&filter_type=realtimehot'
- # headers = {
- # 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36'}
- # response = requests.get(url, headers=headers)
- # time.sleep(3)
- # print(response.text)
- # html = etree.HTML(response.text)
- # el_list = html.xpath('//*[@id="app"]/div[1]/div[2]/div[3]/div/div/div/div/div/div/div/span[2]/span[1]/text()')
- # print(len(el_list))
- import time
- from selenium import webdriver
- from selenium.webdriver.common.by import By
- url = 'https://m.weibo.cn/p/106003type=25&t=3&disable_hot=1&filter_type=realtimehot'
- driver = webdriver.Chrome()
- driver.get(url)
- time.sleep(3)
- el_list = driver.find_elements(By.XPATH,'//*[@id="app"]/div[1]/div[2]/div[3]/div/div/div/div/div/div/div/span[2]/span[1]')
- # print(len(el_list))
- el_list1 = driver.find_elements(By.XPATH,'//*[@id="app"]/div[1]/div[2]/div[3]/div/div/div/div/div/div/div/span[2]/span[2]')
- # print(len(el_list1))
- # save_out = []
- i=1
- for title,hot in zip(el_list,el_list1):
- # save_out.append(f"{i}\n") # 添加行号
- # save_out.append(f"文章标题: {title.text}\n") # 添加文章标题
- # save_out.append(f"热度: {hot.text}\n") # 添加热度
- print(f"{i}")
- print(f"文章标题: {title.text}")
- print(f"热度: {hot.text}")
- i += 1
- print('-' * 40)
- # with open("weibo.txt","w") as file:
- # file.writelines(save_out)
- driver.quit()
复制代码 8.驾校自动答题
- # -*- coding: utf-8 -*-
- import time
- from selenium import webdriver
- from selenium.webdriver.common.by import By
- url = 'https://www.jsyks.com/kmy-mnks'
- driver = webdriver.Chrome()
- driver.get(url)
- # 1.获取答案xpath
- # 2.替换匹配答案
- # 3.执行滑动点击操作
- time.sleep(3)
- el_list = driver.find_elements(By.XPATH, '/html/body/div[4]/div[1]/div[1]/ul/li')
- # print(len(el_list))
- # 使用get_attribute('标签名')获取标签值,保存正确选项
- k_values = []
- for li in el_list:
- k_values.append(li.get_attribute('k')) # 'E'表示错误,'R'表示正确
- # 使用列表推导式替换字符,E R都不是保持原样字符
- replaced_list = ["正确" if x == 'R' else "错误" if x == 'E' else x for x in k_values]
- for index, li in enumerate(el_list):
- answer = replaced_list[index]
- if answer == '正确' or answer == '错误':
- option = li.find_element(By.XPATH, f".//b[contains(text(),'{answer}')]")
- else:
- # 使用 starts-with 函数查找以特定字符开始的文本,答案为A、B、C、D的情况
- option = li.find_element(By.XPATH, f".//b[starts-with(normalize-space(text()), '{answer}')]")
- # 滚动到指定元素
- driver.execute_script('arguments[0].scrollIntoView();', option)
- # 使用JavaScript点击选项
- driver.execute_script("arguments[0].click();", option)
复制代码 后期学习路线:继续在实战中总结反爬本领,学习反调式,以及之后的爬虫完备项目学习。
免责声明:如果侵犯了您的权益,请联系站长,我们会及时删除侵权内容,谢谢合作!更多信息从访问主页:qidao123.com:ToB企服之家,中国第一个企服评测及商务社交产业平台。 |