马上注册,结交更多好友,享用更多功能,让你轻松玩转社区。
您需要 登录 才可以下载或查看,没有账号?立即注册
x
会员购项目
亮点
- 日志记载信息
- 协程异步抓取数据,大大提高抓取速度
- 捕获异常,并添加重试机制
源码
- import logging
- import time
- import requests
- import asyncio
- import aiohttp
- from aiohttp import ContentTypeError
- import csv
- # 配置日志
- logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s : %(message)s')
- # 解析数据
- def parse_data(data):
- if data:
- for meeting in data:
- project_id = meeting['project_id']
- project_name = meeting['project_name']
- start_time = meeting['start_time']
- venue_name = meeting['venue_name']
- price_low = meeting['price_low'] / 100
- price_high = meeting['price_high'] / 100
- yield {
- 'project_id': project_id,
- 'project_name': project_name,
- 'start_time': start_time,
- 'venue_name': venue_name,
- 'price_low': price_low,
- 'price_high': price_high
- }
- # 保存至csv文件中
- def save_file(city_info, city_id):
- if city_info:
- with open(f'{city_id}.csv', 'a+', newline='', encoding='utf-8') as f:
- writer = csv.writer(f)
- writer.writerow([f'{city_info["project_id"]}', f'{city_info["project_name"]}', f'{city_info["start_time"]}',
- f'{city_info["venue_name"]}', f'{city_info["price_low"]}', f'{city_info["price_high"]}'])
- class Myspider(object):
- types_list = ['演出', '展览', '本地生活']
- cities_id_list = []
- failed_urls = []
- CONCURRENTCY = 4
- RETRY_LIMIT = 3
- def __init__(self):
- self.session = None
- self.semaphore = asyncio.Semaphore(Myspider.CONCURRENTCY)
- # 获取城市编号并设置类属性
- @staticmethod
- def set_cities_id():
- headers = {
- 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36 Edg/127.0.0.0'}
- cities_data = requests.get("https://show.bilibili.com/api/ticket/city/list?channel=4", headers=headers).json()[
- 'data']
- developed_cities_id = [city['id'] for city in cities_data['list']]
- developing_cities_id = [city['id'] for part in cities_data['more'] for city in part['list']]
- Myspider.cities_id_list = developed_cities_id + developing_cities_id
- return None
- # 解决单个任务,爬取相关信息
- async def get_every_page_info(self, url):
- async with self.semaphore:
- logging.info(f"scraping {url}")
- for attempt in range(Myspider.RETRY_LIMIT):
- try:
- async with self.session.get(url) as response:
- data = await response.json()
- return data["data"]["result"]
- except ContentTypeError:
- logging.info(f"error ocurred when scraping {url}", exc_info=True)
- except aiohttp.ClientError as e:
- logging.error(f"ClientError on {url}: {e}", exc_info=True)
- if attempt < Myspider.RETRY_LIMIT - 1:
- await asyncio.sleep(2 ** attempt) # Exponential backoff
- continue
- except aiohttp.ServerDisconnectedError:
- logging.error(f"Server disconnected: {url}", exc_info=True)
- if attempt < Myspider.RETRY_LIMIT - 1:
- await asyncio.sleep(2 ** attempt)
- continue
- Myspider.failed_urls.append(url)
- return None # Return None if all retry attempts fail
- # 获取 此分类下 此城市下 最大页数
- def get_max_page(self, url):
- headers = {
- 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36 Edg/127.0.0.0'}
- response = requests.get(url, headers=headers)
- data = response.json()
- return data["data"]["numPages"]
- # 主方法, 获取任务列表, 开4个协程去抓
- async def main(self):
- headers = {
- 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36 Edg/127.0.0.0'}
- # 初始化session(主要加header头信息以及代理,cookie等头信息)
- async with aiohttp.ClientSession(headers=headers) as session:
- self.session = session
- for type in Myspider.types_list:
- for city_id in Myspider.cities_id_list:
- begin_url = "https://show.bilibili.com/api/ticket/project/listV2?version=134&page=1&pagesize=16&area={}&filter=&platform=web&p_type={}".format(
- city_id, type)
- max_page = self.get_max_page(begin_url)
- # 生成任务列表
- scrapy_tasks = [self.get_every_page_info(
- "https://show.bilibili.com/api/ticket/project/listV2?version=134&page={}&pagesize=16&area={}&filter=&platform=web&p_type={}".format(
- page, city_id, type)) for page in range(1, max_page + 1)]
- # 并发执行任务,获取执行结果
- scrapy_results = await asyncio.gather(*scrapy_tasks)
- # 解析结果数据
- for result in scrapy_results:
- data = parse_data(result)
- for city_info in data:
- print(city_info)
- save_file(city_info, city_id)
- # 关闭连接
- await self.session.close()
- if __name__ == '__main__':
- # 开始时间
- start_time = time.time()
- # 获取城市编号,设置类属性cities_id_list
- Myspider.set_cities_id()
- # 初始化Myspider
- spider = Myspider()
- # 创建事件循环池
- loop = asyncio.get_event_loop()
- # 注册
- loop.run_until_complete(spider.main())
- # 结束事件
- end_time = time.time()
- logging.info(f"total_time: {end_time - start_time}")
- # print(spider.get_max_page('https://show.bilibili.com/api/ticket/project/listV2?version=134&page=1&pagesize=16&area=110100&filter=&platform=web&p_type=%E5%85%A8%E9%83%A8%E7%B1%BB%E5%9E%8B'))
复制代码 更多精致内容: [CodeRealm]
[外链图片转存失败,源站大概有防盗链机制,建议将图片保存下来直接上传(img-BB6kKZj6-1722175080359)(https://i-blog.csdnimg.cn/direct/e18ac94120d945d28ffc46243559ba96.png#pic_center)]
免责声明:如果侵犯了您的权益,请联系站长,我们会及时删除侵权内容,谢谢合作!更多信息从访问主页:qidao123.com:ToB企服之家,中国第一个企服评测及商务社交产业平台。 |