一个爬虫
这是我第一次接触爬虫,写的第一个爬虫实例。
https://movie.douban.com/top250
模块
- import requests #用于发送请求
- import re #使用正则表达式,用于匹配处理文本
- import os #用于创建文件夹
- from lxml import etree #这里我使用了Xpath表达式用于数据解析,我觉得这个模块比BeautifulSoup好用,强烈推荐
- from fake_useragent import UserAgent #反爬虫,随机获取浏览器 UA 信息
复制代码 代码
- import requests
- import re
- import os
- from lxml import etree
- from fake_useragent import UserAgent
- class doubanSpider(object):
- def __init__(self):
- if not os.path.exists('db/douban'):
- os.makedirs('db/douban')
- else:
- pass
- self.f = open('./db/douban/douban.txt', 'a', encoding='utf-8')
- def start(self):
- for i in range(46):
- headers = {
- 'User-Agent': UserAgent().random
- }
- url = 'https://movie.douban.com/top250?start=' + str(i * 25)
- r = requests.get(url, headers=headers)
- html = etree.HTML(r.text)
- li_list = html.xpath('//*[@id="content"]/div/div[1]/ol/li')
- movies = []
- for each in li_list:
- movie = {}
- order = each.xpath('.//div/div[1]/em/text()')[0]
- movie['id'] = order
- cover = each.xpath('.//div/div[1]/a/img/@src')[0]
- movie['cover'] = cover
- name = each.xpath('.//div/div[2]/div[1]/a/span/text()')
- flag = ''
- for mo in name:
- flag += mo
- movie['name'] = flag
- info = each.xpath('.//div/div[2]/div[2]/p[1]/text()[1]')[0].strip()
- info = info.replace("\n", "")
- info = info.replace("\xa0", "")
- director = re.findall(r'[导演:].+[主演:]', info)[0]
- director = director[4:len(director) - 3]
- movie['director'] = director
- try:
- role = re.findall(r'主.+', info)[0]
- role = role[4:]
- except IndexError:
- role = ''
- movie['role'] = role
- plot = each.xpath('.//div/div[2]/div[2]/p[1]/text()[2]')[0].strip()
- plot = plot.replace("\xa0", "")
- movie['plot'] = plot
- star = each.xpath('.//div/div[2]/div[2]/div/span[2]/text()')[0]
- movie['star'] = star
- try:
- comment = each.xpath('.//div/div[2]/div[2]/p[2]/span/text()')[0]
- except IndexError:
- comment = ''
- movie['comment'] = comment
- movies.append(movie)
- self.f.write(str(movie)+'\n')
- print(movie)
- def run(self):
- self.start()
- self.f.close()
- if __name__ == '__main__':
- spider = doubanSpider()
- spider.run()
复制代码 免责声明:如果侵犯了您的权益,请联系站长,我们会及时删除侵权内容,谢谢合作! |