import requests import time from bs4 import BeautifulSoup url="https://movie.douban.com/top250?start=0" for i in range(10): url="https://movie.douban.com/top250?start="+ str(i*25) print(url) headers={ 'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36' ,'Cookie' : 'bid=uF9ux7mK05s; __utmc=30149280; push_noty_num=0; push_doumail_num=0; __gads=ID=a70f4f7daef34c82-22d73b4c7cdc002a:T=1679378544:RT=1679378544:S=ALNI_MbvrIegly9aVzyyg1KUfkj_U8zZNg; __gpi=UID=00000bdc91487ff0:T=1679378544:RT=1679378544:S=ALNI_Mbw-cobhGE0mj6qOVNYu0PLUW6srg; __yadk_uid=vRYo5bQXwbmKrd1enLtfPwbMCx8RdWjm; _pk_ref.100001.8cb4=%5B%22%22%2C%22%22%2C1679384454%2C%22https%3A%2F%2Fcn.bing.com%2F%22%5D; _pk_ses.100001.8cb4=*; ll="118281"; __utma=30149280.1339882974.1679378543.1679378543.1679384467.2; __utmz=30149280.1679384467.2.2.utmcsr=cn.bing.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __utmt=1; __utmb=30149280.1.10.1679384467; dbcl2="268997119:qj6AwkV9IWg"; ck=sfDP; _pk_id.100001.8cb4=2aa0c079c9e490d3.1679379069.2.1679384535.1679379069.' } data=requests.get(url,headers=headers).content soup = BeautifulSoup(data,'lxml') name = soup.select('#content > div > div.article > ol > li > div > div.info > div.hd > a > span:nth-child(1)') rank=soup.select("#content > div > div.article > ol > li > div > div.pic > em") pingfen=soup.select('#content > div > div.article > ol > li > div > div.info > div.bd > div > span.rating_num') web=soup.select('#content > div > div.article > ol > li > div > div.info > div.hd > a') names_url=[] for i in range(len(name)): temp= str(rank.get_text())+"--"+str(name.get_text())+"--"+str(pingfen.get_text())+"--"+web['href'] print(str(temp)) names_url.append(temp) print(names_url) |
import scrapy from ..items import DouBanprojectItem class DoubanprojectSpider(scrapy.Spider): name = "DouBanProject" allowed_domains = ["movie.douban.cn"] pre_urls = ["http://movie.douban.com/"] Second_urls = ["https://movie.douban.com/subject/1292052/"] index = 25 pre_url = 'https://movie.douban.com/top250?start=' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36', 'Cookie': 'bid=dJvnLS9LSZw; dbcl2="268997517 ![]() } def start_requests(self): new_url = self.pre_url + str(0) yield scrapy.Request(url=new_url, headers=self.headers,callback=self.parse) def parse(self, response): for element in response.xpath('//*[ @ id = "content"]/div/div[1]/ol/li'): no = element.xpath('./div/div[1]/em/text()').get() name = element.xpath('./div/div[2]/div[1]/a/span[1]/text()').get() grade = element.xpath('./div/div[2]/div[2]/div/span[2]/text()').get() new_url = element.xpath('./div/div[2]/div[1]/a/@href').get() # print(no, name, grade) yield scrapy.Request(url=new_url, headers=self.headers, callback=self.movie) if self.index < 250: new_url = self.pre_url + str(self.index) # https: // movie.douban.com / top250?start = 27 self.index += 25 yield scrapy.Request(url=new_url, headers=self.headers, callback=self.parse) # def Second_requests(self): # # two_url = self.pre1_url + str(0) # yield scrapy.Request(url=two_url, headers=self.headers, callback=self.movie) def movie(self, response): # response.xpath('//*[@id="content"]/h1/span[1]'): daoyan = response.xpath('//*[@id="info"]/span[1]/span[2]/a/text()').get() biaoju = response.xpath('//*[@id="info"]/span[2]/span[2]/a/text()').get() zhuyan = response.xpath('//*[@id="info"]/span[3]/span[2]/span/a/text()').get() leixing = response.xpath('//*[@id="info"]/span/text()').get() shangyintime = response.xpath('//*[@id="info"]/span[10]/text()').get() pianchang = response.xpath('//*[@id="info"]/span[13]/text()').get() pingfenRshu = response.xpath('//*[@id="interest_sectl"]/div[1]/div[2]/div/div[2]/a/text()').get() juqingjiejia = response.xpath('//*[@id="link-report-intra"]/span[1]/span/text()').get() print(daoyan, biaoju, zhuyan, leixing, shangyintime, pianchang, pingfenRshu, juqingjiejia) item = DouBanprojectItem() # item['no'] = no # item['name'] = name item['daoyan'] = daoyan item['biaoju'] = biaoju item['zhuyan'] = zhuyan item['shangyintime'] = shangyintime item['pianchang'] = pianchang item['pianchang'] = pianchang item['pingfenRshu'] = pingfenRshu item['juqingjiejia'] = juqingjiejia yield item |
欢迎光临 ToB企服应用市场:ToB评测及商务社交产业平台 (https://dis.qidao123.com/) | Powered by Discuz! X3.4 |