马上注册,结交更多好友,享用更多功能,让你轻松玩转社区。
您需要 登录 才可以下载或查看,没有账号?立即注册
x
下面给新学者分享一些爬虫的技巧
1 xpath的学习
- xpath为用的最广,也是最好用的,不过比较吃性能。
复制代码 a xpath的基本用法
- from lxml import etree
- from lxml import html
- etree=html. etree
- #需要加载准备解析的数据
- f=open("test.html",modle="r",encoding='utf-8')
- pageSource = f.read()
- #加载数据,返回element对象
- et=etree.HTML(pageSource)
- #xpath的语法
- result =et.xpath('/html')#/html表示根节
- result = et.xpath('/html/body')#表达式中间的/表示一层html节点
- result= et.xpath('/html/body/span')
- result =et.xpath('/html/body/span/text()')#获取span中的文字
- #text()表示提取标签中的文本信息
- result = et.xpath('/html/body/*/l/a/text()')
- #*任意的.通配符
- result =et.xpath('/htmi/body/*/li/a/@href')#@表示属性
- result =et.xpath("//li/a/@href")#/表示任意位置
- result=et.xpath('/div[@class=job]/text()') #[@xx=xxxx]属性的限定
- print(result)
- #带循环的
- result= et.xpath('/html/body/ul/li')
- for item in result:
- href= item.xpath("./a/@href")[0]#./表示当前这个元素
- text =item.xpath("./a/text()")[0]#./表示当前这个元素
- print(text, href)
复制代码 b xpath案例
- import requests
- import re
- import os
- import json
- from lxml import etree
- from lxml import etree
- url = "xxx"
- headers = {
- "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
- }
- try:
- response = requests.get(url=url,headers=headers)
- # 设置编码为和父类一样
- response.encoding =response.apparent_encoding
- print(response.text)
- et = etree.HTML(response.text)
- #xpath匹配
- results = et.xpath(f'//*[@id="J-lemma-main-wrapper"]/div[2]/div/div[1]/div/*/*/*/dd/span/text()')
- print(results)
- except:
- print("errrro")
复制代码 2.正则表达式学习
a 基本使用
- import re
- result=re.search(r'\d+','今天我有100块,买了两个蛋糕')#返回match对象
- print(result.group())
- result=re.finditer(r'\d+','今天我有100块,买了两个蛋糕')#返回的是迭代器
- print(result)
- for item in result:
- print(item.group())
- #预加载
- obj=re.compile(r'\d+')
- result=obj.findall('今天我有100块,买了两个蛋糕')
- print(result)
- match="divorce 分歧;diverse 各色各样;diversify 使多样化;divert 转入;divest 脱去;divorce 分开; divulge 泄露"
- pattern=".*? (.*?);"
- match_1=re.search(pattern,match)
- print(match_1.group())
复制代码 b 12306提取省份成json格式
- import requests
- import re
- url = "https://kyfw.12306.cn/otn/resources/js/framework/favorite_name.js"
- headers = {
- "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
- }
- response = requests.get(url=url,headers=headers)
- response.encoding =response.apparent_encoding
- parr = re.compile(r"@.*?\|(.*?)\|(.*?)\|")
- images = re.findall(parr,response.text)
- len1=len(images)
- print(response.text)
- data={}
- json_file = open('data.json', 'w+', encoding='UTF-8',newline='')
- json_file.write("{")
- json_file.write('\n')
- for image in images:
- # print(image[0],end='')
- # print(image[1])
- name=image[0]
- id=image[1]
- json_file.write(f""{name}":"{id}"")
- if name!=images[len1-1][0]:
- json_file.write(",")
- json_file.write("\n")
- json_file.write('\n')
- json_file.write("}")
复制代码 3.Bs的学习
3.1基础
- from urllib.request import urlopen
- from bs4 import BeautifulSoup
- html=urlopen('xxx.html')
- bs=BeautifulSoup(html, 'html.parser')
- # data=bs.select('li a[href="news/index.html"]')
- # data=bs.select('title')
- # print(data)
- #1.通过标签名查找
- data=bs.select('title')[0].get_text()
- print('通过标签名查找:'+data)
- print('----------'*30)
- #2.通过类名查找
- data1=bs.select('.infoPath a')[0].get_text()
- print('通过类名查找:'+data1)
- print('----------'*30)
- #3.通过id名查找
- data2=bs.select('#header_login_user a')[0].get_text()
- print('通过id名查找:'+data2)
- print('----------'*30)
- #4.组合查找
- data3=bs.select('.readAreaBox.content h1')[0].get_text()
- print('通过组合查找查找:'+data3)
- print('----------'*30)
- #5.属性查找
- data3=bs.select('div a[href="/man"]')[0].get_text()
- print('通过属性查找查找:'+data3)
复制代码 ###3.2基础案例
- import requests
- from bs4 import BeautifulSoup
- class Content:
- def __init__(self, url, title, body):
- self.url = url
- self.title = title
- self.body = body
- def getPage(url):
- html = requests.get(url)
- return BeautifulSoup(html.content, 'html.parser')
- def scrapeNew(url):#小说
- bs =getPage(url)
- title = bs.find('h1').text
- body = bs.select('#readArea > div.readAreaBox.content > div.p')[0].text
- return Content(url, title, body)
- def scrapeNew1(url):#故事大全
- bs=getPage(url)
- title = bs.find('h1').text
- # print(title)
- # body=bs.select('#newsnr p')
- body=bs.find('div',class_='article_content').text
- # body=''.join(body)
- return Content(url,title,body)
- def scrapeNew2(url):#新浪新闻
- bs=getPage(url)
- title = bs.find('h1').text
- print(title)
- body=bs.find('div',class_='article').text
- return Content(url,title,body)
- url= 'xxxx'
- content = scrapeNew(url)
- print('Title:{}'.format(content.title))
- print('URL:{}'.format(content.url))
- print('body:{}'.format(content.body))
- print("-" * 130)
- url='xxxx'
- content=scrapeNew1(url)
- print('Title:{}'.format(content.title))
- print('URL:{}' .format(content.url))
- print('body:{}'.format(content.body))
- print("-" * 130)
- url='xxxx'
- content=scrapeNew2(url)
- print('Title:{}'.format(content.title))
- print('URL:{}' .format(content.url))
- print('body:{}'.format(content.body))
复制代码 3.3基础案例
- import requests
- from bs4 import BeautifulSoup
- class Content:
- def __init__(self, topic, url, title, body):
- self.topic=topic
- self.url=url
- self.title=title
- self.body=body
- def print(self):
- print("找到的相关关键字的文章:{}".format(self.topic))
- print("网站:{}".format (self.url))
- print("标题:{}".format (self.title))
- print("内容:{}".format (self.body))
- class Website:
- def __init__(self, name, url, searchUrl, resultListing, resultUrl, absoluteUrl, titleTag, bodyTag):
- self.name=name#网站名
- self.urL=url#网站网址
- self.searchUrl=searchUrl#关键词的总界面
- self.resultListing=resultListing#存放每个信息的盒子,即每个结果链接的上层标签,例如di
- self.resultUrl=resultUrl#更细致的定位准确的URL,如div.result h3 a
- self.absoluteUrl=absoluteUrl#一个布尔值,判断绝对相对链接
- self.titleTag=titleTag#具体链接文章的标题标签
- self.bodyTag=bodyTag
- headers={
- "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 Edg/118.0.2088.61"
- }
- class Crawler:
- def getPage (self, url):
- # print('测试'+url)
- try:
- req=requests.get (url,headers=headers)
- except requests.exceptions.RequestException:
- return None
- return BeautifulSoup(req.content,'html.parser')
- def safeGet (self, pageObj, selector):
- childObj=pageObj.select(selector)
- if len(childObj) > 0:
- return childObj[0].get_text()
- return ""
- def search(self, topic, site):
- bs=self.getPage(site.searchUrl+topic+'&type=web')
- # print(bs)
- searchResults=bs.select(site.resultListing)
- # print(searchResults)
- for result in searchResults:
- # print(result)
- url=result.select(site.resultUrl)[0].attrs['lanmu1']
- print(url)
- if (site.absoluteUrl):
- bs=self.getPage(url)
- else:
- bs=self.getPage(site.url+url)
- if bs is None:
- print("网页有错")
- return
- title=self.safeGet(bs,site.titleTag)
- # print(title)
- body=self.safeGet(bs,site.bodyTag)
- if title !="" and body !="":
- content=Content(topic,url,title,body)
- content.print()
- # #news_list > table:nth-child(5) > tbody > tr:nth-child(1) > td:nth-child(2) > ul > li.news_title > a
- crawler=Crawler()
- sitedata=[['中国新闻网','https://news.cctv.com','https://search.cctv.com/search.php?qtext=','div .tright','h3.tit span',True,'h1','#content_area']]
- sites=[]
- for row in sitedata:
- site_obj=Website(row[0], row[1], row[2], row[3], row[4], row[5], row[6], row[7])
- sites.append(site_obj)
- topics=['双十一']
- for topic in topics:
- print ('GETTING INFO ABOUT:'+topic)
- for targetSite in sites:
- crawler.search(topic,targetSite)
复制代码 3.4基础案例
- import requests
- from bs4 import BeautifulSoup
- import re
- class Website:
- def __init__(self, name, url, targetPattern, absoluteUrl, titleTag, bodyTag) :
- self. name = name
- self.url = url
- self. targetPattern = targetPattern
- self.absoluteUrl=absoluteUrl
- self. titleTag = titleTag
- self.bodyTag = bodyTag
- class Content:
- def __init__(self, url, title, body):
- self.url = url
- self.title=title
- self.body = body
- def print(self):
- print("URL: {}".format(self.url))
- print("TITLE:{}".format(self.title))
- print("BODY:{]".format(self.body))
- class Crawler:
- def __init__(self, site):
- self.site = site
- self.visited=[]
- def getPage(self, url):
- try:
- req = requests.get(url)
- except requests.exceptions.RequestException:
- return None
- return BeautifulSoup(req.content,'html.parser')
- def safeGet (self, pageObj, selector):
- selectedElems = pageObj.select(selector)
- if selectedElems is not None and len(selectedElems) > 0:
- return '\n'.join([elem.get_text() for elem in selectedElems])
- return ''
- def parser(self, url):
- bs = self.getPage('http://zs.lywhxy.com'+url)
- print(bs)
- if bs is not None:
- title = self.safeGet(bs, self.site.titleTag)
- print(title)
- body = self.safeGet(bs, self.site.bodyTag)
- if title != '' and body != '':
- content = Content(url, title, body)
- content.print()
- def crawl(self): # 获取网站士贝的页面链投
- bs = self.getPage(self.site.url)
- targetPages = bs.findAll('a',href=re.compile(self.site.targetPattern))
- # print(targetPages)
- for targetPage in targetPages:
- targetPage = targetPage.attrs['href']
- # print(targetPage)
- if targetPage not in self.visited:
- self.visited.append(targetPage)
- if not self.site.absoluteUrl:
- # targetPage = '{}{}'.format(self.site.url, targetPage)
- # print(targetPage)
- self.parser(targetPage)
- #reuters = Website('brook.double', http://www.lywhy.com', '^(/Life/)', False, 'h4 span', 'dd.fr')
- # reuters=Website('学院概況','http://ww.lywhxy.com','^(/about/)',False, 'h4 span','h1']
- reuters = Website('Reuters','http://zs.lywhxy.com', '/.*?/', False,'h1', 'p')
- crawler = Crawler(reuters)
- crawler.crawl()
复制代码 3.5基本案例
- import requests
- from bs4 import BeautifulSoup
- import re
- class Content:
- #所有文意人网页的共同基类
- def __init__(self,topic,url,title,body):
- self.topic=topic
- self.url=url
- self.title=title
- self.body=body
- def print(self):#打印函数,控制結果输出,友便烹服
- print("找到的相关关键字的文章:{}".format(self.topic))
- print("网址:{}".format(self.url))
- print("标题:{}".format(self.title))
- print("内容:{}".format(self.body))
- class Website:
- def __init__(self, name, url, titleTag, bodyTag):
- self.name = name
- self.url = url
- self.titleTag = titleTag
- self.bodyTag = bodyTag
- class Crawler:
- def getPage(self,url):
- try:
- html = requests.get(url)
- # html.encoding="UTF-8"
- except requests.exceptions.RequestException:
- return None
- return BeautifulSoup(html.content,'html.parser')
- #return BeautifulSoup(html.content, 'html.parser')
- def safeGet(self, pageObj, selector):
- #pageObj指bs对象
- selectedElems = pageObj.select(selector)
- if selectedElems is not None and len(selectedElems) > 0:
- return '\n'.join([elem.get_text() for elem in selectedElems])
- return ''
- def parse(self, site_obj, url):
- """
- 调用 getPage()获取包含目标数据的 bs对象,使用 safeGet()解析 bs对象的 title和 body,非空时存储到 content里
- """
- bs = self.getPage(url)
- if bs is not None:
- title = self.safeGet(bs, site_obj.titleTag)
- print(title)
- body = self.safeGet(bs, site_obj.bodyTag)
- # print(body)
- if title != '' and body != '':
- content = Content(site_obj.name,url, title, body)
- content.print() # 调用封装后的 print()
- # 将要爬取的目标网页的 name,url,tag,cssselector等信息存储在嵌套列表里:
- siteData = [
- # ['丽江文化旅游学院','https://lywhxy.com','h1','p']
- #readArea > div.readAreaBox.content > div.p #readArea > div.readAreaBox.content body > div.wrap_q > div.main_r.floatleft_q > div.right_r.floatleft_q > div
- ['央视新闻', 'https://news.cctv.com', '.title_area h1', '.content_area p'],
- ['免费小说网','https://www.17k.com/chapter/3328785/44207503.html','div.readAreaBox.content h1','div.readAreaBox.content > div.p'],
- ['故事大全','https://www.qigushi.com/tonghuagushi/1067.html','h1','.article_content'],
- ['新浪新闻','https://news.sina.com.cn/gov/xlxw/2023-10-25/doc-imzshqvs2406187.shtml','h1','.article'],
- ['青年文摘','https://blog.csdn.net/csdnnews/article/details/134025189?spm=1000.2115.3001.5927','h1','.wznr_r'],
- ['领导留言板','','title','body > div.main > div.layout.rm_txt.cf > div.col.col-1.fl > div.rm_txt_con.cf']
- ]
- #print(type(siteData))
- # 将上述信息实例化成 website对象:
- websites=[]
- # print(type(websites))
- for site in siteData:
- # site_obj=Website(site[0],site[1], site[2],site[3])
- site_obj = Website(site[0], site[1], site[2], site[3])
- websites.append(site_obj)
- crawler = Crawler()
- crawler.parse(websites[0],'https://news.cctv.com/2023/10/24/ARTIFCTydl9njIwEibuO0C2j231024.shtml?spm=C94212.P4YnMod9m2uD.ENPMkWvfnaiV.4')
- crawler.parse(websites[1],'https://www.17k.com/chapter/3328785/44207503.html')
- crawler.parse(websites[2],'https://www.qigushi.com/tonghuagushi/1067.html')
- crawler.parse(websites[3],'https://news.sina.com.cn/gov/xlxw/2023-10-25/doc-imzshqvs2406187.shtml')
- crawler.parse(websites[4],'http://www.cyp.com.cn/?viewnews-4481.html')
- crawler.parse(websites[5],'http://henan.people.com.cn/n2/2023/1020/c351638-40610144.html')
复制代码 4.拿取数据技巧
4.1通过标签获取
- import time
- import requests
- import random
- from bs4 import BeautifulSoup
- import re
- url='https://www.baidu.com/'
- head={
- 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
- }
- pages=set()
- random.seed(time.time())
- html=requests.get(url)
- html.encoding='utf-8'
- bs=BeautifulSoup(html.text,"html.parser")
- #获取页面中所有内链的列表
- includeUrl='baidu.com'
- links=bs.find_all('a',href=re.compile('^(.*'+includeUrl+')'))
- # print(resp.json())
- for link in links:
- print(link.attrs['href'])
复制代码 4.2通过lamda表达式获取数据
- html = httpx.get(url, headers=header, verify=False)
- bs = BeautifulSoup(html.text, 'html.parser')
- # try:
- # bodys = bs.select('.detail_content p')
- # for body in bodys:
- # body = body.get_text()
- # print(body)
- # except:
- # pass
- #
- #
- # biaoge=bs.select('.wenben>table>tbody > tr')
- # for bg in biaoge:
- # bg=bg.get_text()
- # bg=bg.replace('\n', '**')
- # print(bg)
- #
- #
- # neirong=bs.select('#mycontent>div>span')
- # for nr in neirong:
- # print(nr.text)
- # txt=bs.find_all(lambda tag: tag.name=='p')
- # for i in txt:
- # print(i.text)
- #
- # txt1=bs.find_all(lambda tag: tag.name=='tbody')
- # for i in txt1:
- # print(i.text)
- # txt2=bs.find_all(lambda tag: len(tag.attrs)==1 and tag.attrs[0]=='title')
- # for i in txt2:
- # print(i.text)
- # txt2=bs.find_all(lambda tag: tag.name == 'span' and tag.has_attr('class') and tag['class'][0] == 'title')
- # print(len(txt2))
- # for i in txt2:
- # print(i.text)
- #
- # txt4=bs.find_all(lambda tag: tag.name == 'span' and tag.has_attr('class')==False)
- # print(len(txt4))
- # for i in txt4:
- # print(i.text)
- txt5=bs.find_all(lambda tag: tag.name == 'span')
- print(len(txt5))
- for i in txt5:
- print(i.text)
- biaoge=bs.find_all(lambda tag: tag.name == 'tr')
- for bg in biaoge:
- bg=bg.get_text()
- bg=bg.replace('\n', '**')
- print(bg)
复制代码 免责声明:如果侵犯了您的权益,请联系站长,我们会及时删除侵权内容,谢谢合作!更多信息从访问主页:qidao123.com:ToB企服之家,中国第一个企服评测及商务社交产业平台。 |