爬虫学习心得

打印 上一主题 下一主题

主题 988|帖子 988|积分 2964

马上注册,结交更多好友,享用更多功能,让你轻松玩转社区。

您需要 登录 才可以下载或查看,没有账号?立即注册

x
下面给新学者分享一些爬虫的技巧

1 xpath的学习

  1.         xpath为用的最广,也是最好用的,不过比较吃性能。
复制代码
a xpath的基本用法

  1. from lxml import etree
  2. from lxml import html
  3. etree=html. etree
  4. #需要加载准备解析的数据
  5. f=open("test.html",modle="r",encoding='utf-8')
  6. pageSource = f.read()
  7. #加载数据,返回element对象
  8. et=etree.HTML(pageSource)
  9. #xpath的语法
  10. result =et.xpath('/html')#/html表示根节
  11. result = et.xpath('/html/body')#表达式中间的/表示一层html节点
  12. result= et.xpath('/html/body/span')
  13. result =et.xpath('/html/body/span/text()')#获取span中的文字
  14. #text()表示提取标签中的文本信息
  15. result = et.xpath('/html/body/*/l/a/text()')
  16. #*任意的.通配符
  17. result =et.xpath('/htmi/body/*/li/a/@href')#@表示属性
  18. result =et.xpath("//li/a/@href")#/表示任意位置
  19. result=et.xpath('/div[@class=job]/text()') #[@xx=xxxx]属性的限定
  20. print(result)
  21. #带循环的
  22. result= et.xpath('/html/body/ul/li')
  23. for item in result:
  24.     href= item.xpath("./a/@href")[0]#./表示当前这个元素
  25.     text =item.xpath("./a/text()")[0]#./表示当前这个元素
  26.     print(text, href)
复制代码
b xpath案例

  1. import requests
  2. import re
  3. import os
  4. import json
  5. from lxml import etree
  6. from lxml import etree
  7. url = "xxx"
  8. headers = {
  9.     "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
  10. }
  11. try:
  12.     response = requests.get(url=url,headers=headers)
  13.     # 设置编码为和父类一样
  14.     response.encoding =response.apparent_encoding
  15.     print(response.text)
  16.     et = etree.HTML(response.text)
  17.     #xpath匹配
  18.     results = et.xpath(f'//*[@id="J-lemma-main-wrapper"]/div[2]/div/div[1]/div/*/*/*/dd/span/text()')
  19.     print(results)
  20. except:
  21.     print("errrro")
复制代码
2.正则表达式学习

a 基本使用

  1. import re
  2. result=re.search(r'\d+','今天我有100块,买了两个蛋糕')#返回match对象
  3. print(result.group())
  4. result=re.finditer(r'\d+','今天我有100块,买了两个蛋糕')#返回的是迭代器
  5. print(result)
  6. for item in result:
  7.     print(item.group())
  8. #预加载
  9. obj=re.compile(r'\d+')
  10. result=obj.findall('今天我有100块,买了两个蛋糕')
  11. print(result)
  12. match="divorce 分歧;diverse 各色各样;diversify 使多样化;divert 转入;divest 脱去;divorce 分开; divulge 泄露"
  13. pattern=".*?&nbsp(.*?);"
  14. match_1=re.search(pattern,match)
  15. print(match_1.group())
复制代码
b 12306提取省份成json格式

  1. import requests
  2. import re
  3. url = "https://kyfw.12306.cn/otn/resources/js/framework/favorite_name.js"
  4. headers = {
  5.     "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
  6. }
  7. response = requests.get(url=url,headers=headers)
  8. response.encoding =response.apparent_encoding
  9. parr = re.compile(r"@.*?\|(.*?)\|(.*?)\|")
  10. images = re.findall(parr,response.text)
  11. len1=len(images)
  12. print(response.text)
  13. data={}
  14. json_file = open('data.json', 'w+', encoding='UTF-8',newline='')
  15. json_file.write("{")
  16. json_file.write('\n')
  17. for image in images:
  18.     # print(image[0],end='')
  19.     # print(image[1])
  20.     name=image[0]
  21.     id=image[1]
  22.     json_file.write(f""{name}":"{id}"")
  23.     if name!=images[len1-1][0]:
  24.         json_file.write(",")
  25.     json_file.write("\n")
  26. json_file.write('\n')
  27. json_file.write("}")
复制代码
3.Bs的学习

3.1基础

  1. from urllib.request import urlopen
  2. from bs4 import BeautifulSoup
  3. html=urlopen('xxx.html')
  4. bs=BeautifulSoup(html, 'html.parser')
  5. # data=bs.select('li a[href="news/index.html"]')
  6. # data=bs.select('title')
  7. # print(data)
  8. #1.通过标签名查找
  9. data=bs.select('title')[0].get_text()
  10. print('通过标签名查找:'+data)
  11. print('----------'*30)
  12. #2.通过类名查找
  13. data1=bs.select('.infoPath a')[0].get_text()
  14. print('通过类名查找:'+data1)
  15. print('----------'*30)
  16. #3.通过id名查找
  17. data2=bs.select('#header_login_user a')[0].get_text()
  18. print('通过id名查找:'+data2)
  19. print('----------'*30)
  20. #4.组合查找
  21. data3=bs.select('.readAreaBox.content h1')[0].get_text()
  22. print('通过组合查找查找:'+data3)
  23. print('----------'*30)
  24. #5.属性查找
  25. data3=bs.select('div a[href="/man"]')[0].get_text()
  26. print('通过属性查找查找:'+data3)
复制代码
###3.2基础案例
  1. import requests
  2. from bs4 import BeautifulSoup
  3. class Content:
  4.     def __init__(self, url, title, body):
  5.         self.url = url
  6.         self.title = title
  7.         self.body = body
  8. def getPage(url):
  9.         html = requests.get(url)
  10.         return BeautifulSoup(html.content, 'html.parser')
  11. def scrapeNew(url):#小说
  12.     bs =getPage(url)
  13.     title = bs.find('h1').text
  14.     body = bs.select('#readArea > div.readAreaBox.content > div.p')[0].text
  15.     return Content(url, title, body)
  16. def scrapeNew1(url):#故事大全
  17.     bs=getPage(url)
  18.     title = bs.find('h1').text
  19.     # print(title)
  20.     # body=bs.select('#newsnr p')
  21.     body=bs.find('div',class_='article_content').text
  22.     # body=''.join(body)
  23.     return Content(url,title,body)
  24. def scrapeNew2(url):#新浪新闻
  25.     bs=getPage(url)
  26.     title = bs.find('h1').text
  27.     print(title)
  28.     body=bs.find('div',class_='article').text
  29.     return Content(url,title,body)
  30. url= 'xxxx'
  31. content = scrapeNew(url)
  32. print('Title:{}'.format(content.title))
  33. print('URL:{}'.format(content.url))
  34. print('body:{}'.format(content.body))
  35. print("-" * 130)
  36. url='xxxx'
  37. content=scrapeNew1(url)
  38. print('Title:{}'.format(content.title))
  39. print('URL:{}' .format(content.url))
  40. print('body:{}'.format(content.body))
  41. print("-" * 130)
  42. url='xxxx'
  43. content=scrapeNew2(url)
  44. print('Title:{}'.format(content.title))
  45. print('URL:{}' .format(content.url))
  46. print('body:{}'.format(content.body))
复制代码
3.3基础案例

  1. import requests
  2. from bs4 import BeautifulSoup
  3. class Content:
  4.     def __init__(self, topic, url, title, body):
  5.         self.topic=topic
  6.         self.url=url
  7.         self.title=title
  8.         self.body=body
  9.     def print(self):
  10.         print("找到的相关关键字的文章:{}".format(self.topic))
  11.         print("网站:{}".format (self.url))
  12.         print("标题:{}".format (self.title))
  13.         print("内容:{}".format (self.body))
  14. class Website:
  15.     def __init__(self, name, url, searchUrl, resultListing, resultUrl, absoluteUrl, titleTag, bodyTag):
  16.         self.name=name#网站名
  17.         self.urL=url#网站网址
  18.         self.searchUrl=searchUrl#关键词的总界面
  19.         self.resultListing=resultListing#存放每个信息的盒子,即每个结果链接的上层标签,例如di
  20.         self.resultUrl=resultUrl#更细致的定位准确的URL,如div.result h3 a
  21.         self.absoluteUrl=absoluteUrl#一个布尔值,判断绝对相对链接
  22.         self.titleTag=titleTag#具体链接文章的标题标签
  23.         self.bodyTag=bodyTag
  24. headers={
  25.         "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 Edg/118.0.2088.61"
  26.         }
  27. class Crawler:
  28.     def getPage (self, url):
  29.         # print('测试'+url)
  30.         try:
  31.             req=requests.get (url,headers=headers)
  32.         except requests.exceptions.RequestException:
  33.             return None
  34.         return BeautifulSoup(req.content,'html.parser')
  35.     def safeGet (self, pageObj, selector):
  36.         childObj=pageObj.select(selector)
  37.         if len(childObj) > 0:
  38.             return childObj[0].get_text()
  39.         return ""
  40.     def search(self, topic, site):
  41.         bs=self.getPage(site.searchUrl+topic+'&type=web')
  42.         # print(bs)
  43.         searchResults=bs.select(site.resultListing)
  44.         # print(searchResults)
  45.         for result in searchResults:
  46.             # print(result)
  47.             url=result.select(site.resultUrl)[0].attrs['lanmu1']
  48.             print(url)
  49.             if (site.absoluteUrl):
  50.                 bs=self.getPage(url)
  51.             else:
  52.                 bs=self.getPage(site.url+url)
  53.             if bs is None:
  54.                 print("网页有错")
  55.                 return
  56.             title=self.safeGet(bs,site.titleTag)
  57.             # print(title)
  58.             body=self.safeGet(bs,site.bodyTag)
  59.             if title !="" and body !="":
  60.                 content=Content(topic,url,title,body)
  61.                 content.print()
  62. # #news_list > table:nth-child(5) > tbody > tr:nth-child(1) > td:nth-child(2) > ul > li.news_title > a
  63. crawler=Crawler()
  64. sitedata=[['中国新闻网','https://news.cctv.com','https://search.cctv.com/search.php?qtext=','div .tright','h3.tit span',True,'h1','#content_area']]
  65. sites=[]
  66. for row in sitedata:
  67.     site_obj=Website(row[0], row[1], row[2], row[3], row[4], row[5], row[6], row[7])
  68.     sites.append(site_obj)
  69.     topics=['双十一']
  70.     for topic in topics:
  71.         print ('GETTING INFO ABOUT:'+topic)
  72.         for targetSite in sites:
  73.             crawler.search(topic,targetSite)
复制代码
3.4基础案例
  1. import requests
  2. from bs4 import BeautifulSoup
  3. import re
  4. class Website:
  5.     def __init__(self, name, url, targetPattern, absoluteUrl, titleTag, bodyTag) :
  6.         self. name = name
  7.         self.url = url
  8.         self. targetPattern = targetPattern
  9.         self.absoluteUrl=absoluteUrl
  10.         self. titleTag = titleTag
  11.         self.bodyTag = bodyTag
  12. class Content:
  13.     def __init__(self, url, title, body):
  14.         self.url = url
  15.         self.title=title
  16.         self.body = body
  17.     def print(self):
  18.         print("URL: {}".format(self.url))
  19.         print("TITLE:{}".format(self.title))
  20.         print("BODY:{]".format(self.body))
  21. class Crawler:
  22.     def __init__(self, site):
  23.         self.site = site
  24.         self.visited=[]
  25.     def getPage(self, url):
  26.         try:
  27.             req = requests.get(url)
  28.         except requests.exceptions.RequestException:
  29.             return None
  30.         return BeautifulSoup(req.content,'html.parser')
  31.     def safeGet (self, pageObj, selector):
  32.         selectedElems = pageObj.select(selector)
  33.         if selectedElems is not None and len(selectedElems) > 0:
  34.             return '\n'.join([elem.get_text() for elem in selectedElems])
  35.         return ''
  36.     def parser(self, url):
  37.         bs = self.getPage('http://zs.lywhxy.com'+url)
  38.         print(bs)
  39.         if bs is not None:
  40.             title = self.safeGet(bs, self.site.titleTag)
  41.             print(title)
  42.             body = self.safeGet(bs, self.site.bodyTag)
  43.             if title != '' and body != '':
  44.                 content = Content(url, title, body)
  45.                 content.print()
  46.     def crawl(self): # 获取网站士贝的页面链投
  47.         bs = self.getPage(self.site.url)
  48.         targetPages = bs.findAll('a',href=re.compile(self.site.targetPattern))
  49.         # print(targetPages)
  50.         for targetPage in targetPages:
  51.             targetPage = targetPage.attrs['href']
  52.             # print(targetPage)
  53.             if targetPage not in self.visited:
  54.                 self.visited.append(targetPage)
  55.                 if not self.site.absoluteUrl:
  56.                     # targetPage = '{}{}'.format(self.site.url, targetPage)
  57.                     # print(targetPage)
  58.                     self.parser(targetPage)
  59. #reuters = Website('brook.double', http://www.lywhy.com', '^(/Life/)', False, 'h4 span', 'dd.fr')
  60. # reuters=Website('学院概況','http://ww.lywhxy.com','^(/about/)',False, 'h4 span','h1']
  61. reuters = Website('Reuters','http://zs.lywhxy.com', '/.*?/', False,'h1', 'p')
  62. crawler = Crawler(reuters)
  63. crawler.crawl()
复制代码
3.5基本案例

  1. import requests
  2. from bs4 import BeautifulSoup
  3. import re
  4. class Content:
  5.     #所有文意人网页的共同基类
  6.     def __init__(self,topic,url,title,body):
  7.         self.topic=topic
  8.         self.url=url
  9.         self.title=title
  10.         self.body=body
  11.     def print(self):#打印函数,控制結果输出,友便烹服
  12.         print("找到的相关关键字的文章:{}".format(self.topic))
  13.         print("网址:{}".format(self.url))
  14.         print("标题:{}".format(self.title))
  15.         print("内容:{}".format(self.body))
  16. class Website:
  17.     def __init__(self, name, url, titleTag, bodyTag):
  18.         self.name = name
  19.         self.url = url
  20.         self.titleTag = titleTag
  21.         self.bodyTag = bodyTag
  22. class Crawler:
  23.     def getPage(self,url):
  24.         try:
  25.             html = requests.get(url)
  26.             # html.encoding="UTF-8"
  27.         except requests.exceptions.RequestException:
  28.             return None
  29.         return BeautifulSoup(html.content,'html.parser')
  30.             #return BeautifulSoup(html.content, 'html.parser')
  31.     def safeGet(self, pageObj, selector):
  32.         #pageObj指bs对象
  33.         selectedElems = pageObj.select(selector)
  34.         if selectedElems is not None and len(selectedElems) > 0:
  35.             return '\n'.join([elem.get_text() for elem in selectedElems])
  36.         return ''
  37.     def parse(self, site_obj, url):
  38.         """
  39.         调用 getPage()获取包含目标数据的 bs对象,使用 safeGet()解析 bs对象的 title和 body,非空时存储到 content里
  40.         """
  41.         bs = self.getPage(url)
  42.         if bs is not None:
  43.             title = self.safeGet(bs, site_obj.titleTag)
  44.             print(title)
  45.             body = self.safeGet(bs, site_obj.bodyTag)
  46.             # print(body)
  47.             if title != '' and body != '':
  48.                 content = Content(site_obj.name,url, title, body)
  49.                 content.print()  # 调用封装后的 print()
  50. # 将要爬取的目标网页的 name,url,tag,cssselector等信息存储在嵌套列表里:
  51. siteData = [
  52.     # ['丽江文化旅游学院','https://lywhxy.com','h1','p']
  53.     #readArea > div.readAreaBox.content > div.p  #readArea > div.readAreaBox.content body > div.wrap_q > div.main_r.floatleft_q > div.right_r.floatleft_q > div
  54.     ['央视新闻', 'https://news.cctv.com', '.title_area h1', '.content_area p'],
  55.     ['免费小说网','https://www.17k.com/chapter/3328785/44207503.html','div.readAreaBox.content h1','div.readAreaBox.content > div.p'],
  56.     ['故事大全','https://www.qigushi.com/tonghuagushi/1067.html','h1','.article_content'],
  57.     ['新浪新闻','https://news.sina.com.cn/gov/xlxw/2023-10-25/doc-imzshqvs2406187.shtml','h1','.article'],
  58.     ['青年文摘','https://blog.csdn.net/csdnnews/article/details/134025189?spm=1000.2115.3001.5927','h1','.wznr_r'],
  59.     ['领导留言板','','title','body > div.main > div.layout.rm_txt.cf > div.col.col-1.fl > div.rm_txt_con.cf']
  60. ]
  61. #print(type(siteData))
  62. # 将上述信息实例化成 website对象:
  63. websites=[]
  64. # print(type(websites))
  65. for site in siteData:
  66.     # site_obj=Website(site[0],site[1], site[2],site[3])
  67.     site_obj = Website(site[0], site[1], site[2], site[3])
  68.     websites.append(site_obj)
  69. crawler = Crawler()
  70. crawler.parse(websites[0],'https://news.cctv.com/2023/10/24/ARTIFCTydl9njIwEibuO0C2j231024.shtml?spm=C94212.P4YnMod9m2uD.ENPMkWvfnaiV.4')
  71. crawler.parse(websites[1],'https://www.17k.com/chapter/3328785/44207503.html')
  72. crawler.parse(websites[2],'https://www.qigushi.com/tonghuagushi/1067.html')
  73. crawler.parse(websites[3],'https://news.sina.com.cn/gov/xlxw/2023-10-25/doc-imzshqvs2406187.shtml')
  74. crawler.parse(websites[4],'http://www.cyp.com.cn/?viewnews-4481.html')
  75. crawler.parse(websites[5],'http://henan.people.com.cn/n2/2023/1020/c351638-40610144.html')
复制代码
4.拿取数据技巧

4.1通过标签获取

  1. import time
  2. import requests
  3. import random
  4. from bs4 import BeautifulSoup
  5. import re
  6. url='https://www.baidu.com/'
  7. head={
  8. 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
  9. }
  10. pages=set()
  11. random.seed(time.time())
  12. html=requests.get(url)
  13. html.encoding='utf-8'
  14. bs=BeautifulSoup(html.text,"html.parser")
  15. #获取页面中所有内链的列表
  16. includeUrl='baidu.com'
  17. links=bs.find_all('a',href=re.compile('^(.*'+includeUrl+')'))
  18. # print(resp.json())
  19. for link in links:
  20.     print(link.attrs['href'])
复制代码
4.2通过lamda表达式获取数据

  1. html = httpx.get(url, headers=header, verify=False)
  2. bs = BeautifulSoup(html.text, 'html.parser')
  3. # try:
  4. #     bodys = bs.select('.detail_content p')
  5. #     for body in bodys:
  6. #         body = body.get_text()
  7. #         print(body)
  8. # except:
  9. #     pass
  10. #
  11. #
  12. # biaoge=bs.select('.wenben>table>tbody > tr')
  13. # for bg in biaoge:
  14. #     bg=bg.get_text()
  15. #     bg=bg.replace('\n', '**')
  16. #     print(bg)
  17. #
  18. #
  19. # neirong=bs.select('#mycontent>div>span')
  20. # for nr in neirong:
  21. #     print(nr.text)
  22. # txt=bs.find_all(lambda tag: tag.name=='p')
  23. # for i in txt:
  24. #     print(i.text)
  25. #
  26. # txt1=bs.find_all(lambda tag: tag.name=='tbody')
  27. # for i in txt1:
  28. #     print(i.text)
  29. # txt2=bs.find_all(lambda tag: len(tag.attrs)==1 and tag.attrs[0]=='title')
  30. # for i in txt2:
  31. #     print(i.text)
  32. # txt2=bs.find_all(lambda tag: tag.name == 'span' and tag.has_attr('class') and tag['class'][0] == 'title')
  33. # print(len(txt2))
  34. # for i in txt2:
  35. #     print(i.text)
  36. #
  37. # txt4=bs.find_all(lambda tag: tag.name == 'span' and tag.has_attr('class')==False)
  38. # print(len(txt4))
  39. # for i in txt4:
  40. #     print(i.text)
  41. txt5=bs.find_all(lambda tag: tag.name == 'span')
  42. print(len(txt5))
  43. for i in txt5:
  44.     print(i.text)
  45. biaoge=bs.find_all(lambda tag: tag.name == 'tr')
  46. for bg in biaoge:
  47.     bg=bg.get_text()
  48.     bg=bg.replace('\n', '**')
  49.     print(bg)
复制代码
免责声明:如果侵犯了您的权益,请联系站长,我们会及时删除侵权内容,谢谢合作!更多信息从访问主页:qidao123.com:ToB企服之家,中国第一个企服评测及商务社交产业平台。
回复

使用道具 举报

0 个回复

倒序浏览

快速回复

您需要登录后才可以回帖 登录 or 立即注册

本版积分规则

何小豆儿在此

金牌会员
这个人很懒什么都没写!
快速回复 返回顶部 返回列表