1.页面
2.分析
发现导航栏有不同分类,f12查看网络接口,发现每个分类对应一个参数
从列表进入内容,发现每条内容文件对应一个id,这个id是由列表页传过来的,从列表获取拼出内容的url,然后举行接口爬取内容的文件路径path,再下载文件,解析文件
3.准备工作
爬取的文件数据下载到文件夹中,其他数据存储到数据库中
4..完整代码
- import uuid
- import requests
- import sys
- import pymysql as mysql
- import datetime
- from docx import Document
- import time
- # 爬接口
- # 消除警告
- requests.packages.urllib3.disable_warnings()
- # 无法识别的乱码处理
- non_bmp_map = dict.fromkeys(range(0x10000, sys.maxunicode + 1), 0xfffd)
- con = mysql.connect(host="127.0.0.1", port=3306, user="root", passwd="root", db="guojia_spider", charset="utf8")
- source = "国家法律法规数据库"
- cookies = {
- 'wzws_sessionid': 'gWU4ZGIxYYJmYWI4NWaAMTM5LjIxNC4zMi4yMjGgZtAiVw==',
- 'Hm_lvt_54434aa6770b6d9fef104d146430b53b': '1722493539,1724209491,1724916312',
- 'HMACCOUNT': '4FF444F068B3087E',
- 'Hm_lpvt_54434aa6770b6d9fef104d146430b53b': '1724979519',
- }
- headers = {
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
- 'Connection': 'close'
- }
- # 列表请求参数
- params = {
- 'type': 'dfxfg',
- 'searchType': 'title;vague',
- 'sortTr': 'f_bbrq_s;desc',
- 'gbrqStart': '',
- 'gbrqEnd': '',
- 'sxrqStart': '',
- 'sxrqEnd': '',
- 'sort': 'true',
- 'page': '1',
- 'size': '10',
- '_': '1724980047619',
- }
- # 内容请求参数
- data = {
- 'id': 'ZmY4MDgxODE4ZDczNmFjMTAxOGRjZmFlOTU2MTJlYWU%3D',
- }
- # 入库
- def inputdb(id, title, source_href, ddate, date2, content_label, content_nolabel, attachment, province):
- global con, source
- cursor1 = con.cursor()
- public_time = ddate
- create_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
- # 标题相同 进行去重 避免数据重复
- sql1 = "select * from 表名 where title ='%s'" % (title)
- cursor1.execute(sql1)
- results = cursor1.fetchall()
- if len(results) > 0:
- print('The data already exists---')
- cursor1.close()
- return
- cursor1.close()
- cursor2 = con.cursor()
- if public_time is None or public_time == '':
- public_time = ''
- if date2 is None or date2 == '':
- date2 = ''
- if province != '' and public_time != '' and date2 != '':
- sql2 = (
- "insert into 表名(id,title,source,source_href,public_time,expiry_time,content,content_text,create_time,province,attachment)"
- " values('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s')") % (
- id, title, source, source_href, public_time, date2, content_label, content_nolabel, create_time, province, attachment)
- elif province != '' and public_time != '' and date2 == '':
- sql2 = (
- "insert into 表名(id,title,source,source_href,public_time,content,content_text,create_time,province,attachment)"
- " values('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s')") % (
- id, title, source, source_href, public_time, content_label, content_nolabel, create_time, province, attachment)
- elif province != '' and public_time == '' and date2 != '':
- sql2 = (
- "insert into 表名(id,title,source,source_href,expiry_time,content,content_text,create_time,province,attachment)"
- " values('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s')") % (
- id, title, source, source_href, date2, content_label, content_nolabel,create_time, province, attachment)
- elif province == '' and public_time != '' and date2 != '':
- sql2 = (
- "insert into 表名(id,title,source,source_href,public_time,expiry_time,content,content_text,create_time,attachment)"
- " values('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s')") % (
- id, title, source, source_href, public_time, date2, content_label, content_nolabel,create_time, attachment)
- elif province != '' and public_time == '' and date2 == '':
- sql2 = (
- "insert into 表名(id,title,source,source_href,content,content_text,create_time,province,attachment)"
- " values('%s','%s','%s','%s','%s','%s','%s','%s','%s')") % (
- id, title, source, source_href, content_label, content_nolabel,create_time, province, attachment)
- elif province == '' and public_time != '' and date2 == '':
- sql2 = (
- "insert into 表名(id,title,source,source_href,expiry_time,content,content_text,create_time,attachment)"
- " values('%s','%s','%s','%s','%s','%s','%s','%s','%s')") % (
- id, title, source, source_href, public_time, content_label, content_nolabel,create_time, attachment)
- elif province == '' and public_time == '' and date2 != '':
- sql2 = (
- "insert into 表名(id,title,source,source_href,expiry_time,content,content_text,create_time,attachment)"
- " values('%s','%s','%s','%s','%s','%s','%s','%s','%s')") % (
- id, title, source, source_href, date2, content_label, content_nolabel,create_time, attachment)
- else:
- sql2 = (
- "insert into 表名(id,title,source,source_href,content,content_text,create_time,attachment)"
- " values('%s','%s','%s','%s','%s','%s','%s','%s')") % (
- id, title, source, source_href, content_label, content_nolabel,create_time, attachment)
- cursor2.execute(sql2)
- con.commit()
- cursor2.close()
- # 雪花算法 获取id
- class Snowflake:
- def __init__(self, machine_id):
- self.machine_id = machine_id
- self.sequence = 0
- self.last_timestamp = -1
- def generate_id(self):
- timestamp = int(time.time() * 1000)
- if timestamp < self.last_timestamp:
- raise Exception("Clock moved backwards")
- if timestamp == self.last_timestamp:
- self.sequence = (self.sequence + 1) & 4095
- if self.sequence == 0:
- timestamp = self.wait_next_millis(self.last_timestamp)
- else:
- self.sequence = 0
- self.last_timestamp = timestamp
- return ((timestamp - 1288834974657) << 22) | (self.machine_id << 12) | self.sequence
- def wait_next_millis(self, last_timestamp):
- timestamp = int(time.time() * 1000)
- while timestamp <= last_timestamp:
- timestamp = int(time.time() * 1000)
- return timestamp
- if __name__ == '__main__':
- # 地方性法规1-2229页
- params['type'] = 'dfxfg'
- # 司法解释
- # params['type'] = 'sfjs'
- # 法律
- # params['type'] = 'fl'
- # 行政法规
- # params['type'] = 'xzfg'
- # 根据参数不同 循环不同页码
- for i in range(1, 2230):
- num = 1
- params['page'] = i
- response = requests.get(
- url='https://flk.npc.gov.cn/api/',
- headers=headers,
- params=params,
- cookies=cookies,
- verify=False
- )
- if response.status_code == 200:
- response = response.json()
- snowflake = Snowflake(1)
- for j in response['result']['data']:
- print('---start running---')
- data['id'] = j['id']
- title = j['title']
- date = j['publish'] # 发布日期
- date2 = j['expiry'] # 生效日期
- if j['type'] == '地方性法规':
- province = str(j['office']).split('人')[0]
- else:
- province = ''
- uurl = 'https://flk.npc.gov.cn/api/detail'
- new_data = requests.post(url=uurl, data=data, headers=headers, cookies=cookies)
- # 避免网页504
- if new_data.status_code == 504:
- print("Error 504: Gateway Timeout")
- if new_data.status_code == 200:
- new_data = new_data.json()
- download_url = 'https://wb.flk.npc.gov.cn' + new_data['result']['body'][0]['path']
- text_p, text, attachment_url = "", "", ""
- if download_url.split('.')[-1].endswith('pdf'):
- download_url = 'https://wb.flk.npc.gov.cn' + new_data['result']['body'][1]['path']
- if download_url.split('.')[-1].endswith('docx'):
- name = str(uuid.uuid4().hex)
- attachment_url = '/spiderFiles/' + name + '.' + download_url.split('.')[-1]
- # 下载文件
- content = requests.get(url=download_url, headers=headers).content
- with open('D:\\spiderFiles\\' + name + '.' + download_url.split('.')[-1],
- mode='wb') as f:
- f.write(content)
- # 解析文件
- doc = Document('D:\\spiderFiles\\' + name + '.' + download_url.split('.')[-1])
- for para in doc.paragraphs:
- text += para.text
- if para.text != None and para.text != "":
- text_p += "<p>" + para.text + "</p>"
- # print(text_p)
- else:
- print('---check the file---')
- href = 'https://flk.npc.gov.cn/detail2.html?' + data['id']
- # id非自增 雪花算法随机生成
- id = snowflake.generate_id()
- inputdb(id, title, href, date, date2, text_p, text, attachment_url, province)
- time.sleep(1)
- print('The', i, 'page-the', num, 'data has been downloaded!!!')
- num += 1
- print('The data has been downloaded and is up-to-date---')
- con.close()
复制代码 免责声明:如果侵犯了您的权益,请联系站长,我们会及时删除侵权内容,谢谢合作!更多信息从访问主页:qidao123.com:ToB企服之家,中国第一个企服评测及商务社交产业平台。 |