爬取民法案例
from bs4 import BeautifulSoup import time import re import datetime import requests import csv import pandas as pd headers = { 'authority': 'item.jd.com', 'cache-control': 'max-age=0', 'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="90", "Google Chrome";v="90"', 'sec-ch-ua-mobile': '?0', 'upgrade-insecure-requests': '1', 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36 FS', 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 'sec-fetch-site': 'same-site', 'sec-fetch-mode': 'navigate', 'sec-fetch-user': '?1', 'sec-fetch-dest': 'document', 'referer': 'https://search.jd.com/', 'accept-language': 'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7,ja;q=0.6', 'cookie': '__jdu=16196590092911498836650; shshshfpa=a828a369-a728-e182-048e-b5072447be6e-1619659011; shshshfpb=tOCI1z%20PGPaR3CEfad3X5Hg%3D%3D; pinId=ra4DldyvyoY_wKdMfg-NzQ; pin=jd_vbaqeJhuCzNB; unick=jd_vbaqeJhuCzNB; _tp=nTy%2BrzzrxqSaQB0srQMkNg%3D%3D; _pst=jd_vbaqeJhuCzNB; cn=124; ipLocation=%u6cb3%u5317; areaId=5; ipLoc-djd=5-248-2990-51290; __jdv=76161171|baidu|-|organic|not set|1624541667990; PCSYCityID=CN_130000_130100_130104; user-key=b71f1c66-2098-455c-a8d6-18761218a06f; TrackID=1GQVBVkoDl_4bfjMShj3z5E2QZTbfh56Eyq2y7QsR13GdfYydXHBeQpKI2oGOZnJNbiyyJqt5Qr69BhDridyaSEtCAvnCqeOxoZA1-uw_7L7c5mYqqvF-HNhckGxBUD_2; shshshfp=4e3578329e37ee251bff8051f7e59ea5; __jda=122270672.16196590092911498836650.1619659009.1624285397.1624541668.33; __jdc=122270672; shshshsID=204213ee80eea633cacd5e34507f5018_29_1624545894631; __jdb=122270672.61.16196590092911498836650|33.1624541668; 3AB9D23F7A4B3C9B=Z5VP7UVC6XSUTN77RWVU3MCMKNU3B47D7TIAVQK2GSP2JKY22AOOH4FHZMIUEZFND67AAM6RO3OUTVSPKHBUXGEYHI', 'if-modified-since': 'Thu, 24 Jun 2021 14:44:50 GMT', } def pa_menu(): url = "https://anli.lawtime.cn/minfa/" response = requests.get(url, headers=headers).text # print(response) soup = BeautifulSoup(response, 'html.parser') # print(soup) i = 10001 lists = [] for item in soup.find_all(class_="db list-nav-a subItem active"): item = item.string item = item[:-2] print(item) l = [i, item, "民法"] lists.append(l) i = i + 1 for item in soup.find_all(class_="db list-nav-a subItem"): item = item.string item = item[:-2] # print(item) l = [i, item, "民法"] lists.append(l) i = i + 1 # print(l) # print(lists) with open("data/menu.csv", "w", encoding="utf-8", newline="") as f: k = csv.writer(f, dialect="excel") k.writerow(['index:ID', 'type', ':LABEL']) for list in lists: # print(list) k.writerow(list) def pa_legalCase(url,type,a,x): lists = [] for i in range(1, 50): surl = url+"list_"+str(i) print(surl) response = requests.get(surl, headers=headers).text # print(response) soup = BeautifulSoup(response, 'html.parser') # print(soup) for item in soup.find_all(class_="list-main-h1 nowrap"): link=item.get('href') item = item.string print(item) l = [x, item, link, type] lists.append(l) x = x + 1 # print(l) # print(lists) with open(f"data/{a}.csv", "w", encoding="utf-8", newline="") as f: k = csv.writer(f, dialect="excel") k.writerow(['index:ID', 'title', 'link', ":LABEL"]) for list in lists: # print(list) k.writerow(list) if __name__=="__main__": start_time=time.time() print("爬虫开始时间%s" %start_time) pa_menu() key = {"hunyin", "laodong", "baoxian", "msssf", "jicheng", "ywjyf", "ldhtf", "guanggao", "xfzqyf", "jiaoyu", "huanjing", "qita"} keyword = "" x=20001 for a in key: type = "" if(a=="hunyin"): type="婚姻法" elif(a=="laodong"): type="劳动法" elif (a=="baoxian"): type="保险法" elif(a=="msssf"): type="民事诉讼法" elif(a=="jicheng"): type="继承法" elif(a=="ywjyf"): type="义务教育法" elif(a=="ldhtf"): type="劳动合同法" elif(a=="guanggao"): type="广告法" elif(a=="xfzqyf"): type="消费者权益法" elif(a=="jiaoyu"): type="教育法" elif(a=="huanjing"): type="环境法" else: type="其他" b="https://anli.lawtime.cn/mf" c="/" url=b+a+c pa_legalCase(url,type,a,x) x=x+10000 end_time=time.time() print("共耗时%s" %(end_time-start_time))