民事案例爬取2

爬取网页:法帮网 http://www.fabang.com/

代码如下:(数据连接部分自行修改)

 

import pymysql
import requests
import csv
from bs4 import BeautifulSoup
pnum = 28
while pnum <=300:
    news_list = []
    head = ['新闻标题']
    url = 'http://www.fabang.com/falvanli/minfaanli/list_1909_'+str(pnum)+'.html'
    headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36'}
    res = requests.get(url,headers=headers)
    #本来demo这一段是没有的,但是常规解码跑出来是乱码,这里用了暴力解码
    demo = res.text.encode("iso-8859-1").decode("gbk")
    bs = BeautifulSoup(demo,'html.parser')
    # 首先爬取这个板块的两个头条
    # 为了方便查看,设置了爬取计数变量a和b
    a = 1
    econ = bs.find('div', class_='liall').find_all('dl')
    #print(econ)
    for info in econ:
        print('正在爬取第{}条案例'.format(a))
        news_title = info.find('dd', class_='name').find('a')['title']
        print(news_title)
        news_url = info.find('dd', class_='name').find('a')['href']
        print(news_url)
        news_time = info.find_all('dd', class_='date')[1].text
        print(news_time)

        res2 = requests.get(news_url, headers=headers)
        # 这里也用了暴力解码
        demo = res2.text.encode("iso-8859-1").decode("gbk")
        bs2 = BeautifulSoup(demo, 'html.parser')
        pstr1=bs2.find('div', class_='content').find_all('p')[2].text
        pstr2=bs2.find('div', class_='content').find_all('p')[1].text
        if pstr1!='' and pstr2!='':
            paper = bs2.find('div', class_='content').find_all('p')[1].text+bs2.find('div', class_='content').find_all('p')[2].text\
                    +bs2.find('div', class_='content').find_all('p')[3].text
            #paper = paper.replace(u'\u3000', u'')
            print(news_title,"---",news_time,"----",paper)

            # 1.连接数据库
            conn = pymysql.connect(
                host='localhost',
                user='root',
                password='lin0613',  # 密码
                db='cus',  # 数据库名
                charset='utf8',
            )
            # 2.创建游标对象
            cur = conn.cursor()
            # 3.对数据库进行CRUD操作
            #
            try:
                insert_sqli = "insert into case2 values('"+news_time+"','"+news_title+"','"+paper+"');"
                cur.execute(insert_sqli)
            except Exception as e:
                print("插入数据失败:", e)
            else:
                conn.commit()
                print("插入数据成功;")
            # 4. 关闭游标
            cur.close()
            # 5. 关闭连接
            conn.close()

            dict_news = {'案例标题': news_title, '时间': news_time, '主要内容':paper}
            #print(dict_news)
            news_list.append(dict_news)

        a += 1
        print(pnum,"")

    pnum = pnum +1

 

posted @ 2022-06-01 18:49  往心。  阅读(94)  评论(0编辑  收藏  举报