Python爬虫学习（一）

写在前面：

　　最近稍微闲下来一点，趁着有这个功夫学习一下爬虫。虽然在大二下学期的时候接触过爬虫，但不太熟练，Python中数据存储的结构也不太了解，所以重新开始学习。

实战：

我是观看尚硅谷的爬虫视频进行学习的，了解了request，xpath，beautifulsoup等的使用方法，在实战部分我重新完成了大二下学期的爬虫任务，加深了理解。

一.顶会论文的爬取

import urllib.request
from bs4 import BeautifulSoup
import requests
from lxml import etree
import json
url = "https://openaccess.thecvf.com/WACV2021"
url1 = "https://openaccess.thecvf.com"
response = urllib.request.urlopen(url)
# 获取页面源码
content = response.read().decode("utf-8")
# 创建soup对象
soup = BeautifulSoup(content, 'lxml')
list_info = soup.find('dl')
list_url = list_info.find_all('dt')   # 返回一个列表
list_title = soup.select('.ptitle a')
title_url = []  #获取所有论文标题的链接
for title in list_title:
    title_url.append(url1+title.get('href'))
# ok,现在都存到一个列表里面了,接下来循环遍历,依次爬取

# down_title = []           # 论文标题
# down_abstract = []        # 论文摘要
# down_author = []          #论文作者
# down_pdf = []             #论文PDF
#authors
for two_url in title_url:
    # resp = urllib.request.urlopen(two_url)
    resp = requests.get(two_url)
    # html = resp.read().decode('utf-8')
    body = etree.HTML(resp.text)
    # two_soup = BeautifulSoup(html, 'lxml')
    # papertitle = two_soup.select('#papertitle>')
    # title1 = two_soup.xpath("//div[@id='papertitle']/text()")[0]
    down_title = body.xpath('//*[@id="papertitle"]/text()')[0].strip()        # 论文标题
    down_author = body.xpath('//*[@id="authors"]/b/i/text()')[0].strip()          #论文作者
    down_abstract = body.xpath('//*[@id="abstract"]/text()')[0].strip()       #论文摘要
    down_pdf = url1+body.xpath('//*[@id="content"]/dl/dd/a/@href')[0].strip()      #论文PDF
    down_abstract = down_abstract.replace("'", "\\'")
    # 爬取标题
    print("开始爬取"+two_url)
    print(down_author)
    import pymysql

    db = pymysql.connect(host="localhost", user="root", password="156132", database="cloud1", charset="utf8mb4")
    cursor = db.cursor()
    sql = "insert into lw(title,author,abstract,pdf) values ('" + str(down_title) + "','" + str(
        down_author) + "','" + down_abstract + "','" + str(
        down_pdf) + "')"
    try:
        cursor.execute(sql)
        print("插入成功")
        db.commit()
        # print(school_shengfen + "\t" + school_name + "添加成功")
    except pymysql.Error as e:
        print("增加数据失败:  " + str(e))
        db.rollback()

#     down_author.append(two_soup.select('#authors'))
#     print(title1)
#     print(soup.select('#authors'))
# print('爬取完成')

# /html/body/div[3]/dl/div[1]

二.疫情数据爬取

import json
import requests
import urllib.request

url = "https://view.inews.qq.com/g2/getOnsInfo?name=disease_h5"
response = requests.get(url)
# 取得数据词典
data = json.loads(response.content.decode())
# 3、解析数据：json模块：把json字符串转换成python可交互的数据类型字典
data_str = data['data']
json_data = json.loads(data_str)

# 获取areaTree下的children
list_data = json_data['areaTree'][0]['children']
# 两层循环遍历
lastUpdateTime = json_data['lastUpdateTime']

for areaTree in list_data:
    # 获取省份名称
    province_name = areaTree['name']
    confirm_total = areaTree['total']['confirm']
    dead_total = areaTree['total']['dead']
    suspect_total = areaTree['total']['suspect']
    heal_total = areaTree['total']['heal']
    # 具体到市
    for city_info in areaTree['children']:
        city_name = city_info['name']
        confirm = city_info['total']['confirm']
        dead = city_info['total']['dead']
        heal = city_info['total']['heal']






#  获取时间
# json_date = json_data['lastUpdateTime']
# print(json_date)

三.

import urllib.request
from bs4 import BeautifulSoup




def schoolinfo(url):
    response = urllib.request.urlopen(url)
    # 获取页面源码
    content = response.read().decode('utf-8')

    # 创建bs4对象
    soup = BeautifulSoup(content, 'lxml')

    # 获得表格所有信息
    info_list = soup.find_all('table', {'class': 'ch-table'})
    return info_list


def handinfo(list):
    for tr_info in list:
        list1 = tr_info.find_all('tr')
        # print(list1)
    # 对td进行清洗
    stu_info = []  # 列表初始化
    for td_info in list1:
        list2 = td_info.find_all('td')

        if (len(list2)):
            stu_info.append(list2[0:3])
        else:
            continue
    for info in stu_info:
        school_name = info[0].get_text().strip
        school_shengfen = info[1].get_text()
        school_belong = info[2].get_text()
    print(school_belong)
    return stu_info




def insertintoDB(stu_info):
    import pymysql

    db = pymysql.connect(host="localhost", user="root", password="156132", database="upload", charset="utf8mb4")
    cursor = db.cursor()
    for info in stu_info:
        # school_name = info[0].get_text().strip()
        school_shengfen = info[1].get_text()
        school_belong = info[2].get_text()
        school = info[0].get_text().split()
        school_name = ','.join(str(i) for i in school)
        sql = "insert into stu_info(stu_name,stu_province,stu_belong) values ('" + str(school_name) + "','" + str(
            school_shengfen) + "','" + str(school_belong) + "')"
        try:
            cursor.execute(sql)
            db.commit()
            # print(school_shengfen + "\t" + school_name + "添加成功")
        except:
            print("插入出错")
            db.rollback()


index = 0
while index < 44:
    info_list = schoolinfo("https://yz.chsi.com.cn/sch/?start=" + str(index * 20))
    stu_info = handinfo(info_list)
    print('爬取完成')
    print('开始插入数据库')
    insertintoDB(stu_info)
    print('成功')
    print(index)
    index += 1

# 插到数据库

posted @ 2021-11-22 23:23 天岁阅读(47) 评论(0) 编辑收藏举报

刷新页面返回顶部

天岁

Python爬虫学习（一）

写在前面：

实战：

公告