Python爬虫学习(一)
写在前面:
最近稍微闲下来一点,趁着有这个功夫学习一下爬虫。虽然在大二下学期的时候接触过爬虫,但不太熟练,Python中数据存储的结构也不太了解,所以重新开始学习。
实战:
我是观看尚硅谷的爬虫视频进行学习的,了解了request,xpath,beautifulsoup等的使用方法,在实战部分我重新完成了大二下学期的爬虫任务,加深了理解。
一.顶会论文的爬取
import urllib.request from bs4 import BeautifulSoup import requests from lxml import etree import json url = "https://openaccess.thecvf.com/WACV2021" url1 = "https://openaccess.thecvf.com" response = urllib.request.urlopen(url) # 获取页面源码 content = response.read().decode("utf-8") # 创建soup对象 soup = BeautifulSoup(content, 'lxml') list_info = soup.find('dl') list_url = list_info.find_all('dt') # 返回一个列表 list_title = soup.select('.ptitle a') title_url = [] #获取所有论文标题的链接 for title in list_title: title_url.append(url1+title.get('href')) # ok,现在都存到一个列表里面了,接下来循环遍历,依次爬取 # down_title = [] # 论文标题 # down_abstract = [] # 论文摘要 # down_author = [] #论文作者 # down_pdf = [] #论文PDF #authors for two_url in title_url: # resp = urllib.request.urlopen(two_url) resp = requests.get(two_url) # html = resp.read().decode('utf-8') body = etree.HTML(resp.text) # two_soup = BeautifulSoup(html, 'lxml') # papertitle = two_soup.select('#papertitle>') # title1 = two_soup.xpath("//div[@id='papertitle']/text()")[0] down_title = body.xpath('//*[@id="papertitle"]/text()')[0].strip() # 论文标题 down_author = body.xpath('//*[@id="authors"]/b/i/text()')[0].strip() #论文作者 down_abstract = body.xpath('//*[@id="abstract"]/text()')[0].strip() #论文摘要 down_pdf = url1+body.xpath('//*[@id="content"]/dl/dd/a/@href')[0].strip() #论文PDF down_abstract = down_abstract.replace("'", "\\'") # 爬取标题 print("开始爬取"+two_url) print(down_author) import pymysql db = pymysql.connect(host="localhost", user="root", password="156132", database="cloud1", charset="utf8mb4") cursor = db.cursor() sql = "insert into lw(title,author,abstract,pdf) values ('" + str(down_title) + "','" + str( down_author) + "','" + down_abstract + "','" + str( down_pdf) + "')" try: cursor.execute(sql) print("插入成功") db.commit() # print(school_shengfen + "\t" + school_name + "添加成功") except pymysql.Error as e: print("增加数据失败: " + str(e)) db.rollback() # down_author.append(two_soup.select('#authors')) # print(title1) # print(soup.select('#authors')) # print('爬取完成') # /html/body/div[3]/dl/div[1]
二.疫情数据爬取
import json import requests import urllib.request url = "https://view.inews.qq.com/g2/getOnsInfo?name=disease_h5" response = requests.get(url) # 取得数据词典 data = json.loads(response.content.decode()) # 3、解析数据:json模块:把json字符串转换成python可交互的数据类型字典 data_str = data['data'] json_data = json.loads(data_str) # 获取areaTree下的children list_data = json_data['areaTree'][0]['children'] # 两层循环遍历 lastUpdateTime = json_data['lastUpdateTime'] for areaTree in list_data: # 获取省份名称 province_name = areaTree['name'] confirm_total = areaTree['total']['confirm'] dead_total = areaTree['total']['dead'] suspect_total = areaTree['total']['suspect'] heal_total = areaTree['total']['heal'] # 具体到市 for city_info in areaTree['children']: city_name = city_info['name'] confirm = city_info['total']['confirm'] dead = city_info['total']['dead'] heal = city_info['total']['heal'] # 获取时间 # json_date = json_data['lastUpdateTime'] # print(json_date)
三.
import urllib.request from bs4 import BeautifulSoup def schoolinfo(url): response = urllib.request.urlopen(url) # 获取页面源码 content = response.read().decode('utf-8') # 创建bs4对象 soup = BeautifulSoup(content, 'lxml') # 获得表格所有信息 info_list = soup.find_all('table', {'class': 'ch-table'}) return info_list def handinfo(list): for tr_info in list: list1 = tr_info.find_all('tr') # print(list1) # 对td进行清洗 stu_info = [] # 列表初始化 for td_info in list1: list2 = td_info.find_all('td') if (len(list2)): stu_info.append(list2[0:3]) else: continue for info in stu_info: school_name = info[0].get_text().strip school_shengfen = info[1].get_text() school_belong = info[2].get_text() print(school_belong) return stu_info def insertintoDB(stu_info): import pymysql db = pymysql.connect(host="localhost", user="root", password="156132", database="upload", charset="utf8mb4") cursor = db.cursor() for info in stu_info: # school_name = info[0].get_text().strip() school_shengfen = info[1].get_text() school_belong = info[2].get_text() school = info[0].get_text().split() school_name = ','.join(str(i) for i in school) sql = "insert into stu_info(stu_name,stu_province,stu_belong) values ('" + str(school_name) + "','" + str( school_shengfen) + "','" + str(school_belong) + "')" try: cursor.execute(sql) db.commit() # print(school_shengfen + "\t" + school_name + "添加成功") except: print("插入出错") db.rollback() index = 0 while index < 44: info_list = schoolinfo("https://yz.chsi.com.cn/sch/?start=" + str(index * 20)) stu_info = handinfo(info_list) print('爬取完成') print('开始插入数据库') insertintoDB(stu_info) print('成功') print(index) index += 1 # 插到数据库