作业一
爬取的大学排名信息
import requests
import pandas as pd
from lxml import etree
url='http://www.shanghairanking.cn/rankings/bcur/2020'
headers = {
'cookie':'buvid3=D9871DF6-0182-41A1-90E9-E29CEBB7486B148819infoc; LIVE_BUVID=AUTO9716390622274942; i-wanna-go-back=-1; dy_spec_agreed=1; buvid_fp_plain=undefined; buvid4=E4F99F79-D38D-7772-1504-36D850DFC56126916-022012117-lmSCu3UJG4vA%2FTj4GqHC7g%3D%3D; b_nut=100; rpdid=0z9Zw2XN99|DXLF9xln|3nv|3w1OV9i5; CURRENT_BLACKGAP=0; header_theme_version=CLOSE; nostalgia_conf=-1; hit-new-style-dyn=1; CURRENT_PID=c743ef00-ca3c-11ed-9fc5-d1323b0c18bf; FEED_LIVE_VERSION=V8; hit-dyn-v2=1; CURRENT_FNVAL=4048; _uuid=F642C575-6692-97F8-D3A6-B4D3AC4A5CBC53115infoc; fingerprint=891cbadcde040e541b9d28e4fbfe5478; buvid_fp=891cbadcde040e541b9d28e4fbfe5478; home_feed_column=5; browser_resolution=1536-715; bp_video_offset_145023955=839673983345360951; CURRENT_QUALITY=64; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE2OTQ2MDQ0MzcsImlhdCI6MTY5NDM0NTIzNywicGx0IjotMX0.pE66B5Gk8NaWPsEvwVGfeiBo4TxGcKmNgERMmnRSgbw; bili_ticket_expires=1694604437; SESSDATA=bd0c1774%2C1709897330%2C6e20b%2A92CjDXcZPHBQ_MJwl2s9cHqTAqf3_FPSjL5CCtvSHv8uRT40kJNyATrAWWiba_aAHLJdgSVmgya0pEN3VEOHdReHNrYlBBWHdNanFGZWdNUl8wTWxBTGxWZ01BUTFXSVdFYXc3eWZvclNNVjhaMUQ4dVJaSEhjYU14dVE2M3g2TFdrM1l2SFRFdWhRIIEC; bili_jct=49ac6574d21f7ddc5deef86b99513d32; DedeUserID=1071519264; DedeUserID__ckMd5=0cc7d57275e1b217; bp_video_offset_1071519264=839690596279910433; b_lsid=6102FE8F3_18A7F2B9259; innersign=0; sid=7dutxo2r; PVID=3',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36'
} #伪装
res = requests.get(url=url, headers=headers)
html = etree.HTML(res.content) # 提取该url源代码
lis = html.xpath('//*[@id="content-box"]/div[2]/table/tbody/tr')
result = pd.DataFrame()
i = 1
for tr in lis:
name = tr.xpath('./td[2]/div/div[2]/div[1]/div/div/a/text()')[0].replace('\n ','') # 提取信息并使格式规范
area = tr.xpath('./td[3]/text()')[0].replace('\n ','')
type1 = tr.xpath('./td[4]/text()')[0].replace('\n ', '')
type1.replace('\n ', '')
grade = tr.xpath('./td[5]/text()')[0].replace('\n ', '')
df = pd.DataFrame({'大学名字': [name], '地区': [area], '学校类型': [type1], '总分': [grade]},index=[i]) # 将提取的信息整理为表格
i = i+1
result = pd.concat([result, df])
if i == 14:
break
print(result)
心得体会
通过这次实验,让我学习了requests和BeautifulSoup库中较为简单的知识,学会了find_all,DataFrame,BeautifulSoup,get等函数的使用,令我受益匪浅。
作业二
爬取商品名称和价格
from selenium import webdriver
import time
import re
all_name = []
all_prince = []
browser = webdriver.Chrome() # 创建driver对象
browser.get('https://www.guangshop.com/?r=/l&kw=%25E4%25B9%25A6%25E5%258C%2585&origin_id=&sort=0')
time.sleep(3)
item = str
# 刷新浏览器
browser.refresh()
# 最大化浏览器窗口
browser.maximize_window()
ab = browser.page_source
name = re.findall('<span data-v-f62188ba="">[\u4e00-\u9fa50-9a-zA-Z【】\-!]*包[\u4e00-\u9fa50-9a-zA-Z【】\-!]*</span>', ab)
prince = re.findall('<span data-v-f62188ba="" class="price">\d*\.*\d*', ab) #利用正则表达式提取商品名称与价格
for item in name:
item = item.replace('<span data-v-f62188ba="">','') # 去除提取信息中的干扰部分
item = item.replace('</span>', '')
all_name.append(item)
for item in prince:
item = item.replace('<span data-v-f62188ba="" class="price">','') # 去除提取信息中的干扰部分
all_prince.append(item)
for i in range(1,61): # 提取60个商品信息
print(all_name[i]+' '+all_prince[i])
心得体会
通过这次实验,让我学习了requests和re库的一些知识,学会了requests.get和re.findall函数,也让我明白了淘宝反爬技术的强大,所以选择爬取不知名小网站。
作业三
爬取给定网页所有JPEG和JPG格式文件
from selenium import webdriver
import time
from bs4 import BeautifulSoup
from urllib import request
all_name = []
all_prince = []
browser = webdriver.Chrome() # 创建driver对象
browser.get('https://xcb.fzu.edu.cn/info/1071/4481.htm')
i = 1
time.sleep(3)
item = str
# 刷新浏览器
browser.refresh()
# 最大化浏览器窗口
browser.maximize_window()
html = browser.page_source
soup = BeautifulSoup(html,"lxml")
lis = soup.find_all("p",{"class":"vsbcontent_img"})
i = 1
for url in lis:
picture = url.find("img")["src"]
picture = "https://xcb.fzu.edu.cn/" + picture
print(picture)
request.urlretrieve(picture, f'./picture/{i}.jpg') #保存图片
i = i + 1
心得体会
通过此次实验,让我明白了如何使用request.urlretrieve函数,如何使用webdriver爬取动态网页