作业一

爬取的大学排名信息

import requests
import pandas as pd
from lxml import etree

url='http://www.shanghairanking.cn/rankings/bcur/2020'
headers = {
        'cookie':'buvid3=D9871DF6-0182-41A1-90E9-E29CEBB7486B148819infoc; LIVE_BUVID=AUTO9716390622274942; i-wanna-go-back=-1; dy_spec_agreed=1; buvid_fp_plain=undefined; buvid4=E4F99F79-D38D-7772-1504-36D850DFC56126916-022012117-lmSCu3UJG4vA%2FTj4GqHC7g%3D%3D; b_nut=100; rpdid=0z9Zw2XN99|DXLF9xln|3nv|3w1OV9i5; CURRENT_BLACKGAP=0; header_theme_version=CLOSE; nostalgia_conf=-1; hit-new-style-dyn=1; CURRENT_PID=c743ef00-ca3c-11ed-9fc5-d1323b0c18bf; FEED_LIVE_VERSION=V8; hit-dyn-v2=1; CURRENT_FNVAL=4048; _uuid=F642C575-6692-97F8-D3A6-B4D3AC4A5CBC53115infoc; fingerprint=891cbadcde040e541b9d28e4fbfe5478; buvid_fp=891cbadcde040e541b9d28e4fbfe5478; home_feed_column=5; browser_resolution=1536-715; bp_video_offset_145023955=839673983345360951; CURRENT_QUALITY=64; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE2OTQ2MDQ0MzcsImlhdCI6MTY5NDM0NTIzNywicGx0IjotMX0.pE66B5Gk8NaWPsEvwVGfeiBo4TxGcKmNgERMmnRSgbw; bili_ticket_expires=1694604437; SESSDATA=bd0c1774%2C1709897330%2C6e20b%2A92CjDXcZPHBQ_MJwl2s9cHqTAqf3_FPSjL5CCtvSHv8uRT40kJNyATrAWWiba_aAHLJdgSVmgya0pEN3VEOHdReHNrYlBBWHdNanFGZWdNUl8wTWxBTGxWZ01BUTFXSVdFYXc3eWZvclNNVjhaMUQ4dVJaSEhjYU14dVE2M3g2TFdrM1l2SFRFdWhRIIEC; bili_jct=49ac6574d21f7ddc5deef86b99513d32; DedeUserID=1071519264; DedeUserID__ckMd5=0cc7d57275e1b217; bp_video_offset_1071519264=839690596279910433; b_lsid=6102FE8F3_18A7F2B9259; innersign=0; sid=7dutxo2r; PVID=3',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36'
    }          #伪装
res = requests.get(url=url, headers=headers)
html = etree.HTML(res.content)                # 提取该url源代码
lis = html.xpath('//*[@id="content-box"]/div[2]/table/tbody/tr')
result = pd.DataFrame()
i = 1
for tr in lis:
    name = tr.xpath('./td[2]/div/div[2]/div[1]/div/div/a/text()')[0].replace('\n            ','')   # 提取信息并使格式规范
    area = tr.xpath('./td[3]/text()')[0].replace('\n                        ','')
    type1 = tr.xpath('./td[4]/text()')[0].replace('\n                        ', '')
    type1.replace('\n                    ', '')
    grade = tr.xpath('./td[5]/text()')[0].replace('\n                    ', '')
    df = pd.DataFrame({'大学名字': [name], '地区': [area], '学校类型': [type1], '总分': [grade]},index=[i])     # 将提取的信息整理为表格
    i = i+1
    result = pd.concat([result, df])
    if i == 14:
        break
print(result)

心得体会

通过这次实验,让我学习了requests和BeautifulSoup库中较为简单的知识,学会了find_all,DataFrame,BeautifulSoup,get等函数的使用,令我受益匪浅。

作业二

爬取商品名称和价格

from selenium import webdriver
import time
import re

all_name = []
all_prince = []
browser = webdriver.Chrome()	# 创建driver对象
browser.get('https://www.guangshop.com/?r=/l&kw=%25E4%25B9%25A6%25E5%258C%2585&origin_id=&sort=0')

time.sleep(3)
item = str
# 刷新浏览器
browser.refresh()

# 最大化浏览器窗口
browser.maximize_window()

ab = browser.page_source

name = re.findall('<span data-v-f62188ba="">[\u4e00-\u9fa50-9a-zA-Z【】\-!]*包[\u4e00-\u9fa50-9a-zA-Z【】\-!]*</span>', ab)
prince = re.findall('<span data-v-f62188ba="" class="price">\d*\.*\d*', ab)               #利用正则表达式提取商品名称与价格
for item in name:
    item = item.replace('<span data-v-f62188ba="">','')         # 去除提取信息中的干扰部分
    item = item.replace('</span>', '')
    all_name.append(item)
for item in prince:
    item = item.replace('<span data-v-f62188ba="" class="price">','')      # 去除提取信息中的干扰部分
    all_prince.append(item)
for i in range(1,61):           # 提取60个商品信息
    print(all_name[i]+'     '+all_prince[i])

心得体会

通过这次实验,让我学习了requests和re库的一些知识,学会了requests.get和re.findall函数,也让我明白了淘宝反爬技术的强大,所以选择爬取不知名小网站。

作业三

爬取给定网页所有JPEG和JPG格式文件

from selenium import webdriver
import time
from bs4 import BeautifulSoup
from urllib import request

all_name = []
all_prince = []
browser = webdriver.Chrome()	# 创建driver对象
browser.get('https://xcb.fzu.edu.cn/info/1071/4481.htm')
i = 1
time.sleep(3)
item = str
# 刷新浏览器
browser.refresh()

# 最大化浏览器窗口
browser.maximize_window()

html = browser.page_source
soup = BeautifulSoup(html,"lxml")
lis = soup.find_all("p",{"class":"vsbcontent_img"})
i = 1
for url in lis:
    picture = url.find("img")["src"]
    picture = "https://xcb.fzu.edu.cn/" + picture
    print(picture)
    request.urlretrieve(picture, f'./picture/{i}.jpg')      #保存图片
    i = i + 1

心得体会

通过此次实验,让我明白了如何使用request.urlretrieve函数,如何使用webdriver爬取动态网页

posted on 2023-09-21 16:52  熏风雪奈  阅读(44)  评论(0编辑  收藏  举报