作业①
要求:用requests和BeautifulSoup库方法定向爬取给定网址(http://www.shanghairanking.cn/rankings/bcur/2020 )的数据,屏幕打印爬取的大学排名信息。
输出信息:
排名 学校名称 省市 学校类型 总分
1 清华大学 北京 综合 852.5
2......
实验
import requests
from bs4 import BeautifulSoup
import bs4
uinfo = []
url = "https://www.shanghairanking.cn/rankings/bcur/2020"
res = requests.get(url)
res.encoding = 'text/html'
html = res.text
soup = BeautifulSoup(html, "lxml")
for tr in soup.find('tbody').children:
if isinstance(tr, bs4.element.Tag):
a = tr('a')
tds = tr('td')
uinfo.append([tds[0].text.strip(), a[0].string.strip(), tds[2].text.strip(),
tds[3].text.strip(), tds[4].text.strip()])
tplt = "{0:^10}\t{1:^10}\t{2:^12}\t{3:^12}\t{4:^10}"
print(tplt.format("排名", "学校名称", "省份", "学校类型", "总分"))
for i in range(30):
print(tplt.format(uinfo[i][0], uinfo[i][1],uinfo[i][2], uinfo[i][3], uinfo[i][4]))
心得体会
通过使用 requests 库发送网络请求,以及 BeautifulSoup 库解析网页内容,我们可以方便地进行网页数据的提取和处理。网络爬虫技术可以帮助我们快速获取所需的数据,例如这里的大学排名信息。
作业②
实验
代码一 爬当当
import urllib.request
import urllib.parse
import re
import requests
from bs4 import BeautifulSoup
print("输入页数:")
page = input()
url = 'http://search.dangdang.com/?key=%CA%E9%B0%FC&act=input&'
data = {
'page_index': page,
}
data = urllib.parse.urlencode(data)
url = url + data
response = requests.get(url=url)
response.encoding = 'gb2312'
content = response.text
price_regex = r'<span class="price_n">¥([\d.]+)</span>'
prices = re.findall(price_regex, content)
soup = BeautifulSoup(content, 'lxml')
"""
name = soup.select('p[class="name"] > a[title]')
name_list = soup.select('span[class="price_n"]')
for i in range(60):
print(str(i)+","+name[i].text + "," + name_list[i].text)
"""
# 提取名称
names = re.findall(r'<p class="name" name="title">\s*<a[^>]*\stitle="(.*?)"', str(soup))
i=1;
for price, name in zip(prices, names):
print(str(i)+"\t"+price+"\t"+name)
i=i+1
代码二 爬京东
import requests
import urllib.parse
import re
from bs4 import BeautifulSoup
page = int(input())
url = 'https://search.jd.com/Search?keyword=%E4%B9%A6%E5%8C%85&qrst=1&wq=%E4%B9%A6%E5%8C%85&stock=1&pvid=92cd471d7ad04c26bf539da0881c3cd9&isList=0&'
data = {
'page': page * 2 + 1,
}
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36',
'Cookie': '__jdu=2112503369; areaId=16; shshshfpa=0c30a2b4-09e7-b60c-4448-ae7b8574600a-1695036268; shshshfpx=0c30a2b4-09e7-b60c-4448-ae7b8574600a-1695036268; qrsc=3; ipLoc-djd=16-1303-1305-48927; PCSYCityID=CN_350000_350100_0; _pst=wdwtnarAuJBXCr; unick=jd_dtTACdThpYFH; pin=wdwtnarAuJBXCr; _tp=TaWizw3MaoP0cB58GohhqA%3D%3D; jsavif=1; jsavif=1; mba_muid=2112503369; wlfstk_smdl=trs6339sab9ij45l0437dzim9lv3jv7d; unpl=JF8EALBnNSttUU9UUh8HSEUQTQ0BWw8JHh4DPDcGXVxZHFJSHQUdFBB7XlVdXxRKHx9sYxRXXVNJUw4aBysSEXteXVdZDEsWC2tXVgQFDQ8VXURJQlZAFDNVCV9dSRZRZjJWBFtdT1xWSAYYRRMfDlAKDlhCR1FpMjVkXlh7VAQrARsSE09cVlxaAHsWM2hXNWRdWUJUBRwyGiIRex8AAlgLQxEAaioGVF1bT1UHGQUTIhF7Xg; __jdv=76161171|baidu-pinzhuan|t_288551095_baidupinzhuan|cpc|0f3d30c8dba7459bb52f2eb5eba8ac7d_0_851f54bf17ad4c0d81ba2800f7f76771|1695648964846; 3AB9D23F7A4B3CSS=jdd03V5JDX6UC4BR3NEA5XLU2HQROHPM52HRBZNV22U5IZRMRJWQVMXYVLEV5I2XWSL4L3APOBM3UDO56LBT2N4KWV4OQUMAAAAMKZSG6AQQAAAAACEO2K65HY6JN34X; avif=1; xapieid=jdd03V5JDX6UC4BR3NEA5XLU2HQROHPM52HRBZNV22U5IZRMRJWQVMXYVLEV5I2XWSL4L3APOBM3UDO56LBT2N4KWV4OQUMAAAAMKZSG6AQQAAAAACEO2K65HY6JN34X; __jda=122270672.2112503369.1695036258.1695100654.1695648774.4; __jdc=122270672; rkv=1.0; logintype=wx; npin=wdwtnarAuJBXCr; thor=98724EA9770521E7D047F1C8BDE7E1D74EF976383E31E33A9CAC8746295B69FF518970B7B48B09917A2F8BC5B189AA9E28E73F120D01A0E84D2AA4A20A67D990B9073861C069608331660B49A976AB8AE34C09BD2EADAEFC159C5F91083F210BE718AD28A4848BA70E3C4E500F46CBB89ED519F3E0B511B08B792A1FC5175E56D1B9CD190039AEB15D7D0344A917DD30; flash=2__v93K6Oip4T3b_KW6ELzfC3r3Xge1qCzNXlPWGel6oP4wt1Krqa5YfVAYOhUlVw0k4AI5wMRC7AX_Bc68atLeKaVIKr-7jI_1erfrIqA40j*; pinId=RhQmtBrUJper1O3taCkAYQ; __jdb=122270672.10.2112503369|4.1695648774; shshshsID=2fd8a8e2242d0e0c553d8bfbdd0e4f0f_5_1695649019376; shshshfpb=AAtSzjsyKEjCitAnntgxESK57hXRgChaVA2JoVAAAAAA; 3AB9D23F7A4B3C9B=V5JDX6UC4BR3NEA5XLU2HQROHPM52HRBZNV22U5IZRMRJWQVMXYVLEV5I2XWSL4L3APOBM3UDO56LBT2N4KWV4OQUM'}
data = urllib.parse.urlencode(data)
url = url + data
response = requests.get(url=url, headers=headers)
response.encoding = 'utf-8'
content = response.text
#soup = BeautifulSoup(content, 'lxml')
#name = soup.select('div[class="p-name p-name-type-2"] em')
#price = soup.select('div[class="p-price"] i')
price_regex = r'<div class="p-price">\s*<strong[^>]*>\s*<em>(¥)<\/em><i[^>]*>([\d.]+)<\/i>\s*<\/strong>\s*<\/div>'
name_regex = r'<div class="p-name p-name-type-2">\s*<a[^>]*>\s*<em>(.*?)<\/em>'
price_matches = re.findall(price_regex, content)
name_matches = re.findall(name_regex, content)
prices = [match[1] for match in price_matches]
names = [re.sub(r'<.*?>', '', match) for match in name_matches]
i=1;
for price, name in zip(prices, names):
print(str(i)+"\t"+price+"\t"+name)
i=i+1
心得体会
re库和urllib.request库很相似,但是re库更方便,像自动挡和手动挡的区别,
京东需要登陆才能查看,所以要在headers中加入cookie
用re库爬当当网比较麻烦,因为当当网的content不是html的格式,所以要转换
作业③:
要求:爬取一个给定网页( https://xcb.fzu.edu.cn/info/1071/4481.htm)或者自选网页的所有JPEG和JPG格式文件
输出信息:将自选网页内的所有JPEG和JPG文件保存在一个文件夹中
实验
下载到文件中
import urllib.request
import urllib.parse
from bs4 import BeautifulSoup
import os
url = 'https://xcb.fzu.edu.cn/info/1071/4481.htm'
request = urllib.request.Request(url=url)
response = urllib.request.urlopen(request)
content = response.read().decode('utf-8')
soup = BeautifulSoup(content, 'lxml')
name = soup.select('p[class="vsbcontent_img"] img')
path = "picture"
os.mkdir(path)
page = 0
for i in name:
img_url = 'https://xcb.fzu.edu.cn' + str(i.get("src"))
page = page + 1
urllib.request.urlretrieve(img_url, path + '\\' + str(page) + '.jpeg')
直接呈现
import urllib.request
import urllib.parse
from bs4 import BeautifulSoup
import requests
from PIL import Image
from io import BytesIO
import os
url = 'https://xcb.fzu.edu.cn/info/1071/4481.htm'
request = urllib.request.Request(url=url)
response = urllib.request.urlopen(request)
content = response.read().decode('utf-8')
soup = BeautifulSoup(content, 'lxml')
name = soup.select('p[class="vsbcontent_img"] img')
image_urls=[]
for i in name:
img_url = 'https://xcb.fzu.edu.cn' + str(i.get("src"))
image_urls.append(img_url)
# 每张图片的大小
image_width = 200
image_height = 200
# 分别计算画布的宽度和高度
num_images = len(image_urls)
canvas_width = image_width * num_images
canvas_height = image_height
# 创建空白画布
canvas = Image.new('RGB', (canvas_width, canvas_height))
# 将每张图片粘贴到画布上
x_offset = 0
for image_url in image_urls:
response = requests.get(image_url)
image_data = response.content
image = Image.open(BytesIO(image_data))
image = image.resize((image_width, image_height))
canvas.paste(image, (x_offset, 0))
x_offset += image_width
# 显示画布中的图片
canvas.show()
心得体会
可能是网站问题,下载图片非常慢,一开始把page=0放入for循环导致图片名称都一样只输出一个图片
所以选择把图片拼接在一起呈现,会更清楚