数据采集与融合技术实践作业一

作业1：大学排名数据爬取

作业代码和图片

主要代码

 import urllib.request
from bs4 import BeautifulSoup
import re  # 导入正则表达式模块
 
# 指定要爬取的URL
url = 'http://www.shanghairanking.cn/rankings/bcur/2020'
 
# 发送请求获取网页内容
response = urllib.request.urlopen(url)
html_content = response.read().decode('utf-8')
 
# 使用BeautifulSoup解析网页内容
soup = BeautifulSoup(html_content, 'html.parser')
 
# 找到存放排名信息的表格（假设表格在页面中唯一或具有特定的class属性）
table = soup.find('table')
 
# 提取表格中的所有行
rows = table.find_all('tr')
 
# 打印表头
print(f"{'排名':<5}{'学校名称':<10}{'省市':<10}{'学校类型':<10}{'总分':<10}")
 
# 遍历每一行，提取并打印所需信息
for row in rows[1:]:  # 跳过表头行
    cols = row.find_all('td')
    if len(cols) >= 5:  # 确保每行至少有5列数据
        rank = cols[0].get_text(strip=True)
 
        # 使用正则表达式只保留中文字符，去掉英文部分
        university_name = re.findall(r'[\u4e00-\u9fa5]+', cols[1].get_text(strip=True))[0]
        province_city = cols[2].get_text(strip=True)
        university_type = cols[3].get_text(strip=True)
        total_score = cols[4].get_text(strip=True)
 
        # 格式化输出
        print(f"{rank:<5}{university_name:<10}{province_city:<10}{university_type:<10}{total_score:<10}")

运行结果

作业心得

通过此次作业，我深入理解了如何使用requests库发送HTTP请求，并使用BeautifulSoup解析HTML文档。数据提取的过程让我体会到了网络爬虫的强大与灵活性。同时，我也认识到在爬取数据时要遵循网站的规则，以避免对网站造成负担。

作业2：商城爬虫

作业代码和图片

主要代码

 import urllib.request
from bs4 import BeautifulSoup
import re
 
BASE_URL = "http://search.dangdang.com/?key=%E4%B9%A6%E5%8C%85&act=input"
 
def get_html(url):
    #获取指定url的HTML内容
    head = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36 SLBrowser/9.0.3.5211 SLBChan/123"
    }
    request = urllib.request.Request(url, headers=head)
    try:
        response = urllib.request.urlopen(request)
        return response.read().decode('gbk')
    except urllib.error.URLError as e:
        if hasattr(e, "code"):
            print(e.code)
        if hasattr(e, "reason"):
            print(e.reason)
    return None
 
def get_data_from_html(html):
    #从HTML内容中提取商品名称和价格
    soup = BeautifulSoup(html, "html.parser")
    names = []
    prices = []
    for items in soup.find_all('p', attrs={"class": "name", "name": "title"}):
        for name in items.find_all('a'):
            title = name['title']
            names.append(title)
    for item in soup.find_all('span', attrs={"class": "price_n"}):
        price = item.string
        prices.append(price)
    return names, prices
 
def print_goods_info(names, prices):
    #打印商品名称和价格信息
    print("序号\t\t\t", "价格\t\t\t", "商品名\t\t")
    for i, (name, price) in enumerate(zip(names, prices)):
        # 去除多余的空白字符
        name = re.sub(r'\s+', ' ', name)
        # 提取数字和小数点
        price = re.findall(r'[\d.]+', price)
        if price:
            print(f"{i + 1}\t\t\t {price[0]}\t\t\t{name}")
 
def main():
    names = []
    prices = []
    for i in range(1, 3):  # 爬取1-2页的内容
        url = BASE_URL + "&page_index=" + str(i)
        html = get_html(url)
        if html:
            page_names, page_prices = get_data_from_html(html)
            names.extend(page_names)
            prices.extend(page_prices)
 
    print_goods_info(names, prices)
 
 
if __name__ == "__main__":
    main()

运行结果

作业心得

在这次作业中，我学习了如何使用re库进行正则表达式匹配，从网页中提取特定的数据。同时，我也意识到需要根据具体网页结构进行调整，灵活性是编写爬虫的关键。

作业3：爬取网页JPEG和JPG格式文件

作业代码和图片

主要代码

 import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
 
# 要爬取的URL
url = 'https://news.fzu.edu.cn/yxfd.htm'
 
# 发送请求获取网页内容
response = requests.get(url)
html_content = response.content
 
# 使用BeautifulSoup解析网页内容
soup = BeautifulSoup(html_content, 'html.parser')
 
# 创建存储图片的文件夹
folder_name = 'downloaded_images'
if not os.path.exists(folder_name):
    os.makedirs(folder_name)
 
# 查找所有的图片链接（JPEG 和 JPG）
img_tags = soup.find_all('img')
 
# 遍历所有图片标签，下载JPEG和JPG格式的图片
for img in img_tags:
    img_url = img.get('src')
 
    # 确保img_url不为空
    if img_url:
        # 将相对链接转换为绝对链接
        img_url = urljoin(url, img_url)
 
        # 只下载JPEG和JPG格式的图片
        if img_url.lower().endswith(('.jpg', '.jpeg')):
            # 获取图片的名称
            img_name = os.path.basename(img_url)
 
            # 下载图片并保存到本地
            img_data = requests.get(img_url).content
            img_path = os.path.join(folder_name, img_name)
 
            with open(img_path, 'wb') as handler:
                handler.write(img_data)
 
            print(f"已下载图片: {img_name}")
 
print("所有图片下载完成！")

运行结果

作业心得

在这次作业中，我学会了使用Python爬取网页上的图片。通过BeautifulSoup解析网页，快速提取所有<img>标签的链接。使用urljoin将相对链接转换为绝对链接，确保下载的图片有效。

posted @ 2024-10-19 16:33 旺旺cc冰阅读(33) 评论(0) 编辑收藏举报

刷新页面返回顶部

登录后才能查看或发表评论，立即登录或者逛逛博客园首页

公告

昵称：旺旺cc冰
园龄： 6个月
粉丝： 0
关注： 2

+加关注

2025年3月

日

一

二

三

四

五

六

nchenxi

数据采集与融合技术实践作业一

作业1：大学排名数据爬取

作业代码和图片

作业心得

作业2：商城爬虫

作业代码和图片

作业心得

作业3：爬取网页JPEG和JPG格式文件

作业代码和图片

作业心得

公告

搜索

常用链接

随笔档案

阅读排行榜

推荐排行榜

	import urllib.request
	from bs4 import BeautifulSoup
	import re # 导入正则表达式模块

	# 指定要爬取的URL
	url = 'http://www.shanghairanking.cn/rankings/bcur/2020'

	# 发送请求获取网页内容
	response = urllib.request.urlopen(url)
	html_content = response.read().decode('utf-8')

	# 使用BeautifulSoup解析网页内容
	soup = BeautifulSoup(html_content, 'html.parser')

	# 找到存放排名信息的表格（假设表格在页面中唯一或具有特定的class属性）
	table = soup.find('table')

	# 提取表格中的所有行
	rows = table.find_all('tr')

	# 打印表头
	print(f"{'排名':<5}{'学校名称':<10}{'省市':<10}{'学校类型':<10}{'总分':<10}")

	# 遍历每一行，提取并打印所需信息
	for row in rows[1:]: # 跳过表头行
	cols = row.find_all('td')
	if len(cols) >= 5: # 确保每行至少有5列数据
	rank = cols[0].get_text(strip=True)

	# 使用正则表达式只保留中文字符，去掉英文部分
	university_name = re.findall(r'[\u4e00-\u9fa5]+', cols[1].get_text(strip=True))[0]
	province_city = cols[2].get_text(strip=True)
	university_type = cols[3].get_text(strip=True)
	total_score = cols[4].get_text(strip=True)

	# 格式化输出
	print(f"{rank:<5}{university_name:<10}{province_city:<10}{university_type:<10}{total_score:<10}")

	import urllib.request
	from bs4 import BeautifulSoup
	import re

	BASE_URL = "http://search.dangdang.com/?key=%E4%B9%A6%E5%8C%85&act=input"

	def get_html(url):
	#获取指定url的HTML内容
	head = {
	"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36 SLBrowser/9.0.3.5211 SLBChan/123"
	}
	request = urllib.request.Request(url, headers=head)
	try:
	response = urllib.request.urlopen(request)
	return response.read().decode('gbk')
	except urllib.error.URLError as e:
	if hasattr(e, "code"):
	print(e.code)
	if hasattr(e, "reason"):
	print(e.reason)
	return None

	def get_data_from_html(html):
	#从HTML内容中提取商品名称和价格
	soup = BeautifulSoup(html, "html.parser")
	names = []
	prices = []
	for items in soup.find_all('p', attrs={"class": "name", "name": "title"}):
	for name in items.find_all('a'):
	title = name['title']
	names.append(title)
	for item in soup.find_all('span', attrs={"class": "price_n"}):
	price = item.string
	prices.append(price)
	return names, prices

	def print_goods_info(names, prices):
	#打印商品名称和价格信息
	print("序号\t\t\t", "价格\t\t\t", "商品名\t\t")
	for i, (name, price) in enumerate(zip(names, prices)):
	# 去除多余的空白字符
	name = re.sub(r'\s+', ' ', name)
	# 提取数字和小数点
	price = re.findall(r'[\d.]+', price)
	if price:
	print(f"{i + 1}\t\t\t {price[0]}\t\t\t{name}")

	def main():
	names = []
	prices = []
	for i in range(1, 3): # 爬取1-2页的内容
	url = BASE_URL + "&page_index=" + str(i)
	html = get_html(url)
	if html:
	page_names, page_prices = get_data_from_html(html)
	names.extend(page_names)
	prices.extend(page_prices)

	print_goods_info(names, prices)


	if __name__ == "__main__":
	main()

	import os
	import requests
	from bs4 import BeautifulSoup
	from urllib.parse import urljoin

	# 要爬取的URL
	url = 'https://news.fzu.edu.cn/yxfd.htm'

	# 发送请求获取网页内容
	response = requests.get(url)
	html_content = response.content

	# 使用BeautifulSoup解析网页内容
	soup = BeautifulSoup(html_content, 'html.parser')

	# 创建存储图片的文件夹
	folder_name = 'downloaded_images'
	if not os.path.exists(folder_name):
	os.makedirs(folder_name)

	# 查找所有的图片链接（JPEG 和 JPG）
	img_tags = soup.find_all('img')

	# 遍历所有图片标签，下载JPEG和JPG格式的图片
	for img in img_tags:
	img_url = img.get('src')

	# 确保img_url不为空
	if img_url:
	# 将相对链接转换为绝对链接
	img_url = urljoin(url, img_url)

	# 只下载JPEG和JPG格式的图片
	if img_url.lower().endswith(('.jpg', '.jpeg')):
	# 获取图片的名称
	img_name = os.path.basename(img_url)

	# 下载图片并保存到本地
	img_data = requests.get(img_url).content
	img_path = os.path.join(folder_name, img_name)

	with open(img_path, 'wb') as handler:
	handler.write(img_data)

	print(f"已下载图片: {img_name}")

	print("所有图片下载完成！")