数据采集和融合技术第一次作业

作业①:

结果展示:

代码部分:

import requests
from bs4 import BeautifulSoup
from tabulate import tabulate
import re

# 目标URL
url = "http://www.shanghairanking.cn/rankings/bcur/2020"

# 发送HTTP请求获取网页内容
response = requests.get(url)
response.encoding = 'utf-8'  # 设置正确的编码格式

# 使用BeautifulSoup解析网页内容
soup = BeautifulSoup(response.text, 'html.parser')

# 查找包含排名信息的表格
table = soup.find('table', {'class': 'rk-table'})

# 提取表格中的行数据
rows = table.find_all('tr')

# 初始化数据列表
data = []

# 提取排名信息
for row in rows[1:]:  # 跳过表头行
    cols = row.find_all('td')
    if len(cols) >= 5:  # 确保有足够的列
        rank = cols[0].text.strip()
        university = cols[1].text.strip()
        
        # 使用正则表达式提取中文名
        chinese_name = re.search(r'[\u4e00-\u9fff]+', university)
        if chinese_name:
            university = chinese_name.group()
        
        province_city = cols[2].text.strip()
        school_type = cols[3].text.strip()
        total_score = cols[4].text.strip()
        data.append([rank, university, province_city, school_type, total_score])

# 打印表格
headers = ["排名", "学校名称", "省市", "学校类型", "总分"]
print(tabulate(data, headers=headers, tablefmt="grid"))

输出结果:

个人心得:

学会了如何使用requests库发送HTTP请求,如何使用BeautifulSoup库解析HTML内容,以及如何通过查找HTML标签和属性来提取信息。

作业②:

结果展示:

代码部分:

import requests
import re
import random
import time

def get_headers():
    return {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36 Edg/129.0.0.0',
        'cookie': '__jdu=17182719663141198906466; shshshfpa=937e652f-a425-de5a-c233-6df524418ad2-1718271970; shshshfpx=937e652f-a425-de5a-c233-6df524418ad2-1718271970; TrackID=1YNbU9FIZd00BWHMDtajlJDz4quPpa1lrPhRvcSiVgltmvXLoAIyPaOb8zXS3RQnjdU2jxdQLGi2-UWCHqdyW0fiLoKP_NXdJtYIm1qvAa0qcquV3fsvHQTEAfQYapPHW; thor=FC0DA1BB1CB59DA1B77A0DF3C3FA4632065CAC1BDA37A6C8D73ADD6607C01200908EAF9AD5A06D6065A64185A5CDB9FEC4A2F7A21D2533BB3DB822DEDDD01B6880B6830B4831088884FFFD1893245574E49EDB536C13A74F76E645224DA275C42453B4225496F2B1F428F61C6FA0767B3CAFB8781E4E009736A1BC05D3945645DB66F3D6BA8037DFADBC787B03BCCF2C24EAEA453E41FB8D0E004DDD8316D751; light_key=AASBKE7rOxgWQziEhC_QY6ya6gDvFukmW0NqWdW4cfuCVgpBy7NJVLYpr8DgG10YJc9-oZDb; pinId=c__mwLI5rc1OFi8nFqZWvw; pin=jd_PFdPlGuLMlzn; unick=jd_48giwwpv953xn4; _tp=UzQYIRktqRpEFW1%2BilHcTw%3D%3D; _pst=jd_PFdPlGuLMlzn; unpl=JF8EALNnNSttUExUUR4EG0USTQ5SWwgASx8AZzMEBFkITF0EHFUbQRl7XlVdWBRKHx9sZBRUVVNJUg4eAysSEXteU11bD00VB2xXVgQFDQ8WUUtBSUt-S1tXV1QOSh4AbGYDZG1bS2QFGjIbFBBCXlJeVQ9MFQNqZwVcVFBKUwAcBSsTIExtZG5bC0MRC19mNVVtGh8IDBIBGxcWBl1SXlQLTRcLaGAHVFhYS1wMEwMcFxdMbVVuXg; cn=0; jsavif=1; __jda=143920055.17182719663141198906466.1718271966.1727856159.1728974438.6; __jdc=143920055; __jdv=143920055|direct|-|none|-|1728974438017; 3AB9D23F7A4B3CSS=jdd032K7FJM4XCTA2C44ZZ753Y6AA5EDECIJU522H6UVHKKPWGHA5R4MXY27Z6AUWAQKUM57FZQBNIGN2T3I2NRVGIZ6PNUAAAAMSR3UEAGAAAAAADEJW5LRWMNY52MX; _gia_d=1; flash=3_ZlDdVjQe6D225JYOtB9b0pzG_EB3m2kmv2YDaca6mmIIgKGJk_rZcZCIl7nLLYhUtOe1exVlTzAP47tlC9tpGO3nWPJJ6bphrVcGm0yZ0A7Mag5WZsOqCg2Yog-f_rIfXJduGE-odM6jNYJkj0egbHbV1m0IX9vSUJSgBJZUZ_wwg4pQ_3cA; areaId=16; ipLoc-djd=16-1317-0-0; __jdb=143920055.2.17182719663141198906466|6.1728974438; shshshfpb=BApXSCsjgjfdAiMs_VmIw7nD6BIKfhzsIBlpmHl9q9xJ1MjnM8oC2; 3AB9D23F7A4B3C9B=2K7FJM4XCTA2C44ZZ753Y6AA5EDECIJU522H6UVHKKPWGHA5R4MXY27Z6AUWAQKUM57FZQBNIGN2T3I2NRVGIZ6PNU'
    }

def fetch_page(keyword, page):
    encoded_keyword = requests.utils.quote(keyword)
    url = f'https://search.jd.com/Search?keyword={encoded_keyword}&page={page}'
    headers = get_headers()
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        return response.text
    except requests.RequestException as e:
        print(f"请求失败: {e}")
        return None

def parse_products(html_content):
    product_name_pattern = re.compile(r'<div class="p-name.*?">.*?<a.*?>\s*<em>(.*?)</em>', re.S)
    price_pattern = re.compile(r'<div class="p-price">.*?<i.*?>([\d\.]+)</i>', re.S)
    
    product_names = product_name_pattern.findall(html_content)
    prices = price_pattern.findall(html_content)
    
    clean_product_names = [re.sub(r'<.*?>', '', name).strip() for name in product_names]
    
    return list(zip(clean_product_names, prices))

def main():
    keyword = '书包'
    page = 1
    html_content = fetch_page(keyword, page)
    if html_content:
        products = parse_products(html_content)
        print(f"{'序号':<5} {'价格':<10} {'商品名'}")
        for index, (name, price) in enumerate(products, 1):
            print(f"{index:<5} {price:<10} {name}")

if __name__ == "__main__":
    main()

输出结果:

个人心得:

掌握了requests和re库的基本用法和技巧,还学会了如何分析和解析HTML内容,以及如何处理反爬虫机制等实际问题。

作业③:

结果展示:

代码部分:

import re
import requests
import os
import urllib.parse

def download_jpeg_images(target_url, request_headers, save_folder='images'):
    """
    从指定网页下载所有JPEG和JPG格式的图片,并保存到本地文件夹。

    参数:
        target_url (str): 要爬取的网页URL。
        request_headers (dict): HTTP请求头信息。
        save_folder (str): 保存图片的文件夹名称,默认为 'images'。
    """
    # 发送HTTP GET请求获取网页内容
    html_response = requests.get(url=target_url, headers=request_headers)
    html_content = html_response.text

    # 使用正则表达式查找所有以.jpg或.jpeg结尾的图片链接
    image_urls = re.findall(r'<img[^>]+src=["\'](.*?\.(?:jpg|jpeg))["\']', html_content, re.IGNORECASE)

    # 创建保存图片的目录,如果不存在则创建
    if not os.path.exists(save_folder):
        os.makedirs(save_folder)

    # 遍历所有图片链接并下载
    for img_url in image_urls:
        # 处理相对路径的图片链接
        full_img_url = urllib.parse.urljoin(target_url, img_url)

        # 从图片URL中提取文件名
        img_filename = os.path.basename(img_url)

        # 定义图片保存路径
        img_save_path = os.path.join(save_folder, img_filename)

        # 获取图片内容并保存
        img_response = requests.get(full_img_url, headers=request_headers)
        with open(img_save_path, 'wb') as img_file:
            img_file.write(img_response.content)

if __name__ == "__main__":
    # 指定要爬取的网页URL
    url_to_scrape = 'https://news.fzu.edu.cn/yxfd.htm'

    # 定义HTTP请求头
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36 Edg/129.0.0.0'
    }

    # 调用函数下载图片
    download_jpeg_images(url_to_scrape, headers)

输出结果:

个人心得:掌握了requests、BeautifulSoup等库的基本用法和技巧,还学会了如何分析HTML内容、提取图片链接以及处理异常和错误。

posted @ 2024-12-17 23:13  梧桐陌雨  阅读(2)  评论(0编辑  收藏  举报