2023数据采集与融合技术实践作业一

作业①:

  • 实验要求:

用requests和BeautifulSoup库方法定向爬取给定网址(http://www.shanghairanking.cn/rankings/bcur/2020 )的数据,屏幕打印爬取的大学排名信息。
输出信息:
排名 学校名称 省市 学校类型 总分
1 清华大学 北京 综合 852.5
2......

  • 完整代码:

import requests
import bs4
import pandas as pd

headers = {'User-Agent': 'Mozilla/5.0'}

def get_info(url):
    wb_data = requests.get(url, headers=headers)
    wb_data.encoding = wb_data.apparent_encoding
    soup = bs4.BeautifulSoup(wb_data.text, 'html')
    trs = soup.select('tbody>tr')
    ranks = []
    names = []
    leixings = []
    zongfens = []
    
    for tr in trs:
        ranks.append(tr.text.split('\n')[1].replace(' ', ''))
        names.append(tr.text.split('\n')[2].replace(' ', ''))
        leixings.append(tr.text.split('\n')[7].replace(' ', ''))
        zongfens.append(tr.text.split('\n')[9].replace(' ', ''))

    ranking = {'排名': ranks, '学校名称': names,'类型': leixings,'总分': zongfens}
    print(pd.DataFrame(ranking))
    


if __name__ == '__main__':
    url = f'https://www.shanghairanking.cn/rankings/bcur/2020'
    get_info(url)
  • 实现结果:

  • 心得体会:

学习并且实践了requests和BeautifulSoup库方法爬取网页,对python库的认识更深刻,对爬虫的方法有更好的学习。

作业②:

  • 实验要求:

用requests和re库方法设计某个商城(自已选择)商品比价定向爬虫,爬取该商城,以关键词“书包”搜索页面的数据,爬取商品名称和价格。
输出信息:
序号 价格 商品名
1 65.00 xxx
2......

  • 完整代码:

import requests
import re
from bs4 import BeautifulSoup as bs

goods=[]
def get_jd_product(keyword):
    url = 'https://search.jd.com/Search'
    params = {
        'keyword': keyword,
        'enc': 'utf-8',
        'wq': keyword,
    }
    
    headers = {
        'cookies':"__jdv=122270672|direct|-|none|-|1695284270414; __jdc=122270672; __jda=122270672.16952842704131365791558.1695284270.1695284270.1695284270.1; mba_muid=16952842704131365791558; wlfstk_smdl=8rj8s7sjkya99loxzwcm058z4uue4ipc; __jdu=16952842704131365791558; _pst=jd_ZJKFwFYkEHnY; logintype=wx; unick=jd_ZJKFwFYkEHnY; pin=jd_ZJKFwFYkEHnY; npin=jd_ZJKFwFYkEHnY; thor=06DCA42AA099C9037B3035F8F3E340FC27A9EB22973AECA12FDFA86FE589F233B08DF824663D55AD6CDA6965335EA0C7279697650E546600856E47A4A60FCCC4462DAEF626C634E48F3E952E46376B896CEC6EA883296DB38AB173D3BFE2A2A8C71BF19CDFCA133F673B36747179C8C8894289DFB7A604623E0083D8489F8AE982B55369D5C493A81C916F38FE776921D2DFD914A6CC51E380BC153DEF9B6797; flash=2_uX4iolMcf7h8yepIK-ndLD6ujl_lbjGaI-cI65sO1nXqgHUy3P7b8f7XVN3MKGCjcJ8hbNCBCbwyb2fV6qlO8MnN3Kotrl9A6Adatv-_itP*; _tp=TBYdxMDxRWMpO/H4Ccpg1w==; pinId=rf0AaCMZJ9S83NzewvszDQ; jsavif=1; jsavif=1; 3AB9D23F7A4B3CSS=jdd03BTX7QZMPLQIV3EEVNZR37O7G5LEYVXDEVY24SIVZHGDD4SDF6YSGQCYBDSX2SAPXDL3V3XRYUHPAGHME4SVRUH2XEIAAAAMKW3IWEBQAAAAAC6ZOPMSO75A2C4X; _gia_d=1; xapieid=jdd03BTX7QZMPLQIV3EEVNZR37O7G5LEYVXDEVY24SIVZHGDD4SDF6YSGQCYBDSX2SAPXDL3V3XRYUHPAGHME4SVRUH2XEIAAAAMKW3IWEBQAAAAAC6ZOPMSO75A2C4X; shshshfpa=754ff8be-5aeb-2ac4-e574-673c432850e2-1695284290; shshshfpx=754ff8be-5aeb-2ac4-e574-673c432850e2-1695284290; areaId=16; ipLoc-djd=16-1317-0-0; __jdb=122270672.4.16952842704131365791558|1.1695284270; shshshsID=0c28e22cb7c616b1f2cae4afa5697c5a_2_1695284335332; shshshfpb=AAhsT0raKEk_4vlrrKsTldGc8QyhQ4haVKEKQQQAAAAA; qrsc=2; 3AB9D23F7A4B3C9B=BTX7QZMPLQIV3EEVNZR37O7G5LEYVXDEVY24SIVZHGDD4SDF6YSGQCYBDSX2SAPXDL3V3XRYUHPAGHME4SVRUH2XEI",
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36'
    }
    
    response = requests.get(url, params=params, headers=headers)
    response.raise_for_status()  # 检查请求是否成功
    print(response.text)
    soup = bs(response.text, 'lxml')
    tags = soup.find('div', attrs={"id": "J_goodsList"})
    tags = tags.find_all('li', attrs={'class': "gl-item"})
    try:
        global goods
        for t in tags:
            name = t.find('div',attrs = {'class': "p-name p-name-type-2"}).find('em').text.replace('\n',' ')
            price = float(t.find('div',attrs = {'class': 'p-price'}).find('i').text)
            # print(num_count,'-->',name,price)
            goods.append([name,price])
    except Exception as e:
        print(e)

def get_jd_product2(keyword):
    url = 'https://search.jd.com/Search?keyword='+str(keyword)+'&enc=utf-8'+ '&page=2'
    
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36'
    }
    
    response = requests.get(url, headers=headers)
    response.raise_for_status()  # 检查请求是否成功
    soup = bs(response.text, 'lxml')
    tags = soup.find('div', attrs={"id": "J_goodsList"})
    tags = tags.find_all('li', attrs={'class': "gl-item"})
    try:
        global goods
        for t in tags:
            name = t.find('div',attrs = {'class': "p-name p-name-type-2"}).find('em').text.replace('\n',' ')
            price = float(t.find('div',attrs = {'class': 'p-price'}).find('i').text)
            # print(num_count,'-->',name,price)
            goods.append([name,price])
    except Exception as e:
        print(e)
# 测试
keyword = '书包'
get_jd_product(keyword)
get_jd_product2(keyword)

goods = sorted(goods,key=lambda a:a[1])
count = 1
for item in goods:
    print(count,'>>',item[0],' ',item[1],'¥')
    count+= 1
  • 实现结果:


  • 心得体会:

学习使用requests和re库方法设计,并自己实践京东爬取商品信息,原来爬淘宝爬太多次号被封了,但是学会了很多方法。

作业③:

  • 实验要求:

要求:爬取一个给定网页( https://xcb.fzu.edu.cn/info/1071/4481.htm)或者自选网页的所有JPEG和JPG格式文件
输出信息:将自选网页内的所有JPEG和JPG文件保存在一个文件夹中

  • 完整代码:

#_*_coding:utf-8_*_
import requests
import re
import os
count = 0
class GetImage(object):
    def __init__(self,url):
        self.url = url
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36'
        }
        self.dir_path = os.path.dirname(os.path.abspath(__file__))
        self.path = self.dir_path+'/imgs'
        isExists = os.path.exists(self.dir_path+'/imgs')
        # 创建目录
        if not isExists:
            os.makedirs(self.path)


    def download(self,url):
        try:
            res = requests.get(url,headers=self.headers)
            return res
        except Exception as E:
            print(url+'下载失败,原因:'+E)


    def parse(self,res):
        content = res.content.decode()
        # print(content)
        img_list = re.findall(r'<imgs.*?src="(.*?)"',content,re.S)
        img_list = ['https://xcb.fzu.edu.cn'+url for url in img_list]
        return img_list

    def save(self,res_img,file_name):
        if res_img:
            # 定义文件名和目录
            global count
            filename = 'FB5989AAE9881E0EDB3B42F01F7_908A13A6_8426D'+str(count)+'.jpeg'
            count+=1
            path = 'D:/爬虫vscode/实验一/imgs'
            # 将文件名中的非法字符替换为下划线
            new_filename = filename.replace('?', '_')
            # # 拼接完整的文件路径
            file_path = os.path.join(path, new_filename)
            with open(file_path, 'wb') as f:
                f.write(res_img.content)
            print(url+'下载成功')

    def run(self):
        # 下载
        res = self.download(self.url)
        # 解析
        url_list = self.parse(res)
        # 下载图片
        for url in url_list:
            res_img = self.download(url)
            name = url.strip().split('/').pop()
            file_name = self.path+'/'+name
            # 保存
            self.save(res_img,file_name)

if __name__ == '__main__':
    url_list = ['https://xcb.fzu.edu.cn/info/1071/4481.htm']
    for url in url_list:
        text = GetImage(url)
        text.run()
  • 实现结果:

  • 心得体会:

通过本次学习,学会了如何将网页上的图片爬到电脑文件夹中,加深了我对爬虫的学习。

posted @ 2023-09-22 14:12  Tao-_-tao  阅读(115)  评论(0编辑  收藏  举报