2023数据采集与融合技术实践作业2

2023数据采集与融合技术实践作业2

实验2.1

code
from bs4 import BeautifulSoup
from bs4 import UnicodeDammit
import urllib.request
import sqlite3

class WeatherDB:
    def openDB(self):
        self.con=sqlite3.connect("weathers.db")
        self.cursor=self.con.cursor()
        try:
            self.cursor.execute("create table weathers (wCity varchar(16),wDate varchar(16),wWeather varchar(64),"
                                "wTemp varchar(32),constraint pk_weather primary key (wCity,wDate))")
        except:
            self.cursor.execute("delete from weathers")

    def closeDB(self):
        self.con.commit()
        self.con.close()


    def insert(self, city, date, weather, temp):
        try:
            self.cursor.execute("insert into weathers (wCity,wDate,wWeather,wTemp) values (?,?,?,?)",
                                (city, date, weather, temp))
        except Exception as err:
            print(err)

    def show(self):
        self.cursor.execute("select * from weathers")
        rows = self.cursor.fetchall()
        print("%-16s%-16s%-32s%-16s" % ("city", "date", "weather", "temp"))
        for row in rows:
            print("%-16s%-16s%-32s%-16s" % (row[0], row[1], row[2], row[3]))


class WeatherForecast:
    def __init__(self):
        self.headers = {
            "User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US; rv:1.9pre) Gecko/2008072421 Minefield/3.0.2pre"}
        self.cityCode = {"北京": "101010100", "上海": "101020100", "广州": "101280101", "深圳": "101280601"}

    def forecastCity(self, city):
        if city not in self.cityCode.keys():
            print(city + " code cannot be found")
            return
        url = "http://www.weather.com.cn/weather/" + self.cityCode[city] + ".shtml"
        try:
            req = urllib.request.Request(url, headers=self.headers)
            data = urllib.request.urlopen(req)
            data = data.read()
            dammit = UnicodeDammit(data, ["utf-8", "gbk"])
            data = dammit.unicode_markup
            soup = BeautifulSoup(data, "lxml")
            lis = soup.select("ul[class='t clearfix'] li")

            for li in lis:
                try:
                    date = li.select('h1')[0].text
                    weather = li.select('p[class="wea"]')[0].text
                    temp = li.select('p[class="tem"] span')[0].text + "/" + li.select('p[class="tem"] i')[0].text
                    print(city, date, weather, temp)
                    self.db.insert(city, date, weather, temp)
                except Exception as err:
                    print(err)
        except Exception as err:
            print(err)
    def process(self, cities):
        self.db = WeatherDB()
        self.db.openDB()
        for city in cities:
            self.forecastCity(city)
        self.db.closeDB()

ws = WeatherForecast()
ws.process(["北京"])
print("completed")

运行结果截图:
image
实验心得:本次实验为复现天气预报内容信息的爬取,结合老师提供的PPT内容即可,对自定义的数据库有了更进一步的理解,同时认识到通过函数化的爬虫可以更好的调用、修改等等。

实验2.2

code
import pandas as pd
import requests

def get_Html(page):
    url = ("https://9.push2.eastmoney.com/api/qt/clist/get?cb=jQuery112405185584716637714_1696659395930&pn="+str(page)+
           "&pz=20&po=1&np=1&ut=bd1d9ddb04089700cf9c27f6f7426281&fltt=2&invt=2&wbp2u=|0|0|0|web&fid=f3&fs=m:0+t:6,m:0+t:80,m:1+t:2,"
           "m:1+t:23,m:0+t:81+s:2048&fields=f12,f14,f2,f3,f4,f5,f6,f7,f15,f16,f17,f18&_=1696659396068")
    html = requests.get(url, timeout=30)
    html.raise_for_status()
    html.encoding = html.apparent_encoding
    return html.text

def get_messege(dict):
    a = []
    for i in [12,14,2,3,4,5,6,7,15,16,17,18]:
        x = 'f' + str(i)
        a.append(dict[x])
    return a

html = get_Html(1)

answer = html.split('[')[1].split(']')[0]
# print(answer)
answer = answer.split('}')
data =[]
for i in answer:
    if i != ',' and i != ' ':
        data.append(i.strip(' ').strip(',')+'}')
    else:
        continue
data.pop(-1) #去除结尾无用信息
Info = []
for i in data:
    messege = eval(i) #str -> dict
    Info.append(get_messege(messege))
columns = {1: "代码", 2: "名称", 3: "最新价格", 4: "涨跌额", 5: "涨跌幅", 6: "成交量",
           7: "成交额", 8: "振幅",9: "最高", 10: "最低",11: "今开", 12: "昨收"}
df = pd.DataFrame(Info, columns=columns.values())
df.to_excel("./infomations.xlsx", index=False)
print("已成功保存至当前文件夹中!")

gif:image

运行结果截图:
image
实验心得:本次实验通过抓包的方式,查找股票列表加载使用的url,并分析api返回的值,并根据所要求的参数可适当更改api的请求参数,使得数据的爬取更加便捷高效!

实验2.3

code
import pandas as pd
import re
import requests
def get_Html():
    url = 'https://www.shanghairanking.cn/_nuxt/static/1696822285/rankings/bcur/2021/payload.js'
    html = requests.get(url, timeout=30)
    html.raise_for_status()
    html.encoding = html.apparent_encoding
    return html.text

html = get_Html()

univNameCn = re.findall('univNameCn:"(.*?)"', html)
score = re.findall('score:(.*?),', html)
# print(univNameCn)

data = []
for i in range(16):
    list = []
    list.append(i + 1)
    list.append(univNameCn[i])
    list.append(score[i])
    data.append(list)

columns = {1: "排名", 2: "学校名称", 3: "学校总分"}
df = pd.DataFrame(data, columns=columns.values())
df.to_excel("./univRank.xlsx", index=False)
print("已成功保存至当前文件夹中!")

gif:image

运行结果截图:image
实验心得:同实验2.2通过分析该网站的发包情况,分析获取数据的api,进一步抓取所需要的信息,更进一步体会到通过抓包爬取信息的好处!

posted @ 2023-10-09 17:27  易楞  阅读(29)  评论(0编辑  收藏  举报