2023数据采集与融合技术实践作业2

作业①:

实验代码:

from bs4 import UnicodeDammit  # type:ignore
import urllib.request
import sqlite3


class WeatherDB:
    def openDB(self):
        self.con = sqlite3.connect("weathers.db")
        self.cursor = self.con.cursor()
        try:
            self.cursor.execute("""create table weathers (wCity \
varchar(16),wDate rchar(16),wWeather varchar(64),\
wTemp varchar(32),constraint pk_weather \
primary key (wCity,wDate))""")
        except Exception:
            self.cursor.execute("delete from weathers")

    def closeDB(self):
        self.con.commit()
        self.con.close()

    def insert(self, city, date, weather, temp):
        try:
            self.cursor.execute("""insert into weathers \
(wCity,wDate,wWeather,wTemp) values
(?,?,?,?)""", (city, date, weather, temp))
        except Exception as err:
            print(err)

    def show(self):
        try:
            self.cursor.execute("select * from weathers")
            rows = self.cursor.fetchall()
            print("%-16s%-16s%-32s%-16s" % ("city", "date", "weather", "temp"))
            for row in rows:
                print(
                    "%-16s%-16s%-32s%-16s" %
                    (row[0], row[1], row[2], row[3])
                )
        except Exception as err:
            print(err)


class WeatherForecast:
    def __init__(self):
        self.headers = {
                "User-Agent": """Mozilla/5.0 (X11; Linux x86_64) \
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36 \
Edg/112.0.1722.54"""
        }
        self.cityCode = {
            "北京": "101010100",
            "上海": "101020100",
            "广州": "101280101",
            "深圳": "101280601",
            "福州": "101230101",
            "厦门": "101230201",
            "莆田": "101230401",
            "三明": "101230801",
            "泉州": "101230501",
        }

    def forecastCity(self, city):
        if city not in self.cityCode.keys():
            print(city + " code cannot be found")
            return
        url = "http://www.weather.com.cn/weather/" + self.cityCode[city] + \
            ".shtml"

        try:
            req = urllib.request.Request(url, headers=self.headers)
            data = urllib.request.urlopen(req)
            data = data.read()
            dammit = UnicodeDammit(data, ["utf-8", "gbk"])
            data = dammit.unicode_markup
            soup = BeautifulSoup(data, "lxml")
            lis = soup.select("ul[class='t clearfix'] li")
            print("序号\t地区\t日期\t天气信息\t温度")
            num = 0
            for li in lis:
                try:
                    date = li.select('h1')[0].text
                    weather = li.select('p[class="wea"]')[0].text
                    temp = li.select('p[class="tem"] span')[0].text + \
                        "/" + li.select('p[class="tem"] i')[0].text
                    num += 1
                    print(num, city, date, weather, temp)
                    self.db.insert(city, date, weather, temp)
                except Exception as err:
                    print(err)
        except Exception as err:
            print(err)

    def process(self, cities):
        self.db = WeatherDB()
        self.db.openDB()

        for city in cities:
            self.forecastCity(city)

        self.db.closeDB()


ws = WeatherForecast()
ws.process(["福州", "厦门", "莆田", "三明", "泉州"])
print("完成")

输出信息:

其实代码还有改进的空间,先通过js文件获得城市代号和名字的映射关系,然后再传入函数中进行爬取。如图所示是从主界面抓取的city.js,里面包含了数字和城市的映射关系。

所以改进后可以加入一段函数获取城市名字与id的对应关系:

函数效果如下:

并且原来代码中,爬取今日天气会提示出错,所以还可以继续更改爬取逻辑为:

最终代码如下:

from bs4 import UnicodeDammit  # type:ignore
import urllib.request
import sqlite3
from bs4 import BeautifulSoup
from bs4 import UnicodeDammit
import re
import requests
import json
def get_city_code(city):
    js_url = 'https://j.i8tq.com/weather2020/search/city.js'
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US; rv:1.9pre) Gecko/2008072421 Minefield/3.0.2pre"}
    req = requests.get(js_url, headers=headers)
    data = req.text
    data = data.split('var city_data = ', 1)[1]
    data = data.split(';', 1)[0]
    data = str(json.loads(data))
    pattern = f"'{city}': {{'AREAID': '(.*?)'"
    res = re.findall(pattern, data)
    return res[0]

class WeatherDB:
    def openDB(self):
        self.con = sqlite3.connect("weathers.db")
        self.cursor = self.con.cursor()
        try:
            self.cursor.execute("""create table weathers (wCity \
varchar(16),wDate rchar(16),wWeather varchar(64),\
wTemp varchar(32),constraint pk_weather \
primary key (wCity,wDate))""")
        except Exception:
            self.cursor.execute("delete from weathers")

    def closeDB(self):
        self.con.commit()
        self.con.close()

    def insert(self, city, date, weather, temp):
        try:
            self.cursor.execute("""insert into weathers \
(wCity,wDate,wWeather,wTemp) values
(?,?,?,?)""", (city, date, weather, temp))
        except Exception as err:
            print(err)

    def show(self):
        try:
            self.cursor.execute("select * from weathers")
            rows = self.cursor.fetchall()
            print("%-16s%-16s%-32s%-16s" % ("city", "date", "weather", "temp"))
            for row in rows:
                print(
                    "%-16s%-16s%-32s%-16s" %
                    (row[0], row[1], row[2], row[3])
                )
        except Exception as err:
            print(err)


class WeatherForecast:
    def __init__(self):
        self.headers = {
                "User-Agent": """Mozilla/5.0 (X11; Linux x86_64) \
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36 \
Edg/112.0.1722.54"""
        }

    def forecastCity(self, city):
        citycode=get_city_code(city)
        url = "http://www.weather.com.cn/weather/" + citycode + \
            ".shtml"

        try:
            req = urllib.request.Request(url, headers=self.headers)
            data = urllib.request.urlopen(req)
            data = data.read()
            dammit = UnicodeDammit(data, ["utf-8", "gbk"])
            data = dammit.unicode_markup
            soup = BeautifulSoup(data, "lxml")
            lis = soup.select("ul[class='t clearfix'] li")
            print("序号\t地区\t日期\t天气信息\t温度")
            n = 0
            for li in lis:
                try:
                    date = li.select('h1')[0].text
                    weather = li.select('p[class="wea"]')[0].text
                    if n > 0:  # temp = li.select("p[class='tem']")[0].text.strip()
                        temp = li.select('p[class="tem"] span')[0].text + "/" + li.select('p[class="tem"] i')[0].text
                    else:  # 当天的温度只有一个当前的温度,没有范围
                        temp = li.select('p[class="tem"] i')[0].text
                        n = n + 1
                    print(date, weather, temp)
                except Exception as err:
                    print(err)
        except Exception as err:
            print(err)

    def process(self, cities):
        self.db = WeatherDB()
        self.db.openDB()

        for city in cities:
            self.forecastCity(city)

        self.db.closeDB()


ws = WeatherForecast()
n=input("请输入城市个数:")
li=[]
for i in range(int(n)):
    city=input("请输入城市名:")
    li.append(city)
ws.process(li)
print("完成")

心得体会:

通过这次学习,学到了sqlite的用法,以及如何通过改变url中的一些数字来映射不同的城市。

作业②:

实验代码:

import re
import json
import sqlite3
iid=1
for j in range(1,11):
    url=f'http://41.push2.eastmoney.com/api/qt/clist/get?cb=jQuery112406279131482570521_1696658826817&pn={j}&pz=20&po=1&np=1&ut=bd1d9ddb04089700cf9c27f6f7426281&fltt=2&invt=2&wbp2u=|0|0|0|web&fid=f3&fs=m:0+t:6,m:0+t:80,m:1+t:2,m:1+t:23,m:0+t:81+s:2048&fields=f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f12,f13,f14,f15,f16,f17,f18,f20,f21,f23,f24,f25,f22,f11,f62,f128,f136,f115,f152&_=1696658826818'
    headers={"User-Agent":"Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US; rv:1.9pre) Gecko/2008072421 Minefield/3.0.2pre"}
    req=requests.get(url,headers=headers)
    data=req.text
    json_data = data.split("(", 1)[1].rsplit(")", 1)[0]
    data_json=json.loads(json_data)
    columns = {1:"代码",2:"名称",3:"最新价格",4:"涨跌额",5:"涨跌幅",6:"成交量",7:"成交额",8:"振幅",9:"最高",10:"最低",11:"今开",12:"昨收"}
    con=sqlite3.connect('stock.db')
    cur=con.cursor()
    print("id 代码 名称 最新价格 涨跌额 涨跌幅 成交量 成交额 振幅 最高 最低 今开 昨收")
    cur.execute('create table if not exists stock(id varchar(10),code varchar(10),name varchar(10),price varchar(10),change varchar(10),change_rate varchar(10),volume varchar(10),turnover varchar(10),amplitude varchar(10),high varchar(10),low varchar(10),open varchar(10),close varchar(10),primary key(id))')
    for i in data_json['data']['diff']:
        try:
            cur.execute('insert into stock values(?,?,?,?,?,?,?,?,?,?,?,?,?)',(iid,i['f12'],i['f14'],i['f2'],i['f4'],i['f3'],i['f5'],i['f6'],i['f18'],i['f15'],i['f16'],i['f17'],i['f17']))
            iid=iid+1
            print(iid,i['f12'],i['f14'],i['f2'],i['f4'],i['f3'],i['f5'],i['f6'],i['f18'],i['f15'],i['f16'],i['f17'],i['f17'])
        except Exception as err:
            print(err)

con.commit()
con.close()

爬取10页后输出如下:

结果保存在db中

心得体会

这次学到了抓取js文件,提取传回的数据,然后再进行数据挖掘,对get和post的流程理解更加深刻。

作业③:

实验代码

import json
import sqlite3
import re
import sqlite3
url='https://www.shanghairanking.cn/_nuxt/static/1695811954/rankings/bcur/2021/payload.js'
html=requests.get(url).text.split('return',1)[1].split('}("",',1)[0]
name_pattern = r'univNameCn:"(.*?)"'
score_pattern = r'score:([\d.]+)'
iid=1
name_pattern=re.findall(name_pattern,html)
score_pattern=re.findall(score_pattern,html)
con=sqlite3.connect('university.db')
cur=con.cursor()
cur.execute('create table if not exists university(id varchar(10),name varchar(10),score varchar(10),primary key(id))')
try:
    for i in range(len(name_pattern)):
        print(iid,name_pattern[i],score_pattern[i])
        cur.execute('insert into university values(?,?,?)',(iid,name_pattern[i],score_pattern[i]))
        iid+=1
except Exception as err:
    print(err)
con.commit()
con.close()

gif图片抓取过程:

输出信息:
输出:

结果保存在university.db中:

心得体会

这次更加深刻的理解了要如何抓取返回的js文件并进行进一步的提取,相较于之前的作业,这次的理解更加深刻。

posted @   cyosyo  阅读(14)  评论(0编辑  收藏  举报
相关博文:
阅读排行:
· 无需6万激活码!GitHub神秘组织3小时极速复刻Manus,手把手教你使用OpenManus搭建本
· C#/.NET/.NET Core优秀项目和框架2025年2月简报
· Manus爆火,是硬核还是营销?
· 终于写完轮子一部分:tcp代理 了,记录一下
· 【杭电多校比赛记录】2025“钉耙编程”中国大学生算法设计春季联赛(1)
点击右上角即可分享
微信分享提示