2023数据采集与融合技术作业二

作业1

  • 要求: 在中国气象网(http://www.weather.com.cn)给定城市集的 7日天气预报,并保存在数据库。

  • 输出信息: gitee文件夹链接

Code
from bs4 import BeautifulSoup
from bs4 import UnicodeDammit
import urllib.request
import sqlite3

class WeatherDB:
    def openDB(self):
        self.con=sqlite3.connect("weathers.db")
        self.cursor=self.con.cursor()
        try:
            self.cursor.execute("create table weathers (wCity varchar(16),wDate varchar(16),wWeather varchar(64),wTemp varchar(32),constraint pk_weather primary key (wCity,wDate))")
        except:
            self.cursor.execute("delete from weathers")

    def closeDB(self):
        self.con.commit()
        self.con.close()
    def insert(self, city, date, weather, temp):
        try:
            self.cursor.execute("insert into weathers (wCity,wDate,wWeather,wTemp) values (?,?,?,?)",
                                (city, date, weather, temp))
        except Exception as err:
            print(err)

        def show(self):
            self.cursor.execute("select * from weathers")
            rows = self.cursor.fetchall()
            print("%-16s%-16s%-32s%-16s" % ("city", "date", "weather", "temp"))
            for row in rows:
                print("%-16s%-16s%-32s%-16s" % (row[0], row[1], row[2], row[3]))


class WeatherForecast:
    def __init__(self):
        self.headers = {
            "User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US; rv:1.9pre) Gecko/2008072421 Minefield/3.0.2pre"}
        self.cityCode = {"北京": "101010100", "上海": "101020100", "广州": "101280101", "深圳": "101280601"}

    def forecastCity(self, city):
        if city not in self.cityCode.keys():
            print(city + " code cannot be found")
            return

        url = "http://www.weather.com.cn/weather/" + self.cityCode[city] + ".shtml"
        try:
            req = urllib.request.Request(url, headers=self.headers)
            data = urllib.request.urlopen(req)
            data = data.read()
            dammit = UnicodeDammit(data, ["utf-8", "gbk"])
            data = dammit.unicode_markup
            soup = BeautifulSoup(data, "lxml")
            lis = soup.select("ul[class='t clearfix'] li")
            for li in lis:
                try:
                    date = li.select('h1')[0].text
                    weather = li.select('p[class="wea"]')[0].text
                    temp = li.select('p[class="tem"] span')[0].text + "/" + li.select('p[class="tem"] i')[0].text
                    print(city, date, weather, temp)
                    self.db.insert(city, date, weather, temp)
                except Exception as err:
                    print(err)
        except Exception as err:
            print(err)

    def process(self, cities):
        self.db = WeatherDB()
        self.db.openDB()
        for city in cities:
            self.forecastCity(city)
        self.db.closeDB()

ws = WeatherForecast()
ws.process(["北京", "上海", "广州", "深圳"])
print("completed")



结果:

image

心得体会:这份作业做的主要是复现,所以难度还好。

作业2

  • 要求: 用 requests 和 BeautifulSoup 库方法定向爬取股票相关信息,并存储在数据库中。

  • 输出信息: gitee文件夹链接

Code
import requests
import re
import pandas as pd
import json

#用get方法访问服务器并提取页面数据
def getHtml():
    headers = {'user-agent': 'Mozilla/5.0',
              'Cookie':'qgqp_b_id=80e1f7e68c6aefe4294725f141d23ff9; st_si=11903563436628; st_asi=delete; HAList=ty-0-301558-N%u4E09%u6001; st_pvi=46750311593906; st_sp=2023-10-07%2014%3A07%3A47; st_inirUrl=https%3A%2F%2Fwww.eastmoney.com%2F; st_sn=7; st_psi=20231007141506909-113200301321-4240583129'
              }
    url = "http://31.push2.eastmoney.com/api/qt/clist/get?cb=jQuery1124021913227827931325_1696660512561&pn=1&pz=20&po=1&np=1&ut=bd1d9ddb04089700cf9c27f6f7426281&fltt=2&invt=2&wbp2u=|0|0|0|web&fid=f3&fs=m:0+t:6,m:0+t:80,m:1+t:2,m:1+t:23,m:0+t:81+s:2048&fields=f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f12,f13,f14,f15,f16,f17,f18,f20,f21,f23,f24,f25,f22,f11,f62,f128,f136,f115,f152&_=1696660512562"
    r = requests.get(url,headers=headers)
    r.text.strip()
    pat = '\"diff\":\[(.*?)\]'
    data = re.compile(pat,re.S).findall(r.text)
    return data

def main( ):
    data = getHtml( )
    data=list(eval(data[0]))

    result = [f"{d['f14']}  {d['f2']} {d['f4']} {d['f5']} {d['f7']} {d['f15']} {d['f16']} {d['f17']} {d['f18']}" for d in data]
    df = pd.DataFrame(result)
    columns = {1:"名称",2:"报价",3:"涨跌幅",4:"成交量",5:"振幅",6:"最高",7:"最低",8:"今开",9:"昨收"}
    print("序号 名称 报价 涨跌幅 成交量 振幅 最高 最低 今开 昨收")
    df.rename(columns = columns,inplace=True)
    cnt=0
    for i in range(len(result)):
        print(str(cnt)+"   "+result[i])
        cnt+=1
    print(result)
#102102103liyishui
main()
结果:

image

心得体会:遇到了不少问题。1. 教程是比较早的,代码里面的url不能用,这个要自己去抓包,找包找了好久。2,数据存储的方式也有所变动,教程里的正则表达式不能用,要换成这个: '"diff":[(.*?)]' 。3,在得到了数据(数据类型为json)后不知道怎么提取,原来只要借用List即可:
data=list(eval(data[0]))
result = [f"{d['f14']} {d['f2']} {d['f4']} {d['f5']} {d['f7']} {d['f15']} {d['f16']} {d['f17']} {d['f18']}" for d in data]

作业3

分析过程:

image

发现所有大学的信息都在payload.js里面,考虑把这个文本提取出来,code1:

Code1:获得文本的部分
import requests
import js2py

url = r'http://www.shanghairanking.cn/_nuxt/static/1695811954/rankings/bcur/2021/payload.js'
r = requests.get(url, timeout=20)
if r.status_code == 200:
    r.encoding = 'utf-8'
    content = r.text
json_html=content[len('  NUXT JSONP  ("/rankings/bcur/2021", '):-2]
js=js2py.EvalJs()
data=js.execute("console.log("+json_html+")")
#102102103liyishui
print(data)


将输出保存为txt文件,用正则表达式进行匹配,code2:

code2:用正则表达式对文本进行提取
import re

from prettytable import PrettyTable

with open("data.txt", "r", encoding="utf-8") as f:
    text = f.read()
    List = []
    for i in range(600):
        List.append([])


def liyishui_find(s):
    positions = [m.start() for m in re.finditer(s, text)]
    cnt = 0
    for pos in positions:
        d = text[pos + len(s):pos + len(s) + 30]
        m = re.search("': '(.*?)',", d)
        if m:
            cnt += 1
            List[cnt].append(m.group(1))
    return cnt


my_str = ["univNameCn", "province", "univCategory"]
for i in range(len(my_str)):
    print(my_str[i] + "count=" + str(liyishui_find(my_str[i])))

# =====score
positions = [m.start() for m in re.finditer("score", text)]
cnt = 0
for pos in positions:
    d = text[pos + len("score"):pos + len("score") + 9]
    start = d.find("': ")
    end = d.find(",")
    score = d[start + 2:end]
    cnt += 1
    List[cnt].append(score)
print("score_count= " + str(cnt))

table = PrettyTable(['排名','学校名称_102102103liyishui','省市','学校类型','总分'])
cnt=0
for school in List:
    if len(school):
        cnt+=1
        table.add_row([cnt,school[0], school[1], school[2], school[3]])
print(table)

结果:
image

image

心得体会:这个网站看起来简单,实际非常有难度。在发现翻页后url居然不变时感觉不妙(这意味着不能用传统的翻页办法了。F12检查时发现这个网站的数据是通过配套的js动态部署的,所有的元素都在payload.js里面,配套的脚本提取后执行并呈现。

所以考虑对payload.js进行处理。我们把“payload.js”文件爬下来以后,是不能直接用 JSON 进行解析的,因为这个文件中掺杂了 JavaScript 的代码。同时这些键值对中,键的部分没有用括号进行包括,此时对于 Python 而言会认为这是个变量。无论是用 json 库还是 eval 都不能直接解析,都会报错。就这个问题老师提供的解决办法是我的code1,利用第三方库进行解释并输出

得到data后遇到的最大问题是不知道怎么处理,因为data的类型是Nonetype,没办法当成字符串来做,全网到处找这个库的api也没有找到。最后突发奇想,可以把输出直接复制保存到本地,变成txt文件进行处理。然后就是正则表达式的问题了,我的做法是依次找到["univNameCn", "province", "univCategory"]这三个字符串出现的位置的后10格左右,提取这个子串,然后再做正则表达式匹配。至于分数的话一直少匹配几个,干脆写暴力,找到所有score后面出现的位以":"开头并且以,结尾的子串。

最后整合一下数据输出即可。

总结

干货满满~对正则表达式的理解加深了,还拓展了爬虫的思路:抓包!

Gitee链接My_Gitee

posted @ 2023-10-07 20:36  liyishui  阅读(79)  评论(1编辑  收藏  举报