2023数据采集与融合技术作业二
作业1
-
要求: 在中国气象网(http://www.weather.com.cn)给定城市集的 7日天气预报,并保存在数据库。
-
输出信息: gitee文件夹链接
Code
from bs4 import BeautifulSoup
from bs4 import UnicodeDammit
import urllib.request
import sqlite3
class WeatherDB:
def openDB(self):
self.con=sqlite3.connect("weathers.db")
self.cursor=self.con.cursor()
try:
self.cursor.execute("create table weathers (wCity varchar(16),wDate varchar(16),wWeather varchar(64),wTemp varchar(32),constraint pk_weather primary key (wCity,wDate))")
except:
self.cursor.execute("delete from weathers")
def closeDB(self):
self.con.commit()
self.con.close()
def insert(self, city, date, weather, temp):
try:
self.cursor.execute("insert into weathers (wCity,wDate,wWeather,wTemp) values (?,?,?,?)",
(city, date, weather, temp))
except Exception as err:
print(err)
def show(self):
self.cursor.execute("select * from weathers")
rows = self.cursor.fetchall()
print("%-16s%-16s%-32s%-16s" % ("city", "date", "weather", "temp"))
for row in rows:
print("%-16s%-16s%-32s%-16s" % (row[0], row[1], row[2], row[3]))
class WeatherForecast:
def __init__(self):
self.headers = {
"User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US; rv:1.9pre) Gecko/2008072421 Minefield/3.0.2pre"}
self.cityCode = {"北京": "101010100", "上海": "101020100", "广州": "101280101", "深圳": "101280601"}
def forecastCity(self, city):
if city not in self.cityCode.keys():
print(city + " code cannot be found")
return
url = "http://www.weather.com.cn/weather/" + self.cityCode[city] + ".shtml"
try:
req = urllib.request.Request(url, headers=self.headers)
data = urllib.request.urlopen(req)
data = data.read()
dammit = UnicodeDammit(data, ["utf-8", "gbk"])
data = dammit.unicode_markup
soup = BeautifulSoup(data, "lxml")
lis = soup.select("ul[class='t clearfix'] li")
for li in lis:
try:
date = li.select('h1')[0].text
weather = li.select('p[class="wea"]')[0].text
temp = li.select('p[class="tem"] span')[0].text + "/" + li.select('p[class="tem"] i')[0].text
print(city, date, weather, temp)
self.db.insert(city, date, weather, temp)
except Exception as err:
print(err)
except Exception as err:
print(err)
def process(self, cities):
self.db = WeatherDB()
self.db.openDB()
for city in cities:
self.forecastCity(city)
self.db.closeDB()
ws = WeatherForecast()
ws.process(["北京", "上海", "广州", "深圳"])
print("completed")
心得体会:这份作业做的主要是复现,所以难度还好。
作业2
-
要求: 用 requests 和 BeautifulSoup 库方法定向爬取股票相关信息,并存储在数据库中。
-
输出信息: gitee文件夹链接
Code
import requests
import re
import pandas as pd
import json
#用get方法访问服务器并提取页面数据
def getHtml():
headers = {'user-agent': 'Mozilla/5.0',
'Cookie':'qgqp_b_id=80e1f7e68c6aefe4294725f141d23ff9; st_si=11903563436628; st_asi=delete; HAList=ty-0-301558-N%u4E09%u6001; st_pvi=46750311593906; st_sp=2023-10-07%2014%3A07%3A47; st_inirUrl=https%3A%2F%2Fwww.eastmoney.com%2F; st_sn=7; st_psi=20231007141506909-113200301321-4240583129'
}
url = "http://31.push2.eastmoney.com/api/qt/clist/get?cb=jQuery1124021913227827931325_1696660512561&pn=1&pz=20&po=1&np=1&ut=bd1d9ddb04089700cf9c27f6f7426281&fltt=2&invt=2&wbp2u=|0|0|0|web&fid=f3&fs=m:0+t:6,m:0+t:80,m:1+t:2,m:1+t:23,m:0+t:81+s:2048&fields=f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f12,f13,f14,f15,f16,f17,f18,f20,f21,f23,f24,f25,f22,f11,f62,f128,f136,f115,f152&_=1696660512562"
r = requests.get(url,headers=headers)
r.text.strip()
pat = '\"diff\":\[(.*?)\]'
data = re.compile(pat,re.S).findall(r.text)
return data
def main( ):
data = getHtml( )
data=list(eval(data[0]))
result = [f"{d['f14']} {d['f2']} {d['f4']} {d['f5']} {d['f7']} {d['f15']} {d['f16']} {d['f17']} {d['f18']}" for d in data]
df = pd.DataFrame(result)
columns = {1:"名称",2:"报价",3:"涨跌幅",4:"成交量",5:"振幅",6:"最高",7:"最低",8:"今开",9:"昨收"}
print("序号 名称 报价 涨跌幅 成交量 振幅 最高 最低 今开 昨收")
df.rename(columns = columns,inplace=True)
cnt=0
for i in range(len(result)):
print(str(cnt)+" "+result[i])
cnt+=1
print(result)
#102102103liyishui
main()
心得体会:遇到了不少问题。1. 教程是比较早的,代码里面的url不能用,这个要自己去抓包,找包找了好久。2,数据存储的方式也有所变动,教程里的正则表达式不能用,要换成这个: '"diff":[(.*?)]' 。3,在得到了数据(数据类型为json)后不知道怎么提取,原来只要借用List即可:
data=list(eval(data[0]))
result = [f"{d['f14']} {d['f2']} {d['f4']} {d['f5']} {d['f7']} {d['f15']} {d['f16']} {d['f17']} {d['f18']}" for d in data]
作业3
-
要求: 爬取中国大学 2021 主榜(https://www.shanghairanking.cn/rankings/bcur/2021)所有院校信息,并存储在数据库中,同时将浏览器 F12 调试分析的过程录制 Gif 加入至博客中。
-
输出信息: gitee文件夹链接
分析过程:
发现所有大学的信息都在payload.js里面,考虑把这个文本提取出来,code1:
Code1:获得文本的部分
import requests
import js2py
url = r'http://www.shanghairanking.cn/_nuxt/static/1695811954/rankings/bcur/2021/payload.js'
r = requests.get(url, timeout=20)
if r.status_code == 200:
r.encoding = 'utf-8'
content = r.text
json_html=content[len(' NUXT JSONP ("/rankings/bcur/2021", '):-2]
js=js2py.EvalJs()
data=js.execute("console.log("+json_html+")")
#102102103liyishui
print(data)
将输出保存为txt文件,用正则表达式进行匹配,code2:
code2:用正则表达式对文本进行提取
import re
from prettytable import PrettyTable
with open("data.txt", "r", encoding="utf-8") as f:
text = f.read()
List = []
for i in range(600):
List.append([])
def liyishui_find(s):
positions = [m.start() for m in re.finditer(s, text)]
cnt = 0
for pos in positions:
d = text[pos + len(s):pos + len(s) + 30]
m = re.search("': '(.*?)',", d)
if m:
cnt += 1
List[cnt].append(m.group(1))
return cnt
my_str = ["univNameCn", "province", "univCategory"]
for i in range(len(my_str)):
print(my_str[i] + "count=" + str(liyishui_find(my_str[i])))
# =====score
positions = [m.start() for m in re.finditer("score", text)]
cnt = 0
for pos in positions:
d = text[pos + len("score"):pos + len("score") + 9]
start = d.find("': ")
end = d.find(",")
score = d[start + 2:end]
cnt += 1
List[cnt].append(score)
print("score_count= " + str(cnt))
table = PrettyTable(['排名','学校名称_102102103liyishui','省市','学校类型','总分'])
cnt=0
for school in List:
if len(school):
cnt+=1
table.add_row([cnt,school[0], school[1], school[2], school[3]])
print(table)
结果:
心得体会:这个网站看起来简单,实际非常有难度。在发现翻页后url居然不变时感觉不妙(这意味着不能用传统的翻页办法了。F12检查时发现这个网站的数据是通过配套的js动态部署的,所有的元素都在payload.js里面,配套的脚本提取后执行并呈现。
所以考虑对payload.js进行处理。我们把“payload.js”文件爬下来以后,是不能直接用 JSON 进行解析的,因为这个文件中掺杂了 JavaScript 的代码。同时这些键值对中,键的部分没有用括号进行包括,此时对于 Python 而言会认为这是个变量。无论是用 json 库还是 eval 都不能直接解析,都会报错。就这个问题老师提供的解决办法是我的code1,利用第三方库进行解释并输出
得到data后遇到的最大问题是不知道怎么处理,因为data的类型是Nonetype,没办法当成字符串来做,全网到处找这个库的api也没有找到。最后突发奇想,可以把输出直接复制保存到本地,变成txt文件进行处理。然后就是正则表达式的问题了,我的做法是依次找到["univNameCn", "province", "univCategory"]这三个字符串出现的位置的后10格左右,提取这个子串,然后再做正则表达式匹配。至于分数的话一直少匹配几个,干脆写暴力,找到所有score后面出现的位以":"开头并且以,结尾的子串。
最后整合一下数据输出即可。
总结
干货满满~对正则表达式的理解加深了,还拓展了爬虫的思路:抓包!