数据采集与融合第二次个人作业
第一题
作业思路:
-
步骤一:看懂网页源码
-
步骤二:用 css 语法把需要的数据提出来
-
步骤三:存入数据库
-
运行结果:
-
相关代码:
from bs4 import BeautifulSoup
import urllib
import requests
import sqlite3
from bs4 import UnicodeDammit
class WeatherDB():
'''
openDB : create a database
closeDB : close database
insert : insert record into database
show : present infomation in database
'''
def openDB(self):
self.connect = sqlite3.connect("weather.db")
self.cursor = self.connect.cursor()
try:
self.connect.execute("create table weathers (wCity VARCHAR(16),wDate VARCHAR(16),wWeather VARCHAR(64),wTemp VARCHAR(32),CONSTRAINT ds_weather PRIMARY KEY (wCity,wDate))")
except:
self.cursor.execute("DELETE FROM weathers")
def closeDB(self):
self.connect.commit()
self.connect.close()
def insert(self,city,date,weather,temp):
try:
self.connect.execute("insert into weathers (wCity,wDate,wWeather,wTemp) VALUES(?,?,?,?)",(city,date,weather,temp))
except Exception as e:
print(e)
def show(self):
self.connect.execute("select * from weathers")
records = self.cursor.fetchall()
print("%-16s%-16s%-32s%-16s%"%("city","date","weather","temp"))
for record in records:
print("%-16s%-16s%-32s%-16s%"%(record[0],record[1],record[2],record[3]))
class WeatherForecast():
'''
forecastCity : forecast weekly weather of given cities
process : action
'''
def __init__(self):
self.headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36 Edg/85.0.564.63"}
self.citycode = {"东山":"101230608","厦门":"101230201","福州":"101230101"}
def forcastCity(self,city):
if city not in self.citycode.keys():
print(city + " code is not exist")
return
url = "http://www.weather.com.cn/weather/" + self.citycode[city] + ".shtml"
try:
req = urllib.request.Request(url, headers=self.headers)
data = urllib.request.urlopen(req)
data = data.read()
data = data.decode()
soup = BeautifulSoup(data, "html.parser")
li_list = soup.select("ul[class = 't clearfix'] li")
print("城市 日期 天气 温度")
# select the weather we need
for li in li_list:
try:
date = li.select("h1")[0].text
weather = li.select("p[class = 'wea']")[0].text
temp = li.select("p[class = 'tem'] span")[0].text + "/" + li.select("p[class = 'tem'] i")[0].text
print(city + " " + date + " " + weather + " " + temp)
self.db.insert(city,date,weather,temp)
except Exception as ee:
print(ee)
except Exception as e:
print(e)
def process(self,cities):
self.db = WeatherDB()
self.db.openDB()
for city in cities:
self.forcastCity(city)
self.db.closeDB()
ws = WeatherForecast()
ws.process(["东山","厦门","福州"])
- 作业心得
本次作业主要的收获是用 css 语法从网页中挑出我们需要的数据之后存入数据库,至少了解了框架是这样子写的~
第二题
- 作业思路:
拿到题目第一反应先用 bs 把想要的数据爬下来,但是会发现其实数据都是用 JS 动态加载的,所以赶紧打开 network 模块,找到需要的信息对应下的 JS 文件发现了这个大兄弟就存储了我们想要的数据。
(爬虫做多了会发现一般jQuery里面会放着我们想要的数据xixixi~)
好的现在我们已经知道了这个 JS 那就赶紧用 request 让他给爷爬过来,解码之后得到 JS 里面 Json 的内容,但是得到的是字符串,那就构成字典之后可以方便的提取相关的字段啦~
然后翻页也是非常简单的,只要点击下一页的按钮并且在 network 模块中找一下从前一页都后一页的 request url 发生了什么变化。可以发现仅仅只是 request url 中的 pn 发生了变化,所以只要对 pn 进行修改就可以实现翻页了。
既然爬取了辣么多的数据,在不用数据库存储的情况下我就比较喜欢用,也是比较通用的就是把数据存成 csv 文件格式,打开查看一目了然,接下来附上代码结果(爬了四页一共80条,就不全部展示了)~
- 附上代码:
import requests
import re
import pandas as pd
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36 Edg/85.0.564.63"}
def get_Info_And_Save(url,offset):
all_data = pd.DataFrame(columns=["序号", "代码", "名称", "最新价", "涨跌幅", "涨跌额", "成交量","成交额", "涨幅"])
try:
html = requests.get(url,headers = headers).content.decode(encoding="utf-8")
res = re.findall(re.compile("\[(.+)\]"),html)[0]
res = res.split("},") # split by } , but need to fix
# print("序号 代码 名称 最新价 涨跌幅 涨跌额 成交量(股) 成交额(港元) 涨幅 ")
for idx,info in enumerate(res):
if idx != 19:# reach 19 dont add }
info = info + "}" # make a complete dict
info = eval(info) # construct a dict
id = idx + 1 + offset
code = info["f12"]
name = info["f14"]
newest_price = info["f2"]
up_down_extent = info["f3"]
up_down_value = info["f4"]
deal_volume = info["f5"]
deal_value = info["f6"]
freq = info["f7"]
# print(str(id) + " " + str(code) + " " + str(name) + " " + str(newest_price) + " " + str(up_down_extent) + " " + str(up_down_value) + " " + str(deal_volume)\
# + " " + str(deal_value) + " " + str(freq))
data = pd.DataFrame([id,code,name,newest_price,up_down_extent,up_down_value,deal_volume,deal_value,freq])
new_data = pd.DataFrame(data.values.T,columns=None)
new_data.columns = ["序号", "代码", "名称", "最新价", "涨跌幅", "涨跌额", "成交量","成交额", "涨幅"]
all_data = all_data.append(new_data)
return all_data
except Exception as e:
print(e)
def go_To_NextPage():
stock_data = pd.DataFrame(columns=["序号", "代码", "名称", "最新价", "涨跌幅", "涨跌额", "成交量", "成交额", "涨幅"])
for idx in range(1,5):
# compare different pages,find parameter "fn" will change autonomously
url = "http://31.push2.eastmoney.com/api/qt/clist/get?cb=jQuery11240033772650735816256_1601427948453&pn={:,d}&pz=20&po=1&np=1&ut=bd1d9ddb04089700cf9c27f6f7426281&fltt=2&invt=2&fid=f3&fs=m:128+t:3,m:128+t:4,m:128+t:1,m:128+t:2&\
fields=f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f12,f13,f14,f15,f16,f17,f18,f19,f20,f21,f23,f24,f25,f26,f22,f33,f11,f62,f128,f136,f115,f152&_=1601427948454".format(idx)
info = get_Info_And_Save(url,20*(idx-1))
stock_data = stock_data.append(info)
stock_data.to_csv("stock_info.csv",index=False,encoding="utf-8-sig")
go_To_NextPage()
- 作业心得
第二题和第三题都是一样的,当网页的数据通过 JS 来动态加载的时候必须要去解析相应的 JS 来获取数据。
第三题
-
作业思路:
既然有了第一题的思路,那么第二题也就势如破竹了,同样需要找出特定信息所在的 JS。但是又必须多加一步,就是以学号后三位结尾并且加上任意的前三位,那么可以在搜索页那里先找出所有的以学号后三位结尾的股票代码,然后后面再来调用之前的做法删删补补就 OK 了。 -
在搜索页中找到需要的 json ,提取对应的股票代码
-
找到相应的 JS 再传入已获得的股票代码然后爬爬爬
-
运行结果
-
附上代码
import re
import requests
# search_url = "http://so.eastmoney.com/web/s?keyword=129"
url = "http://push2.eastmoney.com/api/qt/ulist.np/get?pn=1&pz=50&po=1&np=1&ut=fa5fd1943c7b386f172d6893dbfba10b&fltt=2&invt=2&fid=f62\
&secids=1.600129,1.603129,0.002129&fields=f12,f4,f5,f13,f186,f148,f187,f217,f14,f2,f3,f62,f184,f66,f69,f72,f75,f78,f81,f84,f87,f204,f205,f124&rt=52534396&cb=jQuery1124043095589175823057_1601447227478&_=1601447227499"
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36 Edg/85.0.564.63"}
def get_Stock_Code(url):
# get stock code endswith 129
code_list = []
html = requests.get(url,headers = headers).content.decode(encoding="utf-8")
res = re.findall(re.compile('"diff":\[(.+)\]'),html)[0].split("},")
for idx,info in enumerate(res):
if idx != len(res)-1:
info = info + "}"
info = eval(info)
code = info["f12"]
code_list.append(code)
return code_list
def get_Info(code):
print("代码 名称 今开 最高 涨停 换手(%) 成交额 ")
url = "http://push2.eastmoney.com/api/qt/stock/get?ut=fa5fd1943c7b386f172d6893dbfba10b&invt=2&fltt=2&fields=f43,f57,f58,f169,f170,f46,f44,f51,f168,f47,f164,f163,f116,f60,f45,f52,f50,f48,f167,f117,f71,f161,f49,f530,f135,f136,f137,f138,f139,f141,f142,f144,f145,f147,f148,f140,f143,f146,f149,f55,f62,f162,f92,f173,f104,f105,f84,f85,f183,f184,f185,f186,f187,f188,f189,f190,f191,f192,f107,f111,f86,f177,f78,f110,f262,f263,f264,f267,f268,f250,f251,f252,f253,f254,f255,f256,f257,f258,f266,f269,f270,f271,f273,f274,f275,f127,f199,f128,f193,f196,f194,f195,f197,f80,f280,f281,f282,f284,f285,f286,f287,f292&secid=1." + str(code) +"&cb=jQuery11240959380450062036_1601458369843&_=1601458369844"
html = requests.get(url, headers=headers).content.decode(encoding="utf-8")
# print(re.findall(re.compile('"data":\{(.*?)\}\}\)'), html))
res = re.findall(re.compile('"data":\{(.*?)\}\}\)'), html)[0]
res = "{" + res + "}" # constuct a complete dict
res = eval(res)
code = res["f57"]
name = res["f58"]
opening = res["f44"]
highest = res["f46"]
up_to_stop = res["f51"]
change = res["f168"]
volumn = res["f47"]
print(str(code) + " " + str(name) + " " + str(opening) + " " + str(highest) + " " + str(up_to_stop) + " " + str(change) + " " + str(volumn))
if __name__ == "__main__":
code_list = get_Stock_Code(url)[:2]
for code in code_list:
get_Info(code)
作业总结
这次作业难度不大,主要是考察从网页爬取数据然后存入数据库中以及当 bs4 无法从 HTML 源代码中爬取数据(也就是说网页数据是以 JS 动态加载的话),从 JS 中找出所需要的数据转化为 JSON 格式然后再提取相关的字段。然后问题是不知道这样子的代码是否稳定,当网页改版之后代码很可能失效。特别是第三题,换成其他的股票代码很可能会无法运行。