第二次作业
作业①
1)WeatherForecast
要求:在中国气象网http://www.weather.com.cn给定城市集的7日天气预报,并保存在数据库。
代码:
from bs4 import BeautifulSoup
from bs4 import UnicodeDammit
import urllib.request
import sqlite3
class WeatherDB: # 包含对数据库的操作
def openDB(self):
self.con = sqlite3.connect('weathers.db')
self.cursor = self.con.cursor()
try:
self.cursor.execute(
'create table weathers (wCity varchar(16),wDate varchar(16),wWeather varchar(64),wTemp varchar(32),constraint pk_weather primary key(wCity,wDate))')
except: # 第一次创建表格是成功的;第二次创建就会清空表格
self.cursor.execute('delete from weathers')
def closeDB(self):
self.con.commit()
self.con.close()
def insert(self, city, date, weather, temp):
try:
self.cursor.execute('insert into weathers (wCity,wDate,wWeather,wTemp)values(?,?,?,?)',
(city, date, weather, temp)) ## 爬取城市的天气预报数据储存到数据库weather.db中
except Exception as err:
print(err)
def show(self):
self.cursor.execute('select * from weathers') #执行查询语句,输出表
rows = self.cursor.fetchall()
print('%-16s%-16s%-32s%-16s' % ('city', 'date', 'weather', 'temp'))
for row in rows:
print('%-16s%-16s%-32s%-16s' % (row[0], row[1], row[2], row[3]))
class WeatherForecast:
def __init__(self):
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US;rv:1.9pre)Gecko/2019100821 Minefield/3.0.2pre'} # 创建头,伪装成服务器/浏览器访问远程的web服务器
self.cityCode = {'北京': '101010100', '上海': '101020100', '广州': '101280101', '深圳': '101280601'} # 查找的城市
def forecastCity(self, city):
if city not in self.cityCode.keys():
print(city + 'code cannot be found')
return
url = 'http://www.weather.com.cn/weather/' + self.cityCode[city] + '.shtml'
try:
req = urllib.request.Request(url, headers=self.headers)
data = urllib.request.urlopen(req)
data = data.read()
dammit = UnicodeDammit(data, ['utf-8'], 'gbk')
data = dammit.unicode_markup
soup = BeautifulSoup(data, 'lxml')
lis = soup.select("ul[class='t clearfix'] li") # 找到每一个天气数据
for li in lis:
try:
date = li.select('h1')[0].text
weather = li.select('p[class="wea"]')[0].text
temp = li.select('p[class="tem"] span')[0].text + '/' + li.select('p[class="tem"] i')[0].text
print(city, date, weather, temp)
self.db.insert(city, date, weather, temp) # 插入到数据库的记录
except Exception as err:
print(err)
except Exception as err:
print(err)
def process(self, cities):
self.db = WeatherDB()
self.db.openDB()
for city in cities:
self.forecastCity(city) # 循环每一个城市
self.db.show()
self.db.closeDB()
ws = WeatherForecast()
ws.process(['北京', '上海', '广州', '深圳'])
print('completed')
运行结果部分截图
2)心得体会:
这次的代码就是按照书上敲的,遇到不懂的也通过度娘解决了,这次主要是加强了Beautiful的使用以及对sqlite数据库有初步了解。
作业②
2)
要求:用requests和BeautifulSoup库方法定向爬取股票相关信息。
候选网站:东方财富网https://www.eastmoney.com/
新浪股票http://finance.sina.com.cn/stock/
技巧:在谷歌浏览器中进入F12调试模式进行抓包,查找股票列表加载使用的url,并分析api返回的值,并根据所要求的参数可适当更改api的请求参数。根据URL可观察请求的参数f1、f2可获取不同的数值,根据情况可删减请求的参数。
思路:
通过数据的url对比,可以发现同一板块的数据页数对应的是pn参数
而同一页数不同板块的数据对应的是fid和fs参数
参考链接:https://zhuanlan.zhihu.com/p/50099084
代码:
import requests
import re
import math
# 用get方法访问服务器并提取页面数据
def getHtml(cmd, page):
url = "http://68.push2.eastmoney.com/api/qt/clist/get?cb=jQuery112409784442493077996_1601810442107&pn=" + str(
page) + "&pz=20&po=1&np=1&ut=bd1d9ddb04089700cf9c27f6f7426281&fltt=2&invt=2&" + cmd + "&fields=f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f12,f13,f14,f15,f16,f17,f18,f20,f21,f23,f24,f25,f22,f11,f62,f128,f136,f115,f152"
r = requests.get(url)
pat = "\"diff\":\[(.*?)\]"
data = re.compile(pat, re.S).findall(r.text)[0]
all_page = math.ceil(eval(re.findall('"total":(\d+)', r.text)[0]) / 20) #获取板块的数据条数,除20向上取整就是页数
return data, all_page
# 获取单个页面股票数据
def getOnePageStock(cmd, page):
data, all_page = getHtml(cmd, page)
datas = data.split("},") #分解每条股票
global p
for i in range(len(datas)):
p += 1
stocks = re.sub('["{}]', '', datas[i]).split(",") #分解股票的每条属性
print(tplt.format(p, stocks[11].split(":")[1], stocks[13].split(":")[1], stocks[1].split(":")[1], #输出股票内容
stocks[2].split(":")[1], stocks[3].split(":")[1],
stocks[4].split(":")[1], stocks[5].split(":")[1], stocks[6].split(":")[1],
stocks[14].split(":")[1], stocks[15].split(":")[1],
stocks[16].split(":")[1], stocks[17].split(":")[1], chr(12288)))
cmd = {
"沪深A股": "fid=f3&fs=m:0+t:6,m:0+t:13,m:0+t:80,m:1+t:2,m:1+t:23",
"上证A股": "fid=f3&fs=m:1+t:2,m:1+t:23",
"深证A股": "fid=f3&fs=m:0+t:6,m:0+t:13,m:0+t:80",
"新股": "fid=f26&fs=m:0+f:8,m:1+f:8",
"中小板": "fid=f3&fs=m:0+t:13",
"创业板": "fid=f3&fs=m:0+t:80"
}
for i in cmd.keys():
tplt = "{0:^13}{1:^13}{2:{13}^13}{3:^13}{4:^13}{5:^13}{6:^13}{7:^13}{8:^13}{9:^13}{10:^13}{11:^13}{12:^13}"
print(i)
print("{0:^11}{1:^11}{2:{13}^12}{3:^12}{4:^12}{5:^12}{6:^10}{7:^10}{8:^12}{9:^12}{10:^12}{11:^12}{12:^12}".format(
"序号", "股票代码", "股票名称", "最新报价", "涨跌幅", "涨跌额", "成交量", "成交额", "振幅", "最高", "最低", "今开", "昨收", chr(12288)))
page = 1
p = 0
stocks, all_page = getHtml(cmd[i], page)
while True:
page += 1
if page <= all_page: #页数判断
getOnePageStock(cmd[i], page)
else:
break
运行结果部分截图:
2)心得体会:
这次实验的是抓取js动态加载的网页,在获取数据方面花费的很长时间,再有就是板块的页数(最后还是借鉴同学的),还是有很大收获的。
作业③
3)
要求:根据自选3位数+学号后3位选取股票,获取印股票信息。抓包方法同作②。
候选网站:东方财富网https://www.eastmoney.com/
新浪股票http://finance.sina.com.cn/stock/
代码:
import requests
import re
import math
# 用get方法访问服务器并提取页面数据
def getHtml(cmd, page):
url = "http://68.push2.eastmoney.com/api/qt/clist/get?cb=jQuery112409784442493077996_1601810442107&pn=" + str(
page) + "&pz=20&po=1&np=1&ut=bd1d9ddb04089700cf9c27f6f7426281&fltt=2&invt=2&" + cmd + "&fields=f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f12,f13,f14,f15,f16,f17,f18,f20,f21,f23,f24,f25,f22,f11,f62,f128,f136,f115,f152"
r = requests.get(url)
pat = "\"diff\":\[(.*?)\]"
data = re.compile(pat, re.S).findall(r.text)[0]
all_page = math.ceil(eval(re.findall('"total":(\d+)', r.text)[0]) / 20)
return data, all_page
# 获取单个页面股票数据
def getOnePageStock(cmd, page):
data, all_page = getHtml(cmd, page)
datas = data.split("},")
for i in range(len(datas)):
stocks = re.sub('["{}]', '', datas[i]).split(",")
if (stocks[11].split(":")[1] == "002105"):
print(tplt.format("股票代码号", "股票名称", "今日开", "今日最高", "今日最低", chr(12288)))
print(tplt.format(stocks[11].split(":")[1], stocks[13].split(":")[1], stocks[16].split(":")[1],
stocks[14].split(":")[1],
stocks[15].split(":")[1], chr(12288)))
global p #找到就输出,并置p=1,跳出循环
p = 1
break
cmd = {
"沪深A股": "fid=f3&fs=m:0+t:6,m:0+t:13,m:0+t:80,m:1+t:2,m:1+t:23",
"上证A股": "fid=f3&fs=m:1+t:2,m:1+t:23",
"深证A股": "fid=f3&fs=m:0+t:6,m:0+t:13,m:0+t:80",
"新股": "fid=f26&fs=m:0+f:8,m:1+f:8",
"中小板": "fid=f3&fs=m:0+t:13",
"创业板": "fid=f3&fs=m:0+t:80"
}
p = 0
for i in cmd.keys():
tplt = "{0:^8}\t{1:{5}^8}\t{2:^8}\t{3:^8}\t{4:^8}"
page = 1
stocks, all_page = getHtml(cmd[i], page)
# 自动爬取多页,并在结束时停止
while True:
page += 1
if page <= all_page:
getOnePageStock(cmd[i], page)
else:
break
if p == 1:
break
if p == 0:
print("没找到对应的股票代码")
运行结果:
2)心得体会:
我好像是只是在第二题的基础上加了一个if判断,感觉没有理解题目的意思。。。