第二次作业
作业1
1.天气实验代码
#!/usr/bin/env python
# _*_ coding:utf-8 _*_
from bs4 import BeautifulSoup
from bs4 import UnicodeDammit
import urllib.request
import sqlite3
class weatherDB:
def openDB(self):
self.con = sqlite3.connect("weather.db")
self.cursor = self.con.cursor()
try:
self.cursor.execute(
"create table weathers (wcity varchar(16),wdate varchar(16),wweather varchar(64),wtemp varchar(32),constraint pk_weather primary key(wcity,wdate))")
except:
self.cursor.execute("delete from weathers")
def closeDB(self):
self.con.commit()
self.con.close()
def insert(self, city, date, weather, temp):
try:
self.cursor.execute("insert into weathers (wcity,wdate,wweather,wtemp) values(?,?,?,?)",
(city, date, weather, temp))
except:
print("err")
def show(self):
self.cursor.execute("select * from weathers")
rows = self.cursor.fetchall()
print("%-16s%-16s%-32s%-16s" % ("city", "date", "weather", "temp"))
for row in rows:
print("%-16s%-16s%-32s%-16s" % (row[0], row[1], row[2], row[3]))
class weatherforecast():
def __init__(self):
self.headers = { #伪装成主机,提前记录下来四个城市的代码编号
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3775.400 QQBrowser/10.6.4209.400"}
self.citycode = {"北京": "101010100", "上海": "101020100", "广州": "101280101", "深圳": "101280601"}
def forecastcity(self, city):
if city not in self.citycode.keys(): #if语句用来看看这个城市在不在这个网页上面,有没有可能是自己在输入的时候有错误
print(city + "code not found")
return
url = "http://www.weather.com.cn/weather/" + self.citycode[city] + ".shtml"
try: #进入不同城市的天气信息页面,开始行动
req = urllib.request.Request(url, headers=self.headers)
data = urllib.request.urlopen(req)
data = data.read()
dammit = UnicodeDammit(data, ["utf-8", "gbk"])
data = dammit.unicode_markup
soup = BeautifulSoup(data, 'html.parser')
lis = soup.select("ul[class='t clearfix'] li") #查看页面源码后发现,信息储存在ul里面
for li in lis:
try:
date_ = li.select('h1')[0].text #用select逐层挑选,筛选出自己需要的城市,日期,天气和温度信息
weather_ = li.select('p[class="wea"]')[0].text
temp_ = li.select('p[class="tem"] span')[0].text + '℃/' + li.select("p[class='tem'] i")[0].text
print(city, date_, weather_, temp_)
self.db.insert(city, date_, weather_, temp_)
except:
print('err1')
except:
print('err2')
def precess(self, cities):
self.db = weatherDB()
self.db.openDB()
for city in cities:
self.forecastcity(city)
self.db.show()
self.db.closeDB()
ws = weatherforecast()
ws.precess(["北京", '上海', '广州', '深圳'])
print('completed')
结果:
2.心得体会
在天气预报的代码方面,有点难以理解的是两个大类的定义和函数的处理,各种参数的作用在自己琢磨之后也就出来了。
作业2
1.股票爬取
import requests
from bs4 import BeautifulSoup
import re
def getHtmlText(url):
head = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:70.0) Gecko/20100101 Firefox/70.0',
'Cookie': 'qgqp_b_id=54fe349b4e3056799d45a271cb903df3; st_si=24637404931419; st_pvi=32580036674154; st_sp=2019-11-12%2016%3A29%3A38; st_inirUrl=; st_sn=1; st_psi=2019111216485270-113200301321-3411409195; st_asi=delete'
}
try:
r = requests.get(url, timeout=30, headers=head)
r.raise_for_status()
r.encoding = 'utf-8'
return r.text
except:
return ""
recordfile = 'Data.txt'
url = 'http://51.push2.eastmoney.com/api/qt/clist/get?cb=jQuery112408349318807687469_1574045112932&pn=1&pz=20&po=1&np=2&ut=bd1d9ddb04089700cf9c27f6f7426281&fltt=2&invt=2&fid=f3&fs=m:0+t:6,m:0+t:13,m:0+t:80,m:1+t:2,m:1+t:23&fields=f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f12,f13,f14,f15,f16,f17,f18,f20,f21,f23,f24,f25,f22,f11,f62,f128,f136,f115,f152&_=1574045112933'
head = {
"User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US; rv:1.9pre) Gecko/2008072421 Minefield/3.0.2pre"
}
Codelist=[]
DealData=[['股票代码', '今开', '最高', '最低', '昨收', '成交量', '成交额', '总市值', '流通市值', '振幅', '换手率', '市净率', '市盈率', ]]
r = requests.get(url, timeout=30, headers=head)
r.raise_for_status()
r.encoding = 'utf-8'
html=r.text
# print(html)
soup = str(BeautifulSoup(html,"html.parser"))
# print(html)
regex=re.compile(r'.f12...\d{6}.')
listpatterns=regex.findall(soup)
for listpattern in listpatterns:
numpattern=re.compile(r'\d{6}')
Codelist.append(numpattern.findall(listpattern)[0])
# print(Codelist)
total = len(Codelist)
CodeList = Codelist[:50]
finished = int(0)
for code in CodeList:
finished = finished + 1
finishedco = (finished / total) * 100
print("total : {0} finished : {1} completion : {2}%".format(total, finished, finishedco))
dealDataList = []
dataUrl = 'http://info.stcn.com/dc/stock/index.jsp?stockcode=' + code
dataHtml = getHtmlText(dataUrl)
soup = BeautifulSoup(dataHtml, "html.parser")
dealDataList.append(code)
for i in range(1, 4):
classStr = 'sj_r_' + str(i)
divdata = soup.find_all('div', {'class': classStr})
if len(divdata) == 0:
dealDataList.append('该股票暂时没有交易数据!')
break
dealData = str(divdata[0])
dealPattern = re.compile(r'\d+.\d+[\u4e00-\u9fa5]|\d+.+.%|\d+.\d+')
listdeal = dealPattern.findall(dealData)
for j in range(0, 4):
dealDataList.append(listdeal[j])
DealData.append(dealDataList)
file = open(recordfile, 'a+')
for i in range(len(DealData)):
if i == 0:
s = str(DealData[i]).replace('[', '').replace(']', '')
s = s.replace("'", '').replace(',', ' \t') + '\n'
else:
s = str(DealData[i]).replace('[', '').replace(']', '')
s = s.replace("'", '').replace(',', '\t') + '\n'
file.write(s)
file.close()
print(len(DealData))
结果:
2.心得体会
因为是第一次涉及到爬取实时获取的数据,用了很多的时间,用的时间也是最多的。这次的作业,大部分的代码也是借鉴了CSDN上面的相关代码,再加上自己的修改最后才得出来的结果。
作业3
1.自选股票代码
import requests
Codelist=[]
List = ["股票代码号","股票名称","今日最高","今日最低","今日开"]
url = 'http://46.push2his.eastmoney.com/api/qt/stock/kline/get?cb=jQuery112406437068490950477_1602146854442&secid=1.600115&ut=fa5fd1943c7b386f172d6893dbfba10b&fields1=f1%2Cf2%2Cf3%2Cf4%2Cf5%2Cf6&fields2=f51%2Cf52%2Cf53%2Cf54%2Cf55%2Cf56%2Cf57%2Cf58%2Cf59%2Cf60%2Cf61&klt=101&fqt=0&end=20500101&lmt=120&_=1602146854482'
head = {"User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US; rv:1.9pre) Gecko/2008072421 Minefield/3.0.2pre"}
r = requests.get(url, timeout=30, headers=head)
r.raise_for_status()
r.encoding = 'utf-8'
html=r.text
msg=html[html.rindex('"',0,-10):]
result=msg.split(",")
print("股票代码号 股票名称 今日开 今日最高 今日最低")
print("600115 "+"东方航空 "+result[2]+" "+result[3]+" "+result[4])
结果:
2.心得体会
这次是请教霖哥才写出来的结果。用了rindex直接把相关的数据从后面爬出来后,再换成数组就能直接输出了。