数据采集与融合技术实践作业二

102102141 周嘉辉

作业①

完成代码:

from bs4 import BeautifulSoup
from bs4 import UnicodeDammit
import urllib.request
import sqlite3
import time

class WeatherDB:
    def openDB(self):
        self.con=sqlite3.connect("weathers.db")
        self.cursor=self.con.cursor()
        try:
            self.cursor.execute("create table weathers (wCity varchar(16),wDate varchar(16),wWeather varchar(64),wTemp varchar(32),constraint pk_weather primary key (wCity,wDate))")
        except:
            self.cursor.execute("delete from weathers")
    def closeDB(self):
        self.con.commit()
        self.con.close()

    def insert(self,city,date,weather,temp):
        try:
            self.cursor.execute("insert into weathers (wCity,wDate,wWeather,wTemp) values (?,?,?,?)" ,(city,date,weather,temp))
        except Exception as err:
            print(err)
    def show(self):
        self.cursor.execute("select * from weathers")
        rows=self.cursor.fetchall()
        print("%-16s%-16s%-32s%-16s" % ("city","date","weather","temp"))
        for row in rows:
            print("%-16s%-16s%-32s%-16s" % (row[0],row[1],row[2],row[3]))

class WeatherForecast:
    def __init__(self):
        self.headers = {
            "User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US; rv:1.9pre) Gecko/2008072421 Minefield/3.0.2pre"
        }
        self.cityCode={"北京":"101010100","上海":"101020100","广州":"101280101","深圳":"101280601","福州":"101230101"}

    def forecastCity(self,city):
        if city not in self.cityCode.keys():
            print(city+" code cannot be found")
            return

        url="http://www.weather.com.cn/weather/"+self.cityCode[city]+".shtml"
        try:
            req=urllib.request.Request(url,headers=self.headers)
            data=urllib.request.urlopen(req)
            data=data.read()
            dammit=UnicodeDammit(data,["utf-8","gbk"])
            data=dammit.unicode_markup
            soup=BeautifulSoup(data,"lxml")
            lis=soup.select("ul[class='t clearfix'] li")
            for li in lis[:3]:
                try:
                    date=li.select('h1')[0].text
                    weather=li.select('p[class="wea"]')[0].text
                    temp=li.select('p[class="tem"] span')[0].text+"/"+li.select('p[class="tem"] i')[0].text
                    print(city,date,weather,temp)
                    self.db.insert(city,date,weather,temp)
                except Exception as err:
                    print(err)
        except Exception as err:
            print(err)
    def process(self,cities):
            self.db=WeatherDB()
            self.db.openDB()
    
            for city in cities:
                self.forecastCity(city)
    
            self.db.closeDB()

ws=WeatherForecast()
ws.process(["北京","上海","广州","深圳","福州"])
print("复现天气爬取102102143周嘉辉")

time.sleep(100)

结果:

心得体会:copy老师的代码并且理解后加上了福州的天气,了解了数据库的读写(主要是这个)。

作业②

  • 用requests和自选提取信息方法定向爬取股票相关信息,并存储在数据库中。
import requests
import json
import sqlite3

class MoneyDB:
    def openDB(self):
        self.con=sqlite3.connect("moneys.db")
        self.cursor=self.con.cursor()
        try:
            self.cursor.execute("create table moneys (序号 varchar(64),代码 varchar(64),名称 varchar(64),报价 varchar(64),涨跌幅 varchar(64),涨跌额 varchar(64),成交量 varchar(64),成交额 varchar(64),最高 varchar(64),最低 varchar(64),今开 varchar(64),昨收 varchar(64))")
        except:
            self.cursor.execute("delete from moneys")
    def closeDB(self):
        self.con.commit()
        self.con.close()

    def insert(self,count,data):
        try:
            self.cursor.execute("insert into moneys (序号,代码,名称,报价,涨跌幅,涨跌额,成交量,成交额,最高,最低,今开,昨收) values (?,?,?,?,?,?,?,?,?,?,?,?)" ,(count,data['f12'],data['f14'],data['f2'],data['f3'],
            data['f4'],data['f5'],data['f6'],data['f15'],data['f16'],data['f17'],data['f18']))
        except Exception as err:
            print(err,999)


url = "http://14.push2.eastmoney.com/api/qt/clist/get?cb=jQuery1124015338467305145265_1696661176957&pn=1&pz=50&po=1&np=1&ut=bd1d9ddb04089700cf9c27f6f7426281&fltt=2&invt=2&wbp2u=|0|0|0|web&fid=&fs=b:MK0010&fields=f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f12,f13,f14,f15,f16,f17,f18,f20,f21,f23,f24,f25,f26,f22,f11,f62,f128,f136,f115,f152&_=1696661176958"
db = MoneyDB()
db.openDB()

response = requests.get(url=url)
response.encoding="utf-8"  #设置编码方式,否则有乱码
paper = response.text

# print(response.text)
# jsstr = json.dumps(response.text)
# d = json.loads(jsstr)
# print(response.text[response.text.find('(')+1:-2])
d = json.loads(response.text[response.text.find('(')+1:-2])['data']['diff']
print(d)
print('序号\t代码\t名称\t\t报价\t涨跌幅\t涨跌额\t成交量\t\t成交额\t\t最高\t最低\t今开\t昨收')
count = 1
for i in d:
    print(count,end='\t')
    print(i['f12'],end='\t')
    print(i['f14'],end='\t')
    print(i['f2'],end='\t')
    print(i['f3'],end='\t')
    print(i['f4'],end='\t')
    print(i['f5'],end='\t')
    print(i['f6'],end='\t')
    print(i['f15'],end='\t')
    print(i['f16'],end='\t')
    print(i['f17'],end='\t')
    print(i['f18'])
    db.insert(count,i)
    count+=1
db.closeDB()

结果:

心得体会:一定要记得closeDB,不然database不会保存。

作业③


import requests

import re

import sqlite3

def isfloat(str):
    try:
        float(str)
        return True
    except:
        return False

class rankDB:
    def openDB(self):
        self.con=sqlite3.connect("ranking.db")
        self.cursor=self.con.cursor()
        try:
            self.cursor.execute("create table ranks (name varchar(16),scores varchar(16))")
        except:
            self.cursor.execute("delete from ranks")
    def closeDB(self):
        self.con.commit()
        self.con.close()

    def insert(self,n,s):
        try:
            self.cursor.execute("insert into ranks (name,scores) values (?,?)" ,(n,s))
        except Exception as err:
            print(err)

resp = requests.get('https://www.shanghairanking.cn/_nuxt/static/1695811954/rankings/bcur/2021/payload.js')

res = resp.text

db = rankDB()
db.openDB()
# print(res)

names = re.findall("univNameCn:(.*?),univNameEn:",res)

scores = re.findall("score:(.*?),ranking",res)

ranking = 1
for i in range(len(names)):
    if isfloat(scores[i]) == False:
        scores[i] = ''
    print(i+1,names[i],scores[i])
    db.insert(names[i],scores[i])
db.closeDB()

结果:

心得体会:这次作业是在课堂上完成的,虽然和之前的作业看起来很像但是实际上区别挺大的,有参考学长的代码,加深了我对正则表达式的理解。