2020年寒假学习进度第十三天

河北省疫情数据爬取                                                                                    

  今天主要完成了河北省疫情数据的爬取,从2月8号开始爬取(附python详细代码)

from lxml import etree
import re
import requests        #导入requests包

import SQL as SQL
 #url = 'http://www.hebwst.gov.cn/index.do?id=397505&templet=content&cid=45'
 #url ='http://www.hebwst.gov.cn/index.do?id=397291&templet=content&cid=45'
 #url='http://www.hebwst.gov.cn/index.do?id=395538&templet=content&cid=326'


hrefs = []
def ULS():
    urls = []

    for i in range(6):

           url='http://www.hebwst.gov.cn/index.do?templet=search_list&searchType=1&searchText=河北省新型冠状病毒感染的肺炎疫情情况&type=search&cid=0&page='+str(i)
           print(url)
           strhtml = requests.get(url)
           tree = etree.HTML(strhtml.text)
           urls.append(tree.xpath('//td[@class=\'sy_new_list\']/a//@href'))
    print(urls)
    for href1 in urls:
          for href in href1:
              print(href)
              href = 'http://www.hebwst.gov.cn/'+ href
              a = re.match(r'.*?&cid=45', href)
              if (a):
                  if (href !='http://www.hebwst.gov.cn/index.do?id=395795&templet=content&cid=45'):
                      hrefs.append(href)
    print(hrefs)


def info(url):
      print(url)
      strhtml = requests.get(url)  # Get方式获取网页数据
      tree = etree.HTML(strhtml.text)
      text=tree.xpath('//p//text()')
      text[0]=re.sub(r'\u3000','',text[0])
      print(text)

      #新增
      date=re.findall(r"(.+?日)", text[0])
      print("时间",date)
      xin_que_num=re.findall(r"新增确诊病例(.+?例)", text[0])
      mid = text[0].split("其中", 1)[1]
      num=len(mid.split("其中", 1))
      if num>1:
          mid=mid.split("其中", 1)[0]

      xin_shi_num=re.findall(r"[,,、](.+?市)(.+?例)", mid)

      xin_chu_num = re.findall(r"新增治愈出院病例(.+?例)", text[0])
      xin_yi_num = re.findall(r"新增疑似病例(.+?例)", text[0])
      print("新增确诊病例",xin_que_num)
      print("详细新增确诊病例\n",xin_shi_num)

      print("新增治愈出院病例",xin_chu_num)
      print("新增疑似病例\n",xin_yi_num)



      #确诊



      que_num=re.findall(r"累计报告确诊病例(.+?例)", text[1])

      si_num=re.findall(r"例,其中死亡(.+?例)", text[1])

      zhong_num=re.findall(r",现有重症病例(.+?例)", text[1])

      yu_num=re.findall(r",累计治愈出院(.+?例)", text[1])

      print("累计确诊病例",que_num)
      print("死亡病例",si_num)
      print("重症病例",zhong_num)
      print("出院病例",yu_num)

     #详细

      que_xi_num=[]
      si_xi_num=[]
      zhong_xi_num=[]
      chu_xi_num=[]

      num=len(text[1].split("确诊病例中",1))
      if num>1:
          mid = text[1].split("确诊病例中", 1)[1]
          num = len(mid.split("死亡病例中",1))
          if num > 1:
              que=mid.split("死亡病例中",1)[0]

              que_xi_num = re.findall(r"[,、](.+?市)(.+?例)", que)
              si=mid.split("死亡病例中",1)[1]
              mid=si
              num = len(mid.split("重症病例中", 1))
              if num > 1:
                  si=mid.split("重症病例中",1)[0]

                  si_xi_num = re.findall(r"[,、](.+?市)(.+?例)", si)
                  zhong=mid.split("重症病例中",1)[1]
                  mid=zhong
                  num = len(mid.split("出院病例中", 1))
                  if num > 1:
                     zhong=mid.split("出院病例中",1)[0]
                     zhong_xi_num = re.findall(r"[,、](.+?市)(.+?例)", zhong)
                     chu=mid.split("出院病例中",1)[1]
                     chu_xi_num = re.findall(r"[,、](.+?市)(.+?例)", chu)
                  else:
                     zhong_xi_num = re.findall(r"[,、](.+?市)(.+?例)", zhong)
          else:
                  si_xi_num = re.findall(r"[,、](.+?市)(.+?例)", si)


      print("详细确诊病例",que_xi_num)
      print("详细死亡病例",si_xi_num)
      print("详细重症病例",zhong_xi_num)
      print("详细出院病例",chu_xi_num)

      #疑似

      yisi_num=re.findall(r"疑似病例(.+?例)", text[1])
      print("疑似病例",yisi_num)

     #密切接触者 接触医学观察 正在隔离

      miqie_num=re.findall(r"密切接触者(.+?人)", text[2])
      jie_num=re.findall(r"解除隔离医学观察(.+?人)", text[2])
      guan_num=re.findall(r"现有(.+?人)", text[2])

      print("密切接触者",miqie_num)
      print("接触医学观察",jie_num)
      print("现有医学观察人数",guan_num)

      SQL.insert_province(date[0], "".join(xin_que_num),"".join(xin_chu_num), "".join(xin_yi_num), "".join(que_num), "".join(si_num), "".join(zhong_num), "".join(yu_num),"".join(yisi_num), "".join(miqie_num), "".join(jie_num), "".join(guan_num), url)
      for mid_value in que_xi_num:
          City=mid_value[0]
          Num=mid_value[1]
          print("CITY:",City)
          print("num:",Num)
          SQL.insert_city(date[0],City,Num,url)
     # ,,,,Ur
      for mid_value in xin_shi_num:
          City=mid_value[0]
          Num=mid_value[1]
          print("CITY:",City)
          print("num:",Num)
          SQL.update_db(City,"New_Confirmed_num",Num)


      for mid_value in si_xi_num:
          City=mid_value[0]
          Num=mid_value[1]
          print("CITY:",City)
          print("num:",Num)
          SQL.update_db(City,"Dead_num",Num)

      for mid_value in zhong_xi_num:
          City=mid_value[0]
          Num=mid_value[1]
          print("CITY:",City)
          print("num:",Num)
          SQL.update_db(City,"Zhong_num",Num)

      for mid_value in chu_xi_num:
          City=mid_value[0]
          Num=mid_value[1]
          print("CITY:",City)
          print("num:",Num)
          SQL.update_db(City,"Cured_num",Num)



if __name__ =='__main__':
     #ULS()
     #SQL.delete_db("hebei_info")
     #SQL.delete_db("hebei_city_info")
     #for url in hrefs:
         #info(url)
     info('http://wsjkw.hebei.gov.cn/content/content_14/398316.jhtml')

  

import pymysql

db = pymysql.connect(host='localhost',
                     port=3306,
                     user='root',
                     passwd='',
                     db='yiqing')
def select_db():
    '''查询数据库'''
    # 打开数据库连接

    sql = "select * from blog_info"
    # 使用 cursor() 方法创建一个游标对象cur
    cur = db.cursor()

    # 使用 execute()  方法执行 SQL 查询
    cur.execute(sql)

    # 使用 fetchall() 方法获取查询结果
    data = cur.fetchall()
    # print(data)  # 取出对应的psw值
    # 关闭数据库连接
    #db.close()
    return data

def delete_db(table):
    '''删除操作'''
    # 使用cursor()方法获取操作游标
    cur = db.cursor()
    sql_delete = "delete from "+table+" "
    try:
        cur.execute(sql_delete)  # 执行
        # 提交
        db.commit()
    except Exception as e:
        print("操作异常:%s" % str(e))
        # 错误回滚
        db.rollback()
    #finally:
        #db.close()

def update_db(city,valuename,value):
    '''3.更新操作'''
    # 使用cursor()方法获取操作游标
    cur = db.cursor()
    sql_update = "update hebei_city_info set "+valuename+"='"+value+"' where city='"+city+"'"
    try:
        cur.execute(sql_update)  # 执行sql
        # 提交
        db.commit()
    except Exception as e:
        # 错误回滚
        print("错误信息:%s" % str(e))
        db.rollback()
    #finally:
        #db.close()

def insert_city(Date, City, Confirmed_num,Url):
    '''插入操作'''
    # 使用cursor()方法获取操作游标
    cur = db.cursor()
    print(Confirmed_num)
    print(Date)
    print(City)
    print(Url)
    sql_insert= "insert into hebei_city_info (Date, City,Confirmed_num,Url) values('"+Date+"','"+City+"','"+Confirmed_num+"','"+Url+"')"
    print(sql_insert)
    try:
        cur.execute(sql_insert)
        # 提交
        db.commit()
    except Exception as e:
        print("错误信息:%s" % str(e))
        # 错误回滚
        db.rollback()
    #finally:
        #db.close()

def insert_province(Date, New_Confirmed_num, New_Cured_num,New_Yisi_num,Confirmed_num,Dead_num,Zhong_num,Cured_num,Yisi_num,Miqie_num,None_Guan_num,Guan_num,Url):
    '''插入操作'''
    # 使用cursor()方法获取操作游标
    cur = db.cursor()

    sql_insert= "insert into hebei_info (Date, New_Confirmed_num, New_Cured_num,New_Yisi_num,Confirmed_num,Dead_num,Zhong_num,Cured_num,Yisi_num,Miqie_num,None_Guan_num,Guan_num,Url) values('"+Date+"','"+New_Confirmed_num+"','"+New_Cured_num+"','"+New_Yisi_num+"','"+Confirmed_num+"','"+Dead_num+"','"+Zhong_num+"','"+Cured_num+"','"+Yisi_num+"','"+Miqie_num+"','"+None_Guan_num+"','"+Guan_num+"','"+Url+"')"
    print(sql_insert)
    print("AAAA")
    try:
        cur.execute(sql_insert)
        # 提交
        db.commit()
    except Exception as e:
        print("错误信息:%s" % str(e))
        # 错误回滚
        db.rollback()
    #finally:
        #db.close()
if __name__ == "__main__":

    values=select_db()
    i=0
    for value in values:
        i=i+1
        print(value[2])

    a = select_db()[0][0]
    print("查询结果:%s" %str(a))

    #删除

    #delete_db()

    #修改

    #update_db()

    #insert_db()

  数据库爬取数据详情:

 

 

 

posted @ 2020-02-13 16:52  生活依旧  阅读(107)  评论(0编辑  收藏  举报