个人作业——CVPR顶会论文爬取

main.py

#保存单个界面数据
def getInfo(url):
     # url='https://openaccess.thecvf.com/WACV2021'
     header={
          'User-Agent':'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Mobile Safari/537.36'
     }
     html=requests.get(url=url,headers=header).text
     soup=BeautifulSoup(html,'lxml')
     dl=soup.find('dl')
     print(dl.find('dt'))
     info=dl.find_all(class_='bibref pre-white-space')
     for i in info:
          print('----------------------------------------------------------------------------------------------------------')
          print(i.text)
          # info0=i.find(class_='bibref pre-white-space').get_text()
          # print(info0)
          #处理数据
          info1=i.text.strip('@InProceedings{,}')
          info2=info1.replace('=','')
          info2=info2.replace("'","''")
          info2=info2.replace('{','')
          info2=info2.replace('}',',')
          info2=info2.replace('author',',')
          info2=info2.replace('title','')
          info2=info2.replace('book','')
          info2=info2.replace('month','')
          info2=info2.replace('year','')
          info2=info2.replace('pages','')
          # info2=info2.replace(' ','')
          info2=info2.replace('\n','')
          info2=info2.replace('    ','')
          info2=",,"+info2+","
          print(info2)
          info3=info2.split(',,')
          print(info3)

          #保存数据
          list=[]
          for i in info3:
               list.append(i.strip(' '))
               print(i.strip(' '))
          print(str(list[1]))
          #链接数据库
          conn=pymysql.connect(host='39.106.103.180',port=3306,user='root',password='E6B3628525e4',database='user')
          cursor=conn.cursor()
          sql_select="select * from paper where title='"+str(list[3])+"'"
          if(cursor.execute(sql_select)!=1):
               sql="insert into paper(author,title,booktitle,month,year,pages) values('"+str(list[2])+"','"+str(list[3])+"','"+str(list[4])+"','"+str(list[5])+"','"+str(list[6])+"','"+str(list[7])+"')"
               a=cursor.execute(sql)
               re=cursor.fetchall()
               print(re)
          conn.commit()
          cursor.close()
          conn.close()

url='https://openaccess.thecvf.com/menu'
headers={
    'User-Agent':'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Mobile Safari/537.36'
}
html=requests.get(url=url,headers=headers).text
print(html)
soup=BeautifulSoup(html,'lxml')
dds=soup.find_all('dd')
# print('________________________________________________________________________________________________________________')
# print(dds)
for dd in dds:
    print('----------------------------------------------------------------------------------------------------------------------')
    # print(dd.text) #标题
    for d in dd.find_all('a'):
        # print(d.text)  #会议性质
        # print(d['href'],'\n') #地址

        url_MainConference=url.strip('menu')+str(d['href']).strip('/')
        print(url_MainConference.strip('.py'))
        # menu=requests.get(url=url_MainConference,headers=headers).text
        getInfo(url_MainConference.strip('.py '))

注意数据库字段设置合适的长度

posted @ 2021-06-10 19:04  天岁  阅读(81)  评论(0编辑  收藏  举报