顶会热词统计1
python爬虫
main.py
import requests from bs4 import BeautifulSoup import re import pymysql url = 'https://openaccess.thecvf.com/CVPR2020?day=2020-06-18' response = requests.get(url) obj1 = re.compile(r'<dt class="ptitle"><br>.*?.html">(?P<name>.*?)</a></dt>.*?' r'\[<a href="(?P<pdf>.*?)">pdf</a>].*?' r'author = {(?P<author>.*?)},<br>.*?' r'title = {(?P<title>.*?)},<br>.*?' r'booktitle = {(?P<booktitle>.*?)},<br>', re.S) result = obj1.finditer(response.text) # 连接数据库 conn = pymysql.connect(host='localhost', user='root', password='15132949306@', database='crewler', charset='utf8', port=3306) # 创建游标对象 cursor = conn.cursor() sql = 'INSERT INTO db1(`name`, pdf, author, title, booktitle, `date`) values(%s,%s,%s,%s,%s,%s)' for it in result: try: data = [it.group('name'), it.group('pdf'), it.group('author'), it.group('title'), it.group('booktitle'), 20200618] cursor.execute(sql, data) conn.commit() except Exception as e: print(e) response.close() # 关闭游标 cursor.close() # 关闭连接 conn.close() print('over!!!')
数据库结构