python爬取网页

复制代码
import requests
from bs4 import BeautifulSoup
import re
import pymysql

url = 'https://openaccess.thecvf.com/CVPR2020?day=2020-06-18'
response = requests.get(url)

obj1 = re.compile(r'<dt class="ptitle"><br>.*?.html">(?P<name>.*?)</a></dt>.*?'
                  r'\[<a href="(?P<pdf>.*?)">pdf</a>].*?'
                  r'author = {(?P<author>.*?)},<br>.*?'
                  r'title = {(?P<title>.*?)},<br>.*?'
                  r'booktitle = {(?P<booktitle>.*?)},<br>', re.S)

result = obj1.finditer(response.text)

# 连接数据库
conn = pymysql.connect(host='localhost', user='root', password='123456', database='py', charset='utf8', port=3306)
# 创建游标对象
cursor = conn.cursor()
sql = 'INSERT INTO test1(`name`, pdf, author, title, booktitle, `date`) values(%s,%s,%s,%s,%s,%s)'

for it in result:
    try:
        data = [it.group('name'), it.group('pdf'), it.group('author'), it.group('title'), it.group('booktitle'), 20200618]
        cursor.execute(sql, data)
        conn.commit()
    except Exception as e:
        print(e)


response.close()

# 关闭游标
cursor.close()
# 关闭连接
conn.close()

print('over!!!')
复制代码

 

posted @   zrswheart  阅读(56)  评论(0编辑  收藏  举报
相关博文:
阅读排行:
· TypeScript + Deepseek 打造卜卦网站:技术与玄学的结合
· Manus的开源复刻OpenManus初探
· AI 智能体引爆开源社区「GitHub 热点速览」
· 三行代码完成国际化适配,妙~啊~
· .NET Core 中如何实现缓存的预热?
点击右上角即可分享
微信分享提示