个人作业2-6.4-Python爬取顶会信息
1、个人作业2
数据爬取阶段
import requests
from lxml import etree
import pymysql
def getdata(url):
# 请求CVPR主页
page_text = requests.get(url).text
parser = etree.HTMLParser(encoding="utf-8")
tree = etree.HTML(page_text, parser=parser)
#html
# 爬取论文连接
hrefs = tree.xpath('//dt[@class="ptitle"]/a/@href')
print(len(hrefs))
# 爬取论文信息
titles = []
pdfs = []
abstracts = []
authors = []
keywords = []
for href in hrefs:
db = pymysql.connect(host="127.0.0.1", user="root", password="lin0613",
database="users")
href = "https://openaccess.thecvf.com/" + href
page_text = requests.get(href).text
tree_link = etree.HTML(page_text, parser=parser)
title = tree_link.xpath('/html/body/div/dl/dd/div[@id="papertitle"]/text()')
title[0] = title[0].strip()
titles += title
title[0] = title[0].replace(":", "")
words = title[0].split()
keyword = ""
for word in words:
if checkword(word):
save_keywords(pymysql.connect(host="127.0.0.1", user="root", password="lin0613",database="users"), word)
keyword += word + " "
keywords.append(keyword)
pdf = tree_link.xpath('/html/body/div/dl/dd/a[contains(text(),"pdf")]/@href')
pdf[0] = pdf[0].replace("../../", "https://openaccess.thecvf.com/")
pdfs += pdf
abstract = tree_link.xpath('/html/body/div/dl/dd/div[@id="abstract"]/text()')
abstract[0] = abstract[0].strip()
abstracts += abstract
author = tree_link.xpath('/html/body/div/dl/dd/div/b/i/text()')
authors += author
# print(title)
# print(author)
# print(pdf)
# print(abstract)
save(db, title[0], author[0], abstract[0], href, keyword)
print(titles)
print(hrefs)
print(authors)
print(abstracts)
print(pdfs)
def save(db, title, author, abstract, link, keyword):
# 使用cursor()方法获取操作游标
cursor = db.cursor()
# SQL 插入语句
sql = "INSERT INTO papers(title, authors, abstract_text, original_link, keywords) \
VALUES ('%s', '%s', '%s', '%s', '%s')" % \
(title, author, abstract, link, keyword)
try:
# 执行sql语句
cursor.execute(sql)
print("true")
# 执行sql语句
db.commit()
except:
print("error wenzhang")
# 发生错误时回滚
db.rollback()
# 关闭数据库连接
db.close()
def save_keywords(db, keyword):
# 使用cursor()方法获取操作游标
cursor = db.cursor()
# SQL 插入语句
sql = "INSERT INTO keywords(keyword) VALUES ('%s')" % (keyword)
try:
# 执行sql语句
cursor.execute(sql)
# 执行sql语句
print("true")
db.commit()
except:
print("error word")
# 发生错误时回滚
db.rollback()
# 关闭数据库连接
db.close()
def checkword(word):
invalid_words = ['the', 'a', 'an', 'and', 'by', 'of', 'in', 'on', 'is', 'to', "as", "from", "for", "with", "that",
"have", "by", "on", "upon", "about", "above", "across", "among", "ahead", "after", "a",
"analthough", "at", "also", "along", "around", "always", "away", "anyup", "under", "untilbefore",
"between", "beyond", "behind", "because", "what", "when", "would", "could", "who", "whom", "whose",
"which", "where", "why", "without", "whether", "down", "during", "despite", "over", "off", "only",
"other", "out", "than", "the", "thenthrough", "throughout", "that", "these", "this", "those",
"there", "therefore", "some", "such", "since", "so", "can", "many", "much", "more", "may", "might",
"must", "ever", "even", "every", "each" ,"with","A","With","From"]
if word.lower() in invalid_words:
return False
else:
return True
if __name__ == '__main__':
#getdata("https://openaccess.thecvf.com/CVPR2018?day=2018-06-20")
getdata("https://openaccess.thecvf.com/CVPR2018?day=2018-06-21")
getdata("https://openaccess.thecvf.com/CVPR2019?day=2019-06-18")
#getdata("https://openaccess.thecvf.com/CVPR2019?day=2019-06-19")
#getdata("https://openaccess.thecvf.com/CVPR2019?day=2019-06-20")
getdata("https://openaccess.thecvf.com/CVPR2020?day=2020-06-16")
#getdata("https://openaccess.thecvf.com/CVPR2020?day=2020-06-17")
#getdata("https://openaccess.thecvf.com/CVPR2020?day=2020-06-18")
#getdata("https://openaccess.thecvf.com/CVPR2018?day=2018-06-19")
__EOF__

本文作者:往心。
本文链接:https://www.cnblogs.com/lx06/p/14907379.html
关于博主:评论和私信会在第一时间回复。或者直接私信我。
版权声明:本博客转载请注明出处!
声援博主:如果您觉得文章对您有帮助,可以点击文章右下角【推荐】一下。您的鼓励是博主的最大动力!
本文链接:https://www.cnblogs.com/lx06/p/14907379.html
关于博主:评论和私信会在第一时间回复。或者直接私信我。
版权声明:本博客转载请注明出处!
声援博主:如果您觉得文章对您有帮助,可以点击文章右下角【推荐】一下。您的鼓励是博主的最大动力!
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· .NET Core 中如何实现缓存的预热?
· 从 HTTP 原因短语缺失研究 HTTP/2 和 HTTP/3 的设计差异
· AI与.NET技术实操系列:向量存储与相似性搜索在 .NET 中的实现
· 基于Microsoft.Extensions.AI核心库实现RAG应用
· Linux系列:如何用heaptrack跟踪.NET程序的非托管内存泄露
· TypeScript + Deepseek 打造卜卦网站:技术与玄学的结合
· 阿里巴巴 QwQ-32B真的超越了 DeepSeek R-1吗?
· 【译】Visual Studio 中新的强大生产力特性
· 【设计模式】告别冗长if-else语句:使用策略模式优化代码结构
· AI与.NET技术实操系列(六):基于图像分类模型对图像进行分类