大二下学期第二次个人作业第一阶段
今日主要完成了CVPR部分的论文的爬取,一共爬取了2020,2019,2018三年的内容,并对代码做了些修改。
def get_tencent_data(): """ :return: 返回历史数据和当日详细数据 """ url_CVPR_2020= 'https://openaccess.thecvf.com/CVPR2020' url_CVPR_2019 = 'https://openaccess.thecvf.com/CVPR2019' url_CVPR_2018 ='https://openaccess.thecvf.com/CVPR2018' headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36', } #2020 CVPR res_2018 = requests.get(url_CVPR_2018, headers) soup_2018 = BeautifulSoup(res_2018.text) context_2018 = soup_2018.select("a") url_child_2018=[] url_child_2018.append(context_2018[len(context_2018) - 1]['href']) url_child_2018.append(context_2018[len(context_2018) - 2]['href']) url_child_2018.append(context_2018[len(context_2018) - 3]['href'])#三日日期 # 日期网址 num=[0,1,2] cursor = None conn = None conn, cursor = get_conn() for i in num: url_CVPR_2018_1 = url_CVPR_2018[0:len(url_CVPR_2018) - 8] + url_child_2018[i]; res_2018_1 = requests.get(url_CVPR_2018_1, headers) soup_2018_1 = BeautifulSoup(res_2018_1.text) context_2018_1 = soup_2018_1.find_all("dt") url_CVPR_2018_1_pager = [] # 具体论文网址 for i in context_2018_1: url_CVPR_2018_1_pager.append("https://openaccess.thecvf.com/" + i.select("a")[0]['href']) print(f"{time.asctime()}开始插入数据") try: for i in url_CVPR_2018_1_pager: print(i) res_2019_1_paper = requests.get(i, headers) path_2019_1_pager = i soup_2019_1_paper = BeautifulSoup(res_2019_1_paper.text) title_2019_1_paper = soup_2019_1_paper.find("div", {"id": "papertitle"}).text authors_2019_1_paper = soup_2019_1_paper.find("div", {"id": "authors"}).find("b").find("i").text abstract_2019_1_paper = soup_2019_1_paper.find("div", {"id": "abstract"}).text sql = "insert into paper_data values(%s,%s,%s,%s,%s,%s)" cursor.execute(sql, [title_2019_1_paper, authors_2019_1_paper, abstract_2019_1_paper, "", path_2019_1_pager, "2018"]) conn.commit() # 提交事务 update delete insert操作 except: traceback.print_exc() close_conn(conn, cursor) print(f"{time.asctime()}插入数据完毕")