大二下学期第二次个人作业第一阶段

今日主要完成了CVPR部分的论文的爬取,一共爬取了2020,2019,2018三年的内容,并对代码做了些修改。

def get_tencent_data():
    """
    :return: 返回历史数据和当日详细数据
    """
    url_CVPR_2020= 'https://openaccess.thecvf.com/CVPR2020'
    url_CVPR_2019 = 'https://openaccess.thecvf.com/CVPR2019'
    url_CVPR_2018 ='https://openaccess.thecvf.com/CVPR2018'

    headers = {
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36',
    }
    #2020 CVPR
    res_2018 = requests.get(url_CVPR_2018, headers)
    soup_2018 = BeautifulSoup(res_2018.text)
    context_2018 = soup_2018.select("a")
    url_child_2018=[]
    url_child_2018.append(context_2018[len(context_2018) - 1]['href'])
    url_child_2018.append(context_2018[len(context_2018) - 2]['href'])
    url_child_2018.append(context_2018[len(context_2018) - 3]['href'])#三日日期
    # 日期网址
    num=[0,1,2]
    cursor = None
    conn = None
    conn, cursor = get_conn()
    for i in num:
        url_CVPR_2018_1 = url_CVPR_2018[0:len(url_CVPR_2018) - 8] + url_child_2018[i];
        res_2018_1 = requests.get(url_CVPR_2018_1, headers)
        soup_2018_1 = BeautifulSoup(res_2018_1.text)
        context_2018_1 = soup_2018_1.find_all("dt")
        url_CVPR_2018_1_pager = []
        # 具体论文网址
        for i in context_2018_1:
            url_CVPR_2018_1_pager.append("https://openaccess.thecvf.com/" + i.select("a")[0]['href'])
        print(f"{time.asctime()}开始插入数据")
        try:
            for i in url_CVPR_2018_1_pager:
                print(i)
                res_2019_1_paper = requests.get(i, headers)
                path_2019_1_pager = i
                soup_2019_1_paper = BeautifulSoup(res_2019_1_paper.text)
                title_2019_1_paper = soup_2019_1_paper.find("div", {"id": "papertitle"}).text
                authors_2019_1_paper = soup_2019_1_paper.find("div", {"id": "authors"}).find("b").find("i").text
                abstract_2019_1_paper = soup_2019_1_paper.find("div", {"id": "abstract"}).text
                sql = "insert into paper_data values(%s,%s,%s,%s,%s,%s)"
                cursor.execute(sql,
                               [title_2019_1_paper, authors_2019_1_paper, abstract_2019_1_paper, "", path_2019_1_pager,
                                "2018"])
            conn.commit()  # 提交事务 update delete insert操作
        except:
            traceback.print_exc()
    close_conn(conn, cursor)
    print(f"{time.asctime()}插入数据完毕")

 

posted @ 2021-04-28 23:44  风吹过半夏  阅读(60)  评论(0编辑  收藏  举报