个人作业——爬数据并对其进行操作

当时上课的时候现学的爬虫,写了一个尝试爬取微博热搜的代码并存入txt文件的代码。

import requests
# import os
from bs4 import BeautifulSoup

cookies = {
    'PC_TOKEN': '460f44babc',
    'SUB': '_2AkMVITIXf8NxqwJRmPAUyGvgb4R_yArEieKjfcPMJRMxHRl-yT8XqmAstRB6PqEc-zaoslPVckFYC5nECECC2Gh_Bt8z',
    'SUBP': '0033WrSXqPxfM72-Ws9jqgMF55529P9D9WFVkyBaH9Q3UvYbjxH2Mg80',
    '_s_tentry': 'passport.weibo.com',
    'Apache': '8055727688366.35.1652407589169',
    'SINAGLOBAL': '8055727688366.35.1652407589169',
    'ULV': '1652407589186:1:1:1:8055727688366.35.1652407589169:',
}

headers = {
    'authority': 's.weibo.com',
    'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
    'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
    'cache-control': 'max-age=0',
    # Requests sorts cookies= alphabetically
    # 'cookie': 'PC_TOKEN=460f44babc; SUB=_2AkMVITIXf8NxqwJRmPAUyGvgb4R_yArEieKjfcPMJRMxHRl-yT8XqmAstRB6PqEc-zaoslPVckFYC5nECECC2Gh_Bt8z; SUBP=0033WrSXqPxfM72-Ws9jqgMF55529P9D9WFVkyBaH9Q3UvYbjxH2Mg80; _s_tentry=passport.weibo.com; Apache=8055727688366.35.1652407589169; SINAGLOBAL=8055727688366.35.1652407589169; ULV=1652407589186:1:1:1:8055727688366.35.1652407589169:',
    'referer': 'https://passport.weibo.com/',
    'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="101", "Microsoft Edge";v="101"',
    'sec-ch-ua-mobile': '?0',
    'sec-ch-ua-platform': '"Windows"',
    'sec-fetch-dest': 'document',
    'sec-fetch-mode': 'navigate',
    'sec-fetch-site': 'same-site',
    'sec-fetch-user': '?1',
    'upgrade-insecure-requests': '1',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.54 Safari/537.36 Edg/101.0.1210.39',
}

params = {
    'cate': 'realtimehot',
}



#获取网页这里的参数是怎么定的
response = requests.get('https://s.weibo.com/top/summary', params=params, cookies=cookies, headers=headers)

response.encoding='utf-8'
# print(response.text)

#解析网页
#pl_top_realtimehot > table > tbody > tr> td.td-02 > a
#pl_top_realtimehot > table > tbody > tr > td.td-02 > a
content="#pl_top_realtimehot > table > tbody > tr > td.td-02 > a"
main_page = BeautifulSoup(response.text, 'html.parser')


# 获取数据,第一个参数是标签,attrs代表参数
# find找一个,findall()找所有
# main_page.find("div",attrs={"class":"TypeList"})

#清洗数据
a=main_page.select(content)
# print(a)
for i in range(0,len(a)):
    a[i] = a[i].text
    print(a[i])

爬取数据代码:

import requests
# import os
from bs4 import BeautifulSoup

cookies = {
    'PC_TOKEN': '460f44babc',
    'SUB': '_2AkMVITIXf8NxqwJRmPAUyGvgb4R_yArEieKjfcPMJRMxHRl-yT8XqmAstRB6PqEc-zaoslPVckFYC5nECECC2Gh_Bt8z',
    'SUBP': '0033WrSXqPxfM72-Ws9jqgMF55529P9D9WFVkyBaH9Q3UvYbjxH2Mg80',
    '_s_tentry': 'passport.weibo.com',
    'Apache': '8055727688366.35.1652407589169',
    'SINAGLOBAL': '8055727688366.35.1652407589169',
    'ULV': '1652407589186:1:1:1:8055727688366.35.1652407589169:',
}

headers = {
    'authority': 's.weibo.com',
    'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
    'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
    'cache-control': 'max-age=0',
    # Requests sorts cookies= alphabetically
    # 'cookie': 'PC_TOKEN=460f44babc; SUB=_2AkMVITIXf8NxqwJRmPAUyGvgb4R_yArEieKjfcPMJRMxHRl-yT8XqmAstRB6PqEc-zaoslPVckFYC5nECECC2Gh_Bt8z; SUBP=0033WrSXqPxfM72-Ws9jqgMF55529P9D9WFVkyBaH9Q3UvYbjxH2Mg80; _s_tentry=passport.weibo.com; Apache=8055727688366.35.1652407589169; SINAGLOBAL=8055727688366.35.1652407589169; ULV=1652407589186:1:1:1:8055727688366.35.1652407589169:',
    'referer': 'https://passport.weibo.com/',
    'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="101", "Microsoft Edge";v="101"',
    'sec-ch-ua-mobile': '?0',
    'sec-ch-ua-platform': '"Windows"',
    'sec-fetch-dest': 'document',
    'sec-fetch-mode': 'navigate',
    'sec-fetch-site': 'same-site',
    'sec-fetch-user': '?1',
    'upgrade-insecure-requests': '1',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.54 Safari/537.36 Edg/101.0.1210.39',
}

params = {
    'cate': 'realtimehot',
}



#获取网页这里的参数是怎么定的
response = requests.get('https://s.weibo.com/top/summary', params=params, cookies=cookies, headers=headers)

response.encoding='utf-8'
# print(response.text)

#解析网页
#pl_top_realtimehot > table > tbody > tr> td.td-02 > a
#pl_top_realtimehot > table > tbody > tr > td.td-02 > a
content="#pl_top_realtimehot > table > tbody > tr > td.td-02 > a"
main_page = BeautifulSoup(response.text, 'html.parser')


# 获取数据,第一个参数是标签,attrs代表参数
# find找一个,findall()找所有
# main_page.find("div",attrs={"class":"TypeList"})

#清洗数据
a=main_page.select(content)
# print(a)
for i in range(0,len(a)):
    a[i] = a[i].text
    print(a[i])

写入数据库:

BufferedReader hrefBufferedReader = new BufferedReader(new FileReader("C:\\CS\\Python\\PycharmProjects\\test2\\newTestHref.json"));
        BufferedReader authorBufferedReader = new BufferedReader(new FileReader("C:\\CS\\Python\\PycharmProjects\\test2\\newTestAuthor.json"));
        BufferedReader articleBufferedReader = new BufferedReader(new FileReader("C:\\CS\\Python\\PycharmProjects\\test2\\newTestArticle.json"));
        BufferedReader abstractBufferedReader = new BufferedReader(new FileReader("C:\\CS\\Python\\PycharmProjects\\test2\\newTestAbstract.json"));
        BufferedReader yearBufferedReader = new BufferedReader(new FileReader("C:\\CS\\Python\\PycharmProjects\\test2\\newTestYear.json"));

        String lineHref = null;
        String lineAuthor = null;
        String lineArticle = null;
        String lineAbstract = abstractBufferedReader.readLine();
        String lineYear = null;

        while ((lineHref = hrefBufferedReader.readLine()) != null) {
            lineAbstract = abstractBufferedReader.readLine();
            lineAuthor = authorBufferedReader.readLine();
            lineArticle = articleBufferedReader.readLine();
            lineYear = yearBufferedReader.readLine();
            Paper paper = new Paper();
            paper.setHref(lineHref);
            paper.setAuthor(lineAuthor);
            paper.setArticle(lineArticle);
            paper.setPaperAbstract(lineAbstract);
            paper.setYear(lineYear);
            paperMapper.insert(paper);
        }

 

posted @ 2022-06-01 13:09  闫闫不是那个严  阅读(65)  评论(0编辑  收藏  举报