个人作业二

个人作业——爬数据并对其进行操作

当时上课的时候现学的爬虫，写了一个尝试爬取微博热搜的代码并存入txt文件的代码。

import requests
# import os
from bs4 import BeautifulSoup

cookies = {
    'PC_TOKEN': '460f44babc',
    'SUB': '_2AkMVITIXf8NxqwJRmPAUyGvgb4R_yArEieKjfcPMJRMxHRl-yT8XqmAstRB6PqEc-zaoslPVckFYC5nECECC2Gh_Bt8z',
    'SUBP': '0033WrSXqPxfM72-Ws9jqgMF55529P9D9WFVkyBaH9Q3UvYbjxH2Mg80',
    '_s_tentry': 'passport.weibo.com',
    'Apache': '8055727688366.35.1652407589169',
    'SINAGLOBAL': '8055727688366.35.1652407589169',
    'ULV': '1652407589186:1:1:1:8055727688366.35.1652407589169:',
}

headers = {
    'authority': 's.weibo.com',
    'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
    'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
    'cache-control': 'max-age=0',
    # Requests sorts cookies= alphabetically
    # 'cookie': 'PC_TOKEN=460f44babc; SUB=_2AkMVITIXf8NxqwJRmPAUyGvgb4R_yArEieKjfcPMJRMxHRl-yT8XqmAstRB6PqEc-zaoslPVckFYC5nECECC2Gh_Bt8z; SUBP=0033WrSXqPxfM72-Ws9jqgMF55529P9D9WFVkyBaH9Q3UvYbjxH2Mg80; _s_tentry=passport.weibo.com; Apache=8055727688366.35.1652407589169; SINAGLOBAL=8055727688366.35.1652407589169; ULV=1652407589186:1:1:1:8055727688366.35.1652407589169:',
    'referer': 'https://passport.weibo.com/',
    'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="101", "Microsoft Edge";v="101"',
    'sec-ch-ua-mobile': '?0',
    'sec-ch-ua-platform': '"Windows"',
    'sec-fetch-dest': 'document',
    'sec-fetch-mode': 'navigate',
    'sec-fetch-site': 'same-site',
    'sec-fetch-user': '?1',
    'upgrade-insecure-requests': '1',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.54 Safari/537.36 Edg/101.0.1210.39',
}

params = {
    'cate': 'realtimehot',
}



#获取网页这里的参数是怎么定的
response = requests.get('https://s.weibo.com/top/summary', params=params, cookies=cookies, headers=headers)

response.encoding='utf-8'
# print(response.text)

#解析网页
#pl_top_realtimehot > table > tbody > tr> td.td-02 > a
#pl_top_realtimehot > table > tbody > tr > td.td-02 > a
content="#pl_top_realtimehot > table > tbody > tr > td.td-02 > a"
main_page = BeautifulSoup(response.text, 'html.parser')


# 获取数据,第一个参数是标签，attrs代表参数
# find找一个，findall()找所有
# main_page.find("div",attrs={"class":"TypeList"})

#清洗数据
a=main_page.select(content)
# print(a)
for i in range(0,len(a)):
    a[i] = a[i].text
    print(a[i])

爬取数据代码：

import requests
from bs4 import BeautifulSoup
import re
import pymysql

url = 'https://openaccess.thecvf.com/CVPR2020?day=2020-06-18'
response = requests.get(url)

obj1 = re.compile(r'<dt class="ptitle"><br>.*?.html">(?P<name>.*?)</a></dt>.*?'
                  r'\[<a href="(?P<pdf>.*?)">pdf</a>].*?'
                  r'author = {(?P<author>.*?)},<br>.*?'
                  r'title = {(?P<title>.*?)},<br>.*?'
                  r'booktitle = {(?P<booktitle>.*?)},<br>', re.S)

result = obj1.finditer(response.text)

# 连接数据库
conn = pymysql.connect(host='localhost', user='root', password='123456', database='exercise', charset='utf8', port=3306)
# 创建游标对象
cursor = conn.cursor()
sql = 'INSERT INTO cvpr(`name`, pdf, author, title, booktitle, `date`) values(%s,%s,%s,%s,%s,%s)'

for it in result:
    try:
        data = [it.group('name'), it.group('pdf'), it.group('author'), it.group('title'), it.group('booktitle'), 20200618]
        cursor.execute(sql, data)
        conn.commit()
    except Exception as e:
        print(e)


response.close()

# 关闭游标
cursor.close()
# 关闭连接
conn.close()

print('over!!!')

写入数据库：

BufferedReader hrefBufferedReader = new BufferedReader(new FileReader("C:\\CS\\Python\\PycharmProjects\\test2\\newTestHref.json"));
        BufferedReader authorBufferedReader = new BufferedReader(new FileReader("C:\\CS\\Python\\PycharmProjects\\test2\\newTestAuthor.json"));
        BufferedReader articleBufferedReader = new BufferedReader(new FileReader("C:\\CS\\Python\\PycharmProjects\\test2\\newTestArticle.json"));
        BufferedReader abstractBufferedReader = new BufferedReader(new FileReader("C:\\CS\\Python\\PycharmProjects\\test2\\newTestAbstract.json"));
        BufferedReader yearBufferedReader = new BufferedReader(new FileReader("C:\\CS\\Python\\PycharmProjects\\test2\\newTestYear.json"));

        String lineHref = null;
        String lineAuthor = null;
        String lineArticle = null;
        String lineAbstract = abstractBufferedReader.readLine();
        String lineYear = null;

        while ((lineHref = hrefBufferedReader.readLine()) != null) {
            lineAbstract = abstractBufferedReader.readLine();
            lineAuthor = authorBufferedReader.readLine();
            lineArticle = articleBufferedReader.readLine();
            lineYear = yearBufferedReader.readLine();
            Paper paper = new Paper();
            paper.setHref(lineHref);
            paper.setAuthor(lineAuthor);
            paper.setArticle(lineArticle);
            paper.setPaperAbstract(lineAbstract);
            paper.setYear(lineYear);
            paperMapper.insert(paper);
        }