个人作业——爬数据并对其进行操作
当时上课的时候现学的爬虫,写了一个尝试爬取微博热搜的代码并存入txt文件的代码。
import requests # import os from bs4 import BeautifulSoup cookies = { 'PC_TOKEN': '460f44babc', 'SUB': '_2AkMVITIXf8NxqwJRmPAUyGvgb4R_yArEieKjfcPMJRMxHRl-yT8XqmAstRB6PqEc-zaoslPVckFYC5nECECC2Gh_Bt8z', 'SUBP': '0033WrSXqPxfM72-Ws9jqgMF55529P9D9WFVkyBaH9Q3UvYbjxH2Mg80', '_s_tentry': 'passport.weibo.com', 'Apache': '8055727688366.35.1652407589169', 'SINAGLOBAL': '8055727688366.35.1652407589169', 'ULV': '1652407589186:1:1:1:8055727688366.35.1652407589169:', } headers = { 'authority': 's.weibo.com', 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6', 'cache-control': 'max-age=0', # Requests sorts cookies= alphabetically # 'cookie': 'PC_TOKEN=460f44babc; SUB=_2AkMVITIXf8NxqwJRmPAUyGvgb4R_yArEieKjfcPMJRMxHRl-yT8XqmAstRB6PqEc-zaoslPVckFYC5nECECC2Gh_Bt8z; SUBP=0033WrSXqPxfM72-Ws9jqgMF55529P9D9WFVkyBaH9Q3UvYbjxH2Mg80; _s_tentry=passport.weibo.com; Apache=8055727688366.35.1652407589169; SINAGLOBAL=8055727688366.35.1652407589169; ULV=1652407589186:1:1:1:8055727688366.35.1652407589169:', 'referer': 'https://passport.weibo.com/', 'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="101", "Microsoft Edge";v="101"', 'sec-ch-ua-mobile': '?0', 'sec-ch-ua-platform': '"Windows"', 'sec-fetch-dest': 'document', 'sec-fetch-mode': 'navigate', 'sec-fetch-site': 'same-site', 'sec-fetch-user': '?1', 'upgrade-insecure-requests': '1', 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.54 Safari/537.36 Edg/101.0.1210.39', } params = { 'cate': 'realtimehot', } #获取网页这里的参数是怎么定的 response = requests.get('https://s.weibo.com/top/summary', params=params, cookies=cookies, headers=headers) response.encoding='utf-8' # print(response.text) #解析网页 #pl_top_realtimehot > table > tbody > tr> td.td-02 > a #pl_top_realtimehot > table > tbody > tr > td.td-02 > a content="#pl_top_realtimehot > table > tbody > tr > td.td-02 > a" main_page = BeautifulSoup(response.text, 'html.parser') # 获取数据,第一个参数是标签,attrs代表参数 # find找一个,findall()找所有 # main_page.find("div",attrs={"class":"TypeList"}) #清洗数据 a=main_page.select(content) # print(a) for i in range(0,len(a)): a[i] = a[i].text print(a[i])
爬取数据代码:
import requests # import os from bs4 import BeautifulSoup cookies = { 'PC_TOKEN': '460f44babc', 'SUB': '_2AkMVITIXf8NxqwJRmPAUyGvgb4R_yArEieKjfcPMJRMxHRl-yT8XqmAstRB6PqEc-zaoslPVckFYC5nECECC2Gh_Bt8z', 'SUBP': '0033WrSXqPxfM72-Ws9jqgMF55529P9D9WFVkyBaH9Q3UvYbjxH2Mg80', '_s_tentry': 'passport.weibo.com', 'Apache': '8055727688366.35.1652407589169', 'SINAGLOBAL': '8055727688366.35.1652407589169', 'ULV': '1652407589186:1:1:1:8055727688366.35.1652407589169:', } headers = { 'authority': 's.weibo.com', 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6', 'cache-control': 'max-age=0', # Requests sorts cookies= alphabetically # 'cookie': 'PC_TOKEN=460f44babc; SUB=_2AkMVITIXf8NxqwJRmPAUyGvgb4R_yArEieKjfcPMJRMxHRl-yT8XqmAstRB6PqEc-zaoslPVckFYC5nECECC2Gh_Bt8z; SUBP=0033WrSXqPxfM72-Ws9jqgMF55529P9D9WFVkyBaH9Q3UvYbjxH2Mg80; _s_tentry=passport.weibo.com; Apache=8055727688366.35.1652407589169; SINAGLOBAL=8055727688366.35.1652407589169; ULV=1652407589186:1:1:1:8055727688366.35.1652407589169:', 'referer': 'https://passport.weibo.com/', 'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="101", "Microsoft Edge";v="101"', 'sec-ch-ua-mobile': '?0', 'sec-ch-ua-platform': '"Windows"', 'sec-fetch-dest': 'document', 'sec-fetch-mode': 'navigate', 'sec-fetch-site': 'same-site', 'sec-fetch-user': '?1', 'upgrade-insecure-requests': '1', 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.54 Safari/537.36 Edg/101.0.1210.39', } params = { 'cate': 'realtimehot', } #获取网页这里的参数是怎么定的 response = requests.get('https://s.weibo.com/top/summary', params=params, cookies=cookies, headers=headers) response.encoding='utf-8' # print(response.text) #解析网页 #pl_top_realtimehot > table > tbody > tr> td.td-02 > a #pl_top_realtimehot > table > tbody > tr > td.td-02 > a content="#pl_top_realtimehot > table > tbody > tr > td.td-02 > a" main_page = BeautifulSoup(response.text, 'html.parser') # 获取数据,第一个参数是标签,attrs代表参数 # find找一个,findall()找所有 # main_page.find("div",attrs={"class":"TypeList"}) #清洗数据 a=main_page.select(content) # print(a) for i in range(0,len(a)): a[i] = a[i].text print(a[i])
写入数据库:
BufferedReader hrefBufferedReader = new BufferedReader(new FileReader("C:\\CS\\Python\\PycharmProjects\\test2\\newTestHref.json")); BufferedReader authorBufferedReader = new BufferedReader(new FileReader("C:\\CS\\Python\\PycharmProjects\\test2\\newTestAuthor.json")); BufferedReader articleBufferedReader = new BufferedReader(new FileReader("C:\\CS\\Python\\PycharmProjects\\test2\\newTestArticle.json")); BufferedReader abstractBufferedReader = new BufferedReader(new FileReader("C:\\CS\\Python\\PycharmProjects\\test2\\newTestAbstract.json")); BufferedReader yearBufferedReader = new BufferedReader(new FileReader("C:\\CS\\Python\\PycharmProjects\\test2\\newTestYear.json")); String lineHref = null; String lineAuthor = null; String lineArticle = null; String lineAbstract = abstractBufferedReader.readLine(); String lineYear = null; while ((lineHref = hrefBufferedReader.readLine()) != null) { lineAbstract = abstractBufferedReader.readLine(); lineAuthor = authorBufferedReader.readLine(); lineArticle = articleBufferedReader.readLine(); lineYear = yearBufferedReader.readLine(); Paper paper = new Paper(); paper.setHref(lineHref); paper.setAuthor(lineAuthor); paper.setArticle(lineArticle); paper.setPaperAbstract(lineAbstract); paper.setYear(lineYear); paperMapper.insert(paper); }