爬取校园新闻首页的新闻
import requests from bs4 import BeautifulSoup import re from datetime import datetime new_list, add, p_list, pa = [], [], [], [] url = 'http://news.gzcc.cn/html/xiaoyuanxinwen/' res = requests.get(url) res.encoding = 'utf-8' soup = BeautifulSoup(res.text, 'html.parser') news = soup.select('div[class="list-container"] li a') for i in range(0, len(news)): a = re.findall(r'<a href="(.*?)">', str(news[i]))[0] # print(a) add.append(a) new_list.append(news[i].get_text().strip()) resd = requests.get(add[0]) print(add) print(new_list) resd.encoding = 'utf-8' soupd = BeautifulSoup(resd.text, 'html.parser') # print(soupd) passage = soupd.select('div[class="show-container"]') # print(passage) title = soupd.select('div[class="show-info"]') for j in range(0, len(title)): pa.append(passage[j].get_text().strip()) print(pa) print(title) tm = re.findall(r'\d\d\d\d-\d\d-\d\d', str(title)) print(tm) # sst = datetime.strftime(str(tm), '%Y-%m-%d') # print(sst)