爬取校园新闻首页的新闻

import requests
from bs4 import BeautifulSoup
import re
from datetime import datetime


new_list, add, p_list, pa = [], [], [], []
url = 'http://news.gzcc.cn/html/xiaoyuanxinwen/'
res = requests.get(url)
res.encoding = 'utf-8'
soup = BeautifulSoup(res.text, 'html.parser')
news = soup.select('div[class="list-container"] li a')
for i in range(0, len(news)):
    a = re.findall(r'<a href="(.*?)">', str(news[i]))[0]
    # print(a)
    add.append(a)
    new_list.append(news[i].get_text().strip())
resd = requests.get(add[0])
print(add)
print(new_list)
resd.encoding = 'utf-8'
soupd = BeautifulSoup(resd.text, 'html.parser')
# print(soupd)
passage = soupd.select('div[class="show-container"]')
# print(passage)
title = soupd.select('div[class="show-info"]')
for j in range(0, len(title)):
    pa.append(passage[j].get_text().strip())
print(pa)
print(title)
tm = re.findall(r'\d\d\d\d-\d\d-\d\d', str(title))
print(tm)
# sst = datetime.strftime(str(tm), '%Y-%m-%d')
# print(sst)

  

 

posted on 2018-04-03 21:22  133饶敏  阅读(65)  评论(0编辑  收藏  举报

导航