获取全部校园新闻

爬取中山大学官网的新闻页面:

import requests
from bs4 import BeautifulSoup
from datetime import datetime
import re


# 获取新闻列表页的简略信息
def crawlOnePage(url):
    res = requests.get(url)
    res.encoding = 'UTF-8'
    soup = BeautifulSoup(res.text, 'html.parser')
    cont = soup.select('li')
    for i in cont:
        print()
        print('新闻网址: ' + 'http://news2.sysu.edu.cn/news01/' + i.select('a')[0]['href'])
        # detailUrl='http://news2.sysu.edu.cn/news01/' + i.select('a')[0]['href']
        print('新闻标题: ' + i.select('a')[0].text)
        # getDetail(detailUrl)




#获取新闻具体信息
def getDetail(url):
    res = requests.get(url)
    res.encoding = 'UTF-8'
    soup = BeautifulSoup(res.text, 'html.parser')
    cont=soup.select('p')[2].text.split('|')
    # 日期
    times=cont[4].split('')[1]
    # 来源
    source=cont[0]
    # 作者
    author=cont[1]
    #编辑
    editor=cont[3]
    # 将时间字符串转换成datetime格式
    release_time = datetime.strptime(times, '%Y-%m-%d ')
    print(source,author,editor,release_time)
    content = soup.select('p')[-1].text
    print(content)


# 取得所有页面的新闻
def getTotalPage(url):
    res = requests.get(url)
    res.encoding = 'UTF-8'
    soup = BeautifulSoup(res.text, 'html.parser')
    n =int( soup.select('strong')[0].text.lstrip('1/'))
    for i in range(1, n):
        page = str(i)
        geturl = 'http://news2.sysu.edu.cn/news01/index'+page+'.htm'
        crawlOnePage(geturl)



crawlOnePage('http://news2.sysu.edu.cn/news01/index.htm')
getDetail('http://news2.sysu.edu.cn/news01/152940.htm')
getTotalPage('http://news2.sysu.edu.cn/news01/index.htm')

截图:

 

posted @ 2018-04-11 17:23  Hiro-D  阅读(131)  评论(0编辑  收藏  举报