获取全部校园新闻

1.取出一个新闻列表页的全部新闻 包装成函数。

2.获取总的新闻篇数,算出新闻总页数。

3.获取全部新闻列表页的全部新闻详情。

import requests
from bs4 import BeautifulSoup
from datetime import datetime
import locale
import re


def getClickCount(newsUrl):
    newsid = re.search(r"\_(.*).html", newsUrl).group(1)[-4:]
    clicktimesurl = ("http://oa.gzcc.cn/api.php?op=count&id={}&modelid=80").format(newsid)
    clicktimes = int(requests.get(clicktimesurl).text.split(".html(")[-1].lstrip("'").rstrip("');"))
    return clicktimes


def getNewsDetail(newsUrl):
    resdet = requests.get(newsUrl)
    resdet.encoding = 'utf-8'
    soupdet = BeautifulSoup(resdet.text, 'html.parser')
    contentdetail = soupdet.select('#content')[0].text
    showinfo = soupdet.select('.show-info')[0].text
    date = showinfo.lstrip("发布时间:")[:19]
    if (showinfo.find('作者') > 0):
        author = re.search('作者:((.{2,4}\s|.{2,4}、|.{2,4},|\w*\s){1,5})', showinfo).group(1)
    else:
        author = 'none'
    if (showinfo.find('审核') > 0):
        checker = re.search('审核:((.{2,4}\s|.{2,4}、|.{2,4},|\w*\s){1,5})', showinfo).group(1)
    else:
        checker = 'none'
    if (showinfo.find('来源') > 0):
        source = re.search('来源:(.*)\s*来|摄|点', showinfo).group(1)
    else:
        source = 'none'
    if (showinfo.find('摄影') > 0):
        photographer = re.search('摄影:(.*)\s点', showinfo).group(1)
    else:
        photographer = 'none'
    clicktimes = getClickCount(newsUrl)
    dateTime = datetime.strptime(date, '%Y-%m-%d %H:%M:%S')
    print("发表时间:{0}  作者:{1}  审核:{2}  来源:{3}  摄像:{4}  点击次数:{5} 次".format(dateTime, author, checker, source, photographer,
                                                                        clicktimes))
    print(contentdetail)


def getListDetail(ListPageUrl):
    resl = requests.get(ListPageUrl)
    resl.encoding = 'utf-8'
    soupl = BeautifulSoup(resl.text, 'html.parser')
    for news in soupl.select('li'):
        if len(news.select('.news-list-title')) > 0:
            title = news.select('.news-list-title')[0].text
            description = news.select('.news-list-description')[0].text
            info = news.select('.news-list-info')[0].text
            address = news.select('a')[0]['href']
            print("\n标题: {0}\n描述: {1}\n信息: {2}\n链接: {3}".format(title, description, info, address))
            getNewsDetail(address)


locale.setlocale(locale.LC_CTYPE, 'chinese')
Listurl = "http://news.gzcc.cn/html/xiaoyuanxinwen/"
res = requests.get(Listurl)
res.encoding = 'utf-8'
soup = BeautifulSoup(res.text, 'html.parser')
ListCount = int(soup.select('.a1')[0].text.rstrip(''))
if (ListCount % 10 > 0):
    pagecount = ListCount // 10 + 1
else:
    pagecount = ListCount // 10
for i in range(1, pagecount + 1):
    if (i == 1):
        ListPageUrl = Listurl
    else:
        ListPageUrl = Listurl + '{}.html'.format(i)
    getListDetail(ListPageUrl)

 

4.找一个自己感兴趣的主题,进行数据爬取,并进行分词分析。不能与其它同学雷同。

import requests
from bs4 import BeautifulSoup
import jieba


def getnewsdetail(newsurl):
    resd = requests.get(newsurl)
    resd.encoding = 'gbk'
    soupd = BeautifulSoup(resd.text, 'html.parser')
    total = len(soupd.select(".text"))
    content = ''
    for p in range(0, total):
        content += soupd.select('.text')[p].text + '\n'
    # 有部分为纯图片新闻所以加此判断语句来不分析纯图片新闻
    if (total > 0):
        print(content + "\n词频统计如下:")
        delword = ['', '', '', '', '', '', '-', '', '我们', '', '', '', '', '', '', '', '',
                   '\n', '', '', '', '', '', '', '', '', '', '.', '', '', '', '', ' ', '', '', '']
        wordDict = {}
        newscontent = list(jieba.cut(content))
        wordset = set(newscontent) - set(delword)
        for i in wordset:
            wordDict[i] = newscontent.count(i)
        sort = sorted(wordDict.items(), key=lambda item: item[1], reverse=True)
        for i in range(20):
            print(sort[i])
    else:
        print('纯图片新闻')


def getnewslist(newsurl):
    res = requests.get(newsurl)
    res.encoding = 'gbk'
    soup = BeautifulSoup(res.text, 'html.parser')
    for newsList in soup.select('.newslist')[0].select('li'):
        title = newsList.select('a')[0].text
        publishtime = newsList.select('.pub_time')[0].text
        address = newsList.select('a')[0]['href']
        print('\n标题:{0}\n发表时间:{1}\n新闻链接:{2}\n'.format(title, publishtime, address))
        getnewsdetail(address)


# 添加自定义词汇
jieba.add_word('维斯塔潘')
jieba.add_word('维特尔')
jieba.add_word("范多恩")
jieba.add_word("加斯利")
jieba.add_word("托斯特")
jieba.add_word("小红牛")
jieba.add_word("大红牛")
jieba.add_word("库比卡")
jieba.add_word("马格努森")
jieba.add_word("倍耐力")

url = "http://sports.qq.com/l/f1/allf1news/list20100311191657.htm"
getnewslist(url)
for i in range(1, 101):
    if (i == 1):
        getnewslist(url)
    else:
        newsurl = "http://sports.qq.com/l/f1/allf1news/list20100311191657_{}.htm".format(i)
        getnewslist(newsurl)

 

posted @ 2018-04-10 21:51  165邝启彬  阅读(263)  评论(0编辑  收藏  举报