爬虫大作业

import requests
from bs4 import BeautifulSoup

url = 'http://news.sise.edu.cn/cms/6145.html'
res = requests.get(url)
res.encoding = 'utf-8'
soup = BeautifulSoup(res.text, 'html.parser')
for news in soup.select('li'):
    if len(news.select('.list-unstyled list-inline')):
        break
print(news)
def writeNewsDetail(content):
    f = open('News.txt', 'a',encoding='utf-8')
    f.write(content)
    f.close()
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import re

获取网址

def getClickCount(newsUrl):
    newId = re.search('\cms(.*).html', newsUrl).group(1).split('/')[1]
    url = 'http://news.sise.edu.cn/cms/{}.html'
    res = requests.get(url)
    res.encoding = 'utf-8'
    soup = BeautifulSoup(res.text, 'html.parser')
    return (int(requests.get(url).text.split('.html')[-1].lstrip("('").rstrip("');")))

新闻内容

def getNewsDetail(newsUrl):  #一篇新闻的全部信息
    resd = requests.get(newsUrl)
    resd.encoding = 'utf-8'
    soupd = BeautifulSoup(resd.text, 'html.parser')  # 打开新闻详情页并解析

    news ={}
    news['title'] = soupd.select('.text-muted-5')[0].text
    info = soupd.select('.list-unstyled list-inline')
    for infos in info:
        news['dt'] = datetime.strptime(info.xpath('li[5]'), '%H:%M:%S %Y-%m-%d')
        news['content'] = soupd.select('.MsoNormal')[0].text.strip()
        #writeNewsDetail(news['content'])
        news['click'] = soupd.select('li[6]')
        news['newsUrl'] = newsUrl
        return(news)

全部新闻列表

def getListPage(pageUrl):  #一个列表页的全部新闻
    res = requests.get(pageUrl)
    res.encoding = 'utf-8'
    soup = BeautifulSoup(res.text, 'html.parser')

    newsList = []
    for news in soup.select('li'):
        if len(news.select('.media-body')) > 0:
            newsUrl = news.select('a')[0].attrs['href']  # 链接
            newsList.append(getNewsDetail(newsUrl))
    return(newsList)

def getPageN():
    res = requests.get('http://news.sise.edu.cn/cms/news/2.html')
    res.encoding = 'utf-8'
    soup = BeautifulSoup(res.text, 'html.parser')
    n = int(soup.select('.a1')[0].text.rstrip(''))
    return (n // 12 + 1)

newsTotal = []
firstPageUrl = 'http://news.sise.edu.cn/cms/news/2.html'
newsTotal.extend(getListPage(firstPageUrl))

n = getPageN()
for i in range(n, n+1):
    listPageUrl = 'http://news.sise.edu.cn/cms/news/2/p/{}.html'.format(i)
    newsTotal.extend(getListPage(listPageUrl))

 

posted on 2018-05-03 21:42  370蔡轩  阅读(179)  评论(0编辑  收藏  举报