爬取校园新闻首页的新闻

1. 用requests库和BeautifulSoup库，爬取校园新闻首页新闻的标题、链接、正文。

2. 分析字符串，获取每篇新闻的发布时间，作者，来源，摄影等信息。

3. 将其中的发布时间由str转换成datetime类型。

4. 将完整的代码及运行结果截图发布在作业上。

import requests
from  datetime import  datetime
url = 'http://news.gzcc.cn/html/xiaoyuanxinwen/'
res = requests.get(url)
res.encoding = 'utf-8'
res.text
from bs4 import BeautifulSoup
soup = BeautifulSoup(res.text,'html.parser')
test=soup.body

for i in test.select('li'):
    if len(i.select('.news-list-title')) > 0:
      place = i.select('.news-list-info')[0].contents[1].text #获取来源
      title=i.select('.news-list-title')[0].text #获取标题
      description = i.select('.news-list-description')[0].text #获取描述
      detailurl=i.select('a')[0].attrs['href'] #获取链接
      resurl = requests.get(detailurl)
      resurl.encoding = 'utf-8'
      detailsoup = BeautifulSoup(resurl.text, 'html.parser')
      detail=detailsoup.select('#content')[0].text
      author=detailsoup.select('.show-info')[0].text
      click_num_url = 'http://oa.gzcc.cn/api.php?op=count&id=9167&modelid=80'
      res2 = requests.get(click_num_url)
      res2.encoding = 'UTF-8'
      str3=res2.text.split(';')[3].split('.')[1] #解析js点击数目
      time=author.lstrip('发布时间:')[:19]
      isdatetime=datetime.strptime(time, '%Y-%m-%d %H:%M:%S') #转换datetime
      print("时间："+ time)
      print("来源：" + place)
      print("新闻标题：" + title)
      print("新闻描述：" + description)
      print("新闻链接：" + detailurl)
      print("新闻" +author[author.find('作者：'):].split()[0].lstrip('审核：'))
      print(author[author.find('摄影：'):].split()[0].lstrip('点击：'))
      print("点击次数：" +str3[6:-2])
      print("新闻详情：" + detail)

posted @ 2018-04-03 17:39 207钟程泰阅读(172) 评论(0) 编辑收藏举报

刷新页面返回顶部

悦动

爬取校园新闻首页的新闻

公告