爬取全部校园新闻
import requests from bs4 import BeautifulSoup from datetime import datetime import locale import re locale.setlocale(locale.LC_CTYPE,'chinese') def getClickCount(newsUrl): newsId = re.findall('\_(.*).html', newsUrl)[0].split('/')[1] #使用正则表达式取得新闻编号 clickUrl = 'http://oa.gzcc.cn/api.php?op=count&id={}&modelid=80'.format(newsId) clickStr = requests.get(clickUrl).text return(re.search("hits'\).html\('(.*)'\);",clickStr).group(1)) def getNewDetail(newsUrl): resd = requests.get(newsUrl) # 返回response resd.encoding = 'utf-8' soupd = BeautifulSoup(resd.text, 'html.parser') print('标题:' + soupd.select('.show-title')[0].text) # print('描述:' + soupd.select('.show-description')[0].text) print('链接:' + newsUrl) info = soupd.select('.show-info')[0].text time = re.search('发布时间:(.*) \xa0\xa0 \xa0\xa0作者:', info).group(1) dtime = datetime.strptime(time, '%Y-%m-%d %H:%M:%S') print('发布时间:{}'.format(dtime)) print('作者:' + re.search('作者:(.*)审核:', info).group(1)) # print('审核:' + re.search('审核:(.*)来源:', info).group(1)) # print('来源:' + re.search('来源:(.*)摄影:', info).group(1)) # print('摄影:' + re.search('摄影:(.*)点击', info).group(1)) clickCount = getClickCount(newsUrl) print('点击次数:' + clickCount) # print(soupd.select('.show-content')[0].text) def getListPage(ListPageUrl): res = requests.get(ListPageUrl) res.encoding = 'utf-8' soup = BeautifulSoup(res.text,'html.parser') # print(soup.select('li')) for news in soup.select('li'): if len(news.select('.news-list-title'))>0: a = news.a.attrs['href'] getNewDetail(a) firstUrl = 'http://news.gzcc.cn/html/xiaoyuanxinwen/' print('第1页:') getListPage(firstUrl) res = requests.get(firstUrl) res.encoding = 'utf-8' soupn = BeautifulSoup(res.text,'html.parser') n = int(soupn.select('.a1')[0].text.rstrip('条'))//10+1 for i in range(2,n): pageUrl = 'http://news.gzcc.cn/html/xiaoyuanxinwen/{}.html'.format(i) print('第{}页:'.format(i)) getListPage(pageUrl) break
import requests from bs4 import BeautifulSoup def getHTMLText(url): try: r = requests.get(url, timeout = 30) r.raise_for_status() return r.text except: return "" def getContent(url): html = getHTMLText(url) soup = BeautifulSoup(html, "html.parser") title = soup.select("div.hd > h1") print(title[0].get_text()) time = soup.select("div.a_Info > span.a_time") print(time[0].string) author = soup.select("div.qq_articleFt > div.qq_toolWrap > div.qq_editor") print(author[0].get_text()) paras = soup.select("div.Cnt-Main-Article-QQ > p.text") for para in paras: if len(para) > 0: print(para.get_text()) print() #写入文件 fo = open("text.txt", "w+") fo.writelines(title[0].get_text() + "\n") fo.writelines(time[0].get_text() + "\n") for para in paras: if len(para) > 0: fo.writelines(para.get_text() + "\n\n") fo.writelines(author[0].get_text() + '\n') fo.close() article = { 'Title' : title[0].get_text(), 'Time' : time[0].get_text(), 'Paragraph' : paras, 'Author' : author[0].get_text() } print(article) def main(): url = "http://news.qq.com/a/20170504/012032.htm" getContent(url); main()