使用正则表达式,取得点击次数,函数抽离

1. 用正则表达式判定邮箱是否输入正确。

import re
def is_valid_email(addr):
    if re.match('\w+(\.\w+)*@\w+([-.]\w+)*\.\w+([-.]\w+)*', addr):
        return True
    return False
tmail = 'www.694602156@qq.com'
fmail = 'asdasdasdasdasd'
t = is_valid_email(tmail)
f = is_valid_email(fmail)
print(t, f)

2. 用正则表达式识别出全部电话号码。

import re
a='''版权所有:广州商学院 地址:广州市黄埔区九龙大道206号
学校办公室:020-82876130 招生电话:020-82872773
粤公网安备 44011602000060号    粤ICP备15103669号'''
print(re.findall('(\d{3,4})-(\d{6,8})',a))

3. 用正则表达式进行英文分词。re.split('',news)

import re
a='''Once upon a time, a few mistakes ago
I was in your sights, you got me alone
You found me, you found me'''
b=re.split('\W+',a)
print(b)

4. 使用正则表达式取得新闻编号

import re
url1 = 'http://news.gzcc.cn/html/2018/xiaoyuanxinwen_0404/9183.html'
id1 = re.search('\_(.*).html', url1).group(1).split('/')[-1]
print(id1)

5. 生成点击次数的Request URL

import re
url = "http://news.gzcc.cn/html/2018/xiaoyuanxinwen_0404/9183.html"
id1 = re.findall("\_(.*).html", url)[0].split("/")[-1];
req1 = "http://oa.gzcc.cn/api.php?op=count&id={}&modelid=80".format(id1)
print(req1)

6. 获取点击次数

import re
import requests
url1 = "http://news.gzcc.cn/html/2018/xiaoyuanxinwen_0404/9183.html"
id1 = re.findall("\_(.*).html", url1)[0].split("/")[-1];
rurl = "http://oa.gzcc.cn/api.php?op=count&id={}&modelid=80".format(id1)
res = requests.get(rurl);
times=int(res.text.split('.html')[-1].lstrip("(')").rstrip("');"))
print(times)

7. 将456步骤定义成一个函数 def getClickCount(newsUrl):

import requests
import re
def getClickCount(url1):
    id1 = re.findall("\_(.*).html", url1)[0].split("/")[-1];
    rurl = "http://oa.gzcc.cn/api.php?op=count&id={}&modelid=80".format(id1)
    res = requests.get(rurl);
    times=int(res.text.split('.html')[-1].lstrip("(')").rstrip("');"))
    return times
time=getClickCount("http://news.gzcc.cn/html/2018/xiaoyuanxinwen_0404/9183.html")
print(time)

8. 将获取新闻详情的代码定义成一个函数 def getNewDetail(newsUrl):

import requests
from bs4 import BeautifulSoup
def getNewDetail(newsUrl):
    res = requests.get(newsUrl)
    res.encoding = 'utf-8'
    soup = BeautifulSoup(res.text, 'html.parser')
    for news in soup.select('li'):
        if len(news.select('.news-list-title')) > 0:
            t = news.select('.news-list-title')[0].text
            d = news.select('.news-list-description')[0].text
            a = news.select('a')[0].attrs['href']
            print('标题:', t)
            print('正文:', d)
            print('链接:', a)

            resd = requests.get(a)
            resd.encoding = 'utf-8'
            soupd = BeautifulSoup(resd.text, 'html.parser')
            info = soupd.select('.show-info')[0].text
            laiyuan = info[info.find('来源:'):].split()[0].lstrip('来源:')
            shenhe = info[info.find("审核"):].split()[0].lstrip('审核:')
            sheying = info[info.find('摄影:'):].split()[0].lstrip('点击:')
            print(info)
            time = info.lstrip('发布时间:')[:19]
            print('时间:', time)
            print(info[info.find('作者:'):info.find('审核:')])
            print('来源:', laiyuan)
            print('审核:', shenhe)
            print(sheying)

getNewDetail('http://news.gzcc.cn/html/xiaoyuanxinwen/')

9. 取出一个新闻列表页的全部新闻 包装成函数def getListPage(pageUrl):

import requests
from bs4 import BeautifulSoup
def getListPage(pageUrl):
    res = requests.get(pageUrl)
    res.encoding = 'utf-8'
    soup = BeautifulSoup(res.text, 'html.parser')
    for news in soup.select("li"):
        if len(news.select(".news-list-title")) > 0:
            time1 = news.select(".news-list-info")[0].contents[0].text
            title1 = news.select(".news-list-title")[0].text
            description1 = news.select(".news-list-description")[0].text
            url1 = news.select('a')[0].attrs['href']
            print(time1, title1, description1,url1)
getListPage('http://news.gzcc.cn/html/xiaoyuanxinwen/')

10. 获取总的新闻篇数,算出新闻总页数包装成函数def getPageN():

import requests
from bs4 import BeautifulSoup
def getPageN():
    res = requests.get('http://news.gzcc.cn/html/xiaoyuanxinwen/')
    res.encoding = "utf-8"
    soup = BeautifulSoup(res.text, 'html.parser')
    n = int(soup.select('#pages')[0].select('a')[0].text.rstrip(''))
    return (n // 10 + 1)
print(getPageN())

11. 获取全部新闻列表页的全部新闻详情。

n = getPageN();
def getallxinwen():
    for i in range(1, n + 1):
        if (i == 1):
            newsurl = 'http://news.gzcc.cn/html/xiaoyuanxinwen/'
        else:
            newsurl = 'http://news.gzcc.cn/html/xiaoyuanxinwen/{}.html'.format(i)
        getListPage(newsurl);
getallxinwen()

运行结果:

 

posted @ 2018-04-10 17:50  196-陈文豪  阅读(173)  评论(0编辑  收藏  举报