
1. 用正则表达式判定邮箱是否输入正确。

2. 用正则表达式识别出全部电话号码。

3. 用正则表达式进行英文分词。re.split('',news)

import re
import requests
def checkEmail(Email):
    efindall = re.findall(eroll, Email)
    if len(efindall)>0:

Email = '123google@gmail.com'

def chenkPhonenum(phonenum):
    proll = '(\d{3,4}-\d{6,8})'
    pfindall = re.findall(proll, phonenum)

def getClickCount(newUrl):
    re1 = re.search('\_(.*).html',newUrl)
    re2 = re.match('http://news.gzcc.cn/html/2018/xiaoyuanxinwen_(.*).html',newUrl)
    i = re1.group(1).split('/')[-1]
    return i

phonenum = '''版权所有:广州商学院 地址:广州市黄埔区九龙大道206号
学校办公室:020-82876130 招生电话:020-82872773
粤公网安备 44011602000060号    粤ICP备15103669号'''

article = '''The constant presence of a mobile phone has a "brain drain" 
effect that significantly reduces people's intelligence and attention spans, 
a study has found. 
Researchers at the University of Texas discovered that people are worse at 
conducting tasks and remembering information if they have a smartphone within
eye shot. In two experiments they found phones sitting on a desk or even in
a pocket or handbag would distract users and lead to worse test scores even
when it was set up not to disturb test subjects. '''



4. 使用正则表达式取得新闻编号

5. 生成点击次数的Request URL

6. 获取点击次数

7. 将456步骤定义成一个函数 def getClickCount(newsUrl):

8. 将获取新闻详情的代码定义成一个函数 def getNewDetail(newsUrl):

import requests
from bs4 import BeautifulSoup
from datetime import datetime
import re

def getClickCount(newUrl):
    re1 = re.search('\_(.*).html',newUrl)
    re2 = re.match('http://news.gzcc.cn/html/2018/xiaoyuanxinwen_(.*).html',newUrl)
    i = re1.group(1).split('/')[-1]
    return i

def g(a1):
    res1 = requests.get(a1)
    res1.encoding = 'utf-8'
    soup1 = BeautifulSoup(res1.text, 'html.parser')
    title = soup1.select(".show-title")[0].text
    content = soup1.select("#content")[0].text
    about = soup1.select('.show-info')[0].text
    time = about.lstrip('发布时间:')[:19]
    time = datetime.strptime(time, '%Y-%m-%d %H:%M:%S')
    if about.find('来源:') > 0:
        origin = about[about.find('来源:'):].split()[0]
    if about.find('作者:') > 0:
        writer = about[about.find('作者:'):].split()[0]
    if about.find('审核:') > 0:
        audit = about[about.find('审核:'):].split()[0]
    photograph = 'null'
    if about.find('摄影:') > 0:
        photograph = about[about.find('摄影:'):].split()[0]
    newUrl = a1
    cUrl = 'http://oa.gzcc.cn/api.php?op=count&id={}&modelid=80'.format(getClickCount(newUrl))
    res = requests.get(cUrl)
    print("标题:" + title, "发布时间:", time, origin, audit, writer, photograph,
          "点击次数:" + res.text.split(".html")[-1].lstrip("('").rstrip("');"))

url = "http://news.gzcc.cn/html/xiaoyuanxinwen/"
resurl = requests.get(url)
resurl.encoding = 'utf-8'
soup = BeautifulSoup(resurl.text, 'html.parser')
a = soup.select('li')

for news in a:
    if len(news.select('.news-list-title')) > 0:
        a1 = news.select('a')[0].attrs['href']


posted @ 2018-04-11 00:58  088陈志鸿  阅读(162)  评论(0编辑  收藏  举报