正则表达式
import requests from bs4 import BeautifulSoup from datetime import datetime import re # 1. 用正则表达式判定邮箱是否输入正确。 r='^(\w)+(\.\w+)*@(\w)+((\.\w{2,3}){1,3})$' e='286980036@qq.com' if re.match(r,e): print(re.match(r,e).group(0)) else: print('这不是一个合法的邮箱。') #2.用正则表达式识别出全部电话号码。 r2='(\d{3,4})-(\d{6,8})' e2='''版权所有:广州商学院 地址:广州市黄埔区九龙大道206号 学校办公室:020-82876130 招生电话:020-82872773 粤公网安备 44011602000060号 粤ICP备15103669号''' rec=re.findall(r2,e2) if(rec): for i in rec: print(i) else: print('error') #3. 用正则表达式进行英文分词。re.split('',news) e3='''Chinese President *** has warned against a "Cold War mentality" as he vowed to open up parts of the country's economy.''' print(re.split('[\s,.?\-]+',e3)) #4. 使用正则表达式取得新闻编号 url='http://news.gzcc.cn/html/2018/xiaoyuanxinwen_0404/9183.html' res=re.search('\_(.*).html',url).group(1).split('/')[1] print(res) #5. 生成点击次数的Request URL #6. 获取点击次数 ress = requests.get('http://oa.gzcc.cn/api.php?op=count&id={}&modelid=80'.format(res)) ress.encoding='utf-8' text=ress.text ct=re.findall('\d{1,4}',text)[-1] print(ct) #7. 将456步骤定义成一个函数 def getClickCount(newsUrl): def getClickCount(newsUrl): res = re.search('\_(.*).html', newsUrl).group(1).split('/')[1] ress = requests.get('http://oa.gzcc.cn/api.php?op=count&id={}&modelid=80'.format(res)) ress.encoding = 'utf-8' text = ress.text ct = re.findall('\d{1,4}', text)[-1] return ct #9. 取出一个新闻列表页的全部新闻 包装成函数def getListPage(pageUrl): def getListPage(pageUrl): res = requests.get(pageUrl) res.encoding = 'utf-8' soup = BeautifulSoup(res.text, 'html.parser') for news in soup.select('li'): if len(news.select('.news-list-title')) > 0: d = news.select('.news-list-info')[0].contents[0].text t = news.select('.news-list-title')[0].text href = news.select('a')[0].attrs['href'] clicknum=getClickCount(href) print(t,d,href) resd = requests.get(href) resd.encoding = 'utf-8' soupd = BeautifulSoup(resd.text, 'html.parser') infod = soupd.select('.show-info')[0].text dt = infod.lstrip('发布时间:')[:19] ts = soupd.select('#content')[0].text dati = datetime.strptime(dt, '%Y-%m-%d %H:%M:%S') ther = infod[infod.find('作者:'):infod.find('审核:')].split()[0].lstrip('作者:') check = infod[infod.find('审核:') + 3:infod.find('来源')] source = infod[infod.find('来源:'):].split()[0].lstrip('来源:') photo = infod[infod.find('摄影:'):].split()[0].lstrip('摄影:') print(dati,ther,check,source,photo,clicknum) #10. 获取总的新闻篇数,算出新闻总页数包装成函数def getPageN(): def getPageN(): res=requests.get('http://news.gzcc.cn/html/xiaoyuanxinwen/') res.encoding='utf-8' soup=BeautifulSoup(res.text,'html.parser') news=soup.select('#pages')[0].select('a')[0].text.rstrip('条') print(news) getListPage('http://news.gzcc.cn/html/xiaoyuanxinwen/')