1. 用正则表达式判定邮箱是否输入正确。
r = '^(\w)+(\.\w+)*@(\w)+((\.\w{2,3}){1,3})$' e = '23456521@qq.com' if re.match(r, e): print(re.match(r, e).group(0)) else: print('error')
2. 用正则表达式识别出全部电话号码。
9. 取出一个新闻列表页的全部新闻 包装成函数def getListPage(pageUrl):
11. 获取全部新闻列表页的全部新闻详情。
def AlltitleAndUrl(url): reslist = requests.get(url) reslist.encoding = 'utf-8' soup_list = BeautifulSoup(reslist.text, 'html.parser') head = 'http://news.gzcc.cn/html/tongzhigonggao/' pages = '' tail = '.html' for news in soup_list.select('li'): # 首页 if len(news.select('.news-list-title')) > 0: # 标题 title = news.select('.news-list-title')[0].text # 单位 company =news.select('div .news-list-info > span')[1].text # 链接 href = news.select('a')[0]['href'] #时间 time =news.select('div .news-list-info > span')[0].text print('\n') print("文章标题:" + title) print("文章链接:" + href) print("发布地址:"+company) print("发布时间:" + time) #匹配所以的电话 for a in soup_list.select('#footer'): print(re.findall('(\d{3,4})-(\d{6,8})', a.select('div.container')[0].text)) #其余的页面内容 for i in range(2, 95): pages = i; nexturl = '%s%s%s' % (head, pages, tail) newcontent = requests.get(nexturl) newcontent.encoding = 'utf-8' soup_alllist = BeautifulSoup(newcontent.text, 'html.parser') for news in soup_alllist.select('li'): if len(news.select('.news-list-title')) > 0: # 标题 title = news.select('.news-list-title')[0].text # 单位 company = news.select('div .news-list-info > span')[1].text # 链接 href = news.select('a')[0]['href'] # 时间 time = news.select('div .news-list-info > span')[0].text print('\n') print("文章标题:" + title) print("文章链接:" + href) print("发布地址:" + company) print("发布时间:" + time)
3. 用正则表达式进行英文分词。re.split('',news)
news = '''Process finished with exit code''' word = re.split('[\s,.?\-]+',news) print(word)
4. 使用正则表达式取得新闻编号
5. 生成点击次数的Request URL
newsUrl = 'http://news.gzcc.cn/html/2017/xiaoyuanxinwen_095/8249.html' num=re.search('\_(.*).html',newsUrl).group(1) print(num) newsId = re.search('\_(.*).html', newsUrl).group(1).split('/')[-1] res = 'http://oa.gzcc.cn/api.php?op=count&id=8249&modelid=80'.format(newsId) print(res)
6. 获取点击次数
HitUrl = 'http://oa.gzcc.cn/api.php?op=count&id=9183&modelid=80' hitNumber = requests.get(HitUrl).text.split('.html')[-1].lstrip("('").rstrip("');") print("点击次数:", hitNumber)
7. 将456步骤定义成一个函数 def getClickCount(newsUrl):
def getClickCount(): HitUrl = 'http://oa.gzcc.cn/api.php?op=count&id=9183&modelid=80' hitNumber = requests.get(HitUrl).text.split('.html')[-1].lstrip("('").rstrip("');") print("点击次数:", hitNumber) re.match('http://news.gzcc.cn/html/2018/xiaoyuanxinwen(.*).html', url).group(1).split('/')[1] print('新闻编号:', re.search('\_(.*).html', url).group(1))
8. 将获取新闻详情的代码定义成一个函数 def getNewDetail(newsUrl):
def getNewDetail(url): # 获取一页的详细新闻 res = requests.get(url) res.encoding = 'utf-8' soup = BeautifulSoup(res.text, 'html.parser') for news in soup.select('li'): if len(news.select('.news-list-title')) > 0: # 首页文章标题 title = news.select('.news-list-title')[0].text # 首页文章描述 description = news.select('.news-list-description')[0].text # 首页文章信息 info = news.select('.news-list-info')[0].text # 首页文章链接 href = news.select('a')[0]['href'] url = href res = requests.get(url) res.encoding = 'utf-8' soup = BeautifulSoup(res.text, 'html.parser') # 获取每篇文章的信息 newinfo = soup.select('.show-info')[0].text # 获取文章内容 content = soup.select('#content')[0].text # 日期 date = newinfo.split()[0] # 当日时间 time = newinfo.split()[1] # 作者 author = newinfo.split()[2] # 审核 checker = newinfo.split()[3] # 来源 source = newinfo.split()[4] # 摄影 Photography = newinfo.split()[5] print("文章标题:" + title) print("\n文章描述:" + description) print("\n文章信息:\n" + date + ' ' + time + '\n' + author + '\n' + checker + '\n' + source + '\n' + Photography) getClickCount(href) # 点击次数、新闻编号 print("\n文章链接:" + href) print(content)
10. 获取总的新闻篇数,算出新闻总页数包装成函数def getPageN():
def getPageN(): res = requests.get('http://news.gzcc.cn/html/xiaoyuanxinwen/') res.encoding = 'utf-8' soup = BeautifulSoup(res.text, 'html.parser') pagenumber=int(soup.select('.a1')[0].text.rstrip('条')) page = int(soup.select('.a1')[0].text.rstrip('条'))//10+1 return page