python 爬虫
正则表达式获取贴吧访问量
import urllib.request import re data=urllib.request.urlopen("https://tieba.baidu.com/f?kw=cpda&fr=ala0&tpl=5").read() data2=data.decode("utf-8","ignore") pat="<title>(.*?)</title>" s1 = re.compile(pat).findall(str(data2)) print(s1) pat2='<span class="card_numLabel">(.*?)</span>' s2 = re.compile(pat2).findall(str(data2)) print(s2) pat3='<span class="card_menNum">(.*?)</span>' s3 = re.compile(pat3).findall(str(data2)) print(s3) pat4='<span class="card_infoNum">(.*?)</span>' s4 = re.compile(pat4).findall(str(data2)) print(s4)
正则表达式学习2--豆瓣获取文章
import urllib.request import re file=urllib.request.urlopen("https://read.douban.com/provider/all").read() file2=file.decode("utf-8","ignore") patn='<div class="name">(.*?)</div>' mydata=re.compile(patn).findall(str(file2)) print(mydata) for i in range(0,len(mydata)): print(mydata[i]+"\n")
url数据获取--异常值处理--新浪新闻获取文章
import urllib.request import re data=urllib.request.urlopen("http://news.sina.com.cn/").read() data2=data.decode("utf-8","ignore") pat='href="(http://news.sina.com.cn/.*?)"' allurl=re.compile(pat).findall(data2) for i in range(0,len(allurl)): try: print("第"+str(i)+"次爬取") thisurl=allurl[i] print(thisurl) file="D:/sinanews/"+str(i)+".html" print(file) print("-------成功-------") except urllib.error.URLError as e: if hasattr(e,"code"): print(e.code) if hasattr(e,"reason"): print(e.reason)
import urllib.requestimport refile=urllib.request.urlopen("https://read.douban.com/provider/all").read()file2=file.decode("utf-8","ignore")patn='<div class="name">(.*?)</div>'mydata=re.compile(patn).findall(str(file2))print(mydata)for i in range(0,len(mydata)): print(mydata[i]+"\n")