python 爬虫

正则表达式获取贴吧访问量

import urllib.request
import re
data=urllib.request.urlopen("https://tieba.baidu.com/f?kw=cpda&fr=ala0&tpl=5").read()
data2=data.decode("utf-8","ignore")
pat="<title>(.*?)</title>"
s1 = re.compile(pat).findall(str(data2))
print(s1)
pat2='<span class="card_numLabel">(.*?)</span>'
s2 = re.compile(pat2).findall(str(data2))
print(s2)
pat3='<span class="card_menNum">(.*?)</span>'
s3 = re.compile(pat3).findall(str(data2))
print(s3)
pat4='<span class="card_infoNum">(.*?)</span>'
s4 = re.compile(pat4).findall(str(data2))
print(s4)

正则表达式学习2--豆瓣获取文章

import urllib.request
import re
file=urllib.request.urlopen("https://read.douban.com/provider/all").read()
file2=file.decode("utf-8","ignore")
patn='<div class="name">(.*?)</div>'
mydata=re.compile(patn).findall(str(file2))
print(mydata)
for i in range(0,len(mydata)):
    print(mydata[i]+"\n")

url数据获取--异常值处理--新浪新闻获取文章

import urllib.request
import re
data=urllib.request.urlopen("http://news.sina.com.cn/").read()
data2=data.decode("utf-8","ignore")
pat='href="(http://news.sina.com.cn/.*?)"'
allurl=re.compile(pat).findall(data2)
for i in range(0,len(allurl)):
 try:
  print("第"+str(i)+"次爬取")
  thisurl=allurl[i]
  print(thisurl)
  file="D:/sinanews/"+str(i)+".html"
  print(file)
  print("-------成功-------")
 except urllib.error.URLError as e:
  if hasattr(e,"code"):
    print(e.code)
  if hasattr(e,"reason"):
    print(e.reason)

  

 

import urllib.requestimport refile=urllib.request.urlopen("https://read.douban.com/provider/all").read()file2=file.decode("utf-8","ignore")patn='<div class="name">(.*?)</div>'mydata=re.compile(patn).findall(str(file2))print(mydata)for i in range(0,len(mydata)):    print(mydata[i]+"\n")

posted @ 2019-05-18 22:25  逐梦无惧_数据分析  阅读(125)  评论(0编辑  收藏  举报