python3.5爬虫基础urllib结合beautifulsoup实例
beautifulsoup模块,可以替代re模块来代替正则表达式进行匹配
小例子1:用beautifulsoup爬取淘宝首页的汉字
1 from bs4 import BeautifulSoup 2 def tecent(url): 3 response=urllib.request.urlopen(url) 4 html=response.read() 5 data=html.decode("utf-8") #转换编码,默认转换为utf-8 6 soup=BeautifulSoup(data,"html5lib") 7 for list in soup.find_all("a"): 8 if list.string==None: 9 continue 10 else: 11 print(type(list.string)) 12 print(list.string) #暂时无法将NavigableString类型进行转换,此例子暂时在控制台输出 13 # with open("taobao1.txt","ab") as f: 14 # f.write(list.string) 15 16 if __name__=="__main__": 17 url="https://www.taobao.com/" 18 tecent(url)
小例子2:用Beautiful soup编写一个抓取妹子图页面图片的代码
1 from bs4 import BeautifulSoup 2 def taonvlang(url): 3 res=urllib.request.urlopen(url).read() 4 data=res.decode() 5 soup=BeautifulSoup(data,"html5lib") #将html代码用Bs进行处理 6 path="G:/taonvlang/" 7 if not os.path.isdir(path): #如果不存在该路径,则创建路径 8 os.makedirs(path) 9 count=1 #用于给图片编号 10 for list in soup.find_all("img"): #获取img的所有内容 11 print(list) #img标签的所有内容 12 dict=list.attrs #将该字段转换为字典 13 print(dict) 14 if "src" in dict: 15 image=dict["src"] #取图片地址 16 # print(image) 17 img=image[image.rfind(".")::] #取出文件扩展名 18 # print(img) 19 image_path=str(count).zfill(5)+img 20 filepath=os.path.join(path,image_path) 21 with open(filepath,"wb") as f: 22 image_data=urllib.request.urlopen(dict["src"]).read() 23 f.write(image_data) 24 count+=1 25 26 if __name__=="__main__": 27 url="http://www.mzitu.com/all" 28 taonvlang(url)