day 3 爬虫实战练习
爬取文本网页
import requests #导入requesets库 #往百度主页发送请求,获取响应对象 response=requests.get(url='https://www.baidu.com/') #设置字符编码为 utf-8 response.encoding='utf-8' #打印响应文本 print(response.text) #把响应文本写入本地 with open('baidu.html','w',encoding='utf-8') as f: f.write(response.text)
爬取视频
import requests response =requests.get('https://vd4.bdstatic.com/mda-ij7mjzskvmwpyb6q/sc/mda-ij7mjzskvmwpyb6q.mp4') print(response.content) with open('张艺兴.mp4','wb')as f: f.write(response.content)
1.先往梨视频主页发送请求
http://www.pearvideo.com/
解析获取所有视频的id
video_1570302
re.findall()
2.获取视频详情页url
.........................
.........................
re正则,用于解析文本数据
1.先往梨视频主页发送请求
re正则匹配获取所有的视频id
参数1;正则匹配规则
参数2:解析文本
参数3:匹配模式
import requests import re reponse=requests.get('http://www.pearvideo.com/') print(reponse.text) res_list=re.findall('<a href=video_(.*?)"',reponse.text,re.S) print(res_list) for v_id in res_list: detail_url='http://www.pearvideo.com/video_'+v_id print(detail_url)
import requests import re reponse=requests.get('http://www.pearvideo.com/') print(reponse.text) res_list=re.findall('<a href=video_(.*?)"',reponse.text,re.S) print(res_list) for v_id in res_list: detail_url='http://www.pearvideo.com/video_'+v_id #print(detail_url) #对每一个视频详情页发送请求获取视频,频源url response =requests.get(url=detail_url) print(response.text) #解析并提取详情页视频url #视频url video_url=re.findall('srcUrl="(.*?)'"),response.text,re.S)[0] print(video_url) #视频名称 video_name=re.findall('<h1 class="video-tt(.*?)</h1>',response.text,re.S)[0] print(video_name) #往视频url发送请求获取视频二进制流 v_response=requests.get(video_url) with open('%s.mp4'%video_name,'wb')as f: f.write(v_response.content) print(video_name,'视频爬取完成')