爬虫实例
1. 爬校花网图片:
这是一个很基础的爬虫小例子,爬取校花网的图片。其中用了requests-html库:
先获取每页的url,再爬取每页中的图片进行解析处理,最后存入到文件中
from requests_html import HTMLSession
import os
class Spider():
def __init__(self):
self.session = HTMLSession()
def get_index_page(self):
for i in range(1, 4):
if i == 1:
page_url = "http://www.xiaohuar.com/meinv/index.html"
else:
page_url = "http://www.xiaohuar.com/meinv/index_%s.html" % i
yield page_url
def get_image_name(self, page_url):
r = self.session.get(url=page_url)
elements_list = r.html.find('#images .items')
for element in elements_list:
image_url = element.find('img', first=True).attrs.get('src')
image_name = element.find('.p_title a', first=True).text
image_name = image_name.replace('【', '').replace('】', '').replace('|', '').replace('\\', '').replace(
'/','') + '.jpg'
yield image_url, image_name
def save(self, image_url, image_name):
image_name = os.path.join('pictures', image_name)
if not image_url.startswith('http'):
image_url = 'http://www.xiaohuar.com' + image_url
r = self.session.get(url=image_url)
with open(image_name, 'wb') as f:
f.write(r.content)
print('%s下载完成' % image_name)
def run(self):
for page_url in self.get_index_page():
for image_url, image_name in self.get_image_name(page_url):
self.save(image_url, image_name)
if __name__ == '__main__':
xiaohua = Spider()
xiaohua.run()
2. 豆瓣电影排行信息
爬虫获取豆瓣电影信息:在这里对电影进行筛选是通过url携带参数。所以先获取参数信息。
from requests_html import HTMLSession
class Spider:
def __init__(self):
self.api = "https://movie.douban.com/j/new_search_subjects?"
self.session = HTMLSession()
def get_params(self):
sort = input('请输入按什么排序(S评分)')
year_range = input('请输入年份:')
self.params = {
'sort':sort,
'year_range':year_range,
'start':0
}
def get_message(self):
for i in range(10):
self.params['start'] = i * 20
r = self.session.get(url=self.api,params=self.params)
print(r.json())
def run(self):
self.get_params()
self.get_message()
if __name__ == '__main__':
douban = Spider()
douban.run()
3. 爬取校花视频
校花视频是通过m3u8格式。有的视频会员有反爬机制,查看元素的播放链接是unknown,所以获取不到资源。
现获取到m3u8格式的播放列表连接,然后发送请求获取到的文件内容是一行行.ts。.ts也是一个文件格式。对m3u8文件内容就行处理,发送请求下载.ts文件保存
from requests_html import HTMLSession
import os
class spider():
def __init__(self):
self.session = HTMLSession()
def get_index_page(self):
for i in range(7):
url = 'http://www.xiaohuar.com/list-3-%s.html'%i
yield url
def parse_index_page(self,index_page):
r = self.session.get(url=index_page)
elements_list = r.html.find('#images .items a[class="imglink"]')
for element in elements_list:
yield element.attrs.get('href')
def parse_detail_page(self,detail_page):
r = self.session.get(url=detail_page)
r.html.encoding = 'GBK'
result_obj = r.html.search('var vHLSurl = "{}";')
if result_obj:
m3u8_url = result_obj[0]
m3u8_name = r.html.find('title',first=True).text.replace('\\','')
yield m3u8_url,m3u8_name
else:
print("匹配失败,无资源")
def save_m3u8(self,m3u8_url,m3u8_name):
m3u8_dir = m3u8_name
if not os.path.exists(m3u8_dir):
os.mkdir(m3u8_dir)
print(m3u8_url)
r = self.session.get(url=m3u8_url)
m3u8_path = os.path.join(m3u8_dir,'playlist.m3u8')
with open(m3u8_path,'wt+',encoding='utf-8') as f :
f.write(r.text)
f.seek(0,0)
for line in f:
line = line.strip()
if line.endswith('.ts'):
ts_url = os.path.dirname(m3u8_url) + '/%s'%line
r = self.session.get(url=ts_url)
ts_path = os.path.join(m3u8_dir,line)
with open(ts_path,'wb') as f1:
f1.write(r.content)
print('%s下载完毕'%line)
def run(self):
for url in self.get_index_page():
for detail_page in self.parse_index_page(url):
for m3u8_url,m3u8_name in self.parse_detail_page(detail_page):
self.save_m3u8(m3u8_url,m3u8_name)
if __name__ == '__main__':
xioahua = spider()
xioahua.run()