爬虫2(二)
一,校花网图片
from requests_html import HTMLSession import os class spider(): def __init__(self): self.session = HTMLSession() self.headers = { 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36' } def get_index_url(self): for i in range(1,4): if i == 1: yield 'http://www.xiaohuar.com/meinv/index.html' else: yield 'http://www.xiaohuar.com/meinv/index_%s.html'%i def get_img_name(self,index_url): r = self.session.get(url=index_url,headers=self.headers) elements_list = r.html.find('#images .items') for element in elements_list: img_url:str = element.find('img',first=True).attrs.get('src') if not img_url.startswith('http'): img_url = 'http://www.xiaohuar.com' + img_url img_name = element.find('.p_title>a',first=True).text.replace('\\','').replace('/','') + '.jpg' yield img_url,img_name def save_img(self,img_url,img_name): r = self.session.get(url=img_url) img_path= os.path.join('校花图片',img_name) with open(img_path,'wb') as f: f.write(r.content) print('%s下载完毕'%img_name) def run(self): for index_url in self.get_index_url(): for img_url,img_name in self.get_img_name(index_url): self.save_img(img_url,img_name) if __name__ == '__main__': xiaohua = spider() xiaohua.run()
二.豆瓣
from requests_html import HTMLSession #测试 # session = HTMLSession() # url='https://movie.douban.com/tag/#/?sort=S&range=0,10&tags=2018' # # r = session.get(url=url) # print(r.text) #电影 %E7%94%B5%E5%BD%B1 # print(str('电影'.encode('utf-8')).strip("'b").replace('\\x','%').upper()) class spider(): def __init__(self): self.api = 'https://movie.douban.com/j/new_search_subjects?' self.session = HTMLSession() def get_params(self): year_range = input('输入年份') #年份是一个区间,逗号隔开 sort = input('输入排序规则(S按评分)') self.params = { 'year_range':year_range, 'sort':sort, 'start':0 } def get_data(self): for i in range(10): self.params['start'] = i*20 r = self.session.get(url=self.api,params=self.params) print(r.json()) def run(self): self.get_params() self.get_data() if __name__ == '__main__': douban = spider() douban.run()
三.校花网视频
from requests_html import HTMLSession import os class spider(): def __init__(self): self.session = HTMLSession() def get_index_page(self): for i in range(7): url = 'http://www.xiaohuar.com/list-3-%s.html'%i yield url def parse_index_page(self,index_page): r = self.session.get(url=index_page) elements_list = r.html.find('#images .items a[class="imglink"]') for element in elements_list: yield element.attrs.get('href') def parse_detail_page(self,detail_page): r = self.session.get(url=detail_page) r.html.encoding = 'GBK' result_obj = r.html.search('var vHLSurl = "{}";') if result_obj: m3u8_url = result_obj[0] m3u8_name = r.html.find('title',first=True).text.replace('\\','') yield m3u8_url,m3u8_name else: print("匹配失败,无资源") def save_m3u8(self,m3u8_url,m3u8_name): m3u8_dir = m3u8_name if not os.path.exists(m3u8_dir): os.mkdir(m3u8_dir) print(m3u8_url) r = self.session.get(url=m3u8_url) m3u8_path = os.path.join(m3u8_dir,'playlist.m3u8') with open(m3u8_path,'wt+',encoding='utf-8') as f : f.write(r.text) f.seek(0,0) for line in f: line = line.strip() if line.endswith('.ts'): ts_url = os.path.dirname(m3u8_url) + '/%s'%line r = self.session.get(url=ts_url) ts_path = os.path.join(m3u8_dir,line) with open(ts_path,'wb') as f1: f1.write(r.content) print('%s下载完毕'%line) def run(self): for url in self.get_index_page(): for detail_page in self.parse_index_page(url): for m3u8_url,m3u8_name in self.parse_detail_page(detail_page): self.save_m3u8(m3u8_url,m3u8_name) if __name__ == '__main__': xioahua = spider() xioahua.run()
四.tmall
from requests_html import HTMLSession class spider(): def __init__(self): self.session = HTMLSession() self.api= 'http://list.tmall.com/search_product.htm?' def get_params(self): pro = input("输入你要爬取的商品:") self.params = { 'q':pro, 'totalPage':1, 'jumpto':1 } def get_totalPage(self): r = self.session.get(url=self.api,params=self.params) totalPage = r.html.find('[name="totalPage"]',first=True).attrs.get('value') self.params['totalPage'] = int(totalPage) def get_pro_info(self): for i in range(1,self.params['totalPage']+1): self.params['jumpto'] = i r = self.session.get(url=self.api, params=self.params) elements_pro_list = r.html.find('.product') for element_pro in elements_pro_list: title = element_pro.find('.productTitle a',first=True).text price = element_pro.find('.productPrice em',first=True).attrs.get('title') print(title) print(price) print('-'*30) def run(self): self.get_params() self.get_totalPage() self.get_pro_info() if __name__ == '__main__': tmall = spider() tmall.run()