day1之校花网小试牛刀

一 利用生成器来完成爬去校花网视频

  

import requests 
import re
import os
import hashlib
import time

DOWLOAD_PATH=r'D:\DOWNLOAD'

def get_page(url):
    try:
        response=requests.get(url,)
        if response.status_code == 200:
            return response.text
    except Exception:
        pass

def parse_index(index_contents):
    # print(type(index_contents))
    detail_urls=re.findall('class="items".*?href="(.*?)"',index_contents,re.S)
    for detail_url in detail_urls:
        if not detail_url.startswith('http'):
            detail_url='http://www.xiaohuar.com'+detail_url
        yield detail_url

def parse_detail(detail_contents):
    movie_urls=re.findall('id="media".*?src="(.*?)"',detail_contents,re.S)
    if movie_urls:
        movie_url=movie_urls[0]
        if movie_url.endswith('mp4'):
           yield movie_url

def download(movie_url):
    print(movie_url)
    try:
        response=requests.get(movie_url,
                              )
        if response.status_code == 200:
            data=response.content
            m=hashlib.md5()
            m.update(str(time.time()).encode('utf-8'))
            m.update(movie_url.encode('utf-8'))
            filepath=os.path.join(DOWLOAD_PATH,'%s.mp4' %m.hexdigest())
            with open(filepath,'wb') as f:
                f.write(data)
                f.flush()
                print('下载成功',movie_url)
    except Exception:
        pass

def main():
    raw_url='http://www.xiaohuar.com/list-3-{page_num}.html'
    for i in range(5):
        #请求索引页,解析拿到详情页链接
        index_url=raw_url.format(page_num=i)
        index_contents=get_page(index_url)
        detail_urls=parse_index(index_contents)

        #请求详情页,解析拿到视频的链接地址
        for detail_url in detail_urls:
            detail_contents=get_page(detail_url)
            movie_urls=parse_detail(detail_contents)

            #下载视频
            for movie_url in movie_urls:
                download(movie_url)



if __name__ == '__main__':
    t1=time.time()
    main()
    print(time.time()-t1)

 

二 利用对线程优化上述代码

  

import requests #pip install requests
import re
import os
import hashlib
import time
from concurrent.futures import ThreadPoolExecutor

pool=ThreadPoolExecutor(50)
DOWLOAD_PATH=r'D:\DOWNLOAD'

def get_page(url):
    try:
        response=requests.get(url,)
        if response.status_code == 200:
            return response.text
    except Exception:
        pass

def parse_index(index_contents):
    index_contents=index_contents.result()
    detail_urls=re.findall('class="items".*?href="(.*?)"',index_contents,re.S)
    for detail_url in detail_urls:
        if not detail_url.startswith('http'):
            detail_url='http://www.xiaohuar.com'+detail_url
        pool.submit(get_page,detail_url).add_done_callback(parse_detail)

def parse_detail(detail_contents):
    detail_contents=detail_contents.result()
    movie_urls=re.findall('id="media".*?src="(.*?)"',detail_contents,re.S)
    if movie_urls:
        movie_url=movie_urls[0]
        if movie_url.endswith('mp4'):
           pool.submit(download,movie_url)

def download(movie_url):
    # print(movie_url)
    try:
        response=requests.get(movie_url,
                              )
        if response.status_code == 200:
            data=response.content
            m=hashlib.md5()
            m.update(str(time.time()).encode('utf-8'))
            m.update(movie_url.encode('utf-8'))
            filepath=os.path.join(DOWLOAD_PATH,'%s.mp4' %m.hexdigest())
            with open(filepath,'wb') as f:
                f.write(data)
                f.flush()
                print('下载成功',movie_url)
    except Exception:
        pass

def main():
    raw_url='http://www.xiaohuar.com/list-3-{page_num}.html'
    for i in range(5):
        #请求索引页,解析拿到详情页链接
        index_url=raw_url.format(page_num=i)
        pool.submit(get_page,index_url).add_done_callback(parse_index)


if __name__ == '__main__':
    t1=time.time()
    main()
    print(time.time()-t1)

 

牛逼的代码

 

三 自己根据egon讲的grep命令,类似的道理,写的爬去校花网图片的代码

  

import requests,re,os
def init(f):
    def inner(*args,**kwargs):
        g=f(*args,**kwargs)
        next(g)
        return g
    return inner
def get(url):
    r=requests.get(url)
    def inner():
        r.encoding='gbk'
        return r.text
    return inner
xiaohua=get('http://www.xiaohuar.com/2014.html')
xiaohua_contend=xiaohua()
def search(target):
    g=re.finditer('<a href=.*? target=.*?><img width=.*?  alt="(?P<name>.*?)" src="(?P<src>.*?)" /></a>',xiaohua_contend,re.S)
    for i in g:
        target.send((i.group('name'),i.group('src')))
@init
def handle(target):
    while True:
        name,src=yield
        if src.startswith('http'):
            pass
        else:
            src='http://www.xiaohuar.com'+src
        target.send((name,src))
@init
def download():
    while True:
        name,src=yield
        r=requests.get(src)
        with open(r'D:\校花网'+'\\'+name+'.jpg','wb')as f:
            f.write(r.content)
search(handle((download())))

 

 

 总结:

egon授课。   

生成器与协程有紧密的联系。

生成器可以通过yield接收参数,通过send传值。

生成器与多线程也有关系吗?没有吧。

普通函数爬取视频也是可以用到多线程的。

 

优化的余地:可以加上进度条,利用类实现。大概就是这个样式,copy的。

def download_file(url, path):
    with closing(requests.get(url, stream=True)) as r:
        chunk_size = 1024*10
        content_size = int(r.headers['content-length'])
        print '下载开始'
        with open(path, "wb") as f:
            p = ProgressData(size = content_size, unit='Kb', block=chunk_size)
            for chunk in r.iter_content(chunk_size=chunk_size):
                f.write(chunk)
                p.output()
class ProgressData(object):
 
    def __init__(self, block,size, unit, file_name='', ):
        self.file_name = file_name
        self.block = block/1000.0
        self.size = size/1000.0
        self.unit = unit
        self.count = 0
        self.start = time.time()
    def output(self):
        self.end = time.time()
        self.count += 1
        speed = self.block/(self.end-self.start) if (self.end-self.start)>0 else 0
        self.start = time.time()
        loaded = self.count*self.block
        progress = round(loaded/self.size, 4)
        if loaded >= self.size:
            print u'%s下载完成\r\n'%self.file_name
        else:
            print u'{0}下载进度{1:.2f}{2}/{3:.2f}{4} 下载速度{5:.2%} {6:.2f}{7}/s'.\
                  format(self.file_name, loaded, self.unit,\
                  self.size, self.unit, progress, speed, self.unit)
            print '%50s'%('/'*int((1-progress)*50))

 

posted @ 2018-01-09 23:37  骑者赶路  阅读(158)  评论(0编辑  收藏  举报