爬虫

爬取校花网视频

import requests
import re
import time,os
import hashlib

DOWNLOAD_PATH=r'F:\DOWNLOAD'

def get_page(url):
    print('GET %s' %url)
    try:
        response=requests.get(url)
        print('response:',response)  # <Response [200]>   response 对象
        if response.status_code == 200:
            return response.content    #    二进制的视频列表页面数据
    except Exception:
        pass

def parse_index(res):
    obj=re.compile('class="items.*?<a href="(.*?)"',re.S)
    detail_urls=obj.findall(res.decode('gbk'))      # 符合正则匹配的那些视频详细页的链接   列表形式    ['http://www.xiaohuar.com/p-3-97.html', ....]
    for detail_url in detail_urls:
        if not detail_url.startswith('http'):
            detail_url='http://www.xiaohuar.com'+detail_url
        yield detail_url      # 各视频详细页的url

def parse_detail(res):
    obj=re.compile('id="media".*?src="(.*?)"',re.S)
    res=obj.findall(res.decode('gbk'))      # 符合正则匹配的那些视频mp4的链接   视频详细页只有一个视频,所以该匹配列表中只有一个mp4链接
    if len(res) > 0:
        movie_url=res[0]
        return movie_url   #  各视频的mp4链接


def save(movie_url):
    response=requests.get(movie_url,stream=False)
    if response.status_code == 200:
        m=hashlib.md5()
        m.update(str(time.time()).encode('utf-8'))
        m.update(movie_url.encode('utf-8'))

        filepath = os.path.join(DOWNLOAD_PATH, '%s.mp4' % m.hexdigest())
        with open(filepath,'wb') as f:
            f.write(response.content)
            f.flush()
            print('%s下载成功'%filepath)


def main():
    index_url='http://www.xiaohuar.com/list-3-{0}.html'
    for i in range(5):
        print('*'*50,i)
        #爬取主页面
        index_page=get_page(index_url.format(i,))
        #解析主页面,拿到视频所在的地址列表
        detail_urls=parse_index(index_page)      #  生成器
        #循环爬取视频页
        for detail_url in detail_urls:
            #爬取视频页
            detail_page=get_page(detail_url)
            #拿到视频的url
            movie_url=parse_detail(detail_page)
            print('movie_url==========',movie_url)
            if movie_url:
                #保存视频
                save(movie_url)


if __name__ == '__main__':
    main()
爬视频
from concurrent.futures import ThreadPoolExecutor
import queue
import requests
import re
import time
import hashlib
from threading import current_thread

p=ThreadPoolExecutor(50)

def get_page(url):
    print('%s GET %s' %(current_thread().getName(),url))
    try:
        response=requests.get(url)
        if response.status_code == 200:
            return response.content
    except Exception as e:
        print(e)

def parse_index(res):
    print('%s parse index ' %current_thread().getName())
    res=res.result()
    obj=re.compile('class="items.*?<a href="(.*?)"',re.S)
    detail_urls=obj.findall(res.decode('gbk'))
    for detail_url in detail_urls:
        if not detail_url.startswith('http'):
            detail_url='http://www.xiaohuar.com'+detail_url
        p.submit(get_page,detail_url).add_done_callback(parse_detail)

def parse_detail(res):
    print('%s parse detail ' %current_thread().getName())
    res=res.result()
    obj=re.compile('id="media".*?src="(.*?)"',re.S)
    res=obj.findall(res.decode('gbk'))
    if len(res) > 0:
        movie_url=res[0]
        print('MOVIE_URL: ',movie_url)
        with open('db.txt','a') as f:
            f.write('%s\n' %movie_url)
        # save(movie_url)
        p.submit(save,movie_url)
        print('%s下载任务已经提交' %movie_url)
def save(movie_url):
    print('%s SAVE: %s' %(current_thread().getName(),movie_url))
    try:
        response=requests.get(movie_url,stream=False)
        if response.status_code == 200:
            m=hashlib.md5()
            m.update(('%s%s.mp4' %(movie_url,time.time())).encode('utf-8'))
            filename=m.hexdigest()
            with open(r'./movies/%s.mp4' %filename,'wb') as f:
                f.write(response.content)
                f.flush()
    except Exception as e:
        print(e)

def main():
    index_url='http://www.xiaohuar.com/list-3-{0}.html'
    for i in range(5):
        p.submit(get_page,index_url.format(i,)).add_done_callback(parse_index)


if __name__ == '__main__':
    main()
并发爬取视频

通过requests模块访问网站

import requests
url='https://www.baidu.com/s'
reponse=requests.get(url,  # 网页url
                     # params 将url中的请求信息进行编码
                     params={'wd':'美女',},    # GET请求的请求体在url中,若想发送类似 https://www.baidu.com/s?wd=美女 这类url  需要借助params进行编码 编译成 https://www.baidu.com/s?wd=%E7%BE%8E%E5%A5%B3 才成
                     #请求头中需要携带的信息  User-Agent   cookies   referer
                     # referer=   只允许从当前网页中跳转
                     headers={
                        # 表明来自电脑浏览器
                        'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',
                     },
                     )
print(reponse.status_code)
print(reponse.text)
通过requests模块访问网站

 

posted @ 2018-06-28 16:45  程先生_Python  阅读(97)  评论(0编辑  收藏  举报