基于python的爬虫(一)

 

一.爬虫的基本流程:

# 1、发起请求: 
  使用http库向目标站点发起请求,即发送一个Request
  Request包含:请求头、请求体等

# 2、获取响应内容
  如果服务器能正常响应,则会得到一个Response
  Response包含:html,json,图片,视频等

# 3、解析内容
  解析html数据:正则表达式,第三方解析库如Beautifulsoup,pyquery等
  解析json数据:json模块
  解析二进制数据:以b的方式写入文件

# 4、保存数据
  数据库
  文件

二.我们来爬一个校花网

import requests
import re

# 爬虫三部曲

# 一 发送请求
def get_page(url):
    
    index_res = requests.get(url)
    return index_res.text

# 二 解析数据
# 解析主页
def parse_index(index_page):
    detail_urls = re.findall('<div class="items">.*?href="(.*?)"', index_page, re.S)
    # print(detail_urls)

    for detail_url in detail_urls:
        if not detail_url.startswith('http'):
            detail_url = 'http://www.xiaohuar.com' + detail_url
        yield detail_url

# 解析详情页
def parse_detail(detail_page):
    video_urls = re.findall('id="media".*?src="(.*?)"', detail_page, re.S)

    if video_urls:
        video_urls = video_urls[0]
        if video_urls.endswith('.mp4'):
            yield video_urls

    # print(video_urls)

# 三 保存数据
import uuid
def save_video(video_url):
    try:
        res = requests.get(video_url)
        with open(r'D:\pachong\movies\%s.mp4' % uuid.uuid4(), 'wb') as f:
            f.write(res.content)
            f.flush()

    except Exception:
        pass


if __name__ == '__main__':
    base_url = 'http://www.xiaohuar.com/list-3-{}.html'
    for line in range(5):
        index_url = base_url.format(line)

        index_page = get_page(index_url)
        detail_urls = parse_index(index_page)
        for detail_url in detail_urls:
            detail_page = get_page(detail_url)
            video_urls = parse_detail(detail_page)
            for video_url in video_urls:
                save_video(video_url)

并发版:

# pip3 install requests
import requests
import re
from concurrent.futures import ThreadPoolExecutor

pool = ThreadPoolExecutor(50)

# 爬虫三部曲

# 一 发送请求
def get_page(url):
    print('%s GET start ...' % url)
    index_res = requests.get(url)
    return index_res.text

# 二 解析数据
# 解析主页
def parse_index(index_page):
    # 拿到主页的返回结果
    res = index_page.result()
    detail_urls = re.findall('<div class="items">.*?href="(.*?)"', res, re.S)
    # print(detail_urls)

    for detail_url in detail_urls:
        if not detail_url.startswith('http'):
            detail_url = 'http://www.xiaohuar.com' + detail_url

        pool.submit(get_page, detail_url).add_done_callback(parse_detail)
        # yield detail_url

# 解析详情页
def parse_detail(detail_page):
    res = detail_page.result()

    video_urls = re.findall('id="media".*?src="(.*?)"', res, re.S)

    if video_urls:
        video_urls = video_urls[0]
        if video_urls.endswith('.mp4'):
            pool.submit(save_video, video_urls)

    # print(video_urls)


# 三 保存数据
import uuid
def save_video(video_url):
    try:
        res = requests.get(video_url)
        with open(r'D:\tank\day01\movies\%s.mp4' % uuid.uuid4(), 'wb') as f:
            f.write(res.content)
            f.flush()
            print('%s done ...' % video_url)

    except Exception:
        pass


if __name__ == '__main__':
    base_url = 'http://www.xiaohuar.com/list-3-{}.html'
    for line in range(5):
        index_url = base_url.format(line)
        pool.submit(get_page, index_url).add_done_callback(parse_index)

三.request的基本使用

      1.get请求的两种方式:

import requests
from urllib.parse import urlencode

# 请求url
base_url = 'https://www.baidu.com/s?' + urlencode({"wd": "美女"})
# 请求头
headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0'
}

# 请求方法 GET
res = requests.get(base_url, headers=headers)
 # print(res)            一个response对象 # print(res.text)       整个html文本 # print(res.content)    二进制内容with open('meinv.html', 'w', encoding='utf-8') as f:
    f.write(res.text)

       每次url编码会很麻烦,所以可以在GET内添加参数即可:

import requests

# 请求url
base_url = 'https://www.baidu.com/s?'

# # 请求头
headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0'
}
#
# 请求方法 GET
res = requests.get(base_url, headers=headers, params={"wd": "黄云"})

with open('小云云.html', 'w', encoding='utf-8') as f:
    f.write(res.text)

       get请求访问知乎:

# 访问知乎

# 请求url
zhi_url = 'https://www.zhihu.com/explore'

# # 请求头
headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0'

}
# 请求方法 GET
res = requests.get(zhi_url, headers=headers)
with open('知乎.html', 'w', encoding='utf-8') as f:
    f.write(res.text)

    get请求访问github:

# # 请求头,登录后的主页
url='https://github.com/settings/emails'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36',
'Cookie': 'has_recent_activity=1; _ga=GA1.2.1150787574.1561264746; _octo=GH1.1.800236184.1561264778; _device_id=e38cc770a7f91ac7001f3b1e23185943; user_session=HtVIP7s1AnJA8pBp9PPJN5onsJZ_AJ0mnhXKm-IkGuPYMzDi; __Host-user_session_same_site=HtVIP7s1AnJA8pBp9PPJN5onsJZ_AJ0mnhXKm-IkGuPYMzDi; logged_in=yes; dotcom_user=pengsima; _gat=1; tz=Asia%2FShanghai; _gh_sess=U0hueWR2WmcvMEJ3amVCTFpOVm5KUDFob1FQUHBtd1BYK09ENkU0aTBqK1JrYmFiYTd6K3pLb0pSVDV5UzdOU0oxbGluSDR3dmVJYTA3WlVpaHZ2cWJmQTJrVTQzRHVFa1cvT1hrWG1ON1ZMRm1DeEtkQkhDRUVaK2cwUUpRN29UUnlyWnRCODQ3cTRLYWZkcmN5UHdnPT0tLUgxSmxJMUQzWDllblhFT3JMK083Tnc9PQ%3D%3D--92e621b5b1d19cf03e157bf61e02ded6a1a248c6'

}
# # 请求头,email
headers_2 = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36',
'Cookie':'has_recent_activity=1; _ga=GA1.2.1150787574.1561264746; _octo=GH1.1.800236184.1561264778; _device_id=e38cc770a7f91ac7001f3b1e23185943; user_session=HtVIP7s1AnJA8pBp9PPJN5onsJZ_AJ0mnhXKm-IkGuPYMzDi; __Host-user_session_same_site=HtVIP7s1AnJA8pBp9PPJN5onsJZ_AJ0mnhXKm-IkGuPYMzDi; logged_in=yes; dotcom_user=pengsima; _gat=1; tz=Asia%2FShanghai; _gh_sess=SE5mdjlBaWtla3B2czNYZFI5UTF6TEhUbERvellXVTZnUVE3d0hjTDBTb3RtZ0UxTXhYSCt4S2h2NXR2c3h2YVNaZUNITHlCOE9GcmhIM2lweVFVellYMExxV3dEK0R1ZU15cUEycmxIRk4yZW1WT2J5c3hFVHZ4Y3ZOaUhBN0ZseWcyTmMwNWxPTEIrMmpnVVpKRUJRPT0tLTdNcFZsOTFidnpxZk05cWVZUmV0MkE9PQ%3D%3D--6064098de4400f5a7ac71cdd3806abd11b2a0134'
}
# 请求方法 GET
# res = requests.get(url, headers=headers_2)
res = requests.get(url, headers=headers)
with open('github.html', 'w', encoding='utf-8') as f:
    f.write(res.text)
print('1059239165' in res.text)

# True

   2.post请求

 

# 第一步 https://github.com/login  >>>>  获取tocken
headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0'

}

login_res = requests.get('https://github.com/login', headers=headers)
#

authenticity_token = re.findall('name="authenticity_token".*?value="(.*?)"', login_res.text, re.S)[0]
print(
    authenticity_token
)
# 第二步拿到cookies
cookies = {}


# 把login_cookies放进cookies字典内
cookies.update(login_res.cookies.get_dict())
print(cookies)


# 第三步 往session发送post请求
# 请求方法 POST

# 请求url
#     https://github.com/session  POST

# 请求体

form_data = {
    "commit": "Sign in",
    "utf8": "",
    "authenticity_token": authenticity_token,
    "login": "pengsima",
    "password": "oa09116611",
    "webauthn-support":" supported"

}

# json
# requests.post('https://github.com/session', headers=headers, json=form_data)
res = requests.post('https://github.com/session', headers=headers, data=form_data, cookies=cookies)

# print(res.status_code)

with open('github.html', 'w', encoding='utf-8') as f:
    f.write(res.text)

 

 3.response

import requests

baidu = 'https://www.baidu.com/'

headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0'

}

res = requests.get(baidu, headers=headers)

# 返回响应状态码
print(res.status_code)

print(res)
# 响应头
print(res.headers)

# 响应文本
print(res.text)

print(res.url)
#
print(res.cookies)
print(res.cookies.get_dict())

print(res.encoding)
# res.encoding = 'utf-8'
# print(res.encoding)

print(res.history)

print(res.content)

下载一张图片:

bo = 'https://timgsa.baidu.com/timg?image&quality=80&size=b9999_10000&sec=1551942493340&di=afa19a1f5a3a4fbdec983baaeb1954f0&imgtype=0&src=http%3A%2F%2Fwww.xnnews.com.cn%2Fwenyu%2Flxsj%2F201611%2FW020161114828261827516.jpg'

headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0'

}

res = requests.get(bo, headers=headers, stream=True)
with open('bo2.jpg', 'wb') as f:
    for line in res.iter_content():
        # f.write(res.content)
        f.write(line)

补充:

  取消重定向(默认为True):
  allow_redriects=False

 4.session用法:

import requests
import re

session = requests.session()


# 一 往login发送get请求: 获取token
'''
name="authenticity_token" value="/pE5/yY3Ibm1z0CgiSrqZheBOGQl+rPLs491/TOUL0sRIaQFQzS/s/er5eC/xxEO2AGY0l39b0rEStW/A6Bngg=="
'''

headers = {
    'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Mobile Safari/537.36',
    # 'Cookies'
}

login_res = session.get('https://github.com/login', headers=headers)

authenticity_token = re.findall('name="authenticity_token".*?value="(.*?)"', login_res.text, re.S)[0]



# 二 往session发送post请求
# 请求方法 POST

# 请求url
#     https://github.com/session  POST

# 请求体

form_data = {
    "commit": "Sign in",
    "utf8": "",
    "authenticity_token": authenticity_token,
    "login": "hdjasbfsas",
    "password":"yy9797910",
}

res = session.post('https://github.com/session', headers=headers, data=form_data)


# print('pengsima' in res.text)
print(res.status_code)
# print(res.text)
#
with open('github.html', 'w', encoding='utf-8') as f:
    f.write(res.text)

5.json格式反序化:

import requests
import json
res = requests.get('https://www.toutiao.com/stream/widget/local_weather/city/')
print(res.text)
# jason反序列化的两种方式
print(json.loads(res.text))
print(res.json())

 补充:

'''
requests高级用法
了解!
'''

import requests
# SSL
res = requests.get('https://www.xiaohuar.com/')
print(res.text)

# 改进一: 无视证书
res = requests.get('https://www.xiaohuar.com/', verify=False)
print(res.text)


# 改进二:取消警告
import urllib3
urllib3.disable_warnings()
res = requests.get('https://www.xiaohuar.com/', verify=False)
print(res.text)

# 改进三:添加证书
import urllib3
urllib3.disable_warnings()
res = requests.get('https://www.xiaohuar.com/', verify=False,
                   cert=('/path/server.crt', '/path/key'))
print(res.text)


# 使用代理
res = requests.get('https://www.baidu.com/', headers={
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'
},
                   # proxies={
                   #     'http': 'http://112.85.130.66:9999',
                   #     # 'https': 'https://112.85.130.66:9999',
                   # }
                   proxies={
                       'sock': 'sock://ip:port'
                   })

print(res.text)


# 超时设置(超时报错)
import requests
respone=requests.get('https://www.baidu.com',
                     timeout=0.0001)
print(respone.text)

# 认证
import requests
from requests.auth import HTTPBasicAuth
r=requests.get('xxx', auth=HTTPBasicAuth('user','password'))
print(r.status_code)

# 上传文件

import requests
files = {'file': open('a.jpg', 'rb')}
response = requests.post('http://httpbin.org/post', files=files)
print(response.status_code)

 

posted @ 2019-06-23 23:47  纵横捭阖行  阅读(620)  评论(0编辑  收藏  举报