爬虫之requests

发送请求

'''
1.发送请求:
    - pip3 install requests
'''

import requests
from urllib.parse import urlencode

# 1.请求url,请求方式GET
url = 'https://www.baidu.com/s?wd=%E7%BE%8E%E5%A5%B3'  #

# 组织请求头信息
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9'
}

# 2.通过requests模块发送GET请求
# 往url地址发送请求,并得到一个返回值,赋值给response变量
# response == response 对象
response = requests.get(url, headers=headers)

print(response.status_code)  # 200成功
# 获取响应文本
print(response.text)

with open('baidu.html', 'w', encoding='utf-8') as f:
    f.write(response.text)



demo2
import requests
from urllib.parse import urlencode

# 1.请求url,请求方式GET
url = 'https://www.baidu.com/s?' + urlencode({'wd': '美女'})

# 组织请求头信息
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9'
}

# 2.通过requests模块发送GET请求
# 往url地址发送请求,并得到一个返回值,赋值给response变量
# response == response 对象
response = requests.get(url, headers=headers)

print(response.status_code)  # 200成功
# 获取响应文本
print(response.text)

with open('meinv.html', 'w', encoding='utf-8') as f:
    f.write(response.text)



demo3
import requests

# 1.请求url,请求方式GET
url = 'https://www.baidu.com/s?'

# 组织请求头信息
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9'
}

# 2.通过requests模块发送GET请求
# 往url地址发送请求,并得到一个返回值,赋值给response变量
# response == response 对象
response = requests.get(url, headers=headers, params={'wd': '美女'})

print(response.status_code)  # 200成功
# 获取响应文本
print(response.text)

with open('girl.html', 'w', encoding='utf-8') as f:
    f.write(response.text)


通过get请求,绕过github登录

'''
1、携带github登录过后的cookies信息,访问github主页,绕过登录;

用户名:aaa
邮箱:bbb
密码:ccc



1.请求url:
    - https://github.com/settings/emails

2.请求方式:
    - GET

3.请求头:
    - cookies:
        Cookie: _octo=GH1.1.816288935.1571284371; has_recent_activity=1; tz=Asia%2FShanghai; _device_id=a00ea80a1caf3404bd890d1a520083a6; user_session=y_4qkWDvHylVnzfIezSMSCaNdAnH0fTBw7pmm9yk4ZLmh6Ha; __Host-user_session_same_site=y_4qkWDvHylVnzfIezSMSCaNdAnH0fTBw7pmm9yk4ZLmh6Ha; logged_in=yes; dotcom_user=Michaeljy; _gh_sess=Vy82WmxrcFZmREZvV0dnTUtIaENDVWhPY3M1dEpwRVNWelVmYjk3TWQzQVpVVS84S3hhOXgrMkhRR2lHZUhZNUwvQzU1U3RHUWxWNmhITDFVV3FFcTUreFdPaWsyT2hTZlllQ2Fld3dFVmR5Q0w2RDdKbEduZENidUxySkd6MXVOKzdXQURsRVpSZDVUUjY4TE83c0JCK3BrcWlmeFVBZzhxVGFiVDRPd01HUlI0M3FrVFNLZ3dyNlJ6K0pQaWdQZHFYUFdISFRHT1ZZc3k4UDNFaVB4eWhLY3NTZnRaK0JTb3JlZzhGd3o1eFBaeHh1eCtReFZ3dWFsQXBqSWdNc0ltODdVTTk2bjVrdytFekNyWGlEZnFSdGZLSTZ6eW1jY1pvUGJYWWRIU0RtK01BWC9yOFRqb2FPNENJRUZra3F2V250dVZWMys3ajl5ZHVXQThUVGRQSHhmSDNwUXdnTUFUb0grMytlT1Y5U1RqVEYzak4zV1RIRURVbktycXR5dG1NZVJYNWZLdGxVR1hZeFFmemV0dFhiZXhiZHRKTGw0RHZvYmh6NEJmdHRPVkdFSENzaHhRSjJzVWlya2R4N3N3S3FWVmxpVkl0cWs0bmUvNmhlMGpwU3hNdTFZKzI5eCt1c0RjRVViVjY4Z2E4YzFtbkRhVmtybUZabGg4VVVlaUtLV3FEUUVMdjRvUnU1amdpbnJUTStOV1NIbGpTRGNRc25pbjVlazZ5MzV0b2tVek1qaW1HUDhMWVU1bzhjN3BsYkQ1MXVkaWg2STROdDRCNWVxMUtCTUt2L0o1Y01aTWsxM1BQNUJSZUxVcnBWMXRYdStlTlA2ZFhiVERYRVcwYUZMeFlmNjNxUTN1a0dNSVQySlZQTnpiTTY2UDk2WWMycG9iQzB4SWlRNnFhM3RVRXFxUHF4ZHBmd2R6Rk0wcFQ3b0VvNkJCd1NQZGhGNThKaHdzcEtpSXZMUUxRWVVlOHUyVFFRQmZBdURBZnNXRk9JRTljQVMzWmExSXprWFI2RVMrYkNMaVJybGR0cC9vN09rWTVvYXdHRWxaMmQwdmYzenM5dDVLMXVpTEhkWnNXVkNyWTRVd3JiSWFUVkhDak9Qb0xUekxjTk5PWWxtb2d6V1hZSGNOMkhkdWlVNXY1TTExREc0RW9mUGlVK1FoYVNYLzRSM21OMGFabkZyWHBEOFBpNXBRSU9rYkRXYVczVi8yc3ZPYURJZGUzY1AwN3l0S0szci8yc1J6WmhkQ3ZFeVJuT1pkR2FkMys3d0xTa21QNE80M3hVby82TmJ4MG1DWFZ6ZDlDR3RMaUJBNXhSVHhnMmRwMWxNSTRTR2hRM1JXdncyRklEWlA3cTU2cGRpU0wyVUljZmtGVVNXN2ltTG54cW52QzloWitSZjRVdUh3NnNrcEVWdTFoM1JXNnFqVW55SXRDTmY5WXEvakQ0bk9lWlpFRTMwbm9qWkp0QXlWZkE2MzlQNXJUbk51WkttTFE4QVB3Y1pzZDc0Zlp0ei81U1FKS3JnUVV6SUZoMkhSZXBsL0M0VXBLTUNKd2YvQ2JJTmpVK2JtZ1J6VkE4MW9lVTlZRVVlOHRHd1dNUDJLYkFSbTFGbExuYmFtOHkyc0p6OTdOYnp4ZWJlTjdxR0V2eWZRYXlqeEhmTC92QlJGa2lmaVU9LS1HQmU4cXZCL0trdVVTMW5QVXkzWUdnPT0%3D--e417927d7291752b7527f64c960dbe38450f2543
    - User-Agent: Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36

    - Referer: https://github.com/settings/profile

'''
import requests

url = 'https://github.com/settings/emails'

github_headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36',
    'Cookie': '_octo=GH1.1.816288935.1571284371; has_recent_activity=1; tz=Asia%2FShanghai; _device_id=a00ea80a1caf3404bd890d1a520083a6; user_session=y_4qkWDvHylVnzfIezSMSCaNdAnH0fTBw7pmm9yk4ZLmh6Ha; __Host-user_session_same_site=y_4qkWDvHylVnzfIezSMSCaNdAnH0fTBw7pmm9yk4ZLmh6Ha; logged_in=yes; dotcom_user=Michaeljy; _gh_sess=Vy82WmxrcFZmREZvV0dnTUtIaENDVWhPY3M1dEpwRVNWelVmYjk3TWQzQVpVVS84S3hhOXgrMkhRR2lHZUhZNUwvQzU1U3RHUWxWNmhITDFVV3FFcTUreFdPaWsyT2hTZlllQ2Fld3dFVmR5Q0w2RDdKbEduZENidUxySkd6MXVOKzdXQURsRVpSZDVUUjY4TE83c0JCK3BrcWlmeFVBZzhxVGFiVDRPd01HUlI0M3FrVFNLZ3dyNlJ6K0pQaWdQZHFYUFdISFRHT1ZZc3k4UDNFaVB4eWhLY3NTZnRaK0JTb3JlZzhGd3o1eFBaeHh1eCtReFZ3dWFsQXBqSWdNc0ltODdVTTk2bjVrdytFekNyWGlEZnFSdGZLSTZ6eW1jY1pvUGJYWWRIU0RtK01BWC9yOFRqb2FPNENJRUZra3F2V250dVZWMys3ajl5ZHVXQThUVGRQSHhmSDNwUXdnTUFUb0grMytlT1Y5U1RqVEYzak4zV1RIRURVbktycXR5dG1NZVJYNWZLdGxVR1hZeFFmemV0dFhiZXhiZHRKTGw0RHZvYmh6NEJmdHRPVkdFSENzaHhRSjJzVWlya2R4N3N3S3FWVmxpVkl0cWs0bmUvNmhlMGpwU3hNdTFZKzI5eCt1c0RjRVViVjY4Z2E4YzFtbkRhVmtybUZabGg4VVVlaUtLV3FEUUVMdjRvUnU1amdpbnJUTStOV1NIbGpTRGNRc25pbjVlazZ5MzV0b2tVek1qaW1HUDhMWVU1bzhjN3BsYkQ1MXVkaWg2STROdDRCNWVxMUtCTUt2L0o1Y01aTWsxM1BQNUJSZUxVcnBWMXRYdStlTlA2ZFhiVERYRVcwYUZMeFlmNjNxUTN1a0dNSVQySlZQTnpiTTY2UDk2WWMycG9iQzB4SWlRNnFhM3RVRXFxUHF4ZHBmd2R6Rk0wcFQ3b0VvNkJCd1NQZGhGNThKaHdzcEtpSXZMUUxRWVVlOHUyVFFRQmZBdURBZnNXRk9JRTljQVMzWmExSXprWFI2RVMrYkNMaVJybGR0cC9vN09rWTVvYXdHRWxaMmQwdmYzenM5dDVLMXVpTEhkWnNXVkNyWTRVd3JiSWFUVkhDak9Qb0xUekxjTk5PWWxtb2d6V1hZSGNOMkhkdWlVNXY1TTExREc0RW9mUGlVK1FoYVNYLzRSM21OMGFabkZyWHBEOFBpNXBRSU9rYkRXYVczVi8yc3ZPYURJZGUzY1AwN3l0S0szci8yc1J6WmhkQ3ZFeVJuT1pkR2FkMys3d0xTa21QNE80M3hVby82TmJ4MG1DWFZ6ZDlDR3RMaUJBNXhSVHhnMmRwMWxNSTRTR2hRM1JXdncyRklEWlA3cTU2cGRpU0wyVUljZmtGVVNXN2ltTG54cW52QzloWitSZjRVdUh3NnNrcEVWdTFoM1JXNnFqVW55SXRDTmY5WXEvakQ0bk9lWlpFRTMwbm9qWkp0QXlWZkE2MzlQNXJUbk51WkttTFE4QVB3Y1pzZDc0Zlp0ei81U1FKS3JnUVV6SUZoMkhSZXBsL0M0VXBLTUNKd2YvQ2JJTmpVK2JtZ1J6VkE4MW9lVTlZRVVlOHRHd1dNUDJLYkFSbTFGbExuYmFtOHkyc0p6OTdOYnp4ZWJlTjdxR0V2eWZRYXlqeEhmTC92QlJGa2lmaVU9LS1HQmU4cXZCL0trdVVTMW5QVXkzWUdnPT0%3D--e417927d7291752b7527f64c960dbe38450f2543'
}

# 1.发送请求
response = requests.get(
    url,
    # headers关键字参数
    headers=github_headers
)

print(response.status_code)  # 200
print('bbb' in response.text)  # True
with open('emails.html', 'w', encoding='utf-8') as f:
    f.write(response.text)


通过post请求永久绕过github登录

'''
用户名:aaa
邮箱:bbb
密码:ccc

1.先分析 http 的请求流程
    - 请求url:
        Request URL: https://github.com/session

    - 请求方式:
        Request Method: POST

    - 请求头:
        - Referer: https://github.com/login
        - User-Agent: Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36

    - 请求体: (form data):
        commit: Sign in
        utf8: ✓
        authenticity_token: finQLn5TxHAabDmefQ2EbBXV27jDGlWm6DUu+u5u4J6QnPXBmkc76/QlDpx61v1NFf3AP8r+vg1Cq31G9Wxenw==
        ga_id:
        login: aaa
        password: bbb
        webauthn-support: supported
        webauthn-iuvpaa-support: supported
        required_field_a359:
        timestamp: 1577696492100  # 时间戳
        timestamp_secret: 03e50e82485174cadc2dda90916b93bfeadef0ac92643cbfde40e6c7f598bbb6


    - 1) 先往https://github.com/login页面发送get请求,获取authenticity_token与timestamp_secret随机加密字符串
    - 2) 携带加密字符串与请求体所有的信息,一并通过post请求访问https://github.com/session


2.再写爬虫代码

'''
import requests
import re

# 1) 先往https://github.com/login页面发送get请求,获取authenticity_token与timestamp_secret随机加密字符串
url = 'https://github.com/login'
login_headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36'
}

login_response = requests.get(url, headers=login_headers)

# print(login_response.status_code)
# print(type(login_response.text))

# 2)先解析获取authenticity_token与timestamp_secret, 通过re模块实现
authenticity_token = re.findall('<input type="hidden" name="authenticity_token" value="(.*?)" />',
                                login_response.text,
                                re.S)[0]

timestamp_secret = re.findall('<input type="hidden" name="timestamp_secret" value="(.*?)" class="form-control" />',
                              login_response.text,
                              re.S)[0]

# print(authenticity_token)
# print(timestamp_secret)


# 3) 携带加密字符串与请求体所有的信息,一并通过post请求访问https://github.com/session

form_data = {
    'commit': 'Sign in',
    'utf8': '✓',
    'authenticity_token': authenticity_token,
    'ga_id': '',
    'login': 'aaa',
    'password': 'bbb',
    'webauthn-support': 'supported',
    'webauthn-iuvpaa-support': 'supported',
    'required_field_a359': '',
    'timestamp': 1577696892274,  # 时间戳
    'timestamp_secret': timestamp_secret
}

session_url = 'https://github.com/session'
session_response = requests.post(
    session_url,
    data=form_data,
    cookies=login_response.cookies
)

print(session_response.status_code)
print(session_response.cookies)

# 4) 登录后,直接访问github主页, 前提是需要携带登录成功后的用户cookies值
# 携带cookies值的两种方式:
# - headers: 携带cookies值
# - cookies: 可以添加cookies值
# index_response = requests.get('https://github.com/', cookies=session_response.cookies)
# print(index_response.text)
# with open('github.html', 'w', encoding='utf-8') as f:
#     f.write(index_response.text)


# 5) 验证是否登录成功,校验邮箱
emails_response = requests.get('https://github.com/settings/emails', cookies=session_response.cookies)
print('aaa' in emails_response.text)



爬取梨视频

#爬取视频
#https://www.pearvideo.com/category_loading.jsp?reqType=5&categoryId=1&start=48&mrd=0.9993282952193101&filterIds=1625835,1625642,1625837,1625841,1625870,1625869,1625813,1625844,1625801,1625856,1625857,1625847,1625838,1625827,1625787
#https://www.pearvideo.com/category_loading.jsp?reqType=5&categoryId=1&start=0
#获取视频
import re
res=requests.get('https://www.pearvideo.com/category_loading.jsp?reqType=5&categoryId=1&start=0')


reg_text='<a href="(.*?)" class="vervideo-lilink actplay">'

obj=re.findall(reg_text,res.text)
print(obj)
for url in obj:
    url='https://www.pearvideo.com/'+url
    res1=requests.get(url)
    obj1=re.findall('srcUrl="(.*?)"',res1.text)
    print(obj1[0])
    name=obj1[0].rsplit('/',1)[1]
    print(name)
    res2=requests.get(obj1[0])
    with open(name,'wb') as f:
        for line in res2.iter_content():
            f.write(line)
posted @ 2019-12-30 19:38  Micheal_L  阅读(525)  评论(0编辑  收藏  举报