urilib: ajax&异常&cookie登陆&handler&代理[池]

urllib_9_ajax的get请求豆瓣电影第一页

"""                                                           
                                       .-''-.               
.--.                    _..._        .' .-.  )              
|__|                  .'     '.     / .'  / /               
.--..-,.--.          .   .-.   .   (_/   / /                
|  ||  .-. |         |  '   '  |        / /                 
|  || |  | | _    _  |  |   |  |       / /         _    _   
|  || |  | || '  / | |  |   |  |      . '         | '  / |  
|  || |  '-.' | .' | |  |   |  |     / /    _.-').' | .' |  
|__|| |    /  | /  | |  |   |  |   .' '  _.'.-'' /  | /  |  
    | |   |   `'.  | |  |   |  |  /  /.-'_.'    |   `'.  |  
    |_|   '   .'|  '/|  |   |  | /    _.'       '   .'|  '/ 
           `-'  `--' '--'   '--'( _.-'           `-'  `--'  
Created on 2023/3/28 21:46.
@Author: haifei
"""
import time
from urllib import request, parse


# 谷歌浏览器访问https://movie.douban.com/typerank?type_name=%E5%8A%A8%E4%BD%9C&type=5&interval_id=100:90&action=
# F12，找到top_list?type=5&interval_id=100%3A90&action=&start=0&limit=20，preview或response中可以看到目标json数据
# header中Request URL: https://movie.douban.com/j/chart/top_list?type=5&interval_id=100%3A90&action=&start=0&limit=20

url = 'https://movie.douban.com/j/chart/top_list?type=5&interval_id=100%3A90&action=&start=0&limit=20'
headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36'
}

# 1. 请求对象定制
_request = request.Request(url=url, headers=headers)
# 2. 获取响应数据
response = request.urlopen(_request)
content = response.read().decode('utf-8')
print(content)
# 3. 下载数据到本地
# fp = open('./download/douban-page1.json', 'w')
fp = open('./download/douban-page1.json', 'w', encoding='utf-8')  # 如果上行失败可以+编码
fp.write(content)
fp.close()

# 4. 数据下载方法2
with open('./download/douban-page2.json', 'w', encoding='utf-8') as fp:
    fp.write(content)  # with会自动关闭文件流


if __name__ == '__main__':
    start = time.time()
    print('It takes', time.time() - start, "seconds.")

urllib_10_ajax的get请求豆瓣电影前十页

"""                                                           
                                       .-''-.               
.--.                    _..._        .' .-.  )              
|__|                  .'     '.     / .'  / /               
.--..-,.--.          .   .-.   .   (_/   / /                
|  ||  .-. |         |  '   '  |        / /                 
|  || |  | | _    _  |  |   |  |       / /         _    _   
|  || |  | || '  / | |  |   |  |      . '         | '  / |  
|  || |  '-.' | .' | |  |   |  |     / /    _.-').' | .' |  
|__|| |    /  | /  | |  |   |  |   .' '  _.'.-'' /  | /  |  
    | |   |   `'.  | |  |   |  |  /  /.-'_.'    |   `'.  |  
    |_|   '   .'|  '/|  |   |  | /    _.'       '   .'|  '/ 
           `-'  `--' '--'   '--'( _.-'           `-'  `--'  
Created on 2023/3/28 21:46.
@Author: haifei
"""
import time
from urllib import request, parse


"""
url = 'https://movie.douban.com/j/chart/top_list?type=5&interval_id=100%3A90&action=&start=0&limit=20'
# 以下在在页面上为ajax异步请求实现，后端为分页
url = 'https://movie.douban.com/j/chart/top_list?type=5&interval_id=100%3A90&action=&start=20&limit=20'
url = 'https://movie.douban.com/j/chart/top_list?type=5&interval_id=100%3A90&action=&start=40&limit=20'

page    1  2   3   4    ...
start   0  20  40  60   ...
找到分页规律：start =（page - 1）* 20

1. 请求对象定制
2. 获取响应数据
3. 下载数据
"""


headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36'
}
base_url = 'https://movie.douban.com/j/chart/top_list?type=5&interval_id=100%3A90&action=&'


def create_request(page):
    # pass
    data = {
        'start': (page-1)*20,
        'limit': 20
    }
    # get请求：参数是直接追加在URL后面的，不需要调用.encode('utf-8')
    # put请求：参数必须在.urlencode()之后再调用.encode('utf-8')
    data = parse.urlencode(data)
    url = base_url + data
    # print(url)
    _request = request.Request(url=url, headers=headers)
    return _request


def get_content(_request):
    response = request.urlopen(_request)
    content = response.read().decode('utf-8')
    return content


def download_data(content, page):
    file_name = 'doubanpage-' + str(page) + '.json'  # note:python没有自动类型转换，此处必须强制类型转换
    with open('./download/'+file_name, 'w', encoding='utf-8') as fp:
        fp.write(content)


# Mac&win pycharm快捷键汇总https://blog.csdn.net/qq_42363090/article/details/125725182
# 代码格式化/json文件格式化：command+Option+L   Ctrl+Alt+L
if __name__ == '__main__':
    start = time.time()

    start_page = int(input('请输入起始页码：'))
    end_page = int(input('请输入结束页码：'))
    if (start_page < 0) or (end_page < 0):
        print('[note: 页码数必须大于0]')
    elif start_page > end_page:
        print('[note: 起始页码必须小于等于结束页码]')
    else:
        for page in range(start_page, end_page+1):  # 左闭右开[)
            # print(page)
            myrequest = create_request(page)  # 封装请求对象定制
            mycontent = get_content(myrequest)  # 封装获取响应数据
            download_data(mycontent, page)
        print('download finished')
    print('It takes', time.time() - start, "seconds.")

urllib_11_ajax的put请求肯德基官网店铺位置

"""                                                           
                                       .-''-.               
.--.                    _..._        .' .-.  )              
|__|                  .'     '.     / .'  / /               
.--..-,.--.          .   .-.   .   (_/   / /                
|  ||  .-. |         |  '   '  |        / /                 
|  || |  | | _    _  |  |   |  |       / /         _    _   
|  || |  | || '  / | |  |   |  |      . '         | '  / |  
|  || |  '-.' | .' | |  |   |  |     / /    _.-').' | .' |  
|__|| |    /  | /  | |  |   |  |   .' '  _.'.-'' /  | /  |  
    | |   |   `'.  | |  |   |  |  /  /.-'_.'    |   `'.  |  
    |_|   '   .'|  '/|  |   |  | /    _.'       '   .'|  '/ 
           `-'  `--' '--'   '--'( _.-'           `-'  `--'  
Created on 2023/3/28 22:51.
@Author: haifei
"""
import time
from urllib import request, parse
from pyfiglet import Figlet


"""
request headers中含有X-Requested-With: XMLHttpRequest时说明为ajax请求

'http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=cname'
post请求的参数在payload的form data里，如下

cname: 北京
pid: 
pageIndex: 1
pageSize: 10

cname: 北京
pid: 
pageIndex: 2
pageSize: 10

cname: 北京
pid: 
pageIndex: 2
pageSize: 10
"""


def hello_message():
    print('*' * 100)
    f = Figlet()
    print(f.renderText('irun2u'))
    print('Name: kfc store spider')
    print('Verson: 1.0')
    print('Index: http://www.irun2u.top')
    print('*' * 100)


def legal(s):
    if (s[0] != '+') and (s[0] != '-'):  # 无符号位，默认为正数
        return s
    else:  # 有符号位，对去掉符号位的num进行检验
        return s[1:]


def get_page():
    input_start = input('请输入起始页码：')
    input_end = input('请输入结束页码：')
    if not legal(input_start).isdigit() or not legal(input_end).isdigit():
        print('[note: 输入页码必须为数字]')
        # raise Exception('[note: 输入页码必须为数字]')
    else:
        page_start = int(input_start)
        page_end = int(input_end)
        if (page_start < 0) or (page_end < 0):
            print('[note: 页码数必须大于0]')
            # raise Exception('[note: 输入页码必须为数字]')
        elif page_start > page_end:
            print('[note: 起始页码必须小于等于结束页码]')
            # raise Exception('[note: 输入页码必须为数字]')
        else:
            return [page_start, page_end]


def create_request(page):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36'
    }
    base_url = 'http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=cname'
    data = {
        'cname': '北京',
        'pid': '',
        'pageIndex': page,
        'pageSize': '10'
    }
    _data = parse.urlencode(data).encode('utf-8')  # put请求的参数必须在.urlencode()之后再调用.encode('utf-8')
    _request = request.Request(url=base_url, headers=headers, data=_data)  # put请求的参数放在data属性里而不是直接追加在URL后
    return _request


def get_content(myrequest):
    response = request.urlopen(myrequest)
    content = response.read().decode('utf-8')
    return content


def download_data(page, mycontent):
    file_name = 'kfcstore-' + str(page) + '.json'
    with open('./download/' + file_name, 'w', encoding='utf-8') as fp:
        fp.write(mycontent)


if __name__ == '__main__':
    start = time.time()
    hello_message()
    pages = get_page()
    if pages is not None:
        page_start = pages[0]
        page_end = pages[1]
        for page in range(page_start, page_end + 1):
            myrequest = create_request(page)
            mycontent = get_content(myrequest)
            download_data(page, mycontent)
        print('download finished')
    print('It takes', time.time() - start, "seconds.")

urllib_12_异常

"""                                                           
                                       .-''-.               
.--.                    _..._        .' .-.  )              
|__|                  .'     '.     / .'  / /               
.--..-,.--.          .   .-.   .   (_/   / /                
|  ||  .-. |         |  '   '  |        / /                 
|  || |  | | _    _  |  |   |  |       / /         _    _   
|  || |  | || '  / | |  |   |  |      . '         | '  / |  
|  || |  '-.' | .' | |  |   |  |     / /    _.-').' | .' |  
|__|| |    /  | /  | |  |   |  |   .' '  _.'.-'' /  | /  |  
    | |   |   `'.  | |  |   |  |  /  /.-'_.'    |   `'.  |  
    |_|   '   .'|  '/|  |   |  | /    _.'       '   .'|  '/ 
           `-'  `--' '--'   '--'( _.-'           `-'  `--'  
Created on 2023/3/29 21:59.
@Author: haifei
"""
import time
import urllib.request, urllib.error


headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36'
}

# 正常情况
# url = 'https://blog.csdn.net/csdnnews/article/details/129774767'
# _request = urllib.request.Request(url=url, headers=headers)
# response = urllib.request.urlopen(_request)
# content = response.read().decode('utf-8')
# print(content)

# 报错情况
# url = 'https://blog.csdn.net/csdnnews/article/details/129774767' + '1111'  # URL错误，报错HTTP Error 404
# _request = urllib.request.Request(url=url, headers=headers)
# response = urllib.request.urlopen(_request)
# content = response.read().decode('utf-8')
# print(content)

# 捕获异常
# httperror是urlerror的子类，二者都属于urllib.error包
url = 'https://blog.csdn.net/csdnnews/article/details/129774767' + '1111'  # URL错误，报错HTTP Error 404
try:
    _request = urllib.request.Request(url=url, headers=headers)
    response = urllib.request.urlopen(_request)
    content = response.read().decode('utf-8')
    print(content)
except urllib.error.HTTPError:
    print('[note: 404]')

# urlerror出现情况：URL域名错误
url = 'https://www.dandan789.com/'  # 域名错误，报错Hostname mismatch
_request2 = urllib.request.Request(url=url, headers=headers)
response2 = urllib.request.urlopen(_request2)
content2 = response2.read().decode('utf-8')
print(content2)


if __name__ == '__main__':
    start = time.time()
    print('It takes', time.time() - start, "seconds.")

urllib_13_qq空间的cookie登陆

"""                                                           
                                       .-''-.               
.--.                    _..._        .' .-.  )              
|__|                  .'     '.     / .'  / /               
.--..-,.--.          .   .-.   .   (_/   / /                
|  ||  .-. |         |  '   '  |        / /                 
|  || |  | | _    _  |  |   |  |       / /         _    _   
|  || |  | || '  / | |  |   |  |      . '         | '  / |  
|  || |  '-.' | .' | |  |   |  |     / /    _.-').' | .' |  
|__|| |    /  | /  | |  |   |  |   .' '  _.'.-'' /  | /  |  
    | |   |   `'.  | |  |   |  |  /  /.-'_.'    |   `'.  |  
    |_|   '   .'|  '/|  |   |  | /    _.'       '   .'|  '/ 
           `-'  `--' '--'   '--'( _.-'           `-'  `--'  
Created on 2023/3/29 22:13.
@Author: haifei
"""
import time
import urllib.request


'''
场景：
一个浏览器登陆qq空间，URL
https://user.qzone.qq.com/xxxxx
用另一个浏览器打开这个URL
正常会提示登陆

什么情况下访问不成功？
因为请求头的信息不够  所以访问不成功

解决：回到第一个浏览器找到访问目标接口，拷贝其请求头request headers放到headers中
决定性信息：cookie、referer
注意：cookie携带登陆信息；referer是图片防盗链，判断当前路径是否有上一路径进来
'''


'''
url = 'https://user.qzone.qq.com/xxxxx'
headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36 Edg/99.0.1150.39'
}
# 请求对象的定制
_request = urllib.request.Request(url=url, headers=headers)
# 模拟浏览器向服务器发送请求
response = urllib.request.urlopen(_request)
# 获取响应的数据
content = response.read().decode('utf-8')
# 将数据保存到本地
with open('./download/qqspace.html', 'w', encoding='utf-8') as fp:
    fp.write(content)
'''


url = 'https://user.qzone.qq.com/xxxxx'
headers = {
    # ':authority': 'user.qzone.qq.com',
    # ':method': 'GET',
    # ':path': '/xxxxx',
    # ':scheme': 'https',
    'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
    # 'accept-encoding': 'gzip, deflate, br',
    'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
    'cache-control': 'max-age=0',
    'cookie': '......',
    'if-modified-since': 'Wed, 29 Mar 2023 14:43:59 GMT',
    'referer': 'https://qzs.qq.com/',
    'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="99", "Microsoft Edge";v="99"',
    'sec-ch-ua-mobile': '?0',
    'sec-ch-ua-platform': '"macOS"',
    'sec-fetch-dest': 'document',
    'sec-fetch-mode': 'navigate',
    'sec-fetch-site': 'same-site',
    'sec-fetch-user': '?1',
    'upgrade-insecure-requests': '1',
    'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36 Edg/99.0.1150.39',
}  # 注意：带冒号的全都不好使可以注释掉，+accept-encoding
_request = urllib.request.Request(url=url, headers=headers)
response = urllib.request.urlopen(_request)
content = response.read().decode('utf-8')
with open('./download/qqspace2.html', 'w', encoding='utf-8') as fp:
    fp.write(content)


if __name__ == '__main__':
    start = time.time()
    print('It takes', time.time() - start, "seconds.")

urllib_15_handler处理器基本使用

"""                                                           
                                       .-''-.               
.--.                    _..._        .' .-.  )              
|__|                  .'     '.     / .'  / /               
.--..-,.--.          .   .-.   .   (_/   / /                
|  ||  .-. |         |  '   '  |        / /                 
|  || |  | | _    _  |  |   |  |       / /         _    _   
|  || |  | || '  / | |  |   |  |      . '         | '  / |  
|  || |  '-.' | .' | |  |   |  |     / /    _.-').' | .' |  
|__|| |    /  | /  | |  |   |  |   .' '  _.'.-'' /  | /  |  
    | |   |   `'.  | |  |   |  |  /  /.-'_.'    |   `'.  |  
    |_|   '   .'|  '/|  |   |  | /    _.'       '   .'|  '/ 
           `-'  `--' '--'   '--'( _.-'           `-'  `--'  
Created on 2023/3/30 22:17.
@Author: haifei
"""
import time
from urllib import request, parse

# 需求：使用handler访问百度首页，获取网页源码
# hander  build_operer  open

url = 'http://www.baidu.com'
headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36'
}
_request = request.Request(url=url, headers=headers)
# response = request.urlopen(_request)

# 1. 获取handler对象
handler = request.HTTPHandler()
# 2. 获取opener对象
opener = request.build_opener(handler)
# 3. 调用open方法
response = opener.open(_request)

content = response.read().decode('utf-8')
print(content)


if __name__ == '__main__':
    start = time.time()
    print('It takes', time.time() - start, "seconds.")

urllib_16_代理基本使用

"""                                                           
                                       .-''-.               
.--.                    _..._        .' .-.  )              
|__|                  .'     '.     / .'  / /               
.--..-,.--.          .   .-.   .   (_/   / /                
|  ||  .-. |         |  '   '  |        / /                 
|  || |  | | _    _  |  |   |  |       / /         _    _   
|  || |  | || '  / | |  |   |  |      . '         | '  / |  
|  || |  '-.' | .' | |  |   |  |     / /    _.-').' | .' |  
|__|| |    /  | /  | |  |   |  |   .' '  _.'.-'' /  | /  |  
    | |   |   `'.  | |  |   |  |  /  /.-'_.'    |   `'.  |  
    |_|   '   .'|  '/|  |   |  | /    _.'       '   .'|  '/ 
           `-'  `--' '--'   '--'( _.-'           `-'  `--'  
Created on 2023/3/30 22:40.
@Author: haifei
"""
import time
from urllib import request, parse


# url = 'https://www.baidu.com/s?wd=ip'
url = 'https://www.ip138.com/'
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36'
}


# _request = request.Request(url=url, headers=headers)
# response = request.urlopen(_request)  # 单纯使用urlopen只能设置headers等
# content = response.read().decode('utf-8')
# with open('./download/daili-baidu.html', 'w', encoding='utf-8') as fp:
#     fp.write(content)


_request = request.Request(url=url, headers=headers)
# 免费代理获取https://www.kuaidaili.com/free/，不好使的话在快代理网站自购1块1小时代理进行测试
proxies = {
    # 'http': '121.13.252.62:41564'
    'http': 's562.kdltps.com:15818'
}
handeler = request.ProxyHandler(proxies=proxies)  # 可设置代理
operer = request.build_opener(handeler)
response = operer.open(_request)
content = response.read().decode('utf-8')
with open('./download/daili-baidu2.html', 'w', encoding='utf-8') as fp:
    fp.write(content)


if __name__ == '__main__':
    start = time.time()
    print('It takes', time.time() - start, "seconds.")

urllib_17_代理池

"""                                                           
                                       .-''-.               
.--.                    _..._        .' .-.  )              
|__|                  .'     '.     / .'  / /               
.--..-,.--.          .   .-.   .   (_/   / /                
|  ||  .-. |         |  '   '  |        / /                 
|  || |  | | _    _  |  |   |  |       / /         _    _   
|  || |  | || '  / | |  |   |  |      . '         | '  / |  
|  || |  '-.' | .' | |  |   |  |     / /    _.-').' | .' |  
|__|| |    /  | /  | |  |   |  |   .' '  _.'.-'' /  | /  |  
    | |   |   `'.  | |  |   |  |  /  /.-'_.'    |   `'.  |  
    |_|   '   .'|  '/|  |   |  | /    _.'       '   .'|  '/ 
           `-'  `--' '--'   '--'( _.-'           `-'  `--'  
Created on 2023/3/30 22:40.
@Author: haifei
"""
import time
from urllib import request, parse
import random


# url = 'https://www.baidu.com/s?wd=ip'
url = 'https://www.ip138.com/'
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36'
}


proxies_pool = [
    {'http': 's562.kdltps.com:15818'},
    {'http': '222.74.73.202:42055'},
    {'http': '121.13.252.62:41564'},
    {'http': '27.42.168.46:55481'}
]
proxies = random.choice(proxies_pool)
print(proxies)


_request = request.Request(url=url, headers=headers)
handeler = request.ProxyHandler(proxies=proxies)
operer = request.build_opener(handeler)
response = operer.open(_request)
content = response.read().decode('utf-8')
with open('./download/daili.html', 'w', encoding='utf-8') as fp:
    fp.write(content)


if __name__ == '__main__':
    start = time.time()
    print('It takes', time.time() - start, "seconds.")

posted @ 2023-04-03 22:58 yub4by 阅读(45) 评论(0) 收藏举报

刷新页面返回顶部

yppah

To make a good person better.

urilib: ajax&异常&cookie登陆&handler&代理[池]

urllib_9_ajax的get请求豆瓣电影第一页

urllib_10_ajax的get请求豆瓣电影前十页

urllib_11_ajax的put请求肯德基官网店铺位置

urllib_12_异常

urllib_13_qq空间的cookie登陆

urllib_15_handler处理器基本使用

urllib_16_代理基本使用

urllib_17_代理池

公告