urllib_9_ajax的get请求豆瓣电影第一页
"""
.-''-.
.--. _..._ .' .-. )
|__| .' '. / .' / /
.--..-,.--. . .-. . (_/ / /
| || .-. | | ' ' | / /
| || | | | _ _ | | | | / / _ _
| || | | || ' / | | | | | . ' | ' / |
| || | '-.' | .' | | | | | / / _.-').' | .' |
|__|| | / | / | | | | | .' ' _.'.-'' / | / |
| | | `'. | | | | | / /.-'_.' | `'. |
|_| ' .'| '/| | | | / _.' ' .'| '/
`-' `--' '--' '--'( _.-' `-' `--'
Created on 2023/3/28 21:46.
@Author: haifei
"""
import time
from urllib import request, parse
url = 'https://movie.douban.com/j/chart/top_list?type=5&interval_id=100%3A90&action=&start=0&limit=20'
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36'
}
_request = request.Request(url=url, headers=headers)
response = request.urlopen(_request)
content = response.read().decode('utf-8')
print(content)
fp = open('./download/douban-page1.json', 'w', encoding='utf-8')
fp.write(content)
fp.close()
with open('./download/douban-page2.json', 'w', encoding='utf-8') as fp:
fp.write(content)
if __name__ == '__main__':
start = time.time()
print('It takes', time.time() - start, "seconds.")
urllib_10_ajax的get请求豆瓣电影前十页
"""
.-''-.
.--. _..._ .' .-. )
|__| .' '. / .' / /
.--..-,.--. . .-. . (_/ / /
| || .-. | | ' ' | / /
| || | | | _ _ | | | | / / _ _
| || | | || ' / | | | | | . ' | ' / |
| || | '-.' | .' | | | | | / / _.-').' | .' |
|__|| | / | / | | | | | .' ' _.'.-'' / | / |
| | | `'. | | | | | / /.-'_.' | `'. |
|_| ' .'| '/| | | | / _.' ' .'| '/
`-' `--' '--' '--'( _.-' `-' `--'
Created on 2023/3/28 21:46.
@Author: haifei
"""
import time
from urllib import request, parse
"""
url = 'https://movie.douban.com/j/chart/top_list?type=5&interval_id=100%3A90&action=&start=0&limit=20'
# 以下在在页面上为ajax异步请求实现,后端为分页
url = 'https://movie.douban.com/j/chart/top_list?type=5&interval_id=100%3A90&action=&start=20&limit=20'
url = 'https://movie.douban.com/j/chart/top_list?type=5&interval_id=100%3A90&action=&start=40&limit=20'
page 1 2 3 4 ...
start 0 20 40 60 ...
找到分页规律:start =(page - 1)* 20
1. 请求对象定制
2. 获取响应数据
3. 下载数据
"""
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36'
}
base_url = 'https://movie.douban.com/j/chart/top_list?type=5&interval_id=100%3A90&action=&'
def create_request(page):
data = {
'start': (page-1)*20,
'limit': 20
}
data = parse.urlencode(data)
url = base_url + data
_request = request.Request(url=url, headers=headers)
return _request
def get_content(_request):
response = request.urlopen(_request)
content = response.read().decode('utf-8')
return content
def download_data(content, page):
file_name = 'doubanpage-' + str(page) + '.json'
with open('./download/'+file_name, 'w', encoding='utf-8') as fp:
fp.write(content)
if __name__ == '__main__':
start = time.time()
start_page = int(input('请输入起始页码:'))
end_page = int(input('请输入结束页码:'))
if (start_page < 0) or (end_page < 0):
print('[note: 页码数必须大于0]')
elif start_page > end_page:
print('[note: 起始页码必须小于等于结束页码]')
else:
for page in range(start_page, end_page+1):
myrequest = create_request(page)
mycontent = get_content(myrequest)
download_data(mycontent, page)
print('download finished')
print('It takes', time.time() - start, "seconds.")
urllib_11_ajax的put请求肯德基官网店铺位置
"""
.-''-.
.--. _..._ .' .-. )
|__| .' '. / .' / /
.--..-,.--. . .-. . (_/ / /
| || .-. | | ' ' | / /
| || | | | _ _ | | | | / / _ _
| || | | || ' / | | | | | . ' | ' / |
| || | '-.' | .' | | | | | / / _.-').' | .' |
|__|| | / | / | | | | | .' ' _.'.-'' / | / |
| | | `'. | | | | | / /.-'_.' | `'. |
|_| ' .'| '/| | | | / _.' ' .'| '/
`-' `--' '--' '--'( _.-' `-' `--'
Created on 2023/3/28 22:51.
@Author: haifei
"""
import time
from urllib import request, parse
from pyfiglet import Figlet
"""
request headers中含有X-Requested-With: XMLHttpRequest时说明为ajax请求
'http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=cname'
post请求的参数在payload的form data里,如下
cname: 北京
pid:
pageIndex: 1
pageSize: 10
cname: 北京
pid:
pageIndex: 2
pageSize: 10
cname: 北京
pid:
pageIndex: 2
pageSize: 10
"""
def hello_message():
print('*' * 100)
f = Figlet()
print(f.renderText('irun2u'))
print('Name: kfc store spider')
print('Verson: 1.0')
print('Index: http://www.irun2u.top')
print('*' * 100)
def legal(s):
if (s[0] != '+') and (s[0] != '-'):
return s
else:
return s[1:]
def get_page():
input_start = input('请输入起始页码:')
input_end = input('请输入结束页码:')
if not legal(input_start).isdigit() or not legal(input_end).isdigit():
print('[note: 输入页码必须为数字]')
else:
page_start = int(input_start)
page_end = int(input_end)
if (page_start < 0) or (page_end < 0):
print('[note: 页码数必须大于0]')
elif page_start > page_end:
print('[note: 起始页码必须小于等于结束页码]')
else:
return [page_start, page_end]
def create_request(page):
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36'
}
base_url = 'http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=cname'
data = {
'cname': '北京',
'pid': '',
'pageIndex': page,
'pageSize': '10'
}
_data = parse.urlencode(data).encode('utf-8')
_request = request.Request(url=base_url, headers=headers, data=_data)
return _request
def get_content(myrequest):
response = request.urlopen(myrequest)
content = response.read().decode('utf-8')
return content
def download_data(page, mycontent):
file_name = 'kfcstore-' + str(page) + '.json'
with open('./download/' + file_name, 'w', encoding='utf-8') as fp:
fp.write(mycontent)
if __name__ == '__main__':
start = time.time()
hello_message()
pages = get_page()
if pages is not None:
page_start = pages[0]
page_end = pages[1]
for page in range(page_start, page_end + 1):
myrequest = create_request(page)
mycontent = get_content(myrequest)
download_data(page, mycontent)
print('download finished')
print('It takes', time.time() - start, "seconds.")
urllib_12_异常
"""
.-''-.
.--. _..._ .' .-. )
|__| .' '. / .' / /
.--..-,.--. . .-. . (_/ / /
| || .-. | | ' ' | / /
| || | | | _ _ | | | | / / _ _
| || | | || ' / | | | | | . ' | ' / |
| || | '-.' | .' | | | | | / / _.-').' | .' |
|__|| | / | / | | | | | .' ' _.'.-'' / | / |
| | | `'. | | | | | / /.-'_.' | `'. |
|_| ' .'| '/| | | | / _.' ' .'| '/
`-' `--' '--' '--'( _.-' `-' `--'
Created on 2023/3/29 21:59.
@Author: haifei
"""
import time
import urllib.request, urllib.error
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36'
}
url = 'https://blog.csdn.net/csdnnews/article/details/129774767' + '1111'
try:
_request = urllib.request.Request(url=url, headers=headers)
response = urllib.request.urlopen(_request)
content = response.read().decode('utf-8')
print(content)
except urllib.error.HTTPError:
print('[note: 404]')
url = 'https://www.dandan789.com/'
_request2 = urllib.request.Request(url=url, headers=headers)
response2 = urllib.request.urlopen(_request2)
content2 = response2.read().decode('utf-8')
print(content2)
if __name__ == '__main__':
start = time.time()
print('It takes', time.time() - start, "seconds.")
urllib_13_qq空间的cookie登陆
"""
.-''-.
.--. _..._ .' .-. )
|__| .' '. / .' / /
.--..-,.--. . .-. . (_/ / /
| || .-. | | ' ' | / /
| || | | | _ _ | | | | / / _ _
| || | | || ' / | | | | | . ' | ' / |
| || | '-.' | .' | | | | | / / _.-').' | .' |
|__|| | / | / | | | | | .' ' _.'.-'' / | / |
| | | `'. | | | | | / /.-'_.' | `'. |
|_| ' .'| '/| | | | / _.' ' .'| '/
`-' `--' '--' '--'( _.-' `-' `--'
Created on 2023/3/29 22:13.
@Author: haifei
"""
import time
import urllib.request
'''
场景:
一个浏览器登陆qq空间,URL
https://user.qzone.qq.com/xxxxx
用另一个浏览器打开这个URL
正常会提示登陆
什么情况下访问不成功?
因为请求头的信息不够 所以访问不成功
解决:回到第一个浏览器找到访问目标接口,拷贝其请求头request headers放到headers中
决定性信息:cookie、referer
注意:cookie携带登陆信息;referer是图片防盗链,判断当前路径是否有上一路径进来
'''
'''
url = 'https://user.qzone.qq.com/xxxxx'
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36 Edg/99.0.1150.39'
}
# 请求对象的定制
_request = urllib.request.Request(url=url, headers=headers)
# 模拟浏览器向服务器发送请求
response = urllib.request.urlopen(_request)
# 获取响应的数据
content = response.read().decode('utf-8')
# 将数据保存到本地
with open('./download/qqspace.html', 'w', encoding='utf-8') as fp:
fp.write(content)
'''
url = 'https://user.qzone.qq.com/xxxxx'
headers = {
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
'cache-control': 'max-age=0',
'cookie': '......',
'if-modified-since': 'Wed, 29 Mar 2023 14:43:59 GMT',
'referer': 'https://qzs.qq.com/',
'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="99", "Microsoft Edge";v="99"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"macOS"',
'sec-fetch-dest': 'document',
'sec-fetch-mode': 'navigate',
'sec-fetch-site': 'same-site',
'sec-fetch-user': '?1',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36 Edg/99.0.1150.39',
}
_request = urllib.request.Request(url=url, headers=headers)
response = urllib.request.urlopen(_request)
content = response.read().decode('utf-8')
with open('./download/qqspace2.html', 'w', encoding='utf-8') as fp:
fp.write(content)
if __name__ == '__main__':
start = time.time()
print('It takes', time.time() - start, "seconds.")
urllib_15_handler处理器基本使用
"""
.-''-.
.--. _..._ .' .-. )
|__| .' '. / .' / /
.--..-,.--. . .-. . (_/ / /
| || .-. | | ' ' | / /
| || | | | _ _ | | | | / / _ _
| || | | || ' / | | | | | . ' | ' / |
| || | '-.' | .' | | | | | / / _.-').' | .' |
|__|| | / | / | | | | | .' ' _.'.-'' / | / |
| | | `'. | | | | | / /.-'_.' | `'. |
|_| ' .'| '/| | | | / _.' ' .'| '/
`-' `--' '--' '--'( _.-' `-' `--'
Created on 2023/3/30 22:17.
@Author: haifei
"""
import time
from urllib import request, parse
url = 'http://www.baidu.com'
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36'
}
_request = request.Request(url=url, headers=headers)
handler = request.HTTPHandler()
opener = request.build_opener(handler)
response = opener.open(_request)
content = response.read().decode('utf-8')
print(content)
if __name__ == '__main__':
start = time.time()
print('It takes', time.time() - start, "seconds.")
urllib_16_代理基本使用
"""
.-''-.
.--. _..._ .' .-. )
|__| .' '. / .' / /
.--..-,.--. . .-. . (_/ / /
| || .-. | | ' ' | / /
| || | | | _ _ | | | | / / _ _
| || | | || ' / | | | | | . ' | ' / |
| || | '-.' | .' | | | | | / / _.-').' | .' |
|__|| | / | / | | | | | .' ' _.'.-'' / | / |
| | | `'. | | | | | / /.-'_.' | `'. |
|_| ' .'| '/| | | | / _.' ' .'| '/
`-' `--' '--' '--'( _.-' `-' `--'
Created on 2023/3/30 22:40.
@Author: haifei
"""
import time
from urllib import request, parse
url = 'https://www.ip138.com/'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36'
}
_request = request.Request(url=url, headers=headers)
proxies = {
'http': 's562.kdltps.com:15818'
}
handeler = request.ProxyHandler(proxies=proxies)
operer = request.build_opener(handeler)
response = operer.open(_request)
content = response.read().decode('utf-8')
with open('./download/daili-baidu2.html', 'w', encoding='utf-8') as fp:
fp.write(content)
if __name__ == '__main__':
start = time.time()
print('It takes', time.time() - start, "seconds.")
urllib_17_代理池
"""
.-''-.
.--. _..._ .' .-. )
|__| .' '. / .' / /
.--..-,.--. . .-. . (_/ / /
| || .-. | | ' ' | / /
| || | | | _ _ | | | | / / _ _
| || | | || ' / | | | | | . ' | ' / |
| || | '-.' | .' | | | | | / / _.-').' | .' |
|__|| | / | / | | | | | .' ' _.'.-'' / | / |
| | | `'. | | | | | / /.-'_.' | `'. |
|_| ' .'| '/| | | | / _.' ' .'| '/
`-' `--' '--' '--'( _.-' `-' `--'
Created on 2023/3/30 22:40.
@Author: haifei
"""
import time
from urllib import request, parse
import random
url = 'https://www.ip138.com/'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36'
}
proxies_pool = [
{'http': 's562.kdltps.com:15818'},
{'http': '222.74.73.202:42055'},
{'http': '121.13.252.62:41564'},
{'http': '27.42.168.46:55481'}
]
proxies = random.choice(proxies_pool)
print(proxies)
_request = request.Request(url=url, headers=headers)
handeler = request.ProxyHandler(proxies=proxies)
operer = request.build_opener(handeler)
response = operer.open(_request)
content = response.read().decode('utf-8')
with open('./download/daili.html', 'w', encoding='utf-8') as fp:
fp.write(content)
if __name__ == '__main__':
start = time.time()
print('It takes', time.time() - start, "seconds.")
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 被坑几百块钱后,我竟然真的恢复了删除的微信聊天记录!
· 没有Manus邀请码?试试免邀请码的MGX或者开源的OpenManus吧
· 【自荐】一款简洁、开源的在线白板工具 Drawnix
· 园子的第一款AI主题卫衣上架——"HELLO! HOW CAN I ASSIST YOU TODAY
· Docker 太简单,K8s 太复杂?w7panel 让容器管理更轻松!