自己封装的http.client get请求
由于requests的get请求有各种各样的缺陷,所以我自己用http.client封装了一个 client_get 支持设置代理、 ssl验证、是否重定向
import urllib.parse
import http.client
def client_get(url,headers=None,proxies=None,verify=True,allow_redirect=True):
url = urllib.parse.urlparse(url)
hostname = url.hostname
port = 80
if proxies!={} or None:
hostname,port = proxies["http"].split(":")
conn = http.client.HTTPSConnection(hostname, port)
if verify == False:
conn = http.client.HTTPSConnection(hostname,port,context=ssl._create_unverified_context())
if proxies != {} or None:
conn.set_tunnel(url.hostname)
if url.query == "":
urlPa = url.path
else:
urlPa = "%s?%s" % (url.path,url.query)
if headers is None:
headers = {}
conn.request("GET",urlPa,headers=headers)
response = conn.getresponse()
if response.code in [301,302] and allow_redirect is True:
res_headers = response.headers._headers
location = ""
for item in res_headers:
name,value = item
if name == "location":
location = value
return client_get(location,headers=headers,proxies=proxies,verify=verify)
return response
使用例子
url = "https://pornhub.com/view_video.php?viewkey=ph635d67e1ecd60"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.74 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en-us,en;q=0.5',
'Sec-Fetch-Mode': 'navigate', 'Connection': 'close'}
proxies = {
"http": "http://127.0.0.1:8888", "https": "http://127.0.0.1:8888",
}
res = client_get(url,headers=headers,proxies=proxies,verify=False,allow_redirect=True)
print(res.read().decode('utf-8'))
补充:
环境:
python 3.7.4
requests-2.28.1
urllib3-1.26.12
事件的起因是我使用request去访问(https://pornhub.com/view_video.php?viewkey=ph635d67e1ecd60)时返回的内容错误,而使用浏览器去访问是没问题的,而机缘巧合之下我使用http.client去访问又没问题,这使我意识到是python本身api的问题,然后我用socket重新实现了requests.get和http.client的get
requests.get
import email.parser
import socket
from http.client import HTTPMessage
from urllib3.util.connection import allowed_gai_family
from urllib3.util.ssl_ import create_urllib3_context, resolve_ssl_version, resolve_cert_reqs
_MAXLINE = 65536
def read_status(fp):
line = str(fp.readline(_MAXLINE + 1), "iso-8859-1")
version, status, reason = line.split(None, 2)
status = int(status)
return version, status, reason
def parse_headers(fp, _class=HTTPMessage):
"""Parses only RFC2822 headers from a file pointer.
email Parser wants to see strings rather than bytes.
But a TextIOWrapper around self.rfile would buffer too many bytes
from the stream, bytes which we later need to read as bytes.
So we read the correct bytes here, as bytes, for email Parser
to parse.
"""
headers = []
while True:
line = fp.readline(_MAXLINE + 1)
headers.append(line)
if line in (b'\r\n', b'\n', b''):
break
hstring = b''.join(headers).decode('iso-8859-1')
return email.parser.Parser(_class=_class).parsestr(hstring)
if __name__ == "__main__":
host = "127.0.0.1"
port=7890 # 这里的代理端口要换成你自己vpn的代理端口
family = allowed_gai_family()
tunnel_host = 'cn.pornhub.com'
tunnel_port = 443
af, socktype, proto, canonname, sa = socket.getaddrinfo(host, port, family, socket.SOCK_STREAM)[0]
sock = socket.socket(af, socktype, proto)
sock.settimeout(None)
sock.connect(sa)
connect_str = "CONNECT %s:%d HTTP/1.0\r\n" % (tunnel_host,
tunnel_port)
connect_bytes = connect_str.encode("ascii")
sock.sendall(connect_bytes)
sock.sendall(b"\r\n")
fp = sock.makefile("rb")
(version, code, message) = read_status(fp)
line = fp.readline(_MAXLINE + 1)
print(line)
context = create_urllib3_context(ssl_version=resolve_ssl_version(None),cert_reqs=resolve_cert_reqs('CERT_NONE'))
context.load_default_certs()
context.set_alpn_protocols(['http/1.1'])
sock = context.wrap_socket(sock,server_hostname=tunnel_host)
print(sock)
msg = b'GET /view_video.php?viewkey=ph635d67e1ecd60 HTTP/1.1\r\nHost: cn.pornhub.com\r\nUser-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.74 Safari/537.36\r\nAccept-Encoding: identity\r\nAccept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8\r\nConnection: keep-alive\r\nAccept-Language: en-us,en;q=0.5\r\nSec-Fetch-Mode: navigate\r\n\r\n'
sock.sendall(msg)
sock.settimeout(None)
fp = sock.makefile("rb")
(version, code, message) = read_status(fp)
headers = msg = parse_headers(fp)
print(headers)
data = fp.read()
print(data)
http.client
import email.parser
import socket
from http.client import HTTPMessage
from urllib3.util import resolve_ssl_version, resolve_cert_reqs
from urllib3.util.connection import allowed_gai_family
from urllib3.util.ssl_ import create_urllib3_context
def read_status(fp):
line = str(fp.readline(_MAXLINE + 1), "iso-8859-1")
version, status, reason = line.split(None, 2)
status = int(status)
return version, status, reason
def parse_headers(fp, _class=HTTPMessage):
"""Parses only RFC2822 headers from a file pointer.
email Parser wants to see strings rather than bytes.
But a TextIOWrapper around self.rfile would buffer too many bytes
from the stream, bytes which we later need to read as bytes.
So we read the correct bytes here, as bytes, for email Parser
to parse.
"""
headers = []
while True:
line = fp.readline(_MAXLINE + 1)
headers.append(line)
if line in (b'\r\n', b'\n', b''):
break
hstring = b''.join(headers).decode('iso-8859-1')
return email.parser.Parser(_class=_class).parsestr(hstring)
host = "127.0.0.1"
port = 7890 # 这里的代理端口要换成你自己vpn的代理端口
timeout = socket._GLOBAL_DEFAULT_TIMEOUT
msg = b'GET /view_video.php?viewkey=ph635d67e1ecd60 HTTP/1.1\r\nHost: cn.pornhub.com\r\nUser-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.74 Safari/537.36\r\nAccept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8\r\nAccept-Language: en-us,en;q=0.5\r\nSec-Fetch-Mode: navigate\r\nAccept-Encoding: identity\r\n\r\n'
tunnel_host = 'cn.pornhub.com'
tunnel_port = 443
_MAXLINE = 65535
sock = socket.create_connection((host,port),timeout,None)
# family = allowed_gai_family()
# af, socktype, proto, canonname, sa = socket.getaddrinfo(host, port, family, socket.SOCK_STREAM)[0]
# sock = socket.socket(af, socktype, proto)
# sock.settimeout(None)
# sock.connect(sa)
sock.setsockopt(socket.IPPROTO_TCP, socket.TCP_NODELAY, 1)
connect_str = "CONNECT %s:%d HTTP/1.0\r\n" % (tunnel_host,
tunnel_port)
connect_bytes = connect_str.encode("ascii")
sock.sendall(connect_bytes)
sock.sendall(b'\r\n')
fp = sock.makefile("rb")
(version, code, message) = read_status(fp)
line = fp.readline(_MAXLINE + 1)
print(line)
context = create_urllib3_context(ssl_version=resolve_ssl_version(None),cert_reqs=resolve_cert_reqs('CERT_NONE'))
sock = context.wrap_socket(sock,server_hostname=tunnel_host)
# context.load_default_certs()
# context.set_alpn_protocols(['http/1.1'])
sock.sendall(msg)
# sock.settimeout(None)
fp = sock.makefile("rb")
(version, code, message) = read_status(fp)
headers = msg = parse_headers(fp)
print(headers)
data = fp.read()
print(data)
经过对比和反复修改验证,终于 发现context.set_alpn_protocols(['http/1.1'])是影响获取内容的关键因素,当设置了这个后,返回的数据内容就是错误的,至于为什么这样就不太清楚了,set_alpn_protocols在urllib3\util\ssl_.py的方法ssl_wrap_socket处,urllib3==1.24.2是没有这一句代码的,所以如果不想使用http.client的话,重新安装urllib3就可以了
ALPN(Application-Layer Protocol Negotiation)
是TLS扩展,TLS是一种加密协议,当客户端和服务端协商完加密协议后就会转入 应用数据协议,而ALPN就是在交换加密协议使顺便协商应用数据协议使用哪种协议。如果不使用ALPN的话,就要再发几次以协定应用数据协议的类型,会浪费时间