爬虫 http原理,梨视频,github登陆实例,requests请求参数小总结
回顾:http协议基于请求响应的方式,请求:请求首行 请求头{'keys':vales} 请求体 ;响应:响应首行,响应头{'keys':'vales'},响应体。
import socket sock=socket.socket() sock.bind(("127.0.0.1",8808)) sock.listen(5) while 1: print("server waiting.....") conn,addr=sock.accept() data=conn.recv(1024) print("data", data) # 读取html文件 with open("login.html","rb") as f: data=f.read() conn.send((b"HTTP/1.1 200 OK\r\nContent-type:text/html\r\n\r\n%s"%data)) conn.close()
''' GET请求 # 请求首行 GET / HTTP/1.1\r\n # get请求后面的参数 b'GET /?name=wd&age=11 HTTP/1.1\r\n # 请求头 Host: 127.0.0.1:8008\r\n Connection: keep-alive\r\n Cache-Control: max-age=0\r\n Upgrade-Insecure-Requests: 1\r\n User-Agent: Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181Safari/537.36\r\n Accept:text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8\r\nAccept-Encoding: gzip, deflate, br\r\n Accept-Language: zh-CN,zh;q=0.9\r\n Cookie:csrftoken=7xx6BxQDJ6KB0PM7qS8uTA892ACtooNbnnF4LDwlYk1Y7S7nTS81FBqwruizHsxF\r\n\r\n' # 请求体(get请求,请求体为空) ''' b'' ''' POST请求 # 请求首行 b'POST /?name=wd&age=11 HTTP/1.1\r\n # 请求头 Host: 127.0.0.1:8008\r\n Connection: keep-alive\r\n Content-Length: 21\r\n Cache-Control: max-age=0\r\n Origin: http://127.0.0.1:8008\r\n Upgrade-Insecure-Requests: 1\r\n Content-Type: application/x-www-form-urlencoded\r\n User-Agent: Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36\r\n Accept:text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8\r\n Referer: http://127.0.0.1:8008/?name=lqz&age=18\r\n Accept-Encoding: gzip, deflate, br\r\n Accept-Language: zh-CN,zh;q=0.9\r\n Cookie:csrftoken=7xx6BxQDJ6KB0PM7qS8uTA892ACtooNbnnF4LDwlYk1Y7S7nTS81FBqwruizHsxF\r\n\r\n' # 请求体 b'name=wd&password=11' '''
b"HTTP/1.1 200 OK\r\n Content-type:text/html\r\n\r\n %s"%data
http原理
梨视频案例
#返回数据3种格式 #1.text 匹配需要的东西 #2.content(二进制) 保存成图片,视频等 #3.json 反序列化成字典或列表 #下载功能 def download(videos,title): if not os.path.exists('video'): os.mkdir('video') path=os.path.join('video',title)+'.mp4' res=requests.get(videos) with open(path,'wb') as f: f.write(res.content) #起线程执行执行 if __name__ == '__main__': from concurrent.futures import ThreadPoolExecutor p=ThreadPoolExecutor(10) for i in parser_index(get_index()): dic=video_info(get_video(i)) print(dic) p.submit(download,dic['video'],dic['title']) p.shutdown(wait=True)
#注意问题:梨视频下滑加载视频(是根据url的参数,例如分类下的视频显示多少)
github登陆实例
#get请求登陆页面 获取csrf随机字符串和cookies
#post请求登陆操作 携带csrf,输入的用户名密码等(请求体数据) 和 cookies,user-agent,referer等(请求头数据) 必须数据
数据是请求体还是请求头数据? (我的理解是比如ajax里的data,django的返回数据都是请求体的数据. request.set_cookies('islogin':'true') request对象的数据为请求头的)
""" 1.请求登陆页面 获取token cookie 2.发生登陆的post请求,将用户名密码 和token 放在请求体中,cookie放在请求头中 """ import requests import re login_url = "https://github.com/login" #浏览器标识 headers = {"user-agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36"} #请求登陆页面 res1 = requests.get(login_url,headers=headers) print(res1.status_code) # 从响应体中获取token token = re.search('name="authenticity_token" value="(.*?)"',res1.text).group(1) # 保存cookie login_cookie = res1.cookies.get_dict() print(login_cookie) # 发送登陆请求 res2 = requests.post("https://github.com/session", headers={ "user-agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36"}, cookies = login_cookie, data={ "commit": "Sign in", "utf8": "✓", "authenticity_token": token, "login": "xxxxxxxxxxx", "password": "xxxxxxxxxxx"}, # 是否允许自动重定向 allow_redirects = False) print(res2.status_code) # 用户登录成功后的cookie user_cookie = res2.cookies.get_dict() # 携带用户cookies访问主页 res3 = requests.get("https://github.com/settings/profile",cookies = user_cookie,headers = headers) print(res3.status_code) print(res3.text) # "https://github.com/settings/profile"
requests请求参数小总结
#get请求参数 kwd = "吴秀波出轨门" url = "https://www.baidu.com/s" requests.get(url,headers=headers,params={"wd":kwd}) #post请求参数 requests.post("https://github.com/session", headers={ "user-agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36"}, cookies = login_cookie, data={ "commit": "Sign in", "utf8": "✓", "authenticity_token": token, "login": "ssssss", "password": "ssssss"}, # 是否允许自动重定向 allow_redirects = False) #返回值处理 # response.cookies.get_dict() #获取cookies # response.status_code # 状态码 # response.text # 将结果以文本的形式返回 # response.content # 将结果以二进制的方式返回 # response.json() # 将数据直接反序列化得到字典或是列表