爬虫-request方法使用
一、爬虫数据采集
1、按照采集对象分类
1、全网采集
2、全站采集
3、具体网站的指定数据采集
2、采集方案分类
1、利用http协议采集-页面分析
2、利用api接口采集-app数据采集
3、利用目标网站的api采集-微博、github、twitter、facebook
二、request库爬虫
首先需要安装request依赖包
进入虚拟环境,安装request包
三、request实例
实例一:获取百度源码
request_test.py
1 2 3 4 | import requests res = requests.get( "http://www.baidu.com" ) print (res.text) |
运行结果:
实例二:获取POST和GET请求
http_server.py
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 | # socket服务端 import socket import threading import json server = socket.socket() # 绑定到0.0.0.0 8000端口 server.bind(( '0.0.0.0' , 8000 )) server.listen() # 获取客户端连接并启动线程去处理 def handle_sock(sock, addr): while True : tmp_data = sock.recv( 1024 ) tmp_data = tmp_data.decode( "utf8" ) print (tmp_data) request_line = tmp_data.splitlines()[ 0 ] print (request_line) if request_line: method = request_line.split()[ 0 ] path = request_line.split()[ 1 ] if method = = "GET" : response_template = '''HTTP/1.1 200 OK\r\nContent-Type: text/html\r\n\r <!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"> <title>Title</title> </head> <body> <form action="/" method="POST"> <input type="text" value="name" /> <input type="password" value="password"> <input type="submit" value="登录"> </form> </body> </html> ''' sock.send(response_template.encode( "utf8" )) # sock.close() break elif method = = "POST" : response_template = '''HTTP/1.1 200 OK\r\nContent-Type: application/json {} ''' data = [ { "name" : "django打造在线教育" , "teacher" : "bobby" , "url" : "https://coding.imooc.com/class/78.html" }, { "name" : "python高级编程" , "teacher" : "bobby" , "url" : "https://coding.imooc.com/class/200.html" }, { "name" : "scrapy分布式爬虫" , "teacher" : "bobby" , "url" : "https://coding.imooc.com/class/92.html" }, { "name" : "diango rest framework打造生鲜电商" , "teacher" : "bobby" , "url" : "https://coding.imooc.com/class/131.html" }, { "name" : "tornado从入门到精通" , "teacher" : "bobby" , "url" : "https://coding.imooc.com/class/290.html" }, ] sock.send((response_template. format (json.dumps(data)).encode( "utf8" ))) # sock.close() break while True : # 阻塞等待连接 socket, add = server.accept() # 启动一个线程去处理新的用户连接 client_thread = threading.Thread(target = handle_sock, args = (socket, add)) client_thread.start() |
request_test.py
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | import requests # res=requests.get("http://www.baidu.com") # print(res.text) url = "http://127.0.0.1:8000" params = { "username" : "bobby" , "password" : "bobby" } # response = requests.get(url,params=params) # print(response.text) res = requests.post(url,data = params) print (res.text) print (res.encoding) print (res.json()) |
先运行httpServer.py,在运行request_test.py
运行结果如下:
实例三:打印请求状态码
1 2 3 4 | import requests response = requests.get( "https://www.baidu.com" ) print (response.status_code) |
输出结果:
实例四:打印header请求头部
http_server.py
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 | # socket服务端 import socket import threading import json server = socket.socket() # 绑定到0.0.0.0 8000端口 server.bind(( '0.0.0.0' , 8000 )) server.listen() # 获取客户端连接并启动线程去处理 def handle_sock(sock, addr): while True : tmp_data = sock.recv( 1024 * 10 ) tmp_data = tmp_data.decode( "utf8" ) print (tmp_data) request_line = tmp_data.splitlines()[ 0 ] print (request_line) if request_line: method = request_line.split()[ 0 ] path = request_line.split()[ 1 ] if method = = "GET" : response_template = '''HTTP/1.1 200 OK\r\nContent-Type: text/html\r\nAccess-Controller-Allow-Origin:https://localhost:63342\r\n\r <!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"> <title>Title</title> </head> <body> <form action="/" method="POST"> <input type="text" value="name" /> <input type="password" value="password"> <input type="submit" value="登录"> </form> </body> </html> ''' sock.send(response_template.encode( "utf8" )) # sock.close() break elif method = = "POST" : response_template = '''HTTP/1.1 200 OK\r\nContent-Type: application/json\r\n {} ''' data = [ { "name" : "django打造在线教育" , "teacher" : "bobby" , "url" : "https://coding.imooc.com/class/78.html" }, { "name" : "python高级编程" , "teacher" : "bobby" , "url" : "https://coding.imooc.com/class/200.html" }, { "name" : "scrapy分布式爬虫" , "teacher" : "bobby" , "url" : "https://coding.imooc.com/class/92.html" }, { "name" : "diango rest framework打造生鲜电商" , "teacher" : "bobby" , "url" : "https://coding.imooc.com/class/131.html" }, { "name" : "tornado从入门到精通" , "teacher" : "bobby" , "url" : "https://coding.imooc.com/class/290.html" }, ] sock.send((response_template. format (json.dumps(data)).encode( "utf8" ))) sock.close() break while True : # 阻塞等待连接 socket, add = server.accept() # 启动一个线程去处理新的用户连接 client_thread = threading.Thread(target = handle_sock, args = (socket, add)) client_thread.start() |
request_test.py
1 2 3 4 5 6 7 8 9 | import requests url = "http://127.0.0.1:8000" my_headers = { "user-agent" : "requests" , "imooc_uid" : "321" } response = requests.get(url = url,headers = my_headers) print (response.headers) |
在http_server.py上打上断点
可以看到requests.get方法传递过来的headers头部参数值
输出结果如下:
访问百度的headers
1 2 3 4 | import requests response = requests.get( "https://www.baidu.com" ) print (response.headers) |
输出结果如下:
实例五:默认请求参数Content-Type为Content-Type: application/x-www-form-urlencoded
http_server.py
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 | # socket服务端 import socket import threading import json server = socket.socket() # 绑定到0.0.0.0 8000端口 server.bind(( '0.0.0.0' , 8000 )) server.listen() # 获取客户端连接并启动线程去处理 def handle_sock(sock, addr): while True : tmp_data = sock.recv( 1024 * 10 ) tmp_data = tmp_data.decode( "utf8" ) print (tmp_data) request_line = tmp_data.splitlines()[ 0 ] print (request_line) if request_line: method = request_line.split()[ 0 ] path = request_line.split()[ 1 ] if method = = "GET" : response_template = '''HTTP/1.1 200 OK\r\nContent-Type: text/html\r\nAccess-Controller-Allow-Origin:https://localhost:63342\r\n\r <!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"> <title>Title</title> </head> <body> <form action="/" method="POST"> <input type="text" value="name" /> <input type="password" value="password"> <input type="submit" value="登录"> </form> </body> </html> ''' sock.send(response_template.encode( "utf8" )) # sock.close() break elif method = = "POST" : response_template = '''HTTP/1.1 200 OK\r\nContent-Type: application/json\r\n {} ''' data = [ { "name" : "django打造在线教育" , "teacher" : "bobby" , "url" : "https://coding.imooc.com/class/78.html" }, { "name" : "python高级编程" , "teacher" : "bobby" , "url" : "https://coding.imooc.com/class/200.html" }, { "name" : "scrapy分布式爬虫" , "teacher" : "bobby" , "url" : "https://coding.imooc.com/class/92.html" }, { "name" : "diango rest framework打造生鲜电商" , "teacher" : "bobby" , "url" : "https://coding.imooc.com/class/131.html" }, { "name" : "tornado从入门到精通" , "teacher" : "bobby" , "url" : "https://coding.imooc.com/class/290.html" }, ] sock.send((response_template. format (json.dumps(data)).encode( "utf8" ))) sock.close() break while True : # 阻塞等待连接 socket, add = server.accept() # 启动一个线程去处理新的用户连接 client_thread = threading.Thread(target = handle_sock, args = (socket, add)) client_thread.start() |
request_test.py
1 2 3 4 5 6 7 8 9 | import requests url = "http://127.0.0.1:8000" params = { "username" : "bobby" , "password" : "bobby" } res = requests.post(url,data = params) print (res.encoding) |
断点输出结果如下:
四、data和Json参数都可以传递两种数据类型。1、字符串 2、dict
当为data时
request_test.py
1 2 3 4 5 6 7 8 9 10 | import json import requests url = "http://127.0.0.1:8000" params = { "username" : "bobby" , "password" : "bobby" } res = requests.post(url,data = json.dumps(params)) print (res.encoding) |
http_server.json
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 | # socket服务端 import socket import threading import json server = socket.socket() # 绑定到0.0.0.0 8000端口 server.bind(( '0.0.0.0' , 8000 )) server.listen() # 获取客户端连接并启动线程去处理 def handle_sock(sock, addr): while True : tmp_data = sock.recv( 1024 * 10 ) tmp_data = tmp_data.decode( "utf8" ) print (tmp_data) request_line = tmp_data.splitlines()[ 0 ] print (request_line) if request_line: method = request_line.split()[ 0 ] path = request_line.split()[ 1 ] if method = = "GET" : response_template = '''HTTP/1.1 200 OK\r\nContent-Type: text/html\r\nAccess-Controller-Allow-Origin:https://localhost:63342\r\n\r <!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"> <title>Title</title> </head> <body> <form action="/" method="POST"> <input type="text" value="name" /> <input type="password" value="password"> <input type="submit" value="登录"> </form> </body> </html> ''' sock.send(response_template.encode( "utf8" )) # sock.close() break elif method = = "POST" : response_template = '''HTTP/1.1 200 OK\r\nContent-Type: application/json\r\n {} ''' data = [ { "name" : "django打造在线教育" , "teacher" : "bobby" , "url" : "https://coding.imooc.com/class/78.html" }, { "name" : "python高级编程" , "teacher" : "bobby" , "url" : "https://coding.imooc.com/class/200.html" }, { "name" : "scrapy分布式爬虫" , "teacher" : "bobby" , "url" : "https://coding.imooc.com/class/92.html" }, { "name" : "diango rest framework打造生鲜电商" , "teacher" : "bobby" , "url" : "https://coding.imooc.com/class/131.html" }, { "name" : "tornado从入门到精通" , "teacher" : "bobby" , "url" : "https://coding.imooc.com/class/290.html" }, ] sock.send((response_template. format (json.dumps(data)).encode( "utf8" ))) sock.close() break while True : # 阻塞等待连接 socket, add = server.accept() # 启动一个线程去处理新的用户连接 client_thread = threading.Thread(target = handle_sock, args = (socket, add)) client_thread.start() |
执行结果如下:
当为json时
request_test.py
1 2 3 4 5 6 7 8 9 10 | import json import requests url = "http://127.0.0.1:8000" params = { "username" : "bobby" , "password" : "bobby" } res = requests.post(url,json = json.dumps(params)) print (res.encoding) |
http_server.json
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 | # socket服务端 import socket import threading import json server = socket.socket() # 绑定到0.0.0.0 8000端口 server.bind(( '0.0.0.0' , 8000 )) server.listen() # 获取客户端连接并启动线程去处理 def handle_sock(sock, addr): while True : tmp_data = sock.recv( 1024 * 10 ) tmp_data = tmp_data.decode( "utf8" ) print (tmp_data) request_line = tmp_data.splitlines()[ 0 ] print (request_line) if request_line: method = request_line.split()[ 0 ] path = request_line.split()[ 1 ] if method = = "GET" : response_template = '''HTTP/1.1 200 OK\r\nContent-Type: text/html\r\nAccess-Controller-Allow-Origin:https://localhost:63342\r\n\r <!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"> <title>Title</title> </head> <body> <form action="/" method="POST"> <input type="text" value="name" /> <input type="password" value="password"> <input type="submit" value="登录"> </form> </body> </html> ''' sock.send(response_template.encode( "utf8" )) # sock.close() break elif method = = "POST" : response_template = '''HTTP/1.1 200 OK\r\nContent-Type: application/json\r\n {} ''' data = [ { "name" : "django打造在线教育" , "teacher" : "bobby" , "url" : "https://coding.imooc.com/class/78.html" }, { "name" : "python高级编程" , "teacher" : "bobby" , "url" : "https://coding.imooc.com/class/200.html" }, { "name" : "scrapy分布式爬虫" , "teacher" : "bobby" , "url" : "https://coding.imooc.com/class/92.html" }, { "name" : "diango rest framework打造生鲜电商" , "teacher" : "bobby" , "url" : "https://coding.imooc.com/class/131.html" }, { "name" : "tornado从入门到精通" , "teacher" : "bobby" , "url" : "https://coding.imooc.com/class/290.html" }, ] sock.send((response_template. format (json.dumps(data)).encode( "utf8" ))) sock.close() break while True : # 阻塞等待连接 socket, add = server.accept() # 启动一个线程去处理新的用户连接 client_thread = threading.Thread(target = handle_sock, args = (socket, add)) client_thread.start() |
当为Json时,数据结果如下,数据结果转换为Json
request_test.py
1 2 3 4 5 6 7 8 9 10 | import json import requests url = "http://127.0.0.1:8000" params = { "username" : "bobby" , "password" : "bobby" } res = requests.post(url,json = params) print (res.encoding) |
http_server.json
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 | # socket服务端 import socket import threading import json server = socket.socket() # 绑定到0.0.0.0 8000端口 server.bind(( '0.0.0.0' , 8000 )) server.listen() # 获取客户端连接并启动线程去处理 def handle_sock(sock, addr): while True : tmp_data = sock.recv( 1024 * 10 ) tmp_data = tmp_data.decode( "utf8" ) print (tmp_data) request_line = tmp_data.splitlines()[ 0 ] print (request_line) if request_line: method = request_line.split()[ 0 ] path = request_line.split()[ 1 ] if method = = "GET" : response_template = '''HTTP/1.1 200 OK\r\nContent-Type: text/html\r\nAccess-Controller-Allow-Origin:https://localhost:63342\r\n\r <!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"> <title>Title</title> </head> <body> <form action="/" method="POST"> <input type="text" value="name" /> <input type="password" value="password"> <input type="submit" value="登录"> </form> </body> </html> ''' sock.send(response_template.encode( "utf8" )) # sock.close() break elif method = = "POST" : response_template = '''HTTP/1.1 200 OK\r\nContent-Type: application/json\r\n {} ''' data = [ { "name" : "django打造在线教育" , "teacher" : "bobby" , "url" : "https://coding.imooc.com/class/78.html" }, { "name" : "python高级编程" , "teacher" : "bobby" , "url" : "https://coding.imooc.com/class/200.html" }, { "name" : "scrapy分布式爬虫" , "teacher" : "bobby" , "url" : "https://coding.imooc.com/class/92.html" }, { "name" : "diango rest framework打造生鲜电商" , "teacher" : "bobby" , "url" : "https://coding.imooc.com/class/131.html" }, { "name" : "tornado从入门到精通" , "teacher" : "bobby" , "url" : "https://coding.imooc.com/class/290.html" }, ] sock.send((response_template. format (json.dumps(data)).encode( "utf8" ))) sock.close() break while True : # 阻塞等待连接 socket, add = server.accept() # 启动一个线程去处理新的用户连接 client_thread = threading.Thread(target = handle_sock, args = (socket, add)) client_thread.start() |
此时dict值转换为json
浏览器和requests最终都是需要拼接满足http的字符串。
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 全程不用写代码,我用AI程序员写了一个飞机大战
· DeepSeek 开源周回顾「GitHub 热点速览」
· 记一次.NET内存居高不下排查解决与启示
· MongoDB 8.0这个新功能碉堡了,比商业数据库还牛
· .NET10 - 预览版1新功能体验(一)