爬虫-request方法使用

一、爬虫数据采集

1、按照采集对象分类

1、全网采集

2、全站采集

3、具体网站的指定数据采集

2、采集方案分类

1、利用http协议采集-页面分析

2、利用api接口采集-app数据采集

3、利用目标网站的api采集-微博、github、twitter、facebook

二、request库爬虫

首先需要安装request依赖包

进入虚拟环境,安装request包

 三、request实例

实例一:获取百度源码

 request_test.py

1
2
3
4
import requests
 
res=requests.get("http://www.baidu.com")
print(res.text)

运行结果:

实例二:获取POST和GET请求

http_server.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
# socket服务端
import socket
import threading
import json
 
server = socket.socket()
# 绑定到0.0.0.0 8000端口
server.bind(('0.0.0.0', 8000))
server.listen()
 
 
# 获取客户端连接并启动线程去处理
 
def handle_sock(sock, addr):
    while True:
        tmp_data = sock.recv(1024)
        tmp_data = tmp_data.decode("utf8")
        print(tmp_data)
        request_line = tmp_data.splitlines()[0]
        print(request_line)
        if request_line:
            method = request_line.split()[0]
            path = request_line.split()[1]
            if method == "GET":
                response_template = '''HTTP/1.1 200 OK\r\nContent-Type: text/html\r\n\r
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="utf-8">
    <title>Title</title>
</head>
<body>
<form action="/" method="POST">
    <input type="text" value="name" />
    <input type="password" value="password">
    <input type="submit" value="登录">
</form>
</body>
</html>
        '''
                sock.send(response_template.encode("utf8"))
                # sock.close()
                break
            elif method == "POST":
                response_template = '''HTTP/1.1 200 OK\r\nContent-Type: application/json
 
{}
                '''
                data = [
                    {
                        "name": "django打造在线教育",
                        "teacher": "bobby",
                        "url": "https://coding.imooc.com/class/78.html"
                    },
                    {
                        "name": "python高级编程",
                        "teacher": "bobby",
                        "url": "https://coding.imooc.com/class/200.html"
                    },
                    {
                        "name": "scrapy分布式爬虫",
                        "teacher": "bobby",
                        "url": "https://coding.imooc.com/class/92.html"
                    },
                    {
                        "name": "diango rest framework打造生鲜电商",
                        "teacher": "bobby",
                        "url": "https://coding.imooc.com/class/131.html"
                    },
                    {
                        "name": "tornado从入门到精通",
                        "teacher": "bobby",
                        "url": "https://coding.imooc.com/class/290.html"
                    },
                ]
                sock.send((response_template.format(json.dumps(data)).encode("utf8")))
                # sock.close()
                break
 
 
while True:
    # 阻塞等待连接
    socket, add = server.accept()
 
    # 启动一个线程去处理新的用户连接
    client_thread = threading.Thread(target=handle_sock, args=(socket, add))
    client_thread.start()

 request_test.py  

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
import requests
 
# res=requests.get("http://www.baidu.com")
# print(res.text)
url="http://127.0.0.1:8000"
params={
    "username":"bobby",
    "password":"bobby"
}
# response = requests.get(url,params=params)
# print(response.text)
res=requests.post(url,data=params)
print(res.text)
print(res.encoding)
print(res.json())

 先运行httpServer.py,在运行request_test.py

运行结果如下:

 实例三:打印请求状态码

1
2
3
4
import requests
 
response = requests.get("https://www.baidu.com")
print(response.status_code)

  输出结果:

 实例四:打印header请求头部

http_server.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
# socket服务端
import socket
import threading
import json
 
server = socket.socket()
# 绑定到0.0.0.0 8000端口
server.bind(('0.0.0.0', 8000))
server.listen()
 
 
# 获取客户端连接并启动线程去处理
 
def handle_sock(sock, addr):
    while True:
        tmp_data = sock.recv(1024*10)
        tmp_data = tmp_data.decode("utf8")
        print(tmp_data)
        request_line = tmp_data.splitlines()[0]
        print(request_line)
        if request_line:
            method = request_line.split()[0]
            path = request_line.split()[1]
            if method == "GET":
                response_template = '''HTTP/1.1 200 OK\r\nContent-Type: text/html\r\nAccess-Controller-Allow-Origin:https://localhost:63342\r\n\r
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="utf-8">
    <title>Title</title>
</head>
<body>
<form action="/" method="POST">
    <input type="text" value="name" />
    <input type="password" value="password">
    <input type="submit" value="登录">
</form>
</body>
</html>
        '''
                sock.send(response_template.encode("utf8"))
                # sock.close()
                break
            elif method == "POST":
                response_template = '''HTTP/1.1 200 OK\r\nContent-Type: application/json\r\n
 
{}
                '''
                data = [
                    {
                        "name": "django打造在线教育",
                        "teacher": "bobby",
                        "url": "https://coding.imooc.com/class/78.html"
                    },
                    {
                        "name": "python高级编程",
                        "teacher": "bobby",
                        "url": "https://coding.imooc.com/class/200.html"
                    },
                    {
                        "name": "scrapy分布式爬虫",
                        "teacher": "bobby",
                        "url": "https://coding.imooc.com/class/92.html"
                    },
                    {
                        "name": "diango rest framework打造生鲜电商",
                        "teacher": "bobby",
                        "url": "https://coding.imooc.com/class/131.html"
                    },
                    {
                        "name": "tornado从入门到精通",
                        "teacher": "bobby",
                        "url": "https://coding.imooc.com/class/290.html"
                    },
                ]
                sock.send((response_template.format(json.dumps(data)).encode("utf8")))
                sock.close()
                break
 
 
while True:
    # 阻塞等待连接
    socket, add = server.accept()
 
    # 启动一个线程去处理新的用户连接
    client_thread = threading.Thread(target=handle_sock, args=(socket, add))
    client_thread.start()

request_test.py

1
2
3
4
5
6
7
8
9
import requests
 
url="http://127.0.0.1:8000"
my_headers={
    "user-agent":"requests",
    "imooc_uid":"321"
}
response = requests.get(url=url,headers=my_headers)
print(response.headers)

 在http_server.py上打上断点

 可以看到requests.get方法传递过来的headers头部参数值

 输出结果如下:

访问百度的headers

1
2
3
4
import requests
 
response = requests.get("https://www.baidu.com")
print(response.headers)

输出结果如下:

实例五:默认请求参数Content-Type为Content-Type: application/x-www-form-urlencoded

http_server.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
# socket服务端
import socket
import threading
import json
 
server = socket.socket()
# 绑定到0.0.0.0 8000端口
server.bind(('0.0.0.0', 8000))
server.listen()
 
 
# 获取客户端连接并启动线程去处理
 
def handle_sock(sock, addr):
    while True:
        tmp_data = sock.recv(1024*10)
        tmp_data = tmp_data.decode("utf8")
        print(tmp_data)
        request_line = tmp_data.splitlines()[0]
        print(request_line)
        if request_line:
            method = request_line.split()[0]
            path = request_line.split()[1]
            if method == "GET":
                response_template = '''HTTP/1.1 200 OK\r\nContent-Type: text/html\r\nAccess-Controller-Allow-Origin:https://localhost:63342\r\n\r
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="utf-8">
    <title>Title</title>
</head>
<body>
<form action="/" method="POST">
    <input type="text" value="name" />
    <input type="password" value="password">
    <input type="submit" value="登录">
</form>
</body>
</html>
        '''
                sock.send(response_template.encode("utf8"))
                # sock.close()
                break
            elif method == "POST":
                response_template = '''HTTP/1.1 200 OK\r\nContent-Type: application/json\r\n
 
{}
                '''
                data = [
                    {
                        "name": "django打造在线教育",
                        "teacher": "bobby",
                        "url": "https://coding.imooc.com/class/78.html"
                    },
                    {
                        "name": "python高级编程",
                        "teacher": "bobby",
                        "url": "https://coding.imooc.com/class/200.html"
                    },
                    {
                        "name": "scrapy分布式爬虫",
                        "teacher": "bobby",
                        "url": "https://coding.imooc.com/class/92.html"
                    },
                    {
                        "name": "diango rest framework打造生鲜电商",
                        "teacher": "bobby",
                        "url": "https://coding.imooc.com/class/131.html"
                    },
                    {
                        "name": "tornado从入门到精通",
                        "teacher": "bobby",
                        "url": "https://coding.imooc.com/class/290.html"
                    },
                ]
                sock.send((response_template.format(json.dumps(data)).encode("utf8")))
                sock.close()
                break
 
 
while True:
    # 阻塞等待连接
    socket, add = server.accept()
 
    # 启动一个线程去处理新的用户连接
    client_thread = threading.Thread(target=handle_sock, args=(socket, add))
    client_thread.start()

request_test.py

1
2
3
4
5
6
7
8
9
import requests
 
url="http://127.0.0.1:8000"
params={
    "username":"bobby",
    "password":"bobby"
}
res=requests.post(url,data=params)
print(res.encoding) 

断点输出结果如下:

四、data和Json参数都可以传递两种数据类型。1、字符串 2、dict

当为data时

request_test.py

1
2
3
4
5
6
7
8
9
10
import json
import requests
 
url="http://127.0.0.1:8000"
params={
    "username":"bobby",
    "password":"bobby"
}
res=requests.post(url,data=json.dumps(params))
print(res.encoding)

   http_server.json

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
# socket服务端
import socket
import threading
import json
 
server = socket.socket()
# 绑定到0.0.0.0 8000端口
server.bind(('0.0.0.0', 8000))
server.listen()
 
 
# 获取客户端连接并启动线程去处理
 
def handle_sock(sock, addr):
    while True:
        tmp_data = sock.recv(1024*10)
        tmp_data = tmp_data.decode("utf8")
        print(tmp_data)
        request_line = tmp_data.splitlines()[0]
        print(request_line)
        if request_line:
            method = request_line.split()[0]
            path = request_line.split()[1]
            if method == "GET":
                response_template = '''HTTP/1.1 200 OK\r\nContent-Type: text/html\r\nAccess-Controller-Allow-Origin:https://localhost:63342\r\n\r
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="utf-8">
    <title>Title</title>
</head>
<body>
<form action="/" method="POST">
    <input type="text" value="name" />
    <input type="password" value="password">
    <input type="submit" value="登录">
</form>
</body>
</html>
        '''
                sock.send(response_template.encode("utf8"))
                # sock.close()
                break
            elif method == "POST":
                response_template = '''HTTP/1.1 200 OK\r\nContent-Type: application/json\r\n
 
{}
                '''
                data = [
                    {
                        "name": "django打造在线教育",
                        "teacher": "bobby",
                        "url": "https://coding.imooc.com/class/78.html"
                    },
                    {
                        "name": "python高级编程",
                        "teacher": "bobby",
                        "url": "https://coding.imooc.com/class/200.html"
                    },
                    {
                        "name": "scrapy分布式爬虫",
                        "teacher": "bobby",
                        "url": "https://coding.imooc.com/class/92.html"
                    },
                    {
                        "name": "diango rest framework打造生鲜电商",
                        "teacher": "bobby",
                        "url": "https://coding.imooc.com/class/131.html"
                    },
                    {
                        "name": "tornado从入门到精通",
                        "teacher": "bobby",
                        "url": "https://coding.imooc.com/class/290.html"
                    },
                ]
                sock.send((response_template.format(json.dumps(data)).encode("utf8")))
                sock.close()
                break
 
 
while True:
    # 阻塞等待连接
    socket, add = server.accept()
 
    # 启动一个线程去处理新的用户连接
    client_thread = threading.Thread(target=handle_sock, args=(socket, add))
    client_thread.start()

 执行结果如下:

当为json时

request_test.py

1
2
3
4
5
6
7
8
9
10
import json
import requests
 
url="http://127.0.0.1:8000"
params={
    "username":"bobby",
    "password":"bobby"
}
res=requests.post(url,json=json.dumps(params))
print(res.encoding)

 http_server.json

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
# socket服务端
import socket
import threading
import json
 
server = socket.socket()
# 绑定到0.0.0.0 8000端口
server.bind(('0.0.0.0', 8000))
server.listen()
 
 
# 获取客户端连接并启动线程去处理
 
def handle_sock(sock, addr):
    while True:
        tmp_data = sock.recv(1024*10)
        tmp_data = tmp_data.decode("utf8")
        print(tmp_data)
        request_line = tmp_data.splitlines()[0]
        print(request_line)
        if request_line:
            method = request_line.split()[0]
            path = request_line.split()[1]
            if method == "GET":
                response_template = '''HTTP/1.1 200 OK\r\nContent-Type: text/html\r\nAccess-Controller-Allow-Origin:https://localhost:63342\r\n\r
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="utf-8">
    <title>Title</title>
</head>
<body>
<form action="/" method="POST">
    <input type="text" value="name" />
    <input type="password" value="password">
    <input type="submit" value="登录">
</form>
</body>
</html>
        '''
                sock.send(response_template.encode("utf8"))
                # sock.close()
                break
            elif method == "POST":
                response_template = '''HTTP/1.1 200 OK\r\nContent-Type: application/json\r\n
 
{}
                '''
                data = [
                    {
                        "name": "django打造在线教育",
                        "teacher": "bobby",
                        "url": "https://coding.imooc.com/class/78.html"
                    },
                    {
                        "name": "python高级编程",
                        "teacher": "bobby",
                        "url": "https://coding.imooc.com/class/200.html"
                    },
                    {
                        "name": "scrapy分布式爬虫",
                        "teacher": "bobby",
                        "url": "https://coding.imooc.com/class/92.html"
                    },
                    {
                        "name": "diango rest framework打造生鲜电商",
                        "teacher": "bobby",
                        "url": "https://coding.imooc.com/class/131.html"
                    },
                    {
                        "name": "tornado从入门到精通",
                        "teacher": "bobby",
                        "url": "https://coding.imooc.com/class/290.html"
                    },
                ]
                sock.send((response_template.format(json.dumps(data)).encode("utf8")))
                sock.close()
                break
 
 
while True:
    # 阻塞等待连接
    socket, add = server.accept()
 
    # 启动一个线程去处理新的用户连接
    client_thread = threading.Thread(target=handle_sock, args=(socket, add))
    client_thread.start()

 当为Json时,数据结果如下,数据结果转换为Json

 request_test.py

1
2
3
4
5
6
7
8
9
10
import json
import requests
 
url="http://127.0.0.1:8000"
params={
    "username":"bobby",
    "password":"bobby"
}
res=requests.post(url,json=params)
print(res.encoding)

  http_server.json

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
# socket服务端
import socket
import threading
import json
 
server = socket.socket()
# 绑定到0.0.0.0 8000端口
server.bind(('0.0.0.0', 8000))
server.listen()
 
 
# 获取客户端连接并启动线程去处理
 
def handle_sock(sock, addr):
    while True:
        tmp_data = sock.recv(1024*10)
        tmp_data = tmp_data.decode("utf8")
        print(tmp_data)
        request_line = tmp_data.splitlines()[0]
        print(request_line)
        if request_line:
            method = request_line.split()[0]
            path = request_line.split()[1]
            if method == "GET":
                response_template = '''HTTP/1.1 200 OK\r\nContent-Type: text/html\r\nAccess-Controller-Allow-Origin:https://localhost:63342\r\n\r
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="utf-8">
    <title>Title</title>
</head>
<body>
<form action="/" method="POST">
    <input type="text" value="name" />
    <input type="password" value="password">
    <input type="submit" value="登录">
</form>
</body>
</html>
        '''
                sock.send(response_template.encode("utf8"))
                # sock.close()
                break
            elif method == "POST":
                response_template = '''HTTP/1.1 200 OK\r\nContent-Type: application/json\r\n
 
{}
                '''
                data = [
                    {
                        "name": "django打造在线教育",
                        "teacher": "bobby",
                        "url": "https://coding.imooc.com/class/78.html"
                    },
                    {
                        "name": "python高级编程",
                        "teacher": "bobby",
                        "url": "https://coding.imooc.com/class/200.html"
                    },
                    {
                        "name": "scrapy分布式爬虫",
                        "teacher": "bobby",
                        "url": "https://coding.imooc.com/class/92.html"
                    },
                    {
                        "name": "diango rest framework打造生鲜电商",
                        "teacher": "bobby",
                        "url": "https://coding.imooc.com/class/131.html"
                    },
                    {
                        "name": "tornado从入门到精通",
                        "teacher": "bobby",
                        "url": "https://coding.imooc.com/class/290.html"
                    },
                ]
                sock.send((response_template.format(json.dumps(data)).encode("utf8")))
                sock.close()
                break
 
 
while True:
    # 阻塞等待连接
    socket, add = server.accept()
 
    # 启动一个线程去处理新的用户连接
    client_thread = threading.Thread(target=handle_sock, args=(socket, add))
    client_thread.start()

 此时dict值转换为json

 浏览器和requests最终都是需要拼接满足http的字符串。

posted @   leagueandlegends  阅读(16)  评论(0编辑  收藏  举报
相关博文:
阅读排行:
· 全程不用写代码,我用AI程序员写了一个飞机大战
· DeepSeek 开源周回顾「GitHub 热点速览」
· 记一次.NET内存居高不下排查解决与启示
· MongoDB 8.0这个新功能碉堡了,比商业数据库还牛
· .NET10 - 预览版1新功能体验(一)
点击右上角即可分享
微信分享提示