1 2 3 4 5 6 7 | #1. 爬取强大的度娘,打印页面信息 #第一个爬虫示例,爬取度娘页面 import requests #导入爬虫的库,不然调用不了爬虫函数 response = requests.get( "http://www.baidu.com" ) #生成一个respon对象 response.encoding = response.apparent_encoding #设置编码格式 print ( "状态码:" + str (response.status_code)) #打印状态码 print (response.text) #输出爬取的信息 |
1 2 3 4 | 输出 状态码: 200 <!DOCTYPE html> <! - - STATUS OK - - ><html> <head><meta http - equiv = content - type content = text / html;charset = utf - 8 ><meta http - equiv = X - UA - Compatible content = IE = Edge><meta content = always name = referrer><link rel = stylesheet type = text / css href = http: / / s1.bdstatic.com / r / www / cache / bdorz / baidu. min .css><title>百度一下,你就知道< / title>< / head> <body link = #0000cc> <div id=wrapper> <div id=head> <div class=head_wrapper> <div class=s_form> <div class=s_form_wrapper> <div id=lg> <img hidefocus=true src=//www.baidu.com/img/bd_logo1.png width=270 height=129> </div> <form id=form name=f action=//www.baidu.com/s class=fm> <input type=hidden name=bdorz_come value=1> <input type=hidden name=ie value=utf-8> <input type=hidden name=f value=8> <input type=hidden name=rsv_bp value=1> <input type=hidden name=rsv_idx value=1> <input type=hidden name=tn value=baidu><span class="bg s_ipt_wr"><input id=kw name=wd class=s_ipt value maxlength=255 autocomplete=off autofocus></span><span class="bg s_btn_wr"><input type=submit id=su value=百度一下 class="bg s_btn"></span> </form> </div> </div> <div id=u1> <a href=http://news.baidu.com name=tj_trnews class=mnav>新闻</a> <a href=http://www.hao123.com name=tj_trhao123 class=mnav>hao123</a> <a href=http://map.baidu.com name=tj_trmap class=mnav>地图</a> <a href=http://v.baidu.com name=tj_trvideo class=mnav>视频</a> <a href=http://tieba.baidu.com name=tj_trtieba class=mnav>贴吧</a> <noscript> <a href=http://www.baidu.com/bdorz/login.gif?login&tpl=mn&u=http%3A%2F%2Fwww.baidu.com%2f%3fbdorz_come%3d1 name=tj_login class=lb>登录</a> </noscript> <script>document.write('<a href="http://www.baidu.com/bdorz/login.gif?login&tpl=mn&u='+ encodeURIComponent(window.location.href+ (window.location.search === "" ? "?" : "&")+ "bdorz_come=1")+ '" name="tj_login" class="lb">登录</a>');</script> <a href=//www.baidu.com/more/ name=tj_briicon class=bri style="display: block;">更多产品</a> </div> </div> </div> <div id=ftCon> <div id=ftConw> <p id=lh> <a href=http://home.baidu.com>关于百度</a> <a href=http://ir.baidu.com>About Baidu</a> </p> <p id=cp>©2017 Baidu <a href=http://www.baidu.com/duty/>使用百度前必读</a> <a href=http://jianyi.baidu.com/ class=cp-feedback>意见反馈</a> 京ICP证030173号 <img src=//www.baidu.com/img/gs.gif> </p> </div> </div> </div> </body> </html> |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 | #2. get方法实例,和传参数实例 #第二个get实例 import requests #先导入爬虫的库,不然调用不了爬虫的函数 response = requests.get( "http://httpbin.org/get" ) #get方法 print (response.status_code) #状态码 print (response.text) 输出 200 { "args" : {}, "headers" : { "Accept" : "*/*" , "Accept-Encoding" : "gzip, deflate" , "Host" : "httpbin.org" , "User-Agent" : "python-requests/2.19.1" , "X-Amzn-Trace-Id" : "Root=1-5f8d9277-4092b3b60f705da706dc8bb4" }, "origin" : "" , "url" : "http://httpbin.org/get" } |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 | # 3. post方法实例 import requests #导入爬虫模块 response = requests.post( "http://httpbin.org/post" ) #post方法访问 print (response.status_code) #状态码 print (response.text) 输出 200 { "args" : {}, "data" : "", "files" : {}, "form" : {}, "headers" : { "Accept" : "*/*" , "Accept-Encoding" : "gzip, deflate" , "Content-Length" : "0" , "Host" : "httpbin.org" , "User-Agent" : "python-requests/2.19.1" , "X-Amzn-Trace-Id" : "Root=1-5f8d9f13-374fc5886007f75336920956" }, "json" : null, "origin" : "" , "url" : "http://httpbin.org/post" } |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 | #4. put方法实例 import requests #导入爬虫模块 response = requests.put( "http://httpbin.org/put" ) #put方法访问 print (response.status_code) #状态码 print (response.text) 输出 200 { "args" : {}, "data" : "", "files" : {}, "form" : {}, "headers" : { "Accept" : "*/*" , "Accept-Encoding" : "gzip, deflate" , "Content-Length" : "0" , "Host" : "httpbin.org" , "User-Agent" : "python-requests/2.19.1" , "X-Amzn-Trace-Id" : "Root=1-5f8d9439-082da1a602cc68a9057238a7" }, "json" : null, "origin" : "" , "url" : "http://httpbin.org/put" } |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 | #5. get方法传参数实例1 #如果需要传多个参数只需要用&符号连接即可 import requests #导入爬虫模块 response = requests.get( "http://httpbin.org/get?name=kevin&age=30" ) #get传参 print (response.status_code) #状态码 print (response.text) 输出 200 { "args" : { "age" : "30" , "name" : "kevin" }, "headers" : { "Accept" : "*/*" , "Accept-Encoding" : "gzip, deflate" , "Host" : "httpbin.org" , "User-Agent" : "python-requests/2.19.1" , "X-Amzn-Trace-Id" : "Root=1-5f8d959b-0a6253ed3c94f7f66ccde535" }, "origin" : "" , "url" : "http://httpbin.org/get?name=kevin&age=30" } |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 | #6. get方法传参实例2 import requests #导入爬虫模块 data = { "name" : "kevin" , "age" : 30 } response = requests.get( "http://httpbin.org/get" ,params = data) #get传参 print (response.status_code) #状态码 print (response.text) 输出 200 { "args" : { "age" : "30" , "name" : "kevin" }, "headers" : { "Accept" : "*/*" , "Accept-Encoding" : "gzip, deflate" , "Host" : "httpbin.org" , "User-Agent" : "python-requests/2.19.1" , "X-Amzn-Trace-Id" : "Root=1-5f8d9678-7cc154e278d1ac3c58ac4405" }, "origin" : "" , "url" : "http://httpbin.org/get?name=kevin&age=30" } |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 | #7. post传参方法实例 import requests #导入爬虫模块 data = { "name" : "kevin" , "age" : 30 } response = requests.post( "http://httpbin.org/post,params=data" ) #post传参 print (response.status_code) #状态码 print (response.text) 输出 404 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2 Final//EN" > <title> 404 Not Found< / title> <h1>Not Found< / h1> <p>The requested URL was not found on the server. If you entered the URL manually please check your spelling and try again.< / p> |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 | #8. 爬取信息并保存到本地 #因为目录关系,在D盘建立一个叫做爬虫的文件夹,然后保存信息,注意文件保存时的encoding设置 #爬取一个html并保存 import requests url = "http://www.baidu.com" response = requests.get(url) response.encoding = "utf-8" #设置接收编码格式 print ( "\nr的类型" + str ( type (response))) print ( "\n状态码是:" + str (response.status_code)) print ( "\n头部信息:" + str (response.headers)) print ( "\n响应内容:" ) print (response.text) #保存文件 file = open ( "D:\\爬虫\\baidu.html" , "w" ,encoding = "utf" ) #打开一个文件,w是文件不存在则新建一个文件,这里不用wb是因为不用保存成二进制 file .write(response.text) file .close() 输出 r的类型< class 'requests.models.Response' > 状态码是: 200 头部信息:{ 'Cache-Control' : 'private, no-cache, no-store, proxy-revalidate, no-transform' , 'Connection' : 'keep-alive' , 'Content-Encoding' : 'gzip' , 'Content-Type' : 'text/html' , 'Date' : 'Mon, 19 Oct 2020 14:05:20 GMT' , 'Last-Modified' : 'Mon, 23 Jan 2017 13:27:36 GMT' , 'Pragma' : 'no-cache' , 'Server' : 'bfe/' , 'Set-Cookie' : 'BDORZ=27315; max-age=86400; domain=.baidu.com; path=/' , 'Transfer-Encoding' : 'chunked' } 响应内容: <!DOCTYPE html> <! - - STATUS OK - - ><html> <head><meta http - equiv = content - type content = text / html;charset = utf - 8 ><meta http - equiv = X - UA - Compatible content = IE = Edge><meta content = always name = referrer><link rel = stylesheet type = text / css href = http: / / s1.bdstatic.com / r / www / cache / bdorz / baidu. min .css><title>百度一下,你就知道< / title>< / head> <body link = #0000cc> <div id=wrapper> <div id=head> <div class=head_wrapper> <div class=s_form> <div class=s_form_wrapper> <div id=lg> <img hidefocus=true src=//www.baidu.com/img/bd_logo1.png width=270 height=129> </div> <form id=form name=f action=//www.baidu.com/s class=fm> <input type=hidden name=bdorz_come value=1> <input type=hidden name=ie value=utf-8> <input type=hidden name=f value=8> <input type=hidden name=rsv_bp value=1> <input type=hidden name=rsv_idx value=1> <input type=hidden name=tn value=baidu><span class="bg s_ipt_wr"><input id=kw name=wd class=s_ipt value maxlength=255 autocomplete=off autofocus></span><span class="bg s_btn_wr"><input type=submit id=su value=百度一下 class="bg s_btn"></span> </form> </div> </div> <div id=u1> <a href=http://news.baidu.com name=tj_trnews class=mnav>新闻</a> <a href=http://www.hao123.com name=tj_trhao123 class=mnav>hao123</a> <a href=http://map.baidu.com name=tj_trmap class=mnav>地图</a> <a href=http://v.baidu.com name=tj_trvideo class=mnav>视频</a> <a href=http://tieba.baidu.com name=tj_trtieba class=mnav>贴吧</a> <noscript> <a href=http://www.baidu.com/bdorz/login.gif?login&tpl=mn&u=http%3A%2F%2Fwww.baidu.com%2f%3fbdorz_come%3d1 name=tj_login class=lb>登录</a> </noscript> <script>document.write('<a href="http://www.baidu.com/bdorz/login.gif?login&tpl=mn&u='+ encodeURIComponent(window.location.href+ (window.location.search === "" ? "?" : "&")+ "bdorz_come=1")+ '" name="tj_login" class="lb">登录</a>');</script> <a href=//www.baidu.com/more/ name=tj_briicon class=bri style="display: block;">更多产品</a> </div> </div> </div> <div id=ftCon> <div id=ftConw> <p id=lh> <a href=http://home.baidu.com>关于百度</a> <a href=http://ir.baidu.com>About Baidu</a> </p> <p id=cp>©2017 Baidu <a href=http://www.baidu.com/duty/>使用百度前必读</a> <a href=http://jianyi.baidu.com/ class=cp-feedback>意见反馈</a> 京ICP证030173号 <img src=//www.baidu.com/img/gs.gif> </p> </div> </div> </div> </body> </html> |
1 2 3 4 5 6 7 8 9 | #9. 爬取图片,保存到本地 #保存百度图片到本地 import requests #导入爬虫库 response = requests.get( "http://www.baidu.com/img/baidu_jgylogo3.gif" ) file = open ( "D:\\爬虫\\baidu_logo.gif" , "wb" ) #打开一个文件,wb表示以二进制格式打开一个文件只用于写入 file .write(response.content) #写入文件 file .close() #关闭文件操作,运行完毕后去你的目录检查下是否保存成功 输出 |
#10. 绕过反爬虫机制,以知乎为例 import requests #导入爬虫模块 response = requests.get( "http://www.zhihu.com" ) #第一次访问知乎,不设置头部信息 print ( "第一次,不设置头部信息,状态码:" + response.status_code) #没写headers,不能正常爬取,状态码不是200 输出
