python16_day36【爬虫1】

一、requests

  1. GET请求

 1 # 1、无参数实例
 2  
 3 import requests
 4  
 5 ret = requests.get('https://github.com/timeline.json')
 6  
 7 print(ret.url)
 8 print(ret.text)
 9  
10  
11
12 # 2、有参数实例
13  
14 import requests
15  
16 payload = {'key1': 'value1', 'key2': 'value2'}
17 ret = requests.get("http://httpbin.org/get", params=payload)
18  
19 print(ret.url)
20 print(ret.text)

 

  2.POST请求

 

 1 # 1、基本POST实例
 2  
 3 import requests
 4  
 5 payload = {'key1': 'value1', 'key2': 'value2'}
 6 ret = requests.post("http://httpbin.org/post", data=payload)
 7  
 8 print(ret.text)
 9  
10  
11 # 2、发送请求头和数据实例
12  
13 import requests
14 import json
15  
16 url = 'https://api.github.com/some/endpoint'
17 payload = {'some': 'data'}
18 headers = {'content-type': 'application/json'}
19  
20 ret = requests.post(url, data=json.dumps(payload), headers=headers)
21  
22 print(ret.text)
23 print(ret.cookies)

 

  3.其它请求  

 1 requests.get(url, params=None, **kwargs)
 2 requests.post(url, data=None, json=None, **kwargs)
 3 requests.put(url, data=None, **kwargs)
 4 requests.head(url, **kwargs)
 5 requests.delete(url, **kwargs)
 6 requests.patch(url, data=None, **kwargs)
 7 requests.options(url, **kwargs)
 8  
 9 # 以上方法均是在此方法的基础上构建
10 requests.request(method, url, **kwargs)
View Code

 

  4.汽车之家新闻

 1 import requests
 2 from bs4 import BeautifulSoup
 3 
 4 # response = requests.get("http://www.autohome.com.cn/news/")
 5 # # response.text 是str
 6 # # response.content 是bytes二进制
 7 #
 8 # response.encoding = 'gbk'   # 网站使用了gbk
 9 # root = BeautifulSoup(response.text, 'html.parser')   # 将返回结果拿到用bs解析
10 # outer_div_obj = root.find(name='div', id='auto-channel-lazyload-article')  # 打到div id='xx'
11 # li_obj_list = outer_div_obj.find_all(name='li')     # 拿到里面所有的LI
12 #
13 # for li_obj in li_obj_list:
14 #     if not li_obj.find('h3'):
15 #         continue
16 #     title_obj = li_obj.find('h3')       # 拿到对象 H3标签
17 #     summary_obj = li_obj.find('p')      # 拿到对象 P标签
18 #     img_obj = li_obj.find('img')        # 拿到对象 IMG标签
19 #     src = img_obj.attrs.get('src')      # 从IMG标签对象中拿到src属性
20 #
21 #     print(src, title_obj.text, summary_obj.text)
22 
23 response = requests.get("http://www.autohome.com.cn/news/")
24 response.encoding = 'gbk'
25 
26 soup = BeautifulSoup(response.text, 'html.parser')
27 tag = soup.find(name='div', attrs={'id': 'auto-channel-lazyload-article'})
28 li_list = tag.find_all('li')
29 
30 for li in li_list:
31     h3 = li.find('h3')
32 
33     if not h3:
34         continue
35     print("\033[33;1m标题: {0}\033[0m".format(h3.text))
36     print("\033[34;1m路径: http://{0}\033[0m".format(li.find('img').attrs['src']))
37     print("\033[34;1m内容: {0}\033[0m".format(li.find('p').text))

 

  5.github登录

 1 #!/usr/bin/env python
 2 # -*-coding:utf8-*-
 3 # __author__ = "willian"
 4 
 5 import requests
 6 from bs4 import BeautifulSoup
 7 # 第一次请求: 获取 token and cookie
 8 r1 = requests.get('https://github.com/login')
 9 b1 = BeautifulSoup(r1.text, 'html.parser')
10 # get token
11 auth_token = b1.find(name='input', attrs={'name': 'authenticity_token'}).get('value')
12 # get cookies
13 r1_cookie_dict = r1.cookies.get_dict()
14 
15 
16 # 第二次请求: 发送用户认证
17 r2 = requests.post("https://github.com/session",
18                    data={
19                        'commit': "Sign in",
20                        'utf8': '',
21                        'authenticity_token': auth_token,
22                        'login': '',
23                        'password': ""
24                    }, cookies=r1_cookie_dict)
25 # get cookies
26 r2_cookie_dict = r2.cookies.get_dict()
27 
28 # 将两次的cookies合并
29 all_cookie_dict = {}
30 all_cookie_dict.update(r1_cookie_dict)
31 all_cookie_dict.update(r2_cookie_dict)
32 
33 
34 # 第三次请求:只有登录成功之后获取个人页面
35 r3 = requests.get('https://github.com/settings/emails', cookies=all_cookie_dict)
36 print(r3.text)

 

   6.抽屉点赞

 1 #!/usr/bin/env python
 2 # -*-coding:utf8-*-
 3 # __author__ = "willian"
 4 
 5 import requests
 6 from bs4 import BeautifulSoup
 7 
 8 # 1. 请求获取cookies
 9 r0 = requests.get("http://dig.chouti.com")
10 r0_cookie_dict = r0.cookies.get_dict()
11 
12 
13 # 2. 授权
14 r1 = requests.post(
15     url="http://dig.chouti.com/login",
16     data={
17         'phone': 'xx',
18         'password': 'xx',
19         'oneMonth': 1
20     },
21     cookies=r0_cookie_dict
22 )
23 r1_cookie_dict = r1.cookies.get_dict()
24 
25 all_cookies = {}
26 all_cookies.update(r0_cookie_dict)
27 all_cookies.update(r1_cookie_dict)
28 
29 # 3.点赞
30 r2 = requests.post(url='http://dig.chouti.com/link/vote?linksId=14808951', cookies=all_cookies)
31 print(r2.text)

 

二、Beautfulsoup4

 

 

三、wechat

posted @ 2017-10-21 15:46  willianflasky  阅读(173)  评论(0编辑  收藏  举报