python16_day36【爬虫1】
一、requests
1. GET请求
1 # 1、无参数实例 2 3 import requests 4 5 ret = requests.get('https://github.com/timeline.json') 6 7 print(ret.url) 8 print(ret.text) 9 10 11 12 # 2、有参数实例 13 14 import requests 15 16 payload = {'key1': 'value1', 'key2': 'value2'} 17 ret = requests.get("http://httpbin.org/get", params=payload) 18 19 print(ret.url) 20 print(ret.text)
2.POST请求
1 # 1、基本POST实例 2 3 import requests 4 5 payload = {'key1': 'value1', 'key2': 'value2'} 6 ret = requests.post("http://httpbin.org/post", data=payload) 7 8 print(ret.text) 9 10 11 # 2、发送请求头和数据实例 12 13 import requests 14 import json 15 16 url = 'https://api.github.com/some/endpoint' 17 payload = {'some': 'data'} 18 headers = {'content-type': 'application/json'} 19 20 ret = requests.post(url, data=json.dumps(payload), headers=headers) 21 22 print(ret.text) 23 print(ret.cookies)
3.其它请求
1 requests.get(url, params=None, **kwargs) 2 requests.post(url, data=None, json=None, **kwargs) 3 requests.put(url, data=None, **kwargs) 4 requests.head(url, **kwargs) 5 requests.delete(url, **kwargs) 6 requests.patch(url, data=None, **kwargs) 7 requests.options(url, **kwargs) 8 9 # 以上方法均是在此方法的基础上构建 10 requests.request(method, url, **kwargs)
4.汽车之家新闻
1 import requests 2 from bs4 import BeautifulSoup 3 4 # response = requests.get("http://www.autohome.com.cn/news/") 5 # # response.text 是str 6 # # response.content 是bytes二进制 7 # 8 # response.encoding = 'gbk' # 网站使用了gbk 9 # root = BeautifulSoup(response.text, 'html.parser') # 将返回结果拿到用bs解析 10 # outer_div_obj = root.find(name='div', id='auto-channel-lazyload-article') # 打到div id='xx' 11 # li_obj_list = outer_div_obj.find_all(name='li') # 拿到里面所有的LI 12 # 13 # for li_obj in li_obj_list: 14 # if not li_obj.find('h3'): 15 # continue 16 # title_obj = li_obj.find('h3') # 拿到对象 H3标签 17 # summary_obj = li_obj.find('p') # 拿到对象 P标签 18 # img_obj = li_obj.find('img') # 拿到对象 IMG标签 19 # src = img_obj.attrs.get('src') # 从IMG标签对象中拿到src属性 20 # 21 # print(src, title_obj.text, summary_obj.text) 22 23 response = requests.get("http://www.autohome.com.cn/news/") 24 response.encoding = 'gbk' 25 26 soup = BeautifulSoup(response.text, 'html.parser') 27 tag = soup.find(name='div', attrs={'id': 'auto-channel-lazyload-article'}) 28 li_list = tag.find_all('li') 29 30 for li in li_list: 31 h3 = li.find('h3') 32 33 if not h3: 34 continue 35 print("\033[33;1m标题: {0}\033[0m".format(h3.text)) 36 print("\033[34;1m路径: http://{0}\033[0m".format(li.find('img').attrs['src'])) 37 print("\033[34;1m内容: {0}\033[0m".format(li.find('p').text))
5.github登录
1 #!/usr/bin/env python 2 # -*-coding:utf8-*- 3 # __author__ = "willian" 4 5 import requests 6 from bs4 import BeautifulSoup 7 # 第一次请求: 获取 token and cookie 8 r1 = requests.get('https://github.com/login') 9 b1 = BeautifulSoup(r1.text, 'html.parser') 10 # get token 11 auth_token = b1.find(name='input', attrs={'name': 'authenticity_token'}).get('value') 12 # get cookies 13 r1_cookie_dict = r1.cookies.get_dict() 14 15 16 # 第二次请求: 发送用户认证 17 r2 = requests.post("https://github.com/session", 18 data={ 19 'commit': "Sign in", 20 'utf8': '✓', 21 'authenticity_token': auth_token, 22 'login': '', 23 'password': "" 24 }, cookies=r1_cookie_dict) 25 # get cookies 26 r2_cookie_dict = r2.cookies.get_dict() 27 28 # 将两次的cookies合并 29 all_cookie_dict = {} 30 all_cookie_dict.update(r1_cookie_dict) 31 all_cookie_dict.update(r2_cookie_dict) 32 33 34 # 第三次请求:只有登录成功之后获取个人页面 35 r3 = requests.get('https://github.com/settings/emails', cookies=all_cookie_dict) 36 print(r3.text)
6.抽屉点赞
1 #!/usr/bin/env python 2 # -*-coding:utf8-*- 3 # __author__ = "willian" 4 5 import requests 6 from bs4 import BeautifulSoup 7 8 # 1. 请求获取cookies 9 r0 = requests.get("http://dig.chouti.com") 10 r0_cookie_dict = r0.cookies.get_dict() 11 12 13 # 2. 授权 14 r1 = requests.post( 15 url="http://dig.chouti.com/login", 16 data={ 17 'phone': 'xx', 18 'password': 'xx', 19 'oneMonth': 1 20 }, 21 cookies=r0_cookie_dict 22 ) 23 r1_cookie_dict = r1.cookies.get_dict() 24 25 all_cookies = {} 26 all_cookies.update(r0_cookie_dict) 27 all_cookies.update(r1_cookie_dict) 28 29 # 3.点赞 30 r2 = requests.post(url='http://dig.chouti.com/link/vote?linksId=14808951', cookies=all_cookies) 31 print(r2.text)
二、Beautfulsoup4
三、wechat