第一步:requests
get请求
# -*- coding:utf-8 -*- # 日期:2018/5/15 17:46 # Author:小鼠标 import requests url = "http://www.baidu.com"
#res = requests.get(url) #方法1
res = requests.request('get',url) #方法2
print('响应状态码:',res.status_code) print('响应内容:',res.text)
post请求
# -*- coding:utf-8 -*- # 日期:2018/5/15 17:46 # Author:小鼠标 import requests url = "http://www.baidu.com" data = { 'username': 'xiaoshubiao', 'pwd': 'xiaoshubiao' } res = requests.post(url,data) print('响应状态码:',res.status_code) print('响应内容:',res.text)
第二步:伪装浏览器和伪造cookie
# -*- coding:utf-8 -*- # 日期:2018/5/15 17:46 # Author:小鼠标 import requests url = "http://www.baidu.com" headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36' ' (KHTML, like Gecko) Chrome/55.0.2883.87 UBrowser/6.2.39' '64.2 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0' '.9,image/webp,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate, sdch', 'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive' } cookies = dict(name='xiaoshubiao') res = requests.get(url,headers = headers,cookies = cookies) print('响应状态码:',res.status_code) print('响应内容:',res.text)
第三步:使用代理ip
# -*- coding:utf-8 -*- # 日期:2018/5/15 17:46 # Author:小鼠标 import requests url = "http://www.baidu.com" headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36' ' (KHTML, like Gecko) Chrome/55.0.2883.87 UBrowser/6.2.39' '64.2 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0' '.9,image/webp,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate, sdch', 'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive' } cookies = dict(name='xiaoshubiao') proxies = {'http':'218.73.134.234:36602'} res = requests.get(url,headers = headers,cookies = cookies,proxies = proxies) print('响应状态码:',res.status_code) print('响应内容:',res.text)
第四步:内容解析
# -*- coding:utf-8 -*- # 日期:2018/5/15 17:46 # Author:小鼠标 import requests from bs4 import BeautifulSoup url = "http://news.sina.com.cn/guide/" res = requests.get(url) res.encoding = 'utf-8' web_data = res.text #内容解析 soup = BeautifulSoup(web_data,'lxml') title_list = soup.select('title') #获取标签内容 返回为列表 a_list = soup.select('a') ul_list = soup.select('ul.list01') #获取类名为list01的ul的内容 返回为列表 div_list = soup.select('div#tab01') #获取id为tab01的内容 返回为列表 for title , a in zip(title_list,a_list): title_content = title.get_text() #获取标签内容的值 a_href = a.get('href') #获取标签的属性的值 print(title_content,a_href)
我相信努力的人运气不会差,未来会给我一双梦想的翅膀