爬虫基础
1 import requests 2 #无论是post、get请求 3 #要注意防爬虫策略:一般是加个请求头 4 #登陆 5 6 #下面的过程无法完成点赞 7 import requests 8 #无论是post、get请求 9 #要注意防爬虫策略:一般是加个请求头 10 #登陆 11 response_login = requests.post( 12 url = 'https://dig.chouti.com/login', 13 data = { 14 'phone':'8613125397685', 15 'password':'478324asd', 16 'oneMonth':'1' 17 }, 18 #加个请求头就可以爬取了 19 headers = { 20 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36' 21 } 22 ) 23 cookies_dict = response_login.cookies.get_dict()#就是一个字典 24 #print(cookies_dict)打印cookies信息 25 #点赞 26 r1 = requests.get( 27 url = '',#只有url的话可能会被拦截,要加个请求头 28 headers = { 29 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36' 30 }, 31 cookies = cookies_dict 32 ) 33 print(r1.text)
#可以正确点赞 import requests #2,3步都是post #1:访问抽屉新热榜,获取cookie(未授权) r1 = requests.get( url = 'https://dig.chouti.com/all/hot/recent/1',#只有url的话可能会被拦截,要加个请求头 headers = { 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36' }, ) r1_cookie_dict = r1.cookies.get_dict() #2 :发送用户名和密码认证 + cookie(未授权) #:注意用爬虫策略 response_login = requests.post( url = 'https://dig.chouti.com/login', data = { 'phone':'8613125397685', 'password':'478324asd', 'oneMonth':'1' }, #加个请求头就可以爬取了 headers = { 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36' }, cookies = r1_cookie_dict ) #3:点赞 r1 = requests.post( url = 'https://dig.chouti.com/link/vote?linksId=22900531',#只有url的话可能会被拦截,要加个请求头 headers = { 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36' }, cookies = r1_cookie_dict#授权了 ) print(r1.text)
使用多行注释: 1.选中多行 2.按快捷键Ctrl + / 去掉多行注释: 1.选中被注释的多行内容 2.按快捷键Ctrl + /
1 在使用pycharm时,经常会需要多行代码同时缩进、左移,pycharm提供了快捷方式 2 3 1、pycharm使多行代码同时缩进 4 5 鼠标选中多行代码后,按下Tab键,一次缩进四个字符 6 7 2、pycharm使多行代码同时左移 8 9 鼠标选中多行代码后,同时按住shift+Tab键,一次左移四个字符 10 ---------------------
#左边出现bokmark的标记时可以用F11键取消
import requests#伪造浏览器向某个地址发请求 from bs4 import BeautifulSoup#解析HTML格式的字符串 #1:下载页面 ret = requests.get(url='https://www.autohome.com.cn/news/') #print(ret) ret 是一个对象 #print(ret.content)原始字节 #print(ret.text)字符串,但是会出现乱码 #ret.encoding = 'gbk'可以自己设置编码格式 #print(ret.text)若格式正确,则不会出现乱码了 #print(ret.apparent_encoding)返回网页的编码 ret.encoding = ret.apparent_encoding#直接设置为你网页的编码 #print(ret.text) #2:获取想要的指定内容Beautifulsuop soup = BeautifulSoup(ret.text,'html.parser')#解析器,parser与'之间不能有空格 #print(type(soup))#soup为对象 #div = soup.find(name = 'div',id = 'focus-1') div = soup.find(name = 'div',attrs={'id':'focus-1','class':'focusimg focusimg02'}) print(div) li_list = div.find_all('li')#列表 #print(li_list)\ for li in li_list: h2 = li.find('h2') a = li.find('a') p = li.find('p')#默认第一个参数为name img = li.find('img') src = img.get('src') file_name = src.rsplit('__',maxsplit=1)[1] ret_img = requests.get(url='https:'+src) with open(file_name,'wb') as f: f.write(ret_img.content) print(h2.text, a.get('href')) # href在搜索栏会自动添加http/https print(p.text) print('=' * 15) #print(a.attrs)a的所有属性 #print(a.get('href'))获取a的某一个属性 #print(h2) #print(h2.text) #print(a.text) #p = li.find('p')#默认第一个参数为name #print(p.text)
1 #加快下载 2 pip install requests -i https://pypi.douban.com/simple 3 忘了的话 百度 python 豆瓣下载源 4 使用douban源下载python包 - 中国陆特 - 博客园 5 里面有
1 #对多页进行点赞 2 import requests 3 from bs4 import BeautifulSoup 4 for page_num in range(8,9): 5 r1 = requests.get( 6 url = 'https://dig.chouti.com/all/hot/recent/%s'%page_num, 7 headers = { 8 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36' 9 } 10 ) 11 #print(r1.text) 12 r1_cookie_dict = r1.cookies.get_dict() 13 response_login = requests.post( 14 url = 'https://dig.chouti.com/login', 15 data ={ 16 'phone': '8613125397685', 17 'password': '478324asd', 18 'oneMonth': '1' 19 }, 20 headers = { 21 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36' 22 }, 23 cookies = r1_cookie_dict 24 ) 25 # response_index = requests.get( 26 # url = 'https://dig.chouti.com/', 27 # headers = { 28 # 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36' 29 # } 30 # ) 31 soup = BeautifulSoup(r1.text,'html.parser')#r1.text是对的,不是response_index 32 div = soup.find(attrs={'id':'content-list'}) 33 items = div.find_all(attrs={'class':'item'}) 34 for item in items: 35 tag = item.find(attrs={'class':'part2'}) 36 nid = tag.get('share-linkid') 37 print(nid) 38 r1 = requests.post(#r1前面出现了也没问题 39 url = 'https://dig.chouti.com/link/vote?linksId=%s'%nid, 40 headers={ 41 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36' 42 }, 43 cookies=r1_cookie_dict 44 ) 45 print(r1.text)
1 import requests 2 from bs4 import BeautifulSoup 3 #找到value 4 r1 = requests.get( 5 url = 'https://github.com/login' 6 ) 7 s1 = BeautifulSoup(r1.text,'html.parser') 8 token = s1.find(name = 'input',attrs={'name':'authenticity_token'}).get('value') 9 #print(r1.text) 10 #print(token) 11 r1_cookie_dict = r1.cookies.get_dict() 12 r2 = requests.post( 13 url = 'https://github.com/session', 14 data = { 15 'commit':'Sign in', 16 'utf8':'✓', 17 'authenticity_token':token, 18 'login':'clttyou', 19 'password':'9430' 20 }, 21 cookies = r1_cookie_dict 22 ) 23 print(r1.text)
1 #爬主页的模板 2 import requests 3 def getHtml(url): 4 try: 5 r = requests.get(url,timeout = 30) 6 r.raise_for_status() 7 r.encoding = r.apparent_encoding 8 return r.text 9 except: 10 return "产生异常" 11 if __name__ == "__main__": 12 url = "https://www.taobao.com/" 13 print(getHtml(url))
str = "00000003210Runoob01230000000"; print(str.strip('0')); # 去除首尾字符 0 str2 = " Runoob "; # 去除首尾空格 print(str2.strip()); ''' 3210Runoob0123 Runoob '''