爬虫基础

 1 import  requests
 2 #无论是post、get请求
 3 #要注意防爬虫策略:一般是加个请求头
 4 #登陆
 5 
 6 #下面的过程无法完成点赞
 7 import  requests
 8 #无论是post、get请求
 9 #要注意防爬虫策略:一般是加个请求头
10 #登陆
11 response_login = requests.post(
12     url = 'https://dig.chouti.com/login',
13     data = {
14         'phone':'8613125397685',
15         'password':'478324asd',
16         'oneMonth':'1'
17     },
18     #加个请求头就可以爬取了
19     headers = {
20         'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'
21     }
22 )
23 cookies_dict = response_login.cookies.get_dict()#就是一个字典
24 #print(cookies_dict)打印cookies信息
25 #点赞
26 r1 = requests.get(
27     url = '',#只有url的话可能会被拦截，要加个请求头
28     headers = {
29         'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'
30     },
31     cookies = cookies_dict
32 )
33 print(r1.text)

#可以正确点赞
import  requests
#2,3步都是post
#1:访问抽屉新热榜，获取cookie(未授权)
r1 = requests.get(
    url = 'https://dig.chouti.com/all/hot/recent/1',#只有url的话可能会被拦截，要加个请求头
    headers = {
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'
    },
)
r1_cookie_dict = r1.cookies.get_dict()
#2 :发送用户名和密码认证  + cookie(未授权)
#:注意用爬虫策略
response_login = requests.post(
    url = 'https://dig.chouti.com/login',
    data = {
        'phone':'8613125397685',
        'password':'478324asd',
        'oneMonth':'1'
    },
    #加个请求头就可以爬取了
    headers = {
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'
    },
    cookies = r1_cookie_dict
)
#3:点赞
r1 = requests.post(
    url = 'https://dig.chouti.com/link/vote?linksId=22900531',#只有url的话可能会被拦截，要加个请求头
    headers = {
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'
    },
    cookies = r1_cookie_dict#授权了
)
print(r1.text)

使用多行注释：

1.选中多行

2.按快捷键Ctrl + /



去掉多行注释：

1.选中被注释的多行内容

2.按快捷键Ctrl + /

 1 在使用pycharm时，经常会需要多行代码同时缩进、左移，pycharm提供了快捷方式
 2 
 3 1、pycharm使多行代码同时缩进
 4 
 5    鼠标选中多行代码后，按下Tab键，一次缩进四个字符
 6 
 7 2、pycharm使多行代码同时左移
 8 
 9   鼠标选中多行代码后，同时按住shift+Tab键，一次左移四个字符
10 ---------------------

#左边出现bokmark的标记时可以用F11键取消

import requests#伪造浏览器向某个地址发请求
from bs4 import BeautifulSoup#解析HTML格式的字符串
#1:下载页面
ret = requests.get(url='https://www.autohome.com.cn/news/')
#print(ret)  ret 是一个对象
#print(ret.content)原始字节
#print(ret.text)字符串，但是会出现乱码
#ret.encoding = 'gbk'可以自己设置编码格式
#print(ret.text)若格式正确，则不会出现乱码了
#print(ret.apparent_encoding)返回网页的编码
ret.encoding = ret.apparent_encoding#直接设置为你网页的编码
#print(ret.text)
#2:获取想要的指定内容Beautifulsuop
soup = BeautifulSoup(ret.text,'html.parser')#解析器,parser与'之间不能有空格
#print(type(soup))#soup为对象
#div = soup.find(name = 'div',id = 'focus-1')
div = soup.find(name = 'div',attrs={'id':'focus-1','class':'focusimg focusimg02'})
print(div)
li_list = div.find_all('li')#列表
#print(li_list)\
for  li  in  li_list:
    h2 = li.find('h2')
    a = li.find('a')
    p = li.find('p')#默认第一个参数为name

    img = li.find('img')
    src = img.get('src')
    file_name = src.rsplit('__',maxsplit=1)[1]
    ret_img = requests.get(url='https:'+src)
    with open(file_name,'wb') as f:
        f.write(ret_img.content)
    print(h2.text, a.get('href'))  # href在搜索栏会自动添加http/https
    print(p.text)
    print('=' * 15)
    #print(a.attrs)a的所有属性
    #print(a.get('href'))获取a的某一个属性
    #print(h2)
    #print(h2.text)
    #print(a.text)
    #p = li.find('p')#默认第一个参数为name
    #print(p.text)

1 #加快下载
2 pip install requests -i https://pypi.douban.com/simple
3 忘了的话 百度 python 豆瓣下载源
4 使用douban源下载python包 - 中国陆特 - 博客园
5 里面有

 1 #对多页进行点赞
 2 import  requests
 3 from  bs4 import  BeautifulSoup
 4 for  page_num in  range(8,9):
 5     r1 = requests.get(
 6         url = 'https://dig.chouti.com/all/hot/recent/%s'%page_num,
 7         headers = {
 8        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'
 9         }
10     )
11     #print(r1.text)
12     r1_cookie_dict = r1.cookies.get_dict()
13     response_login  =  requests.post(
14         url = 'https://dig.chouti.com/login',
15         data ={
16         'phone': '8613125397685',
17         'password': '478324asd',
18         'oneMonth': '1'
19         },
20         headers = {
21                 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'
22             },
23         cookies = r1_cookie_dict
24     )
25     # response_index = requests.get(
26     #     url = 'https://dig.chouti.com/',
27     #     headers = {
28     #    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'
29     #     }
30     # )
31     soup  = BeautifulSoup(r1.text,'html.parser')#r1.text是对的，不是response_index
32     div = soup.find(attrs={'id':'content-list'})
33     items = div.find_all(attrs={'class':'item'})
34     for item in  items:
35         tag = item.find(attrs={'class':'part2'})
36         nid = tag.get('share-linkid')
37         print(nid)
38         r1 = requests.post(#r1前面出现了也没问题
39             url = 'https://dig.chouti.com/link/vote?linksId=%s'%nid,
40             headers={
41                 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'
42             },
43             cookies=r1_cookie_dict
44         )
45         print(r1.text)

 1 import  requests
 2 from   bs4   import BeautifulSoup
 3 #找到value
 4 r1 = requests.get(
 5     url = 'https://github.com/login'
 6 )
 7 s1 = BeautifulSoup(r1.text,'html.parser')
 8 token = s1.find(name = 'input',attrs={'name':'authenticity_token'}).get('value')
 9 #print(r1.text)
10 #print(token)
11 r1_cookie_dict = r1.cookies.get_dict()
12 r2 = requests.post(
13     url = 'https://github.com/session',
14     data = {
15         'commit':'Sign in',
16          'utf8':'✓',
17          'authenticity_token':token,
18          'login':'clttyou',
19          'password':'9430'
20     },
21     cookies = r1_cookie_dict
22 )
23 print(r1.text)

 1 #爬主页的模板
 2 import  requests
 3 def  getHtml(url):
 4     try:
 5         r = requests.get(url,timeout = 30)
 6         r.raise_for_status()
 7         r.encoding = r.apparent_encoding
 8         return r.text
 9     except:
10         return "产生异常"
11 if __name__  == "__main__":
12     url = "https://www.taobao.com/"
13     print(getHtml(url))

str = "00000003210Runoob01230000000";
print(str.strip('0'));  # 去除首尾字符 0
str2 = "   Runoob      ";  # 去除首尾空格
print(str2.strip());

'''
3210Runoob0123
Runoob
'''

posted on 2018-10-28 17:33 cltt 阅读(242) 评论(0) 编辑收藏举报

刷新页面返回顶部

爬虫基础

导航

公告