python3+selenium3+requests爬取我的博客粉丝的名称

爬取目标

1.本次代码是在python3上运行通过的

  • selenium3 +firefox59.0.1(最新)
  • BeautifulSoup
  • requests

2.爬取目标网站,我的博客:https://home.cnblogs.com/u/lxs1314
爬取内容:爬我的博客的所有粉丝的名称,并保存到txt

3.由于博客园的登录是需要人机验证的,所以是无法直接用账号密码登录,需借助selenium登录

直接贴代码:

# coding:utf-8
# __author__ = 'Carry'

import requests
from selenium import webdriver
from bs4 import BeautifulSoup
import re
import time



# firefox浏览器配置文件地址
profile_directory = r'C:\Users\Administrator\AppData\Roaming\Mozilla\Firefox\Profiles\pxp74n2x.default'

s = requests.session()  # 新建session
url = "https://home.cnblogs.com/u/lxs1314"

def get_cookies(url):
    '''启动selenium获取登录的cookies'''
    # 加载配置
    profile = webdriver.FirefoxProfile(profile_directory)
    # 启动浏览器配置
    driver = webdriver.Firefox(profile)
    driver.get(url+"/followers")

    time.sleep(3)
    cookies = driver.get_cookies()  # 获取浏览器cookies
    print(cookies)
    driver.quit()
    return cookies

def add_cookies(cookies):
    '''往session添加cookies'''
    # 添加cookies到CookieJar
    c = requests.cookies.RequestsCookieJar()
    for i in cookies:
        c.set(i["name"], i['value'])

    s.cookies.update(c)  # 更新session里cookies

def get_ye_nub(url):
    # 发请求
    r1 = s.get(url+"/relation/followers")
    soup = BeautifulSoup(r1.content, "html.parser")
    # 抓取我的粉丝数
    fensinub = soup.find_all(class_="current_nav")
    print (fensinub[0].string)
    num = re.findall(u"我的粉丝\((.+?)\)", fensinub[0].string)
    print (u"我的粉丝数量:%s"%str(num[0]))

    # 计算有多少页,每页45条
    ye = int(int(num[0])/45)+1
    print (u"总共分页数:%s"%str(ye))
    return ye

def save_name(nub):
    # 抓取第一页的数据
    if nub <= 1:
        url_page = url+"/relation/followers"
    else:
        url_page = url+"/relation/followers?page=%s" % str(nub)
    print (u"正在抓取的页面:%s" %url_page)
    r2 = s.get(url_page)
    soup = BeautifulSoup(r2.content, "html.parser")
    fensi = soup.find_all(class_="avatar_name")
    for i in fensi:
        name = i.string.replace("\n", "").replace(" ","")
        print (name)
        with open("name.txt", "a") as f:  # 追加写入
            f.write(name+"\n")
            #name.encode("utf-8")

if __name__ == "__main__":
    cookies = get_cookies(url)
    add_cookies(cookies)
    n = get_ye_nub(url)
    for i in range(1, n+1):
        save_name(i)
View Code

 

原文链接:http://www.cnblogs.com/yoyoketang/p/8610779.html

 

posted @ 2018-03-22 15:27  彩笔杀手  阅读(296)  评论(0编辑  收藏  举报