【爬虫案例小结】
【案例】登陆博客园
【1】思路分析
- 打开cnblogs
- 点进登录页面
- 输入用户名密码
- 点登录(可能会出现验证码)----手动操作跳过验证码
- 登录成功后
- 拿到cookie
- 保存到本地
- 关闭浏览器
- 开启selenium,打开浏览器
- 把本地的cookie写入到当前浏览器中
- 当前浏览器就是登录状态
【2】代码实现
from selenium import webdriver
from selenium.webdriver.common.by import By
import json
# 创建 Chrome 浏览器的实例
browser = webdriver.Chrome()
# 打开 cnblogs
browser.get('https://www.cnblogs.com/')
# 点击登录按钮,使用 XPATH 定位元素
btn_login = browser.find_element(By.XPATH, '//*[@id="navbar_login_status"]/a[6]')
btn_login.click()
# 输入登录用户名和密码
login_username = browser.find_element(By.XPATH, '//*[@id="mat-input-0"]')
login_password = browser.find_element(By.XPATH, '//*[@id="mat-input-1"]')
login_username.send_keys('206**46849@qq.com')
login_password.send_keys('zhaochunze521.')
# 点击登录按钮
login_btn = browser.find_element(By.XPATH,
"/html/body/app-root/app-sign-in-layout/div/div/app-sign-in/app-content-container/div/div/div/form/div/button/span[1]")
login_btn.click()
# 登录成功后获取 cookie
cookies = browser.get_cookies()
# 保存 cookie 到本地文件
with open('cnblogs_cookie.json', 'w', encoding='utf-8') as f:
json.dump(cookies, f)
# 关闭浏览器
browser.quit()
# 开启新的浏览器实例
browser = webdriver.Chrome()
# 打开 cnblogs 网页
browser.get('https://www.cnblogs.com/')
# 读取本地保存的 cookie
with open('cnblogs_cookie.json', 'r', encoding='utf-8') as f:
cookies = json.load(f)
# 添加 cookie 到浏览器
for cookie in cookies:
browser.add_cookie(cookie)
# 刷新页面,当前浏览器就处于登录状态
browser.refresh()
# 关闭浏览器
browser.close()
# 继续其他操作...
# ...
【案例】抽屉网半自动点赞
# -*-coding: Utf-8 -*-
# @File : 03chouti .py
# author: Chimengmeng
# blog_url : https://www.cnblogs.com/dream-ze/
# Time:2023/8/19
import json
import threading
import requests
from fake_useragent import UserAgent
from selenium import webdriver
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
from lxml import etree
class ChouTi():
def __init__(self):
self.headers = {
'User-Agent': UserAgent().random,
}
self.browser = webdriver.Edge()
def get_link_id(self):
link_id_list = []
response = requests.get('https://dig.chouti.com/', headers=self.headers)
tree = etree.HTML(response.text)
div_list = tree.xpath('/html/body/main/div/div/div[1]/div/div[2]/div[1]/div')
for div in div_list:
link_id = div.xpath("@data-id")[0]
link_id_list.append(link_id)
return link_id_list
def get_cookies(self, browser):
# 获取Cookie
btn_login = browser.find_element(By.XPATH, '//*[@id="login_btn"]')
# btn_login.click()
browser.execute_script("arguments[0].click()", btn_login)
# self.browser.implicitly_wait(3)
# from_username_to_login = self.browser.find_element(By.XPATH, '/html/body/div[4]/div/div[2]/a[2]')
# from_username_to_login.click()
# self.browser.implicitly_wait(3)
# username_input = self.browser.find_element(By.XPATH, '/html/body/div[4]/div/div[3]/div[2]/div/input')
# password_input = self.browser.find_element(By.XPATH, '/html/body/div[4]/div/div[4]/div[1]/div/input[1]')
browser.implicitly_wait(3)
username_input = browser.find_element(By.XPATH, '/html/body/div[4]/div/div[3]/div[1]/div[2]/input')
password_input = browser.find_element(By.XPATH, '/html/body/div[4]/div/div[4]/div[1]/div/input[1]')
username_input.send_keys('')
password_input.send_keys('')
browser.implicitly_wait(3)
login_btn = browser.find_element(By.XPATH, '/html/body/div[4]/div/div[4]/div[4]/button')
# login_btn.click()
browser.execute_script("arguments[0].click()", login_btn)
browser.implicitly_wait(3)
cookies = browser.get_cookies()
with open('chouti_cookies.json', 'w', encoding='utf-8') as fp:
json.dump(cookies, fp)
browser.close()
def up_blog(self, real_cookie, link_id):
# 缺cookie,如果有了cookie,可以整个页面全点一遍
data = {
'linkId': link_id
}
response = requests.post('https://dig.chouti.com/link/vote', headers=self.headers, data=data,
cookies=real_cookie)
if response.status_code == 200:
print(f'link_id:>>>点赞成功')
def get_news(self):
...
def main_up(self):
link_id_list = self.get_link_id()
self.browser.get('https://dig.chouti.com/')
self.browser.implicitly_wait(3)
self.get_cookies(self.browser)
real_cookie = {}
with open('chouti_cookies.json', 'r', encoding='utf-8') as fp:
cookies = json.load(fp)
for item in cookies:
real_cookie[item['name']] = item['value']
task_list = []
for link_id in link_id_list:
task = threading.Thread(target=self.up_blog, args=(real_cookie, link_id_list))
task.start()
task_list.append(task)
for task in task_list:
task.join()
def main(self):
...
if __name__ == '__main__':
ChouTi().main_up()
【案例】滑动验证
import time
from selenium import webdriver
from selenium.webdriver.common.by import By # 按照什么方式查找,By.ID,By.CSS_SELECTOR
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait # 等待页面加载某些元素
import cv2
from urllib import request
from selenium.webdriver.common.action_chains import ActionChains
def get_distance():
background = cv2.imread("background.png", 0)
gap = cv2.imread("gap.png", 0)
res = cv2.matchTemplate(background, gap, cv2.TM_CCOEFF_NORMED)
value = cv2.minMaxLoc(res)[2][0]
print(value)
return value * 278 / 360
def main():
chrome = webdriver.Chrome()
chrome.implicitly_wait(5)
chrome.get('https://passport.jd.com/new/login.aspx?')
login = chrome.find_element(By.CLASS_NAME, 'login-tab-r')
login.click()
loginname = chrome.find_element(By.ID, 'loginname')
loginname.send_keys("123@qq.com")
nloginpwd = chrome.find_element(By.ID, 'nloginpwd')
nloginpwd.send_keys("987654321")
loginBtn = chrome.find_element(By.CLASS_NAME, 'login-btn')
loginBtn.click()
img_src = chrome.find_element(By.XPATH, '//*[@class="JDJRV-bigimg"]/img').get_attribute("src")
temp_src = chrome.find_element(By.XPATH, '//*[@class="JDJRV-smallimg"]/img').get_attribute("src")
request.urlretrieve(img_src, "background.png")
request.urlretrieve(temp_src, "gap.png")
distance = int(get_distance())
print("distance:", distance)
print('第一步,点击滑动按钮')
element = chrome.find_element(By.CLASS_NAME, 'JDJRV-slide-btn')
ActionChains(chrome).click_and_hold(on_element=element).perform() # 点击鼠标左键,按住不放
ActionChains(chrome).move_by_offset(xoffset=distance, yoffset=0).perform()
ActionChains(chrome).release(on_element=element).perform()
if __name__ == '__main__':
main()
【案例】登陆12306
import time
from selenium.webdriver import ActionChains
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
# 创建 Chrome 浏览器选项对象
options = Options()
# 去掉自动化控制的特性
options.add_argument("--disable-blink-features=AutomationControlled")
# 实例化 Chrome 浏览器
bro = webdriver.Chrome(options=options)
# 打开 12306 网站登录页面
bro.get('https://kyfw.12306.cn/otn/resources/login.html')
# 隐式等待
bro.implicitly_wait(5)
# 最大化窗口
bro.maximize_window()
# 点击选择扫码登录方式
user_login = bro.find_element(By.CSS_SELECTOR,
'#toolbar_Div > div.login-panel > div.login-box > ul > li.login-hd-code.active > a')
user_login.click()
time.sleep(1)
# 输入用户名和密码,并点击登录按钮
username = bro.find_element(By.ID, 'J-userName')
password = bro.find_element(By.ID, 'J-password')
submit_btn = bro.find_element(By.ID, 'J-login')
# 修改下方的用户名和密码为正确的信息
username.send_keys('1**53675221')
password.send_keys('')
time.sleep(3)
submit_btn.click()
time.sleep(5)
# 找到滑块并进行拖动操作
span = bro.find_element(By.ID, 'nc_1_n1z')
ActionChains(bro).click_and_hold(span).perform()
ActionChains(bro).move_by_offset(xoffset=300, yoffset=0).perform()
ActionChains(bro).release().perform()
time.sleep(5)
# 关闭浏览器窗口
bro.close()
【案例】登录超级鹰
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from chaojiying import ChaojiyingClient
from PIL import Image
# 初始化浏览器
bro = webdriver.Chrome()
# 打开网页
bro.get('http://www.chaojiying.com/apiuser/login/')
bro.implicitly_wait(10)
bro.maximize_window()
try:
# 输入用户名和密码
username = bro.find_element(by=By.XPATH, value='/html/body/div[3]/div/div[3]/div[1]/form/p[1]/input')
password = bro.find_element(by=By.XPATH, value='/html/body/div[3]/div/div[3]/div[1]/form/p[2]/input')
code = bro.find_element(by=By.XPATH, value='/html/body/div[3]/div/div[3]/div[1]/form/p[3]/input')
btn = bro.find_element(by=By.XPATH, value='/html/body/div[3]/div/div[3]/div[1]/form/p[4]/input')
username.send_keys('306334678')
password.send_keys('lqz123')
# 获取验证码图片
# 将整个页面截图保存为 main.png
bro.save_screenshot('main.png')
# 定位验证码图片元素
img = bro.find_element(By.XPATH, '/html/body/div[3]/div/div[3]/div[1]/form/div/img')
# 获取验证码图片在页面中的位置
location = img.location
# 获取验证码图片的尺寸
size = img.size
# 使用Pillow库裁剪出验证码图片
img_tu = (int(location['x']), int(location['y']), int(location['x'] + size['width']), int(location['y'] + size['height']))
# 打开整个页面截图
img = Image.open('./main.png')
# 裁剪出验证码图片
fram = img.crop(img_tu)
# 保存验证码图片
fram.save('code.png')
# 使用超级鹰识别验证码
# 初始化超级鹰账号信息
chaojiying = ChaojiyingClient('', '', '')
# 读取验证码图片
im = open('code.png', 'rb').read()
# 使用超级鹰识别验证码图片
res_code = chaojiying.PostPic(im, 1902)['pic_str']
# 输入识别结果到验证码输入框
code.send_keys(res_code)
time.sleep(5)
btn.click()
time.sleep(10)
except Exception as e:
print(e)
finally:
bro.close()
【案例】抓京东商品信息
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys # 键盘按键操作
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.edge.options import Options
def get_goods(bro):
# 往下滑动一下屏幕
bro.execute_script('scrollTo(0,5000)')
goods = bro.find_elements(By.CLASS_NAME, 'gl-item')
print(len(goods))
for good in goods:
try:
price = good.find_element(By.CSS_SELECTOR, 'div.p-price i').text
url = good.find_element(By.CSS_SELECTOR, 'div.p-img>a').get_attribute('href')
commit = good.find_element(By.CSS_SELECTOR, 'div.p-commit a').text
name = good.find_element(By.CSS_SELECTOR, 'div.p-name em').text
img = good.find_element(By.CSS_SELECTOR, 'div.p-img img').get_attribute('src')
if not img:
img = 'https:' + good.find_element(By.CSS_SELECTOR, 'div.p-img img').get_attribute('data-lazy-img')
print('''
商品名字:%s
商品价格:%s
商品评论:%s
商品图片:%s
商品链接:%s
''' % (name, price, commit, img, url))
except Exception as e:
print(e)
continue
# 找出下一页按钮,点击
next = bro.find_element(By.PARTIAL_LINK_TEXT, '下一页')
next.click()
get_goods(bro) # 递归调用
options = Options()
options.add_argument("--disable-blink-features=AutomationControlled") # 去掉自动化控制
# bro = webdriver.Chrome(options=options) # 在新版本的Selenium中,'chrome_options'已被替换为'options'
bro = webdriver.Edge(options=options) # 在新版本的Selenium中,'chrome_options'已被替换为'options'
bro.get('https://www.jd.com/')
bro.maximize_window()
bro.implicitly_wait(10)
try:
search_input = bro.find_element(By.ID, 'key')
search_input.send_keys('mac pro')
search_input.send_keys(Keys.ENTER)
# search_input.send_keys(Keys.BACKSPACE)
get_goods(bro)
except Exception as e:
print(e)
finally:
bro.close()
本文来自博客园,作者:Chimengmeng,转载请注明原文链接:https://www.cnblogs.com/dream-ze/p/17647619.html