day6
昨日作业:自动登陆抽屉新热榜
1 from selenium import webdriver 2 import time 3 4 driver = webdriver.Chrome(r'D:\BaiduNetdiskDownload\chromedriver_win32\chromedriver.exe') 5 6 # 把窗口转成全屏 7 driver.maximize_window() 8 9 try: 10 driver.get('https://dig.chouti.com/') 11 driver.implicitly_wait(10) 12 time.sleep(5) 13 14 # 1、点击登录 15 login_btn = driver.find_element_by_id('login_btn') 16 login_btn.click() 17 time.sleep(2) 18 19 # 2、输入用户名 20 phone = driver.find_element_by_class_name('login-phone') 21 phone.send_keys('15622792660') 22 23 # 3、输入密码 24 pwd = driver.find_element_by_class_name('pwd-password-input') 25 pwd.send_keys('kermit46709394') 26 27 # 4、确认登录 28 login_submit = driver.find_element_by_class_name('btn-large') 29 login_submit.click() 30 31 time.sleep(20) 32 33 # 捕获异常并打印 34 except Exception as e: 35 print(e) 36 37 finally: 38 driver.close()
今日内容:
注意: selenium驱动的浏览器是干净的,没有任何缓存。
1、selenium剩余用法
2、selenium万能登录破解
3、selenium爬取京东商品信息
4、破解极验滑动验证码
Xpath语法:
今日作业:
1、总结课堂知识点,写博客
2、爬取京东商品信息
3、滑动验证(提高题)
1。selenium选择器之Xpath:
1 from selenium import webdriver 2 3 driver = webdriver.Chrome(r'D:\BaiduNetdiskDownload\chromedriver_win32\chromedriver.exe') 4 5 6 try: 7 # 隐式等待: 写在get请求前 8 driver.implicitly_wait(5) 9 10 driver.get('https://doc.scrapy.org/en/latest/_static/selectors-sample1.html') 11 12 # 显式等待: 写在get请求后 13 # wait.until(...) 14 15 ''' 16 17 <html> 18 <head> 19 <base href='http://example.com/' /> 20 <title>Example website</title> 21 </head> 22 <body> 23 <div id='images'> 24 <a href='image1.html'>Name: My image 1 <br /><img src='image1_thumb.jpg' /></a> 25 <a href='image2.html'>Name: My image 2 <br /><img src='image2_thumb.jpg' /></a> 26 <a href='image3.html'>Name: My image 3 <br /><img src='image3_thumb.jpg' /></a> 27 <a href='image4.html'>Name: My image 4 <br /><img src='image4_thumb.jpg' /></a> 28 <a href='image5.html'>Name: My image 5 <br /><img src='image5_thumb.jpg' /></a> 29 </div> 30 </body> 31 </html> 32 ''' 33 # 根据xpath语法查找元素 34 # / 从根节点开始找第一个 35 html = driver.find_element_by_xpath('/html') 36 # html = driver.find_element_by_xpath('/head') # 报错 37 print(html.tag_name) 38 39 # // 从根节点开始找任意一个节点 40 div = driver.find_element_by_xpath('//div') 41 print(div.tag_name) 42 43 # @ 44 # 查找id为images的div节点 45 div = driver.find_element_by_xpath('//div[@id="images"]') 46 print(div.tag_name) 47 print(div.text) 48 49 # 找到第一个a节点 50 a = driver.find_element_by_xpath('//a') 51 print(a.tag_name) 52 53 # 找到所有a节点 54 a_s = driver.find_elements_by_xpath('//a') 55 print(a_s) 56 57 # 找到第一个a节点的href属性 58 # get_attribute:获取节点中某个属性 59 a = driver.find_element_by_xpath('//a').get_attribute('href') 60 print(a) 61 62 finally: 63 driver.close()
2.selenium剩余操作:
1 '''''' 2 ''' 3 点击、清除操作 4 ''' 5 # from selenium import webdriver 6 # from selenium.webdriver.common.keys import Keys 7 # import time 8 # 9 # driver = webdriver.Chrome(r'D:\BaiduNetdiskDownload\chromedriver_win32\chromedriver.exe') 10 # 11 # try: 12 # driver.implicitly_wait(10) 13 # # 1、往jd发送请求 14 # driver.get('https://www.jd.com/') 15 # # 找到输入框输入围城 16 # input_tag = driver.find_element_by_id('key') 17 # input_tag.send_keys('围城') 18 # # 键盘回车 19 # input_tag.send_keys(Keys.ENTER) 20 # time.sleep(2) 21 # # 找到输入框输入墨菲定律 22 # input_tag = driver.find_element_by_id('key') 23 # input_tag.clear() 24 # input_tag.send_keys('墨菲定律') 25 # # 找到搜索按钮点击搜索 26 # button = driver.find_element_by_class_name('button') 27 # button.click() 28 # time.sleep(10) 29 # 30 # finally: 31 # driver.close() 32 33 34 ''' 35 获取cookies (了解) 36 ''' 37 # from selenium import webdriver 38 # import time 39 # 40 # driver = webdriver.Chrome(r'D:\BaiduNetdiskDownload\chromedriver_win32\chromedriver.exe') 41 # 42 # try: 43 # driver.implicitly_wait(10) 44 # driver.get('https://www.zhihu.com/explore') 45 # print(driver.get_cookies()) 46 # 47 # time.sleep(10) 48 # finally: 49 # driver.close() 50 51 ''' 52 选项卡 53 ''' 54 #选项卡管理:切换选项卡,有js的方式windows.open,有windows快捷键: 55 # ctrl+t等,最通用的就是js的方式 56 # import time 57 # from selenium import webdriver 58 # 59 # browser = webdriver.Chrome() 60 # try: 61 # browser.get('https://www.baidu.com') 62 # 63 # # execute_script: 执行javascrpit代码 64 # # 弹窗操作 65 # # browser.execute_script('alert("tank")') 66 # # 新建浏览器窗口 67 # browser.execute_script( 68 # ''' 69 # window.open(); 70 # ''' 71 # ) 72 # time.sleep(1) 73 # print(browser.window_handles) # 获取所有的选项卡 74 # # 切换到第二个窗口 75 # # 新: 76 # browser.switch_to.window(browser.window_handles[1]) 77 # # 旧: 78 # # browser.switch_to_window(browser.window_handles[1]) 79 # 80 # # 第二个窗口往淘宝发送请求 81 # browser.get('https://www.taobao.com') 82 # time.sleep(5) 83 # 84 # # 切换到第一个窗口 85 # browser.switch_to_window(browser.window_handles[0]) 86 # browser.get('https://www.sina.com.cn') 87 # 88 # time.sleep(10) 89 # finally: 90 # browser.close() 91 92 93 ''' 94 ActionChangs动作链 95 ''' 96 # from selenium import webdriver 97 # from selenium.webdriver import ActionChains 98 # import time 99 # 100 # driver = webdriver.Chrome() 101 # driver.implicitly_wait(10) 102 # driver.get('http://www.runoob.com/try/try.php?filename=jqueryui-api-droppable') 103 # 104 # try: 105 # 106 # # driver.switch_to_frame('iframeResult') 107 # # 切换到id为iframeResult的窗口内 108 # driver.switch_to.frame('iframeResult') 109 # 110 # # 源位置 111 # draggable = driver.find_element_by_id('draggable') 112 # 113 # # 目标位置 114 # droppable = driver.find_element_by_id('droppable') 115 # 116 # # 调用ActionChains,必须把驱动对象传进去 117 # # 得到一个动作链对象,复制给一个变量 118 # actions = ActionChains(driver) 119 # 120 # # 方式一: 机器人 121 # # 瞬间把源图片位置秒移到目标图片位置 122 # # actions.drag_and_drop(draggable, droppable) # 编写一个行为 123 # # actions.perform() # 执行编写好的行为 124 # 125 # 126 # # 方式二: 模拟人的行为 127 # source = draggable.location['x'] 128 # target = droppable.location['x'] 129 # print(source, target) 130 # 131 # distance = target - source 132 # print(distance) 133 # 134 # # perform:每个动作都要调用perform执行 135 # 136 # # 点击并摁住源图片 137 # ActionChains(driver).click_and_hold(draggable).perform() 138 # 139 # s = 0 140 # while s < distance: 141 # # 执行位移操作 142 # ActionChains(driver).move_by_offset(xoffset=2, yoffset=0).perform() 143 # s += 2 144 # 145 # # 释放动作链 146 # ActionChains(driver).release().perform() 147 # 148 # time.sleep(10) 149 # 150 # 151 # finally: 152 # driver.close() 153 154 155 ''' 156 前进、后退 157 ''' 158 # from selenium import webdriver 159 # import time 160 # 161 # driver = webdriver.Chrome() 162 # 163 # try: 164 # driver.implicitly_wait(10) 165 # driver.get('https://www.jd.com/') 166 # driver.get('https://www.baidu.com/') 167 # driver.get('https://www.cnblogs.com/') 168 # 169 # time.sleep(2) 170 # 171 # # 回退操作 172 # driver.back() 173 # time.sleep(1) 174 # # 前进操作 175 # driver.forward() 176 # time.sleep(1) 177 # driver.back() 178 # time.sleep(10) 179 # 180 # finally: 181 # driver.close()
3.破解登陆:
1 from selenium import webdriver 2 from selenium.webdriver import ChromeOptions 3 import time 4 r''' 5 步骤: 6 1、打开文件的查看,显示隐藏文件 7 2、找到C:\Users\administortra\AppData\Local\Google\Chrome\User Data 8 删除Default文件 9 3、重新打开浏览器,并登陆百度账号 10 - 此时会创建一个新的Default缓存文件 11 4、添加cookies 12 5、关闭谷歌浏览器后执行程序 13 ''' 14 # 获取options对象,参数对象 15 options = ChromeOptions() 16 17 # 获取cookies保存路径 18 # 'C:\Users\administortra\AppData\Local\Google\Chrome\User Data' 19 profile_directory = r'--user-data-dir=C:\Users\administortra\AppData\Local\Google\Chrome\User Data' 20 21 # 添加用户信息目录 22 options.add_argument(profile_directory) 23 24 # 把参数加载到当前驱动中 chrome_options默认参数,用来接收options对象 25 driver = webdriver.Chrome(chrome_options=options) 26 27 try: 28 driver.implicitly_wait(10) 29 driver.get('https://www.baidu.com/') 30 ''' 31 BDUSS:***** 32 ''' 33 # 添加用户cookies信息 34 # name、value必须小写 35 driver.add_cookie({"name": "BDUSS", "value": "用户session字符串"}) 36 37 # 刷新操作 38 driver.refresh() 39 40 time.sleep(10) 41 42 finally: 43 driver.close()
4.selenium爬取京东商品信息:
1 # '''''' 2 # ''' 3 # 爬取京东商品信息: 4 # 请求url: 5 # https://www.jd.com/ 6 # 提取商品信息: 7 # 1.商品详情页 8 # 2.商品名称 9 # 3.商品价格 10 # 4.评价人数 11 # 5.商品商家 12 # ''' 13 # from selenium import webdriver 14 # from selenium.webdriver.common.keys import Keys 15 # import time 16 # 17 # driver = webdriver.Chrome() 18 # 19 # try: 20 # driver.implicitly_wait(10) 21 # # 1、往京东主页发送请求 22 # driver.get('https://www.jd.com/') 23 # 24 # # 2、输入商品名称,并回车搜索 25 # input_tag = driver.find_element_by_id('key') 26 # input_tag.send_keys('macbook') 27 # input_tag.send_keys(Keys.ENTER) 28 # time.sleep(2) 29 # 30 # # 通过JS控制滚轮滑动获取所有商品信息 31 # js_code = ''' 32 # window.scrollTo(0,5000); 33 # ''' 34 # driver.execute_script(js_code) # 执行js代码 35 # 36 # # 等待数据加载 37 # time.sleep(2) 38 # 39 # # 3、查找所有商品div 40 # # good_div = driver.find_element_by_id('J_goodsList') 41 # good_list = driver.find_elements_by_class_name('gl-item') 42 # n = 1 43 # for good in good_list: 44 # # 根据属性选择器查找 45 # # 商品链接 46 # good_url = good.find_element_by_css_selector( 47 # '.p-img a').get_attribute('href') 48 # 49 # # 商品名称 50 # good_name = good.find_element_by_css_selector( 51 # '.p-name em').text.replace("\n", "--") 52 # 53 # # 商品价格 54 # good_price = good.find_element_by_class_name( 55 # 'p-price').text.replace("\n", ":") 56 # 57 # # 评价人数 58 # good_commit = good.find_element_by_class_name( 59 # 'p-commit').text.replace("\n", " ") 60 # 61 # # 商品商家 62 # good_from = good.find_element_by_class_name( 63 # 'J_im_icon').text.replace("\n", " ") 64 # 65 # good_content = f''' 66 # 商品链接: {good_url} 67 # 商品名称: {good_name} 68 # 商品价格: {good_price} 69 # 评价人数: {good_commit} 70 # 商品商家: {good_from} 71 # \n 72 # ''' 73 # print(good_content) 74 # with open('jd.txt', 'a', encoding='utf-8') as f: 75 # f.write(good_content) 76 # 77 # next_tag = driver.find_element_by_link_text('下一页') 78 # 79 # next_tag.click() 80 # 81 # time.sleep(10) 82 # 83 # 84 # finally: 85 # driver.close() 86 87 88 89 '''''' 90 ''' 91 爬取京东商品信息: 92 请求url: 93 https://www.jd.com/ 94 提取商品信息: 95 1.商品详情页 96 2.商品名称 97 3.商品价格 98 4.评价人数 99 5.商品商家 100 ''' 101 from selenium import webdriver 102 from selenium.webdriver.common.keys import Keys 103 import time 104 105 106 def get_good(driver): 107 try: 108 109 # 通过JS控制滚轮滑动获取所有商品信息 110 js_code = ''' 111 window.scrollTo(0,5000); 112 ''' 113 driver.execute_script(js_code) # 执行js代码 114 115 # 等待数据加载 116 time.sleep(2) 117 118 # 3、查找所有商品div 119 # good_div = driver.find_element_by_id('J_goodsList') 120 good_list = driver.find_elements_by_class_name('gl-item') 121 n = 1 122 for good in good_list: 123 # 根据属性选择器查找 124 # 商品链接 125 good_url = good.find_element_by_css_selector( 126 '.p-img a').get_attribute('href') 127 128 # 商品名称 129 good_name = good.find_element_by_css_selector( 130 '.p-name em').text.replace("\n", "--") 131 132 # 商品价格 133 good_price = good.find_element_by_class_name( 134 'p-price').text.replace("\n", ":") 135 136 # 评价人数 137 good_commit = good.find_element_by_class_name( 138 'p-commit').text.replace("\n", " ") 139 140 good_content = f''' 141 商品链接: {good_url} 142 商品名称: {good_name} 143 商品价格: {good_price} 144 评价人数: {good_commit} 145 \n 146 ''' 147 print(good_content) 148 with open('jd.txt', 'a', encoding='utf-8') as f: 149 f.write(good_content) 150 151 next_tag = driver.find_element_by_class_name('pn-next') 152 next_tag.click() 153 154 time.sleep(2) 155 156 # 递归调用函数 157 get_good(driver) 158 159 time.sleep(10) 160 161 finally: 162 driver.close() 163 164 165 if __name__ == '__main__': 166 167 good_name = input('请输入爬取商品信息:').strip() 168 169 driver = webdriver.Chrome() 170 driver.implicitly_wait(10) 171 # 1、往京东主页发送请求 172 driver.get('https://www.jd.com/') 173 174 # 2、输入商品名称,并回车搜索 175 input_tag = driver.find_element_by_id('key') 176 input_tag.send_keys(good_name) 177 input_tag.send_keys(Keys.ENTER) 178 time.sleep(2) 179 180 get_good(driver)
5.破解极验滑动验证:
1 '''''' 2 ''' 3 破解极验滑动验证 4 博客园登录url: 5 https://account.cnblogs.com/signin?returnUrl=https%3A%2F%2Fwww.cnblogs.com%2F 6 1、输入用户名与密码,并点击登录 7 2、弹出滑动验证,获取有缺口与完整的图片 8 3、通过像素点进行比对,获取滑动位移距离 9 4、模拟人的行为轨迹 10 5、开始滑动 11 ''' 12 from selenium import webdriver # 用来驱动浏览器的 13 from selenium.webdriver import ActionChains # 破解滑动验证码的时候用的 可以拖动图片 14 import time 15 from PIL import Image # pip3 install pillow 16 import random 17 18 option = webdriver.ChromeOptions() 19 option.add_argument('disable-infobars') 20 21 driver = webdriver.Chrome(chrome_options=option) 22 23 24 def get_snap(driver): 25 # selenium自带的截图网页全屏图片 26 driver.save_screenshot('snap.png') 27 28 img = driver.find_element_by_class_name('geetest_canvas_img') 29 30 left = img.location['x'] 31 32 upper = img.location['y'] 33 34 right = left + img.size['width'] 35 lower = upper + img.size['height'] 36 37 # print(left, upper, right, lower) 38 img_obj = Image.open('snap.png') 39 40 # 对屏幕进行截取,获取滑动验证图片 41 image = img_obj.crop((left, upper, right, lower)) 42 43 return image 44 45 46 def get_image1(driver): 47 time.sleep(0.2) 48 js_code = ''' 49 var x = document.getElementsByClassName('geetest_canvas_fullbg')[0].style.display="block"; 50 console.log(x) 51 ''' 52 53 time.sleep(1) 54 driver.execute_script(js_code) 55 56 # 截取图片 57 img_obj = get_snap(driver) 58 59 return img_obj 60 61 62 def get_image2(driver): 63 time.sleep(0.2) 64 65 js_code = ''' 66 var x = document.getElementsByClassName('geetest_canvas_fullbg')[0].style.display="none"; 67 console.log(x) 68 ''' 69 70 driver.execute_script(js_code) 71 72 time.sleep(1) 73 74 # 截取图片 75 img_obj = get_snap(driver) 76 77 return img_obj 78 79 80 def get_distance(image1, image2): 81 # 初始值 82 start = 60 83 84 # 滑块色差 85 color_num = 60 86 87 for x in range(start, image1.size[0]): 88 for y in range(image1.size[1]): 89 90 rgb1 = image1.load()[x, y] 91 92 rgb2 = image2.load()[x, y] 93 94 r = abs(rgb1[0] - rgb2[0]) 95 g = abs(rgb1[1] - rgb2[1]) 96 b = abs(rgb1[2] - rgb2[2]) 97 98 if not (r < color_num and g < color_num and b < color_num): 99 return x - 7 100 101 102 def get_stacks(distance): 103 distance += 20 104 105 ''' 106 匀加速\减速运行 107 v = v0 + a * t 108 109 位移: 110 s = v * t + 0.5 * a * (t**2) 111 ''' 112 113 # 初速度 114 v0 = 0 115 116 # 加减速度列表 117 a_list = [3, 4, 5] 118 119 # 时间 120 t = 0.2 121 122 # 初始位置 123 s = 0 124 125 # 向前滑动轨迹 126 forward_stacks = [] 127 128 mid = distance * 3 / 5 129 130 while s < distance: 131 if s < mid: 132 a = a_list[random.randint(0, 2)] 133 134 else: 135 a = -a_list[random.randint(0, 2)] 136 137 v = v0 138 139 stack = v * t + 0.5 * a * (t ** 2) 140 141 # 每次拿到的位移 142 stack = round(stack) 143 144 s += stack 145 146 v0 = v + a * t 147 148 forward_stacks.append(stack) 149 150 back_stacks = [-1, -1, -2, -3, -2, -3, -2, -2, -3, -1] 151 152 return {'forward_stacks': forward_stacks, 'back_stacks': back_stacks} 153 154 155 def main(): 156 try: 157 158 driver.get('https://passport.cnblogs.com/user/signin') 159 driver.implicitly_wait(5) 160 161 # 1.输入用户名与密码,点击登录 162 username = driver.find_element_by_id('LoginName') 163 password = driver.find_element_by_id('Password') 164 login_button = driver.find_element_by_class_name('ladda-label') 165 time.sleep(1) 166 username.send_keys('_tank_') 167 time.sleep(1) 168 password.send_keys('k46709394.') 169 170 # 这里需要等待账号密码输入完毕后再点击登录按钮,否则的不弹框 171 time.sleep(1) 172 login_button.click() 173 # time.sleep(3) 174 175 # 2.点击滑动验证按钮,获取图片 176 geetest_button = driver.find_element_by_class_name('geetest_slider_button') 177 geetest_button.click() 178 179 time.sleep(0.2) 180 181 # 3.针对完整的图片进行截取 182 image1 = get_image1(driver) 183 184 # 4.针对有缺口的图片进行截取 185 image2 = get_image2(driver) 186 187 # 5.对比两张图片,获取滑动距离 188 distance = get_distance(image1, image2) 189 190 # 6.模拟人为滑动轨迹 191 stacks = get_stacks(distance) 192 193 # 7.根据滑动轨迹进行滑动 194 forward_stacks = stacks['forward_stacks'] 195 back_stacks = stacks['back_stacks'] 196 197 slider_button = driver.find_element_by_class_name('geetest_slider_button') 198 time.sleep(0.2) 199 200 ActionChains(driver).click_and_hold(slider_button).perform() 201 202 time.sleep(0.2) 203 for forward_stack in forward_stacks: 204 ActionChains(driver).move_by_offset(xoffset=forward_stack, yoffset=0).perform() 205 time.sleep(0.1) 206 for back_stack in back_stacks: 207 ActionChains(driver).move_by_offset(xoffset=back_stack, yoffset=0).perform() 208 time.sleep(0.1) 209 210 time.sleep(0.2) 211 212 ActionChains(driver).move_by_offset(xoffset=5, yoffset=0).perform() 213 ActionChains(driver).move_by_offset(xoffset=-5, yoffset=0).perform() 214 215 ActionChains(driver).release().perform() 216 217 time.sleep(50) 218 219 220 finally: 221 driver.close() 222 223 224 if __name__ == '__main__': 225 main()