爬虫随笔(一)
最近因为工作和研究方向的变动,开始学习爬虫,现在是刚刚入门,简单写一些,爬虫入门的建议。
一、基础知识
(1)掌握的编程语言:python、Html、JS、CSS
Python是必须的,虽然Java也可以实现相关功能,但是总归脚本写起来方便一点。如果有语言基础,直接去菜鸟教程,看语法,就差不多掌握了。
Html、JS、CSS,这个三件套也是需要具备的。
(2)工具
我自己常用的就是VS Code,方便易用。
执行用控制台(题外话,没想到,Windows我现在用power shell感觉很好用)
(3)插件
python的有两个肯定需要的:
selenium、openpyxl
安装的话,进入控制台: pip install selenium pip install openpyxl
主要用到 webdriver相关功能。
二、登录功能
如下,是一个登录,然后获取cookie,并用cookie完成后续网页操作的方案。
这个是上一篇随笔的进阶版本,负责判断错题,并保存,方便后续学习,比较实用。上一篇是保存全部题目,也很实用。
from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.wait import WebDriverWait from selenium.webdriver.common.action_chains import ActionChains import time from openpyxl import Workbook # 创建一个 workbook wb = Workbook() # 获取被激活的 worksheet ws = wb.active ## 打开浏览器 driver = webdriver.Edge() driver.get('http://zjzx.zjnu.edu.cn/') # 暂停一秒,加载内容 time.sleep(1) username = driver.find_element(By.ID,'username') username.send_keys('用户名') password = driver.find_element(By.ID,'password') password.send_keys('密码') print('登录,等待输入验证码,然后获取cookie') time.sleep(10) # 等待输入验证码,然后获取cookie print('倒计时结束') cookie1 = driver.get_cookies() print(cookie1) print("登录后的cookies值:", cookie1[2]) cookie = cookie1[2] driver.add_cookie(cookie) #### 页面加载成功!!! def get_wrong(number, element, choose_button): # element = driver.find_element(By.ID, 'qsNum24794') # element.send_keys('WebDriver') # element.submit() print('第',number,'题:') element.click() # ActionChains(driver).click(element).perform() # 获取#document下的数据 iframe = WebDriverWait(driver, 20).until( EC.presence_of_element_located( (By.CSS_SELECTOR, 'iframe'))) # iframe = driver.find_element_by_css_selector('iframe') # driver.switch_to_frame(iframe) driver.switch_to.frame(iframe) # answer = driver.find_element(By.ID, 'answer') # answer = driver.find_element(By.CSS_SELECTOR,'div.sct_box1 > p > span.color_green') # print(answer.text) checkbox = driver.find_elements(By.CLASS_NAME,'layui-icon-ok') checkbox[-1].click() time.sleep(0.1) out = driver.find_elements(By.CLASS_NAME, 'color_green') print(out[0].text,':',out[1].text) print('回答正确' if (out[0].text == out[1].text) else '回答错误') if(out[1].text=='正确' or out[1].text=='错误'): if(out[1].text == '正确'): if(out[0].text!='对'): print('开始保存错题本') content = driver.find_element(By.CLASS_NAME,'sct_tit').text chooses = driver.find_elements(By.CLASS_NAME, choose_button) for add_content in chooses: content += '\n' + add_content.text ws.append([ content, out[1].text]) else: if(out[0].text!='错'): print('开始保存错题本') content = driver.find_element(By.CLASS_NAME,'sct_tit').text chooses = driver.find_elements(By.CLASS_NAME, choose_button) for add_content in chooses: content += '\n' + add_content.text ws.append([ content, out[1].text]) else: if(out[0].text != out[1].text): print('开始保存错题本') content = driver.find_element(By.CLASS_NAME,'sct_tit').text chooses = driver.find_elements(By.CLASS_NAME, choose_button) if(choose_button=='layui-form-checkbox'): limit_index = 0 for add_content in chooses: content += '\n' + add_content.text limit_index+=1 if(limit_index >= len(chooses) - 1): break else: for add_content in chooses: content += '\n' + add_content.text ws.append([ content, out[1].text]) driver.switch_to.default_content() for i in range(0,100): url=input("输入想要提取的网页:") if(url==''): url = 'http://zjzx.zjnu.edu.cn/bm/exercise/page/6/1994431#' driver.get(url) ws.append(["题目", "答案"]) answerCards = driver.find_elements(By.CLASS_NAME, 'answerCard') for index in range(0,40): get_wrong(index, answerCards[index], 'layui-form-radio') for index in range(40,60): get_wrong(index, answerCards[index], 'layui-form-checkbox') for index in range(60,80): get_wrong(index, answerCards[index], 'layui-form-radio') name=input("输入保存的excel名称:") wb.save(name+".xlsx") print("文件保存成功") ws.delete_rows(1,ws.max_row) continue_get = input("是否继续(y、n)") if(continue_get != 'y' and continue_get != 'Y'): break print('5秒后退出') time.sleep(5) driver.quit()