爬虫之selenium
概述
selenium模块的作用通过编写代码模拟人工对浏览器的事件,触发相关操作,从而获取网页信息,相对于使用requests模块,selenium模块对动态数据的爬取更为方便
安装selenium:pip install selenium -i https://pypi.douban.com/simple
使用:
1:使用内置的webdriver类实例化一个浏览器对象diver(示例浏览器对象时,需要传入一个浏览器驱动的路径)如实例化一个谷歌浏览器地下diver = webdriver.Chrome(r'./chromedriver.exe)
2:才用实例化的对象中的方法进行模拟人工操作浏览器
常用内置方法:
打开网页:diver.get("要访问的url")
查询标签:diver.find_element_by_id("id值") # 根据标签id查找,可以修改为class,tagname等值,与js查找标签类似,找到标签返回一个obj对象
obj对象的方法: obj.click(点击)
obj.send_keys(输入)
获取网页源码:diver.page_source
关闭浏览器:diver.close()/diver.quit
执行js代码:diver.execute_script("js代码") # 如widow.scrollTo(0,document.body.scrollHeight放到js代码中,浏览器会执行滚轮下滑一定高度的动作,
截图:diver.save_screenshot("图片保存路径和文件名")
前进:diver.forward()
后退:diver.back()
切换到iframe标签:diver.switch_to.frame("iframe标签")
实现是鼠标按住不松手:线实例化动作链对象action = ActionChains(diver) # ActionChains从selenium.webdriver中导入
然后保持不松开:action.click_and_hold(“标签对象”) #点击标签对象松开
移动标签:action.move_by_offset(x,y) #移动标签对象,如果是移动到另一个标签里,可以使用action.drag_and_drop(被移动标签对象, 目标标签对象)
执行上述代码:action.perform
获取cookise值:diver.get_cookies()
使用无头浏览器
通过添加参数可以让selenium操作浏览器在后台运行,不会有界面显示
# 创建一个参数对象,用来控制chrome以无界面模式打开
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
# 实例化一个谷歌浏览器对象, 需要加上一个谷歌无界面浏览器的参数chrome_options
diver = webdriver.Chrome(r'./chromedriver.exe',chrome_options=chrome_options)
规避检测
#通过添加参数可以降低被网站服务器检测为自动化程序的风险
# 实例化一个options对象, 添加规避被检测识别的参数
options = ChromeOptions()
options.add_experimental_option('excludeSwitches', ['enable-automation'])
bro = webdriver.Chrome(executable_path='./chromedriver.exe',options=options)
示例
# 使用selenium爬取网易新闻里面["国内", "国际", "军事", "航空"]四个版块里面的新闻数据
1 # 使用selenium爬取网易新闻里面["国内", "国际", "军事", "航空"]四个版块里面的新闻数据 2 import requests,random 3 from selenium import webdriver 4 from selenium.webdriver import ChromeOptions 5 from selenium.webdriver.chrome.options import Options 6 from lxml import etree 7 from multiprocessing.dummy import Pool 8 9 # 设置不打开浏览器查询 10 chrome_options = Options() 11 chrome_options.add_argument("--headless") 12 chrome_options.add_argument("--disable-gpu") 13 14 # 规避脚本检测 15 options = ChromeOptions() 16 options.add_experimental_option('excludeSwitches', ['enable-automation']) 17 # 生成谷歌浏览器对象 18 diver = webdriver.Chrome('chromedriver.exe',chrome_options=chrome_options,options=options) 19 # 链接目标url 20 diver.get("https://news.163.com") 21 # 获取网页代码 22 response_text = diver.page_source 23 24 #使用lxml解析源代码 25 tree = etree.HTML(response_text) 26 guonei_url = tree.xpath('//li[@class="menu_guonei"]/a/@href')[0] 27 guoji_url = tree.xpath('//li[@class="menu_guoji"]/a/@href')[0] 28 war_url = tree.xpath('//li[@class="menu_war"]/a/@href')[0] 29 hangkong_url = tree.xpath('//li[@class="menu_hangkong"]/a/@href')[0] 30 diver.close() 31 32 33 def get_new(url): 34 '''模仿人工操作浏览器下拉到页面底部,并获取整张页面源码''' 35 new_diver = webdriver.Chrome('chromedriver.exe',options=options,chrome_options=chrome_options) 36 new_diver.get(url) 37 js = 'window.scrollTo(0,document.body.scrollHeight)' 38 check_bottom = new_diver.find_element_by_class_name("load_more_tip") 39 while check_bottom.get_attribute('style') == 'display: none;': 40 new_diver.execute_script(js) 41 obj = new_diver.find_element_by_class_name("post_addmore") 42 if obj.get_attribute('style') == 'visibility: visible;': 43 obj.click() 44 new_diver.execute_script(js) 45 # 获取网页代码 46 response_text = new_diver.page_source 47 # filename = str(random.randint(1000,9999)) + ".html" 48 # with open(filename,"w",encoding="utf-8") as f: 49 # f.write(response_text) 50 new_diver.close() 51 return response_text 52 53 def mark_url(html_text): 54 '''获取各个新闻的详情页标签''' 55 mark_tree = etree.HTML(html_text) 56 title_url_list = mark_tree.xpath('//div[@class="ndi_main"]/div/div/div/h3/a/@href') 57 return title_url_list 58 59 def get_new_detail(title_url_list): 60 '''爬取并将新闻标题个内容保存在本地''' 61 filename = str(random.randint(1000,9999)) + ".txt" 62 with open(filename,"w",encoding="utf-8") as f: 63 for title_url in title_url_list: 64 detail_diver = webdriver.Chrome('chromedriver.exe',options=options,chrome_options=chrome_options) 65 detail_diver.get(title_url) 66 response_text = detail_diver.page_source 67 detail_tree = etree.HTML(response_text) 68 title = detail_tree.xpath('//div[@id="epContentLeft"]/h1/text()')[0] 69 text = detail_tree.xpath('//div[@id="endText"]/p/text()') 70 text = ''.join(text) 71 f.write(title) 72 f.write(text) 73 74 # 初始化四个要爬取的网页url 75 url_list = [guonei_url,guoji_url,war_url,hangkong_url] 76 # 实例化线程池 77 pool = Pool(4) 78 # 使用线程池获取要爬取网页的所有新闻标题和新闻详情页的url 79 data_list = pool.map(get_new,url_list) 80 # 解析所有详情页的url 81 title_url_list = pool.map(mark_url,data_list) 82 # 爬取新闻详情 83 pool.map(get_new_detail,title_url_list)
# 使用线程池爬取梨视频app的视频
# 使用线程池爬取梨视频(10个视频) import requests,re,random from lxml import etree from multiprocessing.dummy import Pool requests = requests.Session() url = 'https://www.pearvideo.com/category_4' headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36", "connection":"close" } response_text = requests.get(url=url,headers=headers).text tree = etree.HTML(response_text) video_url_list = tree.xpath('//ul[@id="listvideoListUl"]/li/div/a/@href | //ul[@id="categoryList"]/li/div/a/@href') for i in range(len(video_url_list)): video_url_list[i] = 'https://www.pearvideo.com/' + video_url_list[i] ''' srcUrl="https://video.pearvideo.com/mp4/third/20191023/cont-1615387-11549790-203859-hd.mp4",vdoUrl=srcUrl, ''' def get_data_url(url): response_text = requests.get(url=url,headers=headers).text data_url = re.findall('srcUrl="(.*?)",vdoUrl=srcUrl,',response_text)[0] return data_url def get_data(data_url): data = requests.get(url=data_url,headers=headers).content filename = str(random.randint(1000,9999)) + ".mp4" with open(filename,"wb") as f: f.write(data) pool = Pool(5) data_url_list = pool.map(get_data_url,video_url_list) pool.map(get_data,data_url_list)