Python爬虫学习第一天--利用selenium和chromedriver驱动浏览器爬取网页

  1 #!/usr/bin/env python
  2 # -*- coding: utf-8 -*-
  3 # @Time    : 2018/7/12 21:10
  4 # @Author  : chenxiaowei
  5 # @Email   : chen1020xiaowei@163.com
  6 # @File    : vip.py
  7 
  8 from pymongo.errors import ConfigurationError
  9 from selenium import webdriver
 10 from selenium.common.exceptions import TimeoutException, WebDriverException
 11 from selenium.webdriver.common.by import By
 12 from selenium.webdriver.support.ui import WebDriverWait
 13 from selenium.webdriver.support import expected_conditions as EC
 14 from pyquery import PyQuery
 15 from urllib3.exceptions import NewConnectionError, MaxRetryError
 16 from config_vip import *
 17 from multiprocessing import Pool
 18 from selenium.webdriver.chrome.options import Options
 19 import os
 20 import pymongo
 21 import requests
 22 import hashlib
 23 import time
 24 
 25 if browser_method == 0:
 26     browser = webdriver.Chrome()
 27     print('你选择使用Chrome()方法...')
 28 elif browser_method == 1:
 29     browser = webdriver.PhantomJS(service_args=['--load-images=false', '--disk-cache=false'])
 30     print('你选择使用PhantomJS()方法...')
 31 else:
 32     chrome_option = Options()
 33     chrome_option.add_argument('--headless')
 34     browser = webdriver.Chrome(options=chrome_option)
 35     print('你选择使用Headless()方法...')
 36 
 37 browser.set_window_size(1920, 1080)
 38 wait = WebDriverWait(browser, 10)
 39 
 40 try:
 41     client = pymongo.MongoClient(mongo_url)
 42     database = client[mongo_database]
 43 except TypeError:
 44     print('数据库创建失败'.center(130, '*'))
 45 except ConfigurationError:
 46     print('数据库创建失败'.center(130, '*'))
 47 
 48 
 49 # 实现数据库对象
 50 
 51 def drop_down_scrollbar():
 52     # 定义下拉滚动条方法
 53     times = 1
 54     while times < total_times:
 55         js = "var q=document.documentElement.scrollTop={}".format(times * size)
 56         browser.execute_script(js)
 57         time.sleep(1)
 58         times += 1
 59 
 60 
 61 def get_search(search_word):
 62     # 定义get_()search方法
 63     url = main_url
 64     browser.get(url)
 65     # 打开url,获得内容
 66     time.sleep(3)
 67     try:
 68         search_bar = wait.until(
 69             EC.presence_of_element_located((By.CSS_SELECTOR, '#J-search > div.c-search-form > input')))
 70         enter_button = wait.until(
 71             EC.element_to_be_clickable((By.CSS_SELECTOR, '#J-search > div.c-search-form > a > span')))
 72         # 确定输入框和搜索按钮可用
 73         search_bar.send_keys(search_word)
 74         time.sleep(1)
 75         enter_button.click()
 76         # 输入关键字并点击搜索
 77         time.sleep(5)
 78         drop_down_scrollbar()
 79         pages = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#J_pagingCt > a:nth-child(6)')))
 80         # 获得总页数,main()中作为for循环参数
 81         print('搜索到{}共{}页的内容'.format(search_word, pages.text))
 82         time.sleep(3)
 83         print('开始获取{}第{}页的内容...'.format(search_word, str(1)))
 84         get_page_detail(search_word)
 85         print('完成获取{}第{}页的内容...'.format(search_word, str(1)))
 86         return pages.text
 87     except TimeoutException:
 88         print('网页未加载完成,无法搜索信息!', TimeoutException.args)
 89         pass
 90     except WebDriverException:
 91         print(WebDriverException.args)
 92         pass
 93 
 94 
 95 def get_next_page(search_word, page):
 96     # 定义get_next_page()方法进行跳转
 97     try:
 98         url1 = url_search.format(search_word, str(page))
 99         # 找出网页规律,定个模板
100         print('开始获取{}第{}页的内容...\n'.format(search_word, page))
101         browser.get(url1)
102         drop_down_scrollbar()
103         get_page_detail(search_word)
104         print('完成获取{}第{}页的内容...\n'.format(search_word, page))
105     except TimeoutException:
106         print('跳转网页超时!', TimeoutException.args)
107         pass
108     except WebDriverException:
109         print(WebDriverException.args)
110         pass
111 
112 
113 def get_page_detail(search_word):
114     # 定义get_page_detail()方法获取网页详细信息
115     try:
116         source = browser.page_source
117         html = PyQuery(source)
118         print('解析数据成功'.center(130, '*'))
119         # PyQuery解析源代码
120         good_items = html('.goods-list .goods-list-item').items()
121         # 调用items()方法获得数据
122         for item in good_items:
123             goods = {
124                 'good-title': item.find('.goods-title-info ').text().split('\n')[1],
125                 'good-sells-price': item.find('.goods-info .goods-price-wrapper .goods-sells-price .price').text(),
126                 'good-market-price': item.find('.goods-info  .goods-market-price').text()[2:],
127                 'good-discount': item.find('.goods-info .goods-discount').text(),
128                 'good-brand': item.find('.goods-info .goods-brand').text(),
129                 'image': 'http:{}'.format(item.find('.goods-slide .goods-image-link .J_img').attr('src')),
130                 'detail': 'http:{}'.format(item.find(' .goods-slide .goods-image-link').attr('href'))
131             }
132             image_url = goods['image']
133             content = get_image_content(image_url)
134             if content:
135                 # 确定图片网页是否可以打开
136                 download_image(content, search_word, image_url)
137             save_to_mongodb(goods, search_word)
138         # 调用find方法和CSS取得数据
139     except TimeoutException:
140         print('爬取网页超时!', TimeoutException.args)
141         pass
142 
143 
144 def save_to_mongodb(goods, database_table):
145     # 定义save_to_mongoDB(goods)方法将数据存储到mongoDB数据
146     try:
147         if database[database_table].insert(goods):
148             # 插入数据成功
149             print('存储数据成功'.center(130, '*'))
150             print(goods, '\n')
151     except Exception:
152         print('写入数据出错!', Exception.args)
153         pass
154 
155 
156 def get_image_content(url):
157     try:
158         response = requests.get(url)
159         if response.status_code == 200:
160             return response.content
161         else:
162             print('请求图片链接失败!')
163     except ConnectionError:
164         print(ConnectionError.args)
165         return False
166     except NewConnectionError:
167         print(NewConnectionError.args)
168         return False
169     except MaxRetryError:
170         print(MaxRetryError.args)
171         return False
172 
173 
174 def download_image(content, folder, image_url):
175     # 定义download_image(content)保存图片
176     time_stamp = time.strftime("%Y%m%d", time.localtime())
177     path = file_path.format(mongo_database, time_stamp, folder)
178     if os.path.exists(path):
179         pass
180     else:
181         os.makedirs(path)
182     # 利用hash算法获得content MD5值以16进制显示
183     filename = hashlib.md5(content).hexdigest()
184     with open(file_type.format(path, filename), 'wb')as f:
185         f.write(content)
186         f.close()
187     # 打开文件保存路径,文件名,格式,wb写入形式
188     print(' {} 下载图片成功'.format(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())).center(125, '*'))
189     print(filename, image_url)
190 
191 
192 def main(search_word):
193     pages = int(get_search(search_word))
194     page = 2
195     if pages >= end:
196         pages = end
197     try:
198         while page <= pages:
199             get_next_page(search_word, page)
200             page += 1
201     except TimeoutException:
202         print(TimeoutException.args)
203         pass
204 
205 
206 if __name__ == '__main__':
207     pool = Pool(processes=2)
208     pool.map(main, [keyword for keyword in keywords])
209     pool.close()
210     #锁定进程池
211     pool.join()
212     os.system('taskkill /im chromedriver.exe /F')
213     os.system('taskkill /im chrome.exe /F')
214     #杀死多余的chromedriver进程以及chrome进程
 1 #!/usr/bin/env python
 2 # -*- coding: utf-8 -*-
 3 # @Time    : 2018/7/12 23:48
 4 # @Author  : chenxiaowei
 5 # @Email   : chen1020xiaowei@163.com
 6 # @File    : config_vip.py
 7 mongo_url = 'localhost'
 8 mongo_database = 'vip'
 9 #数据库地址以及名称
10 main_url = 'https://www.vip.com/'
11 
12 total_times =16
13 size =500
14 #设定下来滚动条的次数和大小
15 
16 browser_method = 2
17 #驱动浏览器的方法
18 
19 start=1
20 end = 45
21 #设定结束网页,有些网页没有内容,容易引起一场
22 url_search = 'https://category.vip.com/suggest.php?keyword={}&page={}&count=100&suggestType=brand#catPerPos'
23 #定义模板
24 file_path = 'H:/Python_download/{}/{}/image/{}/'
25 file_type = '{}{}.jpg'
26 # 文件类型以及文件夹
27 
28 
29 keywords=['苹果','雪梨','香蕉']

 

posted @ 2018-07-16 16:16  陈小维努力ing  阅读(1362)  评论(0)    收藏  举报