Python爬虫学习第一天--利用selenium和chromedriver驱动浏览器爬取网页
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3 # @Time : 2018/7/12 21:10
4 # @Author : chenxiaowei
5 # @Email : chen1020xiaowei@163.com
6 # @File : vip.py
7
8 from pymongo.errors import ConfigurationError
9 from selenium import webdriver
10 from selenium.common.exceptions import TimeoutException, WebDriverException
11 from selenium.webdriver.common.by import By
12 from selenium.webdriver.support.ui import WebDriverWait
13 from selenium.webdriver.support import expected_conditions as EC
14 from pyquery import PyQuery
15 from urllib3.exceptions import NewConnectionError, MaxRetryError
16 from config_vip import *
17 from multiprocessing import Pool
18 from selenium.webdriver.chrome.options import Options
19 import os
20 import pymongo
21 import requests
22 import hashlib
23 import time
24
25 if browser_method == 0:
26 browser = webdriver.Chrome()
27 print('你选择使用Chrome()方法...')
28 elif browser_method == 1:
29 browser = webdriver.PhantomJS(service_args=['--load-images=false', '--disk-cache=false'])
30 print('你选择使用PhantomJS()方法...')
31 else:
32 chrome_option = Options()
33 chrome_option.add_argument('--headless')
34 browser = webdriver.Chrome(options=chrome_option)
35 print('你选择使用Headless()方法...')
36
37 browser.set_window_size(1920, 1080)
38 wait = WebDriverWait(browser, 10)
39
40 try:
41 client = pymongo.MongoClient(mongo_url)
42 database = client[mongo_database]
43 except TypeError:
44 print('数据库创建失败'.center(130, '*'))
45 except ConfigurationError:
46 print('数据库创建失败'.center(130, '*'))
47
48
49 # 实现数据库对象
50
51 def drop_down_scrollbar():
52 # 定义下拉滚动条方法
53 times = 1
54 while times < total_times:
55 js = "var q=document.documentElement.scrollTop={}".format(times * size)
56 browser.execute_script(js)
57 time.sleep(1)
58 times += 1
59
60
61 def get_search(search_word):
62 # 定义get_()search方法
63 url = main_url
64 browser.get(url)
65 # 打开url,获得内容
66 time.sleep(3)
67 try:
68 search_bar = wait.until(
69 EC.presence_of_element_located((By.CSS_SELECTOR, '#J-search > div.c-search-form > input')))
70 enter_button = wait.until(
71 EC.element_to_be_clickable((By.CSS_SELECTOR, '#J-search > div.c-search-form > a > span')))
72 # 确定输入框和搜索按钮可用
73 search_bar.send_keys(search_word)
74 time.sleep(1)
75 enter_button.click()
76 # 输入关键字并点击搜索
77 time.sleep(5)
78 drop_down_scrollbar()
79 pages = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#J_pagingCt > a:nth-child(6)')))
80 # 获得总页数,main()中作为for循环参数
81 print('搜索到{}共{}页的内容'.format(search_word, pages.text))
82 time.sleep(3)
83 print('开始获取{}第{}页的内容...'.format(search_word, str(1)))
84 get_page_detail(search_word)
85 print('完成获取{}第{}页的内容...'.format(search_word, str(1)))
86 return pages.text
87 except TimeoutException:
88 print('网页未加载完成,无法搜索信息!', TimeoutException.args)
89 pass
90 except WebDriverException:
91 print(WebDriverException.args)
92 pass
93
94
95 def get_next_page(search_word, page):
96 # 定义get_next_page()方法进行跳转
97 try:
98 url1 = url_search.format(search_word, str(page))
99 # 找出网页规律,定个模板
100 print('开始获取{}第{}页的内容...\n'.format(search_word, page))
101 browser.get(url1)
102 drop_down_scrollbar()
103 get_page_detail(search_word)
104 print('完成获取{}第{}页的内容...\n'.format(search_word, page))
105 except TimeoutException:
106 print('跳转网页超时!', TimeoutException.args)
107 pass
108 except WebDriverException:
109 print(WebDriverException.args)
110 pass
111
112
113 def get_page_detail(search_word):
114 # 定义get_page_detail()方法获取网页详细信息
115 try:
116 source = browser.page_source
117 html = PyQuery(source)
118 print('解析数据成功'.center(130, '*'))
119 # PyQuery解析源代码
120 good_items = html('.goods-list .goods-list-item').items()
121 # 调用items()方法获得数据
122 for item in good_items:
123 goods = {
124 'good-title': item.find('.goods-title-info ').text().split('\n')[1],
125 'good-sells-price': item.find('.goods-info .goods-price-wrapper .goods-sells-price .price').text(),
126 'good-market-price': item.find('.goods-info .goods-market-price').text()[2:],
127 'good-discount': item.find('.goods-info .goods-discount').text(),
128 'good-brand': item.find('.goods-info .goods-brand').text(),
129 'image': 'http:{}'.format(item.find('.goods-slide .goods-image-link .J_img').attr('src')),
130 'detail': 'http:{}'.format(item.find(' .goods-slide .goods-image-link').attr('href'))
131 }
132 image_url = goods['image']
133 content = get_image_content(image_url)
134 if content:
135 # 确定图片网页是否可以打开
136 download_image(content, search_word, image_url)
137 save_to_mongodb(goods, search_word)
138 # 调用find方法和CSS取得数据
139 except TimeoutException:
140 print('爬取网页超时!', TimeoutException.args)
141 pass
142
143
144 def save_to_mongodb(goods, database_table):
145 # 定义save_to_mongoDB(goods)方法将数据存储到mongoDB数据
146 try:
147 if database[database_table].insert(goods):
148 # 插入数据成功
149 print('存储数据成功'.center(130, '*'))
150 print(goods, '\n')
151 except Exception:
152 print('写入数据出错!', Exception.args)
153 pass
154
155
156 def get_image_content(url):
157 try:
158 response = requests.get(url)
159 if response.status_code == 200:
160 return response.content
161 else:
162 print('请求图片链接失败!')
163 except ConnectionError:
164 print(ConnectionError.args)
165 return False
166 except NewConnectionError:
167 print(NewConnectionError.args)
168 return False
169 except MaxRetryError:
170 print(MaxRetryError.args)
171 return False
172
173
174 def download_image(content, folder, image_url):
175 # 定义download_image(content)保存图片
176 time_stamp = time.strftime("%Y%m%d", time.localtime())
177 path = file_path.format(mongo_database, time_stamp, folder)
178 if os.path.exists(path):
179 pass
180 else:
181 os.makedirs(path)
182 # 利用hash算法获得content MD5值以16进制显示
183 filename = hashlib.md5(content).hexdigest()
184 with open(file_type.format(path, filename), 'wb')as f:
185 f.write(content)
186 f.close()
187 # 打开文件保存路径,文件名,格式,wb写入形式
188 print(' {} 下载图片成功'.format(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())).center(125, '*'))
189 print(filename, image_url)
190
191
192 def main(search_word):
193 pages = int(get_search(search_word))
194 page = 2
195 if pages >= end:
196 pages = end
197 try:
198 while page <= pages:
199 get_next_page(search_word, page)
200 page += 1
201 except TimeoutException:
202 print(TimeoutException.args)
203 pass
204
205
206 if __name__ == '__main__':
207 pool = Pool(processes=2)
208 pool.map(main, [keyword for keyword in keywords])
209 pool.close()
210 #锁定进程池
211 pool.join()
212 os.system('taskkill /im chromedriver.exe /F')
213 os.system('taskkill /im chrome.exe /F')
214 #杀死多余的chromedriver进程以及chrome进程
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3 # @Time : 2018/7/12 23:48
4 # @Author : chenxiaowei
5 # @Email : chen1020xiaowei@163.com
6 # @File : config_vip.py
7 mongo_url = 'localhost'
8 mongo_database = 'vip'
9 #数据库地址以及名称
10 main_url = 'https://www.vip.com/'
11
12 total_times =16
13 size =500
14 #设定下来滚动条的次数和大小
15
16 browser_method = 2
17 #驱动浏览器的方法
18
19 start=1
20 end = 45
21 #设定结束网页,有些网页没有内容,容易引起一场
22 url_search = 'https://category.vip.com/suggest.php?keyword={}&page={}&count=100&suggestType=brand#catPerPos'
23 #定义模板
24 file_path = 'H:/Python_download/{}/{}/image/{}/'
25 file_type = '{}{}.jpg'
26 # 文件类型以及文件夹
27
28
29 keywords=['苹果','雪梨','香蕉']