Python 爬虫实例(9)—— 搜索 爬取 淘宝
# coding:utf-8 import json import redis import time import requests session = requests.session() import logging.handlers import pickle import sys import re import datetime from bs4 import BeautifulSoup import sys reload(sys) sys.setdefaultencoding('utf8') import datetime # 生成一年的日期 def dateRange(start, end, step=1, format="%Y-%m-%d"): strptime, strftime = datetime.datetime.strptime, datetime.datetime.strftime days = (strptime(end, format) - strptime(start, format)).days return [strftime(strptime(start, format) + datetime.timedelta(i), format) for i in xrange(0, days, step)] def spider(): from selenium import webdriver import os # 引入chromedriver.exe chromedriver = "C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe" os.environ["webdriver.chrome.driver"] = chromedriver browser = webdriver.Chrome(chromedriver) # 设置浏览器需要打开的url url = "https://www.taobao.com/" browser.get(url) time.sleep(1) browser.find_element_by_id("q").send_keys(u'python') browser.find_element_by_class_name("btn-search").click() time.sleep(5) for i in range(1,100): browser.find_element_by_xpath('//a[@trace="srp_bottom_pagedown"]').click() time.sleep(15) result = browser.page_source result_replace = str(result).replace('\n','').replace('\r','').replace('\t','').replace(' ','') result_replace = re.findall('<divclass="pic-boxJ_MouseEneterLeaveJ_PicBox">(.*?)</div><divclass="ctx-boxJ_MouseEneterLeaveJ_IconMoreNew">(.*?)</div><divclass="rowrow-4g-clearfix">(.*?)</div></div></div>',result_replace) print len(result_replace) for item in result_replace: item_imgurl = re.findall('data-src="(.*?)"alt=',item[0])[0] item_name = re.findall('alt="(.*?)"/></a></div><divclass=',item[0])[0] item_loation = re.findall('<divclass="location">(.*?)</div>',item[1])[0] company_name = re.findall('</span></span><span>(.*?)</span></a></div><divclass="location">',item[1])[0] company_price = re.findall('<divclass="priceg_priceg_price-highlight"><span>¥</span><strong>(.*?)</strong></div>',item[1])[0] purchase_num = re.findall('<divclass="deal-cnt">(.*?)人付款</div>',item[1])[0] print item_imgurl print item_name print item_loation print company_name print company_price print purchase_num print "="*30 # time.sleep(1000) # 关闭浏览器 # browser.quit() spider()
如果觉得对您有帮助,麻烦您点一下推荐,谢谢!
好记忆不如烂笔头
好记忆不如烂笔头