爬取京东数据
#模仿浏览器 #下载地址 #创建数据库 #初始化downloadSpider文件夹 #访问京东页面 #数据库增删改查 from selenium.webdriver.common.keys import Keys from selenium import webdriver import threading import datetime import sqlite3 import urllib import time import os class MySipider: #模拟浏览器 herder = { "user_Agent":"Mozilla/5.0 (Linux; U; Android 8.1.0; zh-cn; BLA-AL00 Build/HUAWEIBLA-AL00" } #下载地址 imagePath = "downloadSpider" def StartUp(self,url,key): options = webdriver.ChromeOptions() options.add_argument('--headless') options.add_argument('--disable-gpu') self.driver = webdriver.Chrome(options=options) self.driver.maximize_window() self.thread=[] self.No=0 self.imgNo=0 #创建数据表 try: self.con=sqlite3.connect("phones.db") #self.con=sql.conect("phones.db") self.cursor=self.con.cursor() try: self.cursor.execute("Drop table phone") except : pass except : pass try: sql="create table phones(mNO varchar(32) primary key,mMark varchar(126),mNote varchar(126),mPrice varchar(126),mFile varchar(126))" self.cursor.execute(sql) except : pass #初始化dowmload文件夹 try: if not os.path.exists(MySipider.imagePath): os.mkdir(MySipider.imagePath) images=os.listdir(MySipider.imagePath) for img in images: s=os.path.join(MySipider.imagePath,img) os.remove(s) except Exception as err: print(err) #访问第一页 self.driver.get(url) keyInput=self.driver.find_element_by_id('key') keyInput.send_keys(key) keyInput.send_keys(key) def CloseUp(self): try: self.con.commit() self.con.close() self.driver.close() except Exception as err: print(err) def InsertDB(self,mNO,mMark,mPrice,mNote,mFile): try: sql="insert into phones(mNO,mMark,mPrice,mNote,mFile)values(?,?,?,?)" self.cursor.execute(sql,(mNO,mMark,mPrice,mFile)) except : pass def ShowDB(self): try: con=sqlite3.connect("phones.db") cursor=con.cursor() print(row[0],row[1],row[2],row[3]) cursor.execute("select mNO,mMark,mPrice,mFile from phone order by mNo") rows=cursor.fetchall() for row in rows: print(row[0],row[1],row[2],row[3]) con.close() except : pass def downloadSpider(self,src1,src2,mFile): data=None if src1: try: req=urllib.request.Request(src1,header=MySipider.herder) resp=urllib.request.urlopen(req,timeout=400) data=resp.read() except : pass if not data and src2: try: req=urllib.request.Request(src2,header=MySipider.herder) resp=urllib.request.urlopen(req,timeout=400) data=resp.read() except : pass if data: fobj=open(MySipider.imagePath+"\\"+mFile,"wb") fobj.write(data) fobj.close() print("downlaod",mFile) def ProcessSpider(self): try: time.sleep(1) print(self.driver.current_url) lis = self.driver.find_element_by_xpath("//div[@id='J_goodsList']//li[@class='gl_item']") for li in lis: try: scr1=li.find_element_by_xpath(".//div[@class='p-img']//a//img").get_attribute("src") except : src1="" try: scr2=li.find_element_by_xpath(".//div[@class='p-img']//a//img").get_attribute("src") except : src2="" try: price = li.find_element_by_xpath(".//div[@class='p-price']//i").text except : price="0" if src1: src1=urllib.request.urljoin(self.driver.current_url,scr1) p=src1.rfind(".") mFile=no+src1[p:] else: src2=urllib.request.urljoin(self.driver.current_url,scr2) p=src2.rfind(".") mFile=no+src1[p:] if src1 or src2: T=threading.Thread(target=self.downloadSpider,args=(src1,src2,mFile)) T.setDaemon(False) T.start() self.thread.append(T) else: mFile="" self.InsertDB(no,mark,price,mFile) try: self.driver.find_element_by_xpath("//pan[@class='p-num']//a[@class='pn-next disabled']") except : nextpage=self.driver.find_element_by_xpath("//pan[@class='p-num']//a[@class='pn-next']") nextpage.click() self.ProcessSpider() except : pass def ExecuteSpider(self,url,key): starttime=datetime.datetime.now() self.StartUp(url,key) self.ProcessSpider() self.CloseUp() for t in self.thread: t.join() url="http://www.jd.com" spider=MySipider() while True: print("1,爬取") print("2,显示") print("3,退出") s=input("请输入选择") if s=="1": spider.ExecuteSpider(url,"手机") elif s=="2": spider.ShowDB() elif s=="3": break #lis = self.driver.find_element_by_xpath("//div[@id='J_goodsList']//li[@class='gl_item']") #for li in lis: # try: # price = li.find_element_by_xpath(".//div[@class='p-price']//i").text # except : # price="0" # try: # src=li.find_element_by_xpath(".//div[@class='p-img']//a//img").get_attribute("src") # except : # src="" #try: # self.driver.find_element_by_xpath("//pan[@class='p-num']//a[@class='pn-next disabled']") #except : # nextpage=self.driver.find_element_by_xpath("//pan[@class='p-num']//a[@class='pn-next']") # nextpage.click()