python爬虫实战-----利用队列加快爬取速度

前面有篇文章是介绍利用selenium库爬取表格数据的代码,然而这种单纯利用selenium的方法,速度太慢,我们可以使用队列的方法使速度大大提高。代码如下。提高速度的方法也不只这一种,下篇文章我将介绍基于scrapy的方法。

#!/usr/bin/env python3
# -*- coding: UTF-8 -*-
from bs4 import BeautifulSoup
import json
from selenium import webdriver
import pyodbc
import time
import lxml
import random
import datetime
import threading
from queue import Queue
class GetThred(threading.Thread):
def __init__(self, queue):
threading.Thread.__init__(self)
self.queue = queue
def run(self):
while True:
if not self.queue.empty():
lists = self.queue.get()
work(lists)
self.queue.task_done()
else:
break
def work(url_code):
cnxn = pyodbc.connect("DRIVER={SQL Server};SERVER=WIN-20160105DRP;DATABASE=CtripData;UID=sa;PWD=123")
try:
html_i = getHtml(url_code)
Html_Change=html_i[0]
Html_Change_EN=html_i[1]
Hroom_list_cn = Html_Change.findAll("div", {"class": "hroom_list"})[0]
allhroomlist = Hroom_list_cn.findAll("div", {"class": "hroom_tr J_baseRoomlist "})
hotel_type_list = Html_Change_EN.findAll("div", {"class": "m-hotel-type", "id": "room_table"})[0].findAll("div",{"class": "hotel-type__list"})
roomid_list = list()
for hotel_type in hotel_type_list:
roomid_list.append(int(hotel_type.find("table").get("data-roomid")))
for allhroomlist_index in range(len(allhroomlist)):
hroominfo = allhroomlist[allhroomlist_index].findAll("div", {"class": "hroom_col hroom_col_type"})[0].find("dl", {"class": "hroom_base"})
hroominfo_text = hroominfo.find("dd", {"class": "hroom_base_txt J_hroom_base_detail"}).get("data-text")
hroominfo_json = json.loads(str(hroominfo_text))
roomId = int(hroominfo_json['comment_baseroomId'])
roomName = hroominfo_json['comment_baseroomName']
subRoomlist = allhroomlist[allhroomlist_index].find("div", {"class": "hroom_tr_cols"}).findAll("div", {"class": "hroom_tr_col J_subRoomlist"}) + allhroomlist[allhroomlist_index].find("div", {"class": "hroom_tr_cols"}).findAll("div", {"class": "hroom_tr_col J_subRoomlist hidden"})
freewifi_num = 0
nonsmoking_num = 0
for subRoomlistfor_one in subRoomlist:
hroom_col_network = str(subRoomlistfor_one.find("div", {"class": "hroom_col hroom_col_network"}))
if '免费' in hroom_col_network:
freewifi_num += 1
if '吸烟信息' in hroom_col_network and '不可吸烟' not in hroom_col_network:
nonsmoking_num += 1
Tname_list = list()
Tinfo_list = list()
if hroominfo_json['thisBaseRoomServiceDetailList'] != []:
RoomServiceDetailList = hroominfo_json['thisBaseRoomServiceDetailList']
for RoomServiceDetail in RoomServiceDetailList:
Tname_list.append(RoomServiceDetail['thisDetailInfoName'])
Tinfo_list.append(RoomServiceDetail['thisDetailInfoVal'])
Addbed_info = ''
Area_info = ''
if Tname_list != []:
if '可加床' in Tname_list:
Addbed_info = Tinfo_list[Tname_list.index('可加床')]
if '建筑面积' in Tname_list:
Area_info = Tinfo_list[Tname_list.index('建筑面积')]
roomId_1 = str(roomId)

room_en_name = ''
if roomId in roomid_list:
room_en_name = Html_Change_EN.find("div", {"class": "h-type__img", "data-roomid": roomId_1}).get(
"data-roomname")
# room_en_name = Html_Change_EN.find_element_by_xpth('//table[@data-roomid=str(roomId)]/div[@class="h-type__cnt"]/a/text()')
room_en_name = str(room_en_name)
if "'" in room_en_name:
room_en_name = room_en_name.replace("'", "''")

cursor2 = cnxn.cursor()
sql2 = "insert into CtripHotelRoomList_1([HCode],[RoomCode],[RoomNameEN],[RoomNameCN],[FreeWifi],[SmokingInfo],[AreaInfo],[AddBedInfo],[UpdateDate],[AddDate]) values('%d','%d','%s','%s','%d','%d','%s','%s',getdate(),getdate());" % (url_code, roomId, str(room_en_name), roomName, freewifi_num, nonsmoking_num, Area_info,
Addbed_info)
cursor2.execute(sql2)
cnxn.commit()
cursor2.close()
Html_Change = Html_Change.findAll("div", {"class": "hroom_list"})[0]
Room_list = Html_Change.findAll("div", {"class": "hroom_tr J_baseRoomlist "})
for Room_list_index in range(len(Room_list)):
hroom_base = Room_list[Room_list_index].findAll("div", {"class": "hroom_col hroom_col_type"})[0].find("dl",
{
"class": "hroom_base"})
RoomInfoDetailsList_text = hroom_base.find("dd", {"class": "hroom_base_txt J_hroom_base_detail"}).get(
"data-text")
RoomInfoDetailsList_json = json.loads(RoomInfoDetailsList_text)
Rinfoname_list = list()
Rinfoval_list = list()
if RoomInfoDetailsList_json['thisBaseRoomRoomInfoDetailsList'] != []:
RoomInfoDetailsList_json_list = RoomInfoDetailsList_json['thisBaseRoomRoomInfoDetailsList']

for RoomInfoDetailsList_json_list_index in RoomInfoDetailsList_json_list:
RInfoVal = ','.join(RoomInfoDetailsList_json_list_index['thisDetailInfoVal'])
Rinfoname_list.append(RoomInfoDetailsList_json_list_index['thisDetailInfoName'])
Rinfoval_list.append(RInfoVal)
Rinfoval_list.append('|')
roomid = int(RoomInfoDetailsList_json['comment_baseroomId'])
roomname = RoomInfoDetailsList_json['comment_baseroomName']
if RoomInfoDetailsList_json['thisBaseRoomServiceDetailList'] != []:
ServiceDetailList = RoomInfoDetailsList_json['thisBaseRoomServiceDetailList']
for ServiceDetailList_index in ServiceDetailList:
InfoVal_part = ServiceDetailList_index['thisDetailInfoVal']
Rinfoname_list.append(ServiceDetailList_index['thisDetailInfoName'])
Rinfoval_list.append(InfoVal_part)
Rinfoval_list.append('|')
Rinfoval_list = Rinfoval_list[:-1]
infoname = str('|'.join(Rinfoname_list))
infoval = str(''.join(Rinfoval_list))
cursor7 = cnxn.cursor()
sql7 = "SELECT Rid from CtripHotelRoomList_1 where RoomCode = '%d'" % (roomid)
cursor7.execute(sql7)
rid = cursor7.fetchone()[0]
cursor7.close()
cursor8 = cnxn.cursor()
sql8 = "insert into CtripHotelFacilities_1([HCode],[Rid],[TName],[TInfo],[RoomCode],[UpdateDate],[AddDate]) values('%d','%d','%s','%s','%d',getdate(),getdate());" % (url_code, rid, infoname, infoval, roomid)
cursor8.execute(sql8)
cnxn.commit()
cursor8.close()
except:
cursor11 = cnxn.cursor()
updateDstatus_Sql = "update roomdata set DStatus=1 where CtripID='%d'" % int(url_code)
cursor11.execute(updateDstatus_Sql)
cnxn.commit()
cursor11.close()
pass
cursor11 = cnxn.cursor()
updateDstatus_Sql = "update roomdata set DStatus=1 where CtripID='%d'" % int(url_code)
cursor11.execute(updateDstatus_Sql)
cnxn.commit()
cursor11.close()
cnxn.close()



def getHtml(url_code):
randomDays = random.randint(20, 80)
randomDate = (datetime.datetime.now() + datetime.timedelta(days=randomDays)).strftime("%Y-%m-%d")
url = 'http://hotels.ctrip.com/international/' + str(url_code) + '.html/' + '?&checkin=' + randomDate
Driver_CN = webdriver.Chrome()
Driver_CN.get(url)
time.sleep(18)
Html_CN = Driver_CN.page_source
Html_Change = BeautifulSoup(Html_CN, 'lxml')
url2 = 'https://www.trip.com/hotels/london-hotel-detail-' + str(url_code) + '/' + '?&checkin=' + randomDate
Driver_EN = webdriver.Chrome()
Driver_EN.get(url2)
time.sleep(18)
Html_EN = Driver_EN.page_source
Html_Change_EN = BeautifulSoup(Html_EN, 'lxml')
html_list=list()
html_list.append(Html_Change)
html_list.append(Html_Change_EN)
Driver_CN.quit()
Driver_EN.quit()
return html_list
def get_code():
code_list=list()
cnxn = pyodbc.connect("DRIVER={SQL Server};SERVER=WIN-20160105DRP;DATABASE=CtripData;UID=sa;PWD=123")
cursor1 = cnxn.cursor()
sql1 = "SELECT CtripID from roomdata where dstatus=0"
cursor1.execute(sql1)
allselect = cursor1.fetchall()
cursor1.close()
for oneselect in allselect:
code_list.append(oneselect.CtripID)
return code_list

def main():
lists=get_code()
queue = Queue()
for q in lists:
queue.put(q)
for i in range(3):
d = GetThred(queue)
d.setDaemon(True)
d.start()
queue.join()
if __name__ == "__main__":
main()
posted @ 2018-08-11 12:08  kzl  阅读(606)  评论(0编辑  收藏  举报