爬kuku漫画网站的小爬虫

#coding:utf-8
import sys #用于设置默认编码
from selenium import webdriver #用于进行网页元素定位
import re #正则表达式
import os #用于操作文件夹
import urllib #用于下载图片
import time #用于暂停
import csv

def getinfo(url): #获取下载信息列表
driver = webdriver.Firefox(firefox_options=options)
driver.get(url) #打开网站
links_all = driver.find_elements_by_tag_name("a") #寻找所有链接
links_rexue = [] #创建链接列表
name_rexue = [] #创建名称列表
i = str(0)
img_Num = str(0)
error1 = []
error2 = []
download_continue = ""
page_number = str(1)
for link in links_all:
try:
if str("Vol") in link.text: #判断是否想要的链接
links_rexue.append(link.get_attribute("href")) #加入到链接列表
name_rexue.append(link.text.encode("gbk")) #加入到名称列表
except:
pass
driver.close()
dataFile = open("progress_list","wb")
dataWrite = csv.writer(dataFile)
dataWrite.writerow(links_rexue)
dataWrite.writerow(name_rexue)
dataWrite.writerow(i)
dataWrite.writerow(img_Num)
dataWrite.writerow(error1)
dataWrite.writerow(error2)
dataWrite.writerow(download_continue)
dataWrite.writerow(page_number)
dataFile.close()
return links_rexue,name_rexue,i,img_Num,error1,error2,download_continue,page_number

def get_img(url_start):
global links_rexue
global name_rexue
global x
global img_Num
global error1
global error2
global page_number
link_start = str(url_start)
while True:
try:
restart = 1
while restart < 10:
try:
driver.get(link_start) #打开网页
key = re.compile("http.*?(jpg|JPG)") #定义图片的正则表达式
img = key.search(driver.page_source).group(0) #获取图片的链接
urllib.urlretrieve(img, os.getcwd() + "\\" + name_rexue[x] + "\\" + str(page_number) + ".jpg") #下载图片
break
except:
print("重新下载:" + link_start)
restart += 1
if restart == 11:
driver.get(link_start) #打开网页
key = re.compile("http.*?(jpg|JPG)") #定义图片的正则表达式
img = key.search(driver.page_source).group(0) #获取图片的链接
urllib.urlretrieve(img, os.getcwd() + "\\" + name_rexue[x] + "\\" + str(page_number) + ".jpg") #下载图片
img_Num += 1
page_number += 1
print(img + ".......ok!")
dataFile = open("progress_list","wb")
dataWrite = csv.writer(dataFile)
dataWrite.writerow(links_rexue)
dataWrite.writerow(name_rexue)
dataWrite.writerow([str(x)])
dataWrite.writerow([str(img_Num)])
dataWrite.writerow(error1)
dataWrite.writerow(error2)
dataWrite.writerow([link_start])
dataWrite.writerow([str(page_number)])
dataFile.close()
#time.sleep(1) #休息1秒
except:
print("error1:" + link_start) #出错则显示在哪个链接出错
page_number += 1
error1.append(link_start)

try:
if "bbs" in driver.find_elements_by_xpath(".//*[@href]")[3].get_attribute("href"): #判断是否第二页,第二页的下一页链接在找到的第5个元素
if "exit.htm" not in driver.find_elements_by_xpath(".//*[@href]")[4].get_attribute("href"): #判断是否最后一页
link_next = driver.find_elements_by_xpath(".//*[@href]")[4].get_attribute("href") #为下个链接赋值
else:
break #跳出循环,读取下一集
else:
link_next = driver.find_elements_by_xpath(".//*[@href]")[3].get_attribute("href") #第一页直接赋值

if link_next != link_start:
link_start = link_next
else:
break
except:
print("error2:" + link_start)
error2.append(link_start)

options = webdriver.FirefoxOptions() #加载firefox浏览器实例
options.set_headless() #设置为无窗体模式
url = "http://comic.kukudm.com/comiclist/380/index.htm"

if os.path.isfile("progress_list"):
dataFile = open("progress_list")
dataReader = csv.reader(dataFile)
data = list(dataReader)
links_rexue = data[0] #创建链接列表
name_rexue = data[1] #创建名称列表
i = int(data[2][0])
img_Num = int(data[3][0]) error1 = data[4] error2 = data[5] download_continue = data[6][0] page_number = int(data[7][0]) dataFile.close() print("loading progress_list.......") x = 0 for link in links_rexue: if x == i: if os.path.isdir(name_rexue[x]): #判断是否存在文件夹,不存在则创建 print("Folder is existence") else: os.mkdir(name_rexue[x]) print("New Folder:" + name_rexue[x]) page_number -= 1 driver = webdriver.Firefox(firefox_options=options) driver.implicitly_wait(30) get_img(download_continue) driver.close() elif x > i: page_number = 1 if os.path.isdir(name_rexue[x]): #判断是否存在文件夹,不存在则创建 print("Folder is existence") else: os.mkdir(name_rexue[x]) print("New Folder:" + name_rexue[x]) driver = webdriver.Firefox(firefox_options=options) driver.implicitly_wait(30) get_img(link) driver.close() x += 1else: downloadList = getinfo(url) links_rexue = downloadList[0] #创建链接列表 name_rexue = downloadList[1] #创建名称列表 x = int(downloadList[2]) img_Num = int(downloadList[3]) error1 = downloadList[4] error2 = downloadList[5] download_continue = downloadList[6] page_number = int(downloadList[7]) print("set up progress_list.......") for link in links_rexue: if os.path.isdir(name_rexue[x]): #判断是否存在文件夹,不存在则创建 print("Folder is existence") else: os.mkdir(name_rexue[x]) print("New Folder:" + name_rexue[x]) driver = webdriver.Firefox(firefox_options=options) driver.implicitly_wait(30) get_img(link) driver.close()dataFile = open("finish_info","wb")dataWrite = csv.writer(dataFile)dataWrite.writerow([str(img_Num)])dataWrite.writerow(error1)dataWrite.writerow(error2)dataFile.close()os.remove("progress_list")print("==========================finish download=========================")print("download:" + str(img_Num))print("error1:" + str(error1))print("error2:" + str(error2))
posted on 2018-03-13 15:45  学习代码小仓库  阅读(1305)  评论(0编辑  收藏  举报