Python Selenium+PhantomJS 抓取斗图

很久之前写着玩的,最近拿出来还能用,等过段时间优化下

#打开html
#获取所有指定的<a>标签中的href
#遍历打开所有的href中的url
#获取指定的元素
#获取图片的url链接
#通过函数,将图片保存到本地

import time
import os
import re
import urllib.request
import uuid
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait


#生成一个文件名字符串
def generateFileName():
    return str(uuid.uuid1())

#根据文件名创建文件
def createFileWithFileName(localPathParam,fileName):
    totalPath=localPathParam+'\\'+fileName
    if not os.path.exists(totalPath):
        file=open(totalPath,'a+')
        file.close()
        return totalPath

def getAndSaveImg(imgUrl,img_name):
    if (len(imgUrl) != 0):
        fileName = img_name + '.jpg'
        fileName = re.sub('[\/:*?"<>|]', '-', fileName)
        try:
            urllib.request.urlretrieve(imgUrl, createFileWithFileName("C:\\Downloads", fileName))
        except:
            print("这图我没法下载")

#获取每个list的url
def get_list():

    lists = driver.find_elements_by_class_name("list-group-item")

    for i in range(len(lists)):
        list = lists[i].get_attribute("href")
        print(list)
        # 存入list中
        list_info.append(list)


if __name__=="__main__":


    driver=webdriver.PhantomJS()

    driver.set_window_size(1400, 900)
    for m in range(28,50):
        list_info = []
        url="http://www.doutula.com/article/list?page="+str(m+1)
        driver.get(url)
        #网页加载完成后,等待2s
        wait1 = WebDriverWait(driver, 2)
        # 获取每个list的url,返回结果存入了list_info
        get_list()

        #遍历每个url链接,打开
        for j in range(len(list_info)):
            driver.get(list_info[j])
            wait2 = WebDriverWait(driver, 2)
            #page=driver.page_source
            #print(page)
            url_info=driver.find_elements_by_xpath("//div[@class='artile_des']/table/tbody")

            for x in range (len(url_info)):
                img_url=url_info[x].find_element_by_tag_name("img").get_attribute("src")
                img_name = url_info[x].find_element_by_tag_name("img").get_attribute("alt")
                print("坐标" + str(m+1)+":"+str(j)+":"+str(x))
                print(img_url+"----->"+img_name)
                getAndSaveImg(img_url,img_name)
            #print(''+str(j+1)+"")

 

posted @ 2017-06-13 15:09  洛雨寒殇  阅读(191)  评论(0编辑  收藏  举报