python 百度图片爬虫

 

 

 

# -*- coding:utf-8 -*-
#https://blog.csdn.net/qq_32166627/article/details/60882964
import requests
import os
import pinyin
import simplejson

def getManyPages(keyword,pages):
    params=[]
    for i in range(30,30*pages+30,30):
        params.append({
                      'tn': 'resultjson_com',
                      'ipn': 'rj',
                      'ct': 201326592,
                      'is': '',
                      'fp': 'result',
                      'queryWord': keyword,
                      'cl': 2,
                      'lm': -1,
                      'ie': 'utf-8',
                      'oe': 'utf-8',
                      'adpicid': '',
                      'st': -1,
                      'z': '',
                      'ic': 0,
                      'word': keyword,
                      's': '',
                      'se': '',
                      'tab': '',
                      'width': '',
                      'height': '',
                      'face': 0,
                      'istype': 2,
                      'qc': '',
                      'nc': 1,
                      'fr': '',
                      'pn': i,
                      'rn': 30,
                      'gsm': '1e',
                      '1488942260214': ''
                  })
    url = 'https://image.baidu.com/search/acjson'
    urls = []
    for i in params:
        #print("begin")
        try:
            rgjson = requests.get(url,params=i).json().get('data')
        except simplejson.scanner.JSONDecodeError:
            print('【错误】simplejson.scanner.JSONDecodeError ')
            continue
        #print("end")
        urls.append(rgjson)

    return urls


def getImg(dataList, localPath, keyword):

    if not os.path.exists(localPath):  # 新建文件夹
        os.mkdir(localPath)

    x = 0
    for list in dataList:
        for i in list:
            if i.get('thumbURL') != None:
                #print('download:%s' % i.get('thumbURL'))
                print("down " + str(x) + " image " + i.get('thumbURL'))
                ir = requests.get(i.get('thumbURL'))
                open(localPath +"/" + keyword +  '_%d.jpg' % x, 'wb').write(ir.content)
                x += 1
            else:
                print('image not exist')


def convert():
    fp = open("stars_list_clean.txt",'w')
    with open("stars_list.txt",'r') as face_file:
        stars_list = face_file.readlines()
        index = 0
        line_record = []
        for line in stars_list:
            line = line.replace('\r','').replace('\n','').replace('\t','') 
            #print(line)
            line_split = line.strip().split(",")
            print(line_split[1])
            if line_split[1] not in line_record:
                line_record.append(line_split[1])
                fp.write('%s\n' % line_split[1])
            else:
                print(line_split[1], " is exist")

def debug():

    # with open("stars_list_clean.txt",'r') as face_file:
    #   stars_list = face_file.readlines()
    #   index = 0
    #   for line in stars_list:
    #       line = line.replace('\r','').replace('\n','').replace('\t','')
    #       keyword_english = pinyin.get(line, format="strip")
    #       keyword = line
    #       index += 1
    #       if index > 0:
    #         break

    # print(keyword)
    # keyword1 = '胡因梦'
    # if keyword == keyword1:
    #     print("yes")
    # else:
    #     print("no")
    keyword = '胡因梦'
    keyword_english = "hym"
    dataList = getManyPages(keyword,2)  # 参数1:关键字,参数2:要下载的页数
    getImg(dataList,'./hanxue', keyword_english) # 参数2:指定保存的路径

    # keyword = '韩雪'
    # dataList = getManyPages(keyword,2)  # 参数1:关键字,参数2:要下载的页数
    #getImg(dataList,'./hanxue') # 参数2:指定保存的路径


def run():

    fp = open("stars_list_en.txt",'w')
    with open("stars_list_clean.txt",'r') as face_file:
        stars_list = face_file.readlines()
        for line in stars_list:
            line = line.replace('\r','').replace('\n','').replace('\t','')
            keyword_english = pinyin.get(line, format="strip")
            fp.write('%s\n' % keyword_english)
    face_ID_index = 0

    dir = "./stars_srcimg/"
    
    # if os.path.exists(dir):
    #     os.system("rm -rf " + dir)

    if not os.path.exists(dir):
        os.mkdir(dir)

    pages = 5
    maxnum = pages * 30
    print(maxnum)

    for line in stars_list:
        #line.decode('utf-8').encode('gb2312')
        line = line.replace('\r','').replace('\n','').replace('\t','')
        keyword = line
        print keyword
        keyword_english = pinyin.get(keyword, format="strip")
        print keyword_english
        face_ID = str(face_ID_index) + "_" + keyword
        facesavepath = dir + str(face_ID_index) + "_" + keyword
        face_ID_index += 1
        print facesavepath
        if not os.path.exists(facesavepath):
            os.mkdir(facesavepath)
        else:
            print(keyword, " exist")
            continue

        print("down "  + keyword)

        dataList = getManyPages(keyword, pages)  # 参数1:关键字,参数2:要下载的页数
        getImg(dataList, facesavepath, face_ID) # 参数2:指定保存的路径

if __name__ == '__main__':
  debug()
  #run()

 

posted on 2018-05-22 20:48  Maddock  阅读(403)  评论(0编辑  收藏  举报

导航