python 百度图片爬虫
# -*- coding:utf-8 -*- #https://blog.csdn.net/qq_32166627/article/details/60882964 import requests import os import pinyin import simplejson def getManyPages(keyword,pages): params=[] for i in range(30,30*pages+30,30): params.append({ 'tn': 'resultjson_com', 'ipn': 'rj', 'ct': 201326592, 'is': '', 'fp': 'result', 'queryWord': keyword, 'cl': 2, 'lm': -1, 'ie': 'utf-8', 'oe': 'utf-8', 'adpicid': '', 'st': -1, 'z': '', 'ic': 0, 'word': keyword, 's': '', 'se': '', 'tab': '', 'width': '', 'height': '', 'face': 0, 'istype': 2, 'qc': '', 'nc': 1, 'fr': '', 'pn': i, 'rn': 30, 'gsm': '1e', '1488942260214': '' }) url = 'https://image.baidu.com/search/acjson' urls = [] for i in params: #print("begin") try: rgjson = requests.get(url,params=i).json().get('data') except simplejson.scanner.JSONDecodeError: print('【错误】simplejson.scanner.JSONDecodeError ') continue #print("end") urls.append(rgjson) return urls def getImg(dataList, localPath, keyword): if not os.path.exists(localPath): # 新建文件夹 os.mkdir(localPath) x = 0 for list in dataList: for i in list: if i.get('thumbURL') != None: #print('download:%s' % i.get('thumbURL')) print("down " + str(x) + " image " + i.get('thumbURL')) ir = requests.get(i.get('thumbURL')) open(localPath +"/" + keyword + '_%d.jpg' % x, 'wb').write(ir.content) x += 1 else: print('image not exist') def convert(): fp = open("stars_list_clean.txt",'w') with open("stars_list.txt",'r') as face_file: stars_list = face_file.readlines() index = 0 line_record = [] for line in stars_list: line = line.replace('\r','').replace('\n','').replace('\t','') #print(line) line_split = line.strip().split(",") print(line_split[1]) if line_split[1] not in line_record: line_record.append(line_split[1]) fp.write('%s\n' % line_split[1]) else: print(line_split[1], " is exist") def debug(): # with open("stars_list_clean.txt",'r') as face_file: # stars_list = face_file.readlines() # index = 0 # for line in stars_list: # line = line.replace('\r','').replace('\n','').replace('\t','') # keyword_english = pinyin.get(line, format="strip") # keyword = line # index += 1 # if index > 0: # break # print(keyword) # keyword1 = '胡因梦' # if keyword == keyword1: # print("yes") # else: # print("no") keyword = '胡因梦' keyword_english = "hym" dataList = getManyPages(keyword,2) # 参数1:关键字,参数2:要下载的页数 getImg(dataList,'./hanxue', keyword_english) # 参数2:指定保存的路径 # keyword = '韩雪' # dataList = getManyPages(keyword,2) # 参数1:关键字,参数2:要下载的页数 #getImg(dataList,'./hanxue') # 参数2:指定保存的路径 def run(): fp = open("stars_list_en.txt",'w') with open("stars_list_clean.txt",'r') as face_file: stars_list = face_file.readlines() for line in stars_list: line = line.replace('\r','').replace('\n','').replace('\t','') keyword_english = pinyin.get(line, format="strip") fp.write('%s\n' % keyword_english) face_ID_index = 0 dir = "./stars_srcimg/" # if os.path.exists(dir): # os.system("rm -rf " + dir) if not os.path.exists(dir): os.mkdir(dir) pages = 5 maxnum = pages * 30 print(maxnum) for line in stars_list: #line.decode('utf-8').encode('gb2312') line = line.replace('\r','').replace('\n','').replace('\t','') keyword = line print keyword keyword_english = pinyin.get(keyword, format="strip") print keyword_english face_ID = str(face_ID_index) + "_" + keyword facesavepath = dir + str(face_ID_index) + "_" + keyword face_ID_index += 1 print facesavepath if not os.path.exists(facesavepath): os.mkdir(facesavepath) else: print(keyword, " exist") continue print("down " + keyword) dataList = getManyPages(keyword, pages) # 参数1:关键字,参数2:要下载的页数 getImg(dataList, facesavepath, face_ID) # 参数2:指定保存的路径 if __name__ == '__main__': debug() #run()