google图片抓取
google图片是base64加密的,而且base64后的信息放在script信息里面
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 | import pymysql from lxml import etree import logging import requests import time import threading from threading import RLock import re import os lock = RLock() import base64 import ssl ssl._create_default_https_context = ssl._create_unverified_context # 添加日志 logging.basicConfig( level = logging.INFO, # 定义输出到文件的log级别,大于此级别的都被输出 format = '%(asctime)s %(filename)s %(levelname)s : %(message)s' , # 定义输出log的格式 datefmt = '%Y-%m-%d %H:%M:%S' , # 时间 filename = 'drugimagesError.log' , # log文件名 filemode = 'a' ) # 写入模式“w”或“a” class google_images( object ): def __init__( self ): self .strat_record = 1 self .end_record = 10000001 self .db = pymysql.connect(host = 'localhost' , port = 3306 , database = 'yao_zhi' , user = 'root' , password = 'root' , charset = 'utf8' ) self .cursor = self .db.cursor() self .headers = { "User-Agent" : "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36" } while True : self .parse_page() def parse_page( self ): lock.acquire() num = self .cursor.execute( "select id, me_pizhunwenhao, me_name, me_jixing, me_key from guo_cai_jin_kou_yao_pin where id > {} limit 1000" . format ( self .strat_record)) lock.release() if str (num) = = str ( 0 ): exit() data_tuple = self .cursor.fetchall() threading_list = [] for data_one in data_tuple: id = data_one[ 0 ] approvalNumber = data_one[ 1 ] drugName = data_one[ 2 ] dosageForm = data_one[ 3 ] try : specifications = re.findall(r ".+?," , data_one[ 4 ])[ 0 ] except : specifications = data_one[ 4 ] self .strat_record = id logging.info( "id:%s approvalNumber:%s drugName:%s dosageForm:%s specifications:%s" % ( id , approvalNumber, drugName, dosageForm, specifications)) print ( "id:%s approvalNumber:%s drugName:%s dosageForm:%s specifications:%s" % ( id , approvalNumber, drugName, dosageForm, specifications)) if str ( id ) = = str ( self .end_record): exit() lock.acquire() num = self .cursor.execute( "select id from drugimages where approvalNumber = '{}' " . format (approvalNumber)) lock.release() if not num: t = threading.Thread(target = self .parse_page_data, args = ( id , approvalNumber, drugName, dosageForm, specifications,)) t.start() threading_list.append(t) time.sleep( 3 ) for t_one in threading_list: t_one.join() def parse_page_data( self , id , approvalNumber, drugName, dosageForm, specifications): print ( "id:%s approvalNumber:%s drugName:%s specifications:%s" % ( id , approvalNumber, drugName, specifications)) keyword = drugName + ' ' + dosageForm + ' ' + specifications url = 'https://www.google.com/search?biw=1920&bih=900&tbm=isch&q=%s' % keyword print (url) data_particular = etree.HTML(requests.get(url = url, headers = self .headers).content) images_list = data_particular.xpath( '//span[@id="xjs"]/script/text()' )[ 0 ] images_list_link = re.findall(r 'data:image/jpeg;base64,/9j/4AAQSkZJRgABAQAAAQABAAD.*"\]?' , images_list) num = 0 for link in images_list_link: num + = 1 url_link = link.replace( '"]' , '') image = url_link.encode( 'utf-8' ).decode( 'unicode_escape' ) image_data = image.replace( 'data:image/jpeg;base64,' , '') drugsql = "insert into drugimages(approvalNumber, drugName, specifications, image, num, durgid) values('{}', '{}', '{}', '{}', {}, {})" drugsql_data = drugsql. format (approvalNumber, drugName, specifications, image, int (num), int ( id )) print ( 'sql_data:%s' % drugsql_data) logging.info( "id:%s approvalNumber:%s drugName:%s specifications:%s" % ( id , approvalNumber, drugName, specifications)) lock.acquire() self .cursor.execute(drugsql_data) self .db.commit() lock.release() pic_content = base64.b64decode(image_data) page_id = int ( id / 1000 ) file = './images/' + 'page' + str (page_id) + '/' if not os.path.exists( file ): os.makedirs( file ) files = file + 'id' + str ( id ) + '/' if not os.path.exists(files): os.makedirs(files) file = open (files + str (approvalNumber) + '-' + str (num) + '.jpg' , 'wb' ) file .write(pic_content) file .close() if str (num) = = str ( 30 ): break if __name__ = = '__main__' : google_images() |
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】凌霞软件回馈社区,博客园 & 1Panel & Halo 联合会员上线
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步