google图片抓取

google图片是base64加密的,而且base64后的信息放在script信息里面

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import pymysql
from lxml import etree
import logging
import requests
import time
import threading
from threading import RLock
import re
import os
 
lock = RLock()
import base64
import ssl
 
ssl._create_default_https_context = ssl._create_unverified_context
 
# 添加日志
 
logging.basicConfig(
    level=logging.INFO,  # 定义输出到文件的log级别,大于此级别的都被输出
    format='%(asctime)s  %(filename)s  %(levelname)s : %(message)s'# 定义输出log的格式
    datefmt='%Y-%m-%d %H:%M:%S'# 时间
    filename='drugimagesError.log'# log文件名
    filemode='a'# 写入模式“w”或“a”
 
 
class google_images(object):
 
    def __init__(self):
        self.strat_record = 1
        self.end_record = 10000001
        self.db = pymysql.connect(host='localhost', port=3306, database='yao_zhi', user='root', password='root',
                                  charset='utf8')
        self.cursor = self.db.cursor()
        self.headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"}
 
        while True:
            self.parse_page()
 
    def parse_page(self):
        lock.acquire()
        num = self.cursor.execute(
            "select id, me_pizhunwenhao, me_name, me_jixing, me_key from guo_cai_jin_kou_yao_pin where id > {} limit 1000".format(
                self.strat_record))
        lock.release()
        if str(num) == str(0):
            exit()
 
        data_tuple = self.cursor.fetchall()
        threading_list = []
        for data_one in data_tuple:
            id = data_one[0]
            approvalNumber = data_one[1]
            drugName = data_one[2]
            dosageForm = data_one[3]
            try:
                specifications = re.findall(r".+?,", data_one[4])[0]
            except:
                specifications = data_one[4]
            self.strat_record = id
            logging.info("id:%s  approvalNumber:%s   drugName:%s   dosageForm:%s   specifications:%s" % (
                id, approvalNumber, drugName, dosageForm, specifications))
            print("id:%s  approvalNumber:%s   drugName:%s   dosageForm:%s   specifications:%s" % (
                id, approvalNumber, drugName, dosageForm, specifications))
 
            if str(id) == str(self.end_record):
                exit()
            lock.acquire()
            num = self.cursor.execute("select id from drugimages where approvalNumber = '{}' ".format(approvalNumber))
            lock.release()
            if not num:
                t = threading.Thread(target=self.parse_page_data,
                                     args=(id, approvalNumber, drugName, dosageForm, specifications,))
                t.start()
                threading_list.append(t)
                time.sleep(3)
 
        for t_one in threading_list:
            t_one.join()
 
    def parse_page_data(self, id, approvalNumber, drugName, dosageForm, specifications):
        print("id:%s  approvalNumber:%s   drugName:%s   specifications:%s" % (
            id, approvalNumber, drugName, specifications))
        keyword = drugName + ' ' + dosageForm + ' ' + specifications
        url = 'https://www.google.com/search?biw=1920&bih=900&tbm=isch&q=%s' % keyword
        print(url)
        data_particular = etree.HTML(requests.get(url=url, headers=self.headers).content)
        images_list = data_particular.xpath('//span[@id="xjs"]/script/text()')[0]
        images_list_link = re.findall(r'data:image/jpeg;base64,/9j/4AAQSkZJRgABAQAAAQABAAD.*"\]?', images_list)
 
        num = 0
        for link in images_list_link:
            num += 1
            url_link = link.replace('"]', '')
            image = url_link.encode('utf-8').decode('unicode_escape')
            image_data = image.replace('data:image/jpeg;base64,', '')
            drugsql = "insert into drugimages(approvalNumber, drugName, specifications, image, num, durgid) values('{}', '{}', '{}', '{}', {}, {})"
            drugsql_data = drugsql.format(approvalNumber, drugName, specifications, image, int(num), int(id))
            print('sql_data:%s' % drugsql_data)
            logging.info("id:%s  approvalNumber:%s   drugName:%s   specifications:%s" % (
                id, approvalNumber, drugName, specifications))
            lock.acquire()
            self.cursor.execute(drugsql_data)
            self.db.commit()
            lock.release()
            pic_content = base64.b64decode(image_data)
            page_id = int(id / 1000)
            file = './images/' + 'page' + str(page_id) + '/'
            if not os.path.exists(file):
                os.makedirs(file)
            files = file + 'id' + str(id) + '/'
            if not os.path.exists(files):
                os.makedirs(files)
            file = open(files + str(approvalNumber) + '-' + str(num) + '.jpg', 'wb')
            file.write(pic_content)
            file.close()
            if str(num) == str(30):
                break
 
 
if __name__ == '__main__':
    google_images()