Python3.7爬虫 大量爬取某小说网站小说并写入mysql(持续完善中...) 未解决问题:mysql长时间新增超过百万条数据表锁甚至崩溃

练手之作 代码中还有很多问题 持续完善中 

渣渣阿里T5 99包邮服务器只开了6个进程

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
#encoding:utf-8
import requests  # 请求
from lxml import html  # 解析HTML
from multiprocessing import Pool,Semaphore  # 进程
import random
import time
import os
import string
from fake_useragent import UserAgent
import multiprocessing
import base64
import MySQLdb
 
basepath = os.path.abspath('text')
imgpath = os.path.abspath('timg')
baseUrl = 'http://www.quanshuwang.com/list/1_1.html'
baseFrom = '全书网'
type=2
 
def getList(page):  # 获得主页数据
    r = requests.get('http://www.quanshuwang.com/all/allvisit_{}_0_0_0_0_0_{}.html'.format(type,page), headers=getHeaders()).text
    doc = html.fromstring(r)
    urls = doc.xpath('//div[@class="yd-book-item yd-book-item-pull-left"]/a/@href')
    return urls
 
 
 
 
def getHeaders():  # 头部
    headers = {
        'Referer': baseUrl,
        'Connection': 'close',
        'User-Agent': UserAgent().random
    }
    return headers
 
 
def upload_img(jpgLink, filename):
    with open(filename, "wb+") as jpg:
        jpg.write(requests.get(jpgLink).content)
        print('图片下载成功')
 
 
def getInfo(url):
    try:
        info = {}
        pro = 1
        r = requests.get(url, headers=getHeaders(), timeout=3)
        doc = html.fromstring(r.content)
        des = doc.xpath('//div[@id="waa"]/text()')[0]
        info['des'] = "".join(des.split())
        info['des'] = info['des'][3:]
        info['name'] = doc.xpath('//div[@class="b-info"]/h1/text()')[0]
        links = doc.xpath('//div[@class="b-oper"]/a/@href')[0]
        imgurl = doc.xpath('//a[@class="l mr11"]/img/@src')[0]
        img = base64.b64encode(info['name'].encode('utf-8')) + b'.jpg'
        img=(img.decode()).replace('/', '')
        info['thumb'] = 'timg/' + img
        filename = imgpath + '/' + img
        info['from'] = links
        upload_img(imgurl, filename)  # 下载图片
        getBook(links, pro, info) #下载内容
    except requests.exceptions.Timeout:
        print('连接超时,正在重连...')
        getInfo(url)
    except Exception as e:
        print('错误',e)
        getInfo(url)
 
 
def insertList(info):  # 新增小说
    db = MySQLdb.connect.connect(host='localhost', user='root', passwd='LuoYang%684985', db='python', port=3306,
                         charset='utf8')
    you = db.cursor()  # 用cursor方法获取一个操作游标you
    sql='select id from text_list  where name={}'.format("'"+info['name']+"'")
    you.execute(sql)
    is_repeat =you.fetchone()
    if is_repeat:
        print('小说{}重复'.format(info['name']))
        return is_repeat[0]
    else:
        you.execute("insert into text_list (type,thumb,description,name,author,froms,add_time) value({},{},{},{},{},{},{})".format(info['type'],"'"+info['thumb']+"'","'"+info['des']+"'","'"+info['name']+"'","'"+info['author']+"'", "'"+info['from']+"'",int(time.time())))
        you.execute("select last_insert_id();")
        data = you.fetchone()
        db.commit()
        db.close()  # 释放数据库资源
        print('正在下载小说{}'.format(info['name']))
        return data[0]
 
def is_repeat(info,db):
    you = db.cursor()  # 用cursor方法获取一个操作游标you
    sql1 = 'select id from text_del where l_id={} and title={}'.format(info['l_id'], "'" + info['title'] + "'")
    you.execute(sql1)
    is_repeat = you.fetchone()
    if is_repeat:
        time.sleep(0.1)
        return -1
    else:
        return 1
def insertContent(info,db):  # 新增小说
    you = db.cursor()  # 用cursor方法获取一个操作游标you
    sql="insert into text_del (l_id,title,content,add_time,`order`,froms) value({},{},{},{},{},{})".format(info['l_id'],"'"+info['title']+"'","'"+info['content']+"'",info['add_time'],info['num'],"'"+info['froms']+"'")
    you.execute(sql)
    db.commit()
    you.close()  # 关闭操作游标
 
 
def random_string(size=5, chars=string.ascii_uppercase + string.digits):
        return str(int(time.time()))+''.join(random.choice(chars) for _ in range(size))
def getBook(link, pro, info):  # 下载图片以及存入mysql
    try:
        r = requests.get(link, headers=getHeaders(),timeout=3)
        doc = html.fromstring(r.content)
        info['author'] = doc.xpath('//div[@class="chapName"]/span/text()')[0]
        info['author'] = info['author'][3:]
        info['type'] = type
        res={}
        res['l_id']=insertList(info)
        links = doc.xpath('//div[@class="clearfix dirconone"]/li/a')
        str=random_string()
        bookpath = '%s\\%s' % (basepath,str )
        if os.path.exists(bookpath):
            pass
        else:
            os.mkdir(bookpath)
        num=0
        db = MySQLdb.connect(host='localhost', user='root', passwd='LuoYang%684985', db='python', port=3306,charset='utf8')
        for i in links:
            num = num + 1
            res['num'] = num
            name = i.xpath('./text()')[0]
            res['title'] = name
            if is_repeat(res,db)==-1:
                pass
            else:
                downTxt(i, str, pro,res,db)
        db.close()
    except requests.exceptions.Timeout:
        print('代理连接超时,正在重连...')
        getBook(link, 0, info)
    except Exception as e:
        # print('错误', e)
        getBook(link, 0, info)
 
 
def downTxt(page, path, pro,res,db):  # 下载书籍
    res['add_time']=int(time.time())
    url = page.xpath('./@href')[0]
    try:
        r = requests.get(url, headers=getHeaders())
        doc = html.fromstring(r.content)
        arc = doc.xpath('//div[@id="content"]/text()')
        arc = "".join(arc)
        str=random_string()
        relname='text/{}/{}.txt'.format(path,str)
        res['froms']=url
        res['content']=relname
        fileName = '%s\\text\\%s\\%s.txt' % (os.path.abspath('.'),path, str)
        time.sleep(1)
        insertContent(res,db)
        with open(fileName, 'w+', encoding='utf-8') as txt:
            txt.write(arc)
    except requests.exceptions.Timeout:
        # print('代理连接超时,正在重连...')
        downTxt(page, path, 0,res,db)
    except Exception as e:
        # print('错误', e, '正在重连...')
        downTxt(page, path, 0,res,db)
 
 
def work(i):
    lists = getList(i)
    for s in lists:
        getInfo(s)
        time.sleep(10)
if __name__ == '__main__':
    multiprocessing.freeze_support()
    pool = multiprocessing.Pool(processes=6)
    for i in range(1,51):
        pool.apply_async(work, (i, ))
    pool.close()
    pool.join()

  

posted @   清氺Anunnaki  阅读(554)  评论(0编辑  收藏  举报
努力加载评论中...
点击右上角即可分享
微信分享提示