Python3.7爬虫 大量爬取某小说网站小说并写入mysql(持续完善中...) 未解决问题:mysql长时间新增超过百万条数据表锁甚至崩溃
练手之作 代码中还有很多问题 持续完善中
渣渣阿里T5 99包邮服务器只开了6个进程
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 | #encoding:utf-8 import requests # 请求 from lxml import html # 解析HTML from multiprocessing import Pool,Semaphore # 进程 import random import time import os import string from fake_useragent import UserAgent import multiprocessing import base64 import MySQLdb basepath = os.path.abspath( 'text' ) imgpath = os.path.abspath( 'timg' ) baseUrl = 'http://www.quanshuwang.com/list/1_1.html' baseFrom = '全书网' type = 2 def getList(page): # 获得主页数据 r = requests.get( 'http://www.quanshuwang.com/all/allvisit_{}_0_0_0_0_0_{}.html' . format ( type ,page), headers = getHeaders()).text doc = html.fromstring(r) urls = doc.xpath( '//div[@class="yd-book-item yd-book-item-pull-left"]/a/@href' ) return urls def getHeaders(): # 头部 headers = { 'Referer' : baseUrl, 'Connection' : 'close' , 'User-Agent' : UserAgent().random } return headers def upload_img(jpgLink, filename): with open (filename, "wb+" ) as jpg: jpg.write(requests.get(jpgLink).content) print ( '图片下载成功' ) def getInfo(url): try : info = {} pro = 1 r = requests.get(url, headers = getHeaders(), timeout = 3 ) doc = html.fromstring(r.content) des = doc.xpath( '//div[@id="waa"]/text()' )[ 0 ] info[ 'des' ] = "".join(des.split()) info[ 'des' ] = info[ 'des' ][ 3 :] info[ 'name' ] = doc.xpath( '//div[@class="b-info"]/h1/text()' )[ 0 ] links = doc.xpath( '//div[@class="b-oper"]/a/@href' )[ 0 ] imgurl = doc.xpath( '//a[@class="l mr11"]/img/@src' )[ 0 ] img = base64.b64encode(info[ 'name' ].encode( 'utf-8' )) + b '.jpg' img = (img.decode()).replace( '/' , '') info[ 'thumb' ] = 'timg/' + img filename = imgpath + '/' + img info[ 'from' ] = links upload_img(imgurl, filename) # 下载图片 getBook(links, pro, info) #下载内容 except requests.exceptions.Timeout: print ( '连接超时,正在重连...' ) getInfo(url) except Exception as e: print ( '错误' ,e) getInfo(url) def insertList(info): # 新增小说 db = MySQLdb.connect.connect(host = 'localhost' , user = 'root' , passwd = 'LuoYang%684985' , db = 'python' , port = 3306 , charset = 'utf8' ) you = db.cursor() # 用cursor方法获取一个操作游标you sql = 'select id from text_list where name={}' . format ( "'"+info['name']+"'" ) you.execute(sql) is_repeat = you.fetchone() if is_repeat: print ( '小说{}重复' . format (info[ 'name' ])) return is_repeat[ 0 ] else : you.execute( "insert into text_list (type,thumb,description,name,author,froms,add_time) value({},{},{},{},{},{},{})" . format (info[ 'type' ], "'"+info['thumb']+"'" , "'"+info['des']+"'" , "'"+info['name']+"'" , "'"+info['author']+"'" , "'"+info['from']+"'" , int (time.time()))) you.execute( "select last_insert_id();" ) data = you.fetchone() db.commit() db.close() # 释放数据库资源 print ( '正在下载小说{}' . format (info[ 'name' ])) return data[ 0 ] def is_repeat(info,db): you = db.cursor() # 用cursor方法获取一个操作游标you sql1 = 'select id from text_del where l_id={} and title={}' . format (info[ 'l_id' ], "'" + info['title'] + "'" ) you.execute(sql1) is_repeat = you.fetchone() if is_repeat: time.sleep( 0.1 ) return - 1 else : return 1 def insertContent(info,db): # 新增小说 you = db.cursor() # 用cursor方法获取一个操作游标you sql = "insert into text_del (l_id,title,content,add_time,`order`,froms) value({},{},{},{},{},{})" . format (info[ 'l_id' ], "'"+info['title']+"'" , "'"+info['content']+"'" ,info[ 'add_time' ],info[ 'num' ], "'"+info['froms']+"'" ) you.execute(sql) db.commit() you.close() # 关闭操作游标 def random_string(size = 5 , chars = string.ascii_uppercase + string.digits): return str ( int (time.time())) + ''.join(random.choice(chars) for _ in range (size)) def getBook(link, pro, info): # 下载图片以及存入mysql try : r = requests.get(link, headers = getHeaders(),timeout = 3 ) doc = html.fromstring(r.content) info[ 'author' ] = doc.xpath( '//div[@class="chapName"]/span/text()' )[ 0 ] info[ 'author' ] = info[ 'author' ][ 3 :] info[ 'type' ] = type res = {} res[ 'l_id' ] = insertList(info) links = doc.xpath( '//div[@class="clearfix dirconone"]/li/a' ) str = random_string() bookpath = '%s\\%s' % (basepath, str ) if os.path.exists(bookpath): pass else : os.mkdir(bookpath) num = 0 db = MySQLdb.connect(host = 'localhost' , user = 'root' , passwd = 'LuoYang%684985' , db = 'python' , port = 3306 ,charset = 'utf8' ) for i in links: num = num + 1 res[ 'num' ] = num name = i.xpath( './text()' )[ 0 ] res[ 'title' ] = name if is_repeat(res,db) = = - 1 : pass else : downTxt(i, str , pro,res,db) db.close() except requests.exceptions.Timeout: print ( '代理连接超时,正在重连...' ) getBook(link, 0 , info) except Exception as e: # print('错误', e) getBook(link, 0 , info) def downTxt(page, path, pro,res,db): # 下载书籍 res[ 'add_time' ] = int (time.time()) url = page.xpath( './@href' )[ 0 ] try : r = requests.get(url, headers = getHeaders()) doc = html.fromstring(r.content) arc = doc.xpath( '//div[@id="content"]/text()' ) arc = "".join(arc) str = random_string() relname = 'text/{}/{}.txt' . format (path, str ) res[ 'froms' ] = url res[ 'content' ] = relname fileName = '%s\\text\\%s\\%s.txt' % (os.path.abspath( '.' ),path, str ) time.sleep( 1 ) insertContent(res,db) with open (fileName, 'w+' , encoding = 'utf-8' ) as txt: txt.write(arc) except requests.exceptions.Timeout: # print('代理连接超时,正在重连...') downTxt(page, path, 0 ,res,db) except Exception as e: # print('错误', e, '正在重连...') downTxt(page, path, 0 ,res,db) def work(i): lists = getList(i) for s in lists: getInfo(s) time.sleep( 10 ) if __name__ = = '__main__' : multiprocessing.freeze_support() pool = multiprocessing.Pool(processes = 6 ) for i in range ( 1 , 51 ): pool.apply_async(work, (i, )) pool.close() pool.join() |
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】凌霞软件回馈社区,博客园 & 1Panel & Halo 联合会员上线
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】博客园社区专享云产品让利特惠,阿里云新客6.5折上折
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步