【url ---lib___】笔趣阁(抓取斗罗大陆完整)和(三寸天堂)
1 # coding=gbk #因为在黑屏下执行,所以代码会使用GBK 2 url='http://www.biquge.info/10_10218/' 3 UA={"User-Agent":"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"} 4 UA1={"User-Agent":"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", 5 'Host':'www.xxbiquge.com', 6 'Referer':'https://www.xxbiquge.com/2_2278/'} 7 import time,lxml,pymysql 8 from lxml import etree 9 from urllib.request import Request 10 from urllib.request import urlopen 11 import os,sys,io 12 sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding='gb18030') 13 14 def source(url):#获取源 15 global UA 16 text=urlopen(Request(url,None,UA),timeout=5) 17 return text.read() 18 19 def respon(text):#解析章 20 global url 21 seletor=etree.HTML(text) 22 url1=seletor.xpath("//*[@id='list']/dl/dd/a/@href") 23 return url1 24 25 def spider(url):#解析内容spider('http://www.biquge.info/10_10218/5002106.html') 26 global UA1 27 for i in url: 28 i='https://www.xxbiquge.com'+i 29 a=urlopen(Request(i,None,UA1),timeout=5).read() 30 seletor=etree.HTML(a) 31 text=seletor.xpath('//*[@id="content"]/text()')#内容 32 c='' 33 for aa in text: 34 c=c+aa 35 36 text1=seletor.xpath('//html/head/title/text()')[0].split('-')[0]#章节名 37 #print(i,type(i),text1,type(text1)) 38 mysqlw(c,i,text1) 39 time.sleep(3) 40 41 42 #c=os.path.join(os.path.abspath(os.path.dirname(__name__)),'2.html') 43 #with open(c,'r') as f: 44 # a=f.read() 45 46 def mysqlw(text,url,chapter):#写内容 47 b1=time.time() 48 b=pymysql.connect('localhost',port=3306,user='root',passwd='liu',db='test',charset='utf8') 49 cur=b.cursor() 50 print(url,chapter,'w') 51 52 #for i in cur.fetchall(): 53 #pass 54 sql="""insert into douludalu(souce,html,chapter) values('%s','%s','%s')"""%(text,url,chapter) 55 print(sql) 56 try: 57 cur.execute(sql) 58 b.commit() 59 print("插入成功") 60 except Exception as e: 61 print(e) 62 b.rollback() 63 b.close() 64 print("关闭",'耗时',time.time()-b1) 65 66 def mysqlr(text):#读内容 67 b=pymysql.connect('localhost',port=3306,user='root',passwd='liu',db='test',charset='utf8') 68 cur=b.cursor() 69 sql='select * from douludalu where html="%s%s%%s"'%(',text,') 70 cur.execute(sql) 71 print(sql) 72 for i in cur.fetchall(): 73 a=i[0] 74 b=i[3] 75 print(a,b) 76 77 #a='2唐三已经挥出了八千余锤,铁坨不断的变小,已经不到最初时三分' 78 #mysqlw(a,'1.html','第一章') 79 def main(): 80 a=source('https://www.xxbiquge.com/2_2278/') 81 b=respon(a) 82 spider(b) 83 #mysqlr('https://www.xxbiquge.com/2_2278/1036550.html') 84 main()
——————————————————————————————————————————————————————————————————
三寸天堂
1 # coding=gbk 2 url='http://www.biquge.info/10_10218/' 3 UA={"User-Agent":"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"} 4 UA1={"User-Agent":"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", 5 'Host':'www.biquge.com.tw', 6 'Referer':'http://www.biquge.com.tw/14_14055/', 7 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'} 8 import time,lxml,pymysql,threading 9 from lxml import etree 10 from urllib.request import Request 11 from urllib.request import urlopen 12 import os,sys,io 13 sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding='gb18030') 14 15 def source(url):#获取源 16 global UA 17 text=urlopen(Request(url,None,UA),timeout=5) 18 return text.read() 19 20 def respon(text):#解析章 21 global url 22 seletor=etree.HTML(text) 23 url1=seletor.xpath("//*[@id='list']/dl/dd/a/@href") 24 return url1 25 26 def spider(url):#解析内容spider('http://www.biquge.info/10_10218/5002106.html') 27 global UA1 28 i='http://www.biquge.com.tw/'+url 29 print(i) 30 a=urlopen(Request(i,None,UA1),timeout=5).read() 31 if a is None: 32 pass 33 else: 34 seletor=etree.HTML(a) 35 text=seletor.xpath('//*[@id="content"]/text()')#内容 36 c='' 37 for aa in text: 38 c=c+aa 39 40 text1=seletor.xpath('//html/head/title/text()')[0]#章节名 41 print(text1) 42 #print(i,type(i),text1,type(text1)) 43 mysqlw(c,i,text1) 44 time.sleep(3) 45 46 47 def mysqlw(text,url,chapter):#写内容 48 b1=time.time() 49 b=pymysql.connect('localhost',port=3306,user='root',passwd='liu',db='test',charset='utf8') 50 cur=b.cursor() 51 print(url,chapter,'11111111111111111111111111111') 52 #for i in cur.fetchall(): 53 #pass 54 sql="""insert into suibian(souce,html,chapter) values('%s','%s','%s')"""%(text,url,chapter) 55 try: 56 cur.execute(sql) 57 b.commit() 58 print("插入成功") 59 except Exception as e: 60 print(e) 61 b.rollback() 62 b.close() 63 print("关闭",'耗时',time.time()-b1) 64 65 def mysqlr(text):#读内容 66 b1=True 67 b=pymysql.connect('localhost',port=3306,user='root',passwd='liu',db='test',charset='utf8') 68 cur=b.cursor() 69 sql='select * from douludalu where html="%s%s%%s"'%(',text,') 70 cur.execute(sql) 71 print(sql) 72 for i in cur.fetchall(): 73 a=i[0] 74 b=i[3] 75 print(a,b) 76 if i[3] is None: 77 b1=False 78 79 def main(): 80 print(threading.current_thread().name) 81 cc=time.time() 82 print('开始时间%s'%cc) 83 a=source('http://www.biquge.com.tw/14_14055/') 84 b=respon(a) 85 for i in b: 86 #print(i) 87 spider(i) 88 ctime=time.time()-cc 89 print('完成耗时%s'%ctime) 90 91 92 #c=os.path.join(os.path.abspath(os.path.dirname(__name__)),'1.html') 93 #with open(c,'r') as f: 94 # a=f.read() 95 main()
特别需要注意的是UA在Request中传值会出现错误,这时需要耐心来把问题解决
容易出现的错误【
1,协议中,referer错误,host错误
2,网页xpath错误,目测此网站的网页还是比较规则的
】
不是所有的成功都是坐享其成,联系作者v(13147256756)