爬虫(抄)费老劲抄完
1 #!/usr/bin/python 2 # -*- coding: UTF-8 -*- 3 # python 36 4 __author_ = '' 5 import requests 6 import re 7 import os 8 9 domian = 'http://www.sjtxt.la' 10 11 def get_novel_sort_list(): 12 13 response = requests.get('http://www.sjtxt.la/soft/7/Soft_007_1.html') 14 result = response.text 15 reg = r'<a href="([^=]*?)"><img src=".*?">(.*?)</a>' 16 novel_url_list = re.findall(reg,result) 17 # print(novel_url_list) 18 return novel_url_list 19 20 def get_novel_content(url): 21 url = '{}{}'.format(domian,url) 22 response = requests.get(url) 23 response.encoding = 'UTF-8' 24 result = response.text 25 reg = r'''<a class="downButton" href='(.*?)' title''' 26 chapter_url_content = re.findall(reg,result)[0] 27 return chapter_url_content 28 29 def get_chapter_list(url): 30 url = '{}{}'.format(domian,url) 31 response = requests.get(url) 32 response.encoding = 'UTF-8' 33 result = response.text 34 reg = r'<li><a href="(.*?\.html)">(.*?)</a></li>' 35 chapter_url_list = re.findall(reg,result) 36 return chapter_url_list 37 38 def get_chapter_content(url): 39 url = '{}{}'.format(domian,url) 40 response = requests.get(url) 41 response.encoding = 'UTF-8' 42 result = response.text 43 reg = r'id="content1">(.*?)<script type="text/javascript">read_bot' 44 chapter_content = re.findall(reg,result,re.S)[0] 45 return chapter_content 46 47 for novel_url,novel_name in get_novel_sort_list(): 48 # print(novel_name,novel_url) 49 50 path = os.path.join('novel',novel_name) 51 if not os.path.exists(path):#判断当前文件是否存在 52 os.mkdir(path) 53 print('创建目录成功---{}'.format(novel_name)) 54 else: 55 print('{}---当前目录已经存在,跳过'.format(novel_name)) 56 chapter_url_content = get_novel_content(novel_url) 57 for chapter_url,chapter_name in get_chapter_list(chapter_url_content): 58 chapter_content = get_chapter_content(chapter_url_content + chapter_url) 59 tmp_path = os.path.join(path,chapter_name + '.txt') 60 print(tmp_path,111) 61 if not os.path.exists(tmp_path): 62 with open(tmp_path,'w') as fn: 63 fn.write(chapter_content) 64 print('{}---保存成功'.format(chapter_name)) 65 else: 66 print('{}---章节存在,已跳过'.format(tmp_path))