爬虫(抄)费老劲抄完

 1 #!/usr/bin/python
 2 # -*- coding: UTF-8 -*-
 3 # python 36
 4 __author_ = ''
 5 import requests
 6 import re
 7 import os
 8 
 9 domian = 'http://www.sjtxt.la'
10 
11 def get_novel_sort_list():
12     13     response = requests.get('http://www.sjtxt.la/soft/7/Soft_007_1.html')
14     result = response.text
15     reg = r'<a href="([^=]*?)"><img src=".*?">(.*?)</a>'
16     novel_url_list = re.findall(reg,result)
17     # print(novel_url_list)
18     return novel_url_list
19 
20 def get_novel_content(url):
21     url = '{}{}'.format(domian,url)
22     response = requests.get(url)
23     response.encoding = 'UTF-8'
24     result = response.text
25     reg = r'''<a class="downButton" href='(.*?)' title'''
26     chapter_url_content = re.findall(reg,result)[0]
27     return chapter_url_content
28 
29 def get_chapter_list(url):
30     url = '{}{}'.format(domian,url)
31     response = requests.get(url)
32     response.encoding = 'UTF-8'
33     result = response.text
34     reg = r'<li><a href="(.*?\.html)">(.*?)</a></li>'
35     chapter_url_list = re.findall(reg,result)
36     return chapter_url_list
37 
38 def get_chapter_content(url):
39     url = '{}{}'.format(domian,url)
40     response = requests.get(url)
41     response.encoding = 'UTF-8'
42     result = response.text
43     reg = r'id="content1">(.*?)<script type="text/javascript">read_bot'
44     chapter_content = re.findall(reg,result,re.S)[0]
45     return chapter_content
46 
47 for novel_url,novel_name in get_novel_sort_list():
48     # print(novel_name,novel_url)
49 
50     path = os.path.join('novel',novel_name)
51     if not os.path.exists(path):#判断当前文件是否存在
52         os.mkdir(path)
53         print('创建目录成功---{}'.format(novel_name))
54     else:
55         print('{}---当前目录已经存在,跳过'.format(novel_name))
56     chapter_url_content = get_novel_content(novel_url)
57     for chapter_url,chapter_name in get_chapter_list(chapter_url_content):
58         chapter_content = get_chapter_content(chapter_url_content + chapter_url)
59         tmp_path = os.path.join(path,chapter_name + '.txt')
60         print(tmp_path,111)
61         if not os.path.exists(tmp_path):
62             with open(tmp_path,'w') as fn:
63                 fn.write(chapter_content)
64                 print('{}---保存成功'.format(chapter_name))
65         else:
66             print('{}---章节存在,已跳过'.format(tmp_path))

 

posted on 2018-01-28 02:26  新手爱好者  阅读(142)  评论(0编辑  收藏  举报

导航