狂自私

导航

< 2025年3月 >
23 24 25 26 27 28 1
2 3 4 5 6 7 8
9 10 11 12 13 14 15
16 17 18 19 20 21 22
23 24 25 26 27 28 29
30 31 1 2 3 4 5

统计

爬虫初体验-爬取小说内容

很简单,依赖模块,爬虫的编写很简单.虽然我写的烂七八糟的.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
#小说类
import requests #第三方模块,需要额外安装
import re
class GrabNovel(object):
    #这个小说名字其实还可以从给定的URL中获取到.
    def __init__(self,novel_name="小说.txt"):
        self.novel_contnet = open(novel_name,mode='w',encoding='UTF-8');
        self.html="";
        self.all_url=[];
        self.url="";
        self.str_len=0;
        self.index=0;
    def __del__(self):
        self.novel_contnet.close();
    def t(self,str):
        zhong={'零':0,'一':1,'二':2,'三':3,'四':4,'五':5,'六':6,'七':7,'八':8,'九':9};
        danwei={'十':10,'百':100,'千':1000,'万':10000};
        num=0;
        if len(str)==0:
            return 0;
        if len(str)==1:
            if str == '十':
                return 10;
            num=zhong[str];
            return num;
        temp=0;
        if str[0] == '十':
            num=10;
        for i in str:
            if i == '零':
                temp=zhong[i];
            elif i == '一':
                temp=zhong[i];
            elif i == '二':
                temp=zhong[i];
            elif i == '三':
                temp=zhong[i];
            elif i == '四':
                temp=zhong[i];
            elif i == '五':
                temp=zhong[i];
            elif i == '六':
                temp=zhong[i];
            elif i == '七':
                temp=zhong[i];
            elif i == '八':
                temp=zhong[i];
            elif i == '九':
                temp=zhong[i];
            if i == '十':
                temp=temp*danwei[i];
                num+=temp;
            elif i == '百':
                temp=temp*danwei[i];
                num+=temp;
            elif i == '千':
                temp=temp*danwei[i];
                num+=temp;
            elif i == '万':
                temp=temp*danwei[i];
                num+=temp;
        if str[len(str)-1] != '十'and str[len(str)-1] != '百'and str[len(str)-1] != '千'and str[len(str)-1] != '万':
            num+=temp;
        return num;
 
    #此方法依赖 request,re模块
    #此函数的作用:获取字符串型式的网页源码,以get方式
    #参数就是网址,返回网页源码,对传入的数据类型和内容进行简单检查
    #一次只能传入一个URL
    def get_strHTML(self):
        if type(self.url) != type("http://baidu.com"):
            self.html = "传入的变量类型不是字符串类型";
        if len(self.url) == 0:
            self.html = "传入值为空";
        #http://    https://    ftp://  file://
        if len(re.findall('http://',self.url)) != 0:
            self.html = requests.get(self.url).content.decode('UTF-8');
        elif len(re.findall('https://',self.url)) != 0:
            self.html = requests.get(self.url).content.decode('UTF-8');
        elif len(re.findall('gtp://',self.url)) != 0:
            self.html = requests.get(self.url).content.decode('UTF-8');
        elif len(re.findall('file://',self.url)) != 0:
            self.html = requests.get(self.url).content.decode('UTF-8');
        elif len(re.findall('www.',self.url)) != 0:
            self.html = requests.get(self.url).content.decode('UTF-8');
        else:
            self.html = "传入的值不是常见协议头.";
 
    def sort_t(self,num):
        return num[0];
 
    def get_url(self,url="https://www.qu.la/book/4140/"):
        if self.url == "":
            self.url=url;
            self.get_strHTML();
            #self.html=re.sub('\n',"",self.html);
            #print(self.html);
 
            #抓取另一本的时候要把11355改掉.注意观察url就知道了.
            temp=re.findall('(?<=<a style="" href="/book/4140/).*?第.*?(?=章)',self.html);  #获取到的列表的内容大概是:5976000.html">第328
            num='';
            my_url='';
            for i in range(0,len(temp)):
                num=(re.findall('(?<=第).*?$',temp[i])[0]);
                try:
                    num=int(num);
                except ValueError:
                    num=int(self.t(num));
                my_url=(re.findall('.*?(?=">)',temp[i])[0]);
                group=[];
                group.append(num);
                group.append(my_url);
                self.all_url.append(group);
            #排序
            self.all_url.sort(key=self.sort_t);
            print(len(self.all_url));
        else:
            self.url=url+self.all_url[self.index][1];
            self.index+=1;
        '''
            self.url=url;
        else:
            temp=re.findall('章节列表.*?href=.+?下一章',self.html);
            temp=re.findall('(?<=href=").*?(?=\.)',temp[0]);
            temp=temp[0];                                       #现在应该是这样的:4669985
            try:
                num=int(temp);
            except ValueError:
                self.novel_contnet.seek(self.str_len * -1,1);
                return;
            else:
                self.url="https://www.qu.la/book/11355/"+temp+'.html';      #爬取的是笔趣阁的(虽然不知道是不是真的笔趣阁)
        '''
 
 
 
    #此函数依赖re模块
    #此函数提取网页源代码中的标题和内容
    #给定参数为字符串.
    #此函数返回列表,第一项是章节名,第二项是章节内容.(标题是<h1></h1>里面的内容,内容则是<div id="content"></div>标签里面的正文),
    #给定参数不是源代码时返回空列表
    #给定参数为空时返回空列表
    #给定参数类型不正确返回空列表
    #此函数不完善,某些内容不能获取到
 
     
    def wirite_title_content(self):
        if type(self.html) != type("http://baidu.com"):
            self.chapter = [];
        if len(self.html) == 0:
            self.chapter = [];
        #匹配<!DOCTYPE html或者<!doctype html
        if len(re.findall('<![Dd][Oo][Cc][Tt][Yy][Pp][Ee]\s[Hh][Tt][Mm][Ll]\s*?',self.html)) == 0:
            self.chapter = [];
        temp=re.findall('(?<=<h1>).*?(?=</h1>)',self.html);
        if len(temp) != 0:
            self.novel_contnet.write(temp[0]+'\n');
            print(temp);
        else:
            self.novel_contnet.write('\n');
            print("");
        #将<div id="content"></div>标签里面的内容弄下来而已
        self.html=re.sub('\n',"",self.html);                #这里有个我一直忽略的细节,哪怕我们把多行文本读为一个字符串,在进行匹配的时候很有可能会因为那些换行符失效.我不清楚re.findall函数是怎么实现的,但是没删除\n和删除之后完全是两码事.
        content_div=re.findall('(?<=content">).*?(?=</div>)',self.html);
        if 0 == len(content_div):
            print("\t\t\t\t\t\t\t\t没有获取到内容,正则表达式.");
        for i in range(0,len(content_div)):
            content_div[i]=re.sub('<br\s/>  ','',content_div[i]);
            content_div[i]=re.sub('    ','\n',content_div[i]);
            content_div[i]=re.sub('</br>.*?</script>','',content_div[i]);
            self.novel_contnet.write(content_div[i]+'\n');
            self.str_len=len(content_div[i]+'\n');
 
    #只能转数字,传参中包含非数字会错.
    

  

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
#主函数
import novel
 
def main():
    my_novel=novel.GrabNovel(novel_name="太古神王.txt");
     
    index=1;
    my_novel.get_url(); #初始化url
    while len(my_novel.all_url) != index:
        my_novel.get_url();
        print("第"+str(index)+"个:\t"+str(my_novel.url));
        my_novel.get_strHTML();
        my_novel.wirite_title_content();
         
        index+=1;
main();

  本来想说点啥,但是发现注释写的还好(别骂).就不说了.

posted on   狂自私  阅读(718)  评论(0编辑  收藏  举报

编辑推荐:
· AI与.NET技术实操系列(二):开始使用ML.NET
· 记一次.NET内存居高不下排查解决与启示
· 探究高空视频全景AR技术的实现原理
· 理解Rust引用及其生命周期标识(上)
· 浏览器原生「磁吸」效果!Anchor Positioning 锚点定位神器解析
阅读排行:
· DeepSeek 开源周回顾「GitHub 热点速览」
· 物流快递公司核心技术能力-地址解析分单基础技术分享
· .NET 10首个预览版发布:重大改进与新特性概览!
· AI与.NET技术实操系列(二):开始使用ML.NET
· 单线程的Redis速度为什么快?
点击右上角即可分享
微信分享提示