爬某小说网站代码

# -*- coding: utf-8 -*-
"""
Created on Mon Jan 14 16:42:02 2019

@author: Administrator
"""

# -*- coding: utf-8 -*-
"""
Created on Mon Jan 14 13:44:35 2019

@author: Administrator
"""

import requests
from bs4 import BeautifulSoup
import re
import time


header ={
    "Connection": "keep-alive",
    "Upgrade-Insecure-Requests": "1",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36",
    "Accept":" text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
    "Accept-Encoding": "gzip,deflate",
    "Accept-Language": "zh-CN,zh;q=0.8"
    }
    


index_url="http://www.zbjxs.net/4/4589/"
r=requests.get(index_url,headers=header)
r.raise_for_status()
r.encoding=r.apparent_encoding
index_html=r.text

index_soup=BeautifulSoup(index_html,"html.parser")
t7=index_soup.select("li span a")   links
=[] titles=[] for i in range(len(t7)): links.append("http://www.zbjxs.net"+t7[i]["href"]) titles.append(t7[i].get_text()) f=open("d:/小说.txt","a",encoding="utf-8") for i in range(len(links)): st=time.time() r1=requests.get(links[i]) r1.encoding=r1.apparent_encoding html=r1.text soup=BeautifulSoup(html,"html.parser") artitle=titles[i]+"\n"+soup.select(".article-con")[0].get_text() f.write(artitle) en=time.time() runtime=en-st print("已经写入第:",i+1,"章,剩余:",len(links)-i-1,"","用时:",runtime) f.close()

然后附上一些注释掉的内容,方便复习

# -*- coding: utf-8 -*-
"""
Created on Mon Jan 14 16:42:02 2019

@author: Administrator
"""

# -*- coding: utf-8 -*-
"""
Created on Mon Jan 14 13:44:35 2019

@author: Administrator
"""

import requests
from bs4 import BeautifulSoup
import re
import time


header ={
    "Connection": "keep-alive",
    "Upgrade-Insecure-Requests": "1",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36",
    "Accept":" text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
    "Accept-Encoding": "gzip,deflate",
    "Accept-Language": "zh-CN,zh;q=0.8"
    }
    


index_url="http://www.zbjxs.net/4/4589/"
r=requests.get(index_url,headers=header)
r.raise_for_status()
r.encoding=r.apparent_encoding
index_html=r.text

index_soup=BeautifulSoup(index_html,"html.parser")



"""

t0=index_soup.find_all("span")         #精确 22个
t1=index_soup.find_all("a")
t2=index_soup.find_all("a",href=re.compile("html$"))        #不精确24个
t3=index_soup.find_all("a",href=re.compile("\d{7}\.html$"))   #精确22个

t4=index_soup.find_all(href=re.compile("\d{7}\.html$"))     #精确22

t5=index_soup.find_all({'href':'re.compile("\d{7}")'})


#中文正则   ([\u4e00-\u9fa5]{2,4})   2-4汉字

#([\u4e00-\u9fa5]{2,4})  
 
t6=index_soup.find_all("a",string=re.compile('[\u4e00-\u9fa5]{2-4}'))
"""
t7=index_soup.select("li span a")      #真机吧好用 精确好用 22



"""
#下面每两个为一个功能注解

#用标签名查找
c0=index_soup.select("a") 
#找到所有a标签

c1=index_soup.select("li") 
#找到所有li标签

#类名查找
d0=index_soup.select(".home") 

d1=index_soup.select(".line") 



d2=index_soup.select("[class~=line]") 
#d2和d0等价,select只能接受class 我测试其他参数不可以

"""
"""
通过id获得标签:

soup.select("#link1") #通过设置参数为id来获取该id对应的tag
# [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]

soup.select("a#link2")  #这里区别于上一个单纯的使用id,又增添了tag属性,使查找更加具体
# [<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]
1
2
3
4
5
通过设置select函数的参数为列表,来获取tags。只要匹配列表中的任意一个则就可以捕获。

soup.select(“#link1,#link2”) #捕获id为link1或link2的标签
# [<a class=”sister” href=”http://example.com/elsie” id=”link1”>Elsie</a>, 
# <a class=”sister” href=”http://example.com/lacie” id=”link2”>Lacie</a>]
--------------------- 
作者:SuPhoebe 
来源:CSDN 
原文:https://blog.csdn.net/u013007900/article/details/54728408 
版权声明:本文为博主原创文章,转载请附上博文链接!

这些有用但是本文没有id这个属性,
可以理解的是bs4为了class和id这两个常用的属性,专门自订的一些功能

"""

#下面介绍本文用到的href属性 这个比较通用

"""
按照标签是否存在某个属性来获取:

soup.select('a[href]') #获取a标签中具有href属性的标签
# [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
#  <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
#  <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
1
2
3
4
通过某个标签的具体某个属性值来查找tags:

soup.select('a[href="http://example.com/elsie"]')
# [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]

soup.select('a[href^="http://example.com/"]')
# [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
#  <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
#  <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

soup.select('a[href$="tillie"]')
# [<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

soup.select('a[href*=".com/el"]')
# [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]
1
2
3
4
5
6
7
8
9
10
11
12
13
这里需要解释一下: 
soup.select(‘a[href^=”http://example.com/”]’)意思是查找href属性值是以”http://example.com/“值为开头的标签,可以查看博客介绍。 
soup.select(‘a[href$=”tillie”]’)意思是查找href属性值是以tillie为结尾的标签。 
soup.select(‘a[href*=”.com/el”]’)意思是查找href属性值中存在字符串”.com/el”的标签,所以只有href=”http://example.com/elsie”一个匹配。

查询符合查询条件的第一个标签:

soup.select_one(".sister") #只查询符合条件的第一个tag
# <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
--------------------- 
作者:SuPhoebe 
来源:CSDN 
原文:https://blog.csdn.net/u013007900/article/details/54728408 
版权声明:本文为博主原创文章,转载请附上博文链接!


"""
#e=index_soup.select('a[href]') #获取a标签中具有href属性的标签


#下面采用t7

links=[]
titles=[]

    
for i in range(len(t7)):
    links.append("http://www.zbjxs.net"+t7[i]["href"])
    titles.append(t7[i].get_text())


f=open("d:/丝袜合集.txt","a",encoding="utf-8")


for i in range(len(links)):
    st=time.time()
    r1=requests.get(links[i])
    r1.encoding=r1.apparent_encoding
    html=r1.text 
    soup=BeautifulSoup(html,"html.parser") 
    artitle=titles[i]+"\n"+soup.select(".article-con")[0].get_text()
    f.write(artitle) 
    en=time.time()
    runtime=en-st
    print("已经写入第:",i+1,"章,剩余:",len(links)-1-i,"","用时:",runtime)    
    

f.close()

"""
titles=[]

    titles.append(t7[i].get_text())
"""




"""

 r1=requests.get(links[i])
    r1.encoding=r1.apparent_encoding
    html=r1.text    
    soup=BeautifulSoup(html,"html.parser")
    #txt=soup.find_all(string=)
    #txt=titles[i]+html

"""

 

 

 

 

 

header ={
    "Connection": "keep-alive",
    "Upgrade-Insecure-Requests": "1",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36",
    "Accept":" text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
    "Accept-Encoding": "gzip,deflate",
    "Accept-Language": "zh-CN,zh;q=0.8"
    }
    

 

posted @ 2019-01-14 17:04  V5八旗  阅读(470)  评论(0编辑  收藏  举报