第一课 爬虫基础
一、小说下载
小说网址是:http://www.biqukan.com
import requests from bs4 import BeautifulSoup class downloader(object): def __init__(self): self.url = 'http://www.biqukan.com/1_1408/' self.serve = 'http://www.biqukan.com' self.page_url = [] self.page_name = [] #获取每个章节的链接和章节名字 def get_page_url(self): html = requests.get(self.url) soup = BeautifulSoup(html.text,'lxml') url_list = soup.find_all('div',class_="listmain") url_list = BeautifulSoup(str(url_list[0])) a = url_list.find_all('a') for each in a[12:]: self.page_url.append(self.serve + each.get('href')) self.page_name.append(each.string) #小说页面的内容 def get_html(self,url): html = requests.get(url) soup = BeautifulSoup(html.text,'lxml') content = soup.find_all('div',class_="showtxt") content = content[0].text content = content.replace('<br/><br/>','\n\n') return content #写入txt文件中 def writer(self,path,name,text): with open(path,'a',encoding='utf-8') as f: f.write(name+'\n') f.write(text) f.write('\n\n') if __name__ == '__main__': dl = downloader() #实例化类 dl.get_page_url() #运行获取章节名称,url的函数 name = dl.page_name #获取到的章节名称和url赋值给name,url url = dl.page_url for i in range(len(name)): dl.writer('小说.txt',name[i],dl.get_html(url[i]))