抓取网页内容生成kindle电子书
参考:
- http://calibre-ebook.com/download_linux
- http://blog.codinglabs.org/articles/convert-html-to-kindle-book.html
The Linux Command Line
#TLCL.recipe from calibre.web.feeds.recipes import BasicNewsRecipe class The_Linux_Command_Line(BasicNewsRecipe): title = 'The Linux Command Line' description = 'The Linux Command Line' cover_url = 'http://img5.douban.com/lpic/s7056078.jpg' url_pre = 'http://billie66.github.io/TLCL/book/' no_stylesheets = True keep_only_tags = [{ 'class': 'typo' }] #内容的寻找范围 def parse_index(self): soup = self.index_to_soup(self.url_pre)#目录页 div = soup.find('div', {'class': 'contents'})#目录页的寻找范围 articles = [] for link in div.findAll('a'): til = link.contents[0].strip() url = self.url_pre + link['href'] a = { 'title': til, 'url': url } articles.append(a) results = [('The Linux Command Line', articles)] return results