Python爬取17吉他网吉他谱
最近学习吉他,一张一张保存吉他谱太麻烦,写个小程序下载吉他谱。
安装 BeautifulSoup,BeautifulSoup是一个解析HTML的库。
pip install BeautifulSoup4
在这个程序中 BeautifulSoup 使用 html5lib 所以还要安装 html5lib
pip install html5lib
代码如下:
# -*- coding: utf-8 -*- #coding=UTF8 import os import sys import logging import urllib import urllib2 import chardet import re import cookielib import urlparse from bs4 import BeautifulSoup sysEncoding = sys.getfilesystemencoding() cookieJar = cookielib.CookieJar() def get(url): req = urllib2.Request(url) opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookieJar)) response = opener.open(req) return response.read() def download_guitar_image(url, target): print 'start download guitar image ...' req = urllib2.Request(url) req.add_header('Accept','image/webp,image/*,*/*;q=0.8') opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookieJar)) response = opener.open(req) content = response.read() with open(target, 'wb') as code: code.write(content) #解析吉他谱图片页面链接地址 def parse_guitar_img_link(): page_list = [] url_base = 'http://www.17jita.com/' page = 1 while True: url = url_base + 'tab/img/index.php?page=' + str(page) print url html = get(url) soup = BeautifulSoup(html, "html5lib") list = soup.select('#ct dl > dt > a') if not list: break for item in list: page_list.append({ 'title' : item.text, 'link' : url_base + item['href'] }) page += 1 return page_list def download_guitar_image_link_list(url): image_link_list = [] page = 1 while True: page_url = url if page > 1: page_url = url.replace('.html', '' + str(page) + '.html') try: html = get(page_url) soup = BeautifulSoup(html, 'html5lib') img_list = soup.select('#article_contents a > img') for img in img_list: image_link_list.append(img['src']) except urllib2.URLError, e: msg = u'下载 ' + page_url + u' 出错, 原因: ' + e.reason print msg logging.error(msg) break page += 1 return image_link_list if __name__ == '__main__': logging.basicConfig( level=logging.DEBUG, format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s', datefmt='%Y-%m-%d %H:%M:%S', filename='guitar.log', filemode='a') path = 'guitar' if not os.path.exists(path): os.mkdir(path) page_list = parse_guitar_img_link() for page in page_list: print page['link'] + '(' + page['title'] + ')' guitar_path = path + '/' + (page['title']).encode('GBK') if not os.path.exists(guitar_path): os.mkdir(guitar_path) image_link_list = download_guitar_image_link_list(page['link']) for image_link in image_link_list: print '\t' + image_link filename = image_link[image_link.rindex('/'):] filepath = guitar_path + filename.encode('GBK') download_guitar_image(image_link, filepath)
程序中还存在一些问题尚优化,比如下载中断,不能下载剩下的吉他谱。