Python爬取17吉他网吉他谱

最近学习吉他,一张一张保存吉他谱太麻烦,写个小程序下载吉他谱。

安装 BeautifulSoup,BeautifulSoup是一个解析HTML的库。
pip install BeautifulSoup4

在这个程序中 BeautifulSoup 使用 html5lib 所以还要安装 html5lib
pip install html5lib

代码如下:

# -*- coding: utf-8 -*-
#coding=UTF8

import os
import sys
import logging
import urllib
import urllib2
import chardet
import re
import cookielib
import urlparse

from bs4 import BeautifulSoup

sysEncoding = sys.getfilesystemencoding()
cookieJar = cookielib.CookieJar()

def get(url):

    req = urllib2.Request(url)
    
    opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookieJar))
    response = opener.open(req)
    
    return response.read()

def download_guitar_image(url, target):
    
    print 'start download guitar image ...'

    req = urllib2.Request(url)
    req.add_header('Accept','image/webp,image/*,*/*;q=0.8')

    opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookieJar))
    response = opener.open(req)
    
    content = response.read()    

    with open(target, 'wb') as code:
        code.write(content)

#解析吉他谱图片页面链接地址
def parse_guitar_img_link():
    
    page_list = []
    
    url_base = 'http://www.17jita.com/'
    
    page = 1
    while True:
        
        url = url_base + 'tab/img/index.php?page=' + str(page)
        
        print url
        
        html = get(url)
        
        soup = BeautifulSoup(html, "html5lib")
        
        list = soup.select('#ct dl > dt > a')
        
        if not list:
            break
        
        for item in list:
            page_list.append({ 'title' : item.text, 'link' : url_base + item['href'] })
                
        page += 1
    
    return page_list    
    
    
def download_guitar_image_link_list(url):
    
    image_link_list = []
    
    page = 1
    
    while True:
    
        page_url = url

        if page > 1:
            page_url = url.replace('.html', '' + str(page) + '.html')
            
        try:
        
            html = get(page_url)
            
            soup = BeautifulSoup(html, 'html5lib')
            
            img_list = soup.select('#article_contents a > img')
    
            for img in img_list:
                image_link_list.append(img['src'])
            
        except urllib2.URLError, e:
            msg = u'下载 ' + page_url + u' 出错, 原因: ' + e.reason
            print msg
            logging.error(msg)
            break
        
        page += 1

    return image_link_list

if __name__ == '__main__':
    
    logging.basicConfig(
        level=logging.DEBUG,
        format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S',
        filename='guitar.log',
        filemode='a')
        
    path = 'guitar'
    if not os.path.exists(path):
        os.mkdir(path)

    page_list = parse_guitar_img_link()
    for page in page_list:
        
        print page['link'] + '(' + page['title'] + ')'
        
        guitar_path = path + '/' + (page['title']).encode('GBK')
        if not os.path.exists(guitar_path):
            os.mkdir(guitar_path)
    
        image_link_list = download_guitar_image_link_list(page['link'])
        for image_link in image_link_list:
            
            print '\t' + image_link
            
            filename = image_link[image_link.rindex('/'):]
            
            filepath = guitar_path + filename.encode('GBK')
            
            download_guitar_image(image_link, filepath)
    
    
    
    
    
    
    

程序中还存在一些问题尚优化,比如下载中断,不能下载剩下的吉他谱。

posted on 2016-07-01 21:09  ^上善若水^  阅读(1119)  评论(0编辑  收藏  举报

导航