BeautifulSoup简单例子

Beautiful Soup Documentation 中文文档:http://www.crummy.com/software/BeautifulSoup/documentation.zh.html

#! /usr/bin/env python

#coding=utf-8
import mechanize
from BeautifulSoup import BeautifulSoup

def scrape_links(base_url, data):
    soup = BeautifulSoup(data)
    #create mechanize.Browser instance
    links = [mechanize.Link(base_url = base_url,
    url = str(anchor['href']),
    text = str(anchor.string),
    tag = str(anchor.name),
    attrs = [(str(name),str(value))
    for name, value in anchor.attrs])
    for anchor in soup.right.findAll("a")]  #findAll返回所有满足条件的list,find只返回一个第一个
    
    return links

def scrape_articles(data):
    """
    Scrape the title and url of all the articles in this page
    """
    # URL prefix is used to filter out other links
    # such as the ones pointing to books
    ARTICLE_URL_PREFIX = 'http://www.packtpub.com/article/'
    
    soup = BeautifulSoup(data)
    articles = [{'title':str(anchor.string),
    'url':str(anchor['href'])}
    for anchor in [li.a for li in soup.findAll('li')]
    if anchor['href'].startwith(ARTICLE_URL_PREFIX)]
    
    return articles

def main():
    articles = []
    BASE_URL = "http://news.nwsuaf.edu.cn/index.html"
    br = mechanize.Browser()
    data = br.open(BASE_URL).get_data()
    links = scrape_links(BASE_URL, data)
    
    # Scrape articles in main page
    articles.extend(scrape_articles(data))
    
    # Scrape articles in linked pages
    for link in links[1:]:
        data = br.follow_link(link).get_data()
        articles.extend(scrape_articles(data))
        br.back()
    
    # Ouput is the list of titles and URLs for each article found

    print ("Article Networkn"

"---------------")

    print "nn".join(['Title: "%(title)s"nURL: "%(url)s"' % article
    for article in articles])
    
if __name__ == "__main__":
    main()

本文使用Blog_Backup未注册版本导出,请到soft.pt42.com注册。

posted @ 2011-04-05 21:32  莫忆往西  阅读(392)  评论(0编辑  收藏  举报