BeautifulSoup简单例子

Beautiful Soup Documentation 中文文档：http://www.crummy.com/software/BeautifulSoup/documentation.zh.html

#! /usr/bin/env python

#coding=utf-8
import mechanize
from BeautifulSoup import BeautifulSoup

def scrape_links(base_url, data):
    soup = BeautifulSoup(data)
    #create mechanize.Browser instance
    links = [mechanize.Link(base_url = base_url,
    url = str(anchor['href']),
    text = str(anchor.string),
    tag = str(anchor.name),
    attrs = [(str(name),str(value))
    for name, value in anchor.attrs])
    for anchor in soup.right.findAll("a")] #findAll返回所有满足条件的list，find只返回一个第一个

    return links

def scrape_articles(data):
    """
    Scrape the title and url of all the articles in this page
    """
    # URL prefix is used to filter out other links
    # such as the ones pointing to books
    ARTICLE_URL_PREFIX = 'http://www.packtpub.com/article/'

    soup = BeautifulSoup(data)
    articles = [{'title':str(anchor.string),
    'url':str(anchor['href'])}
    for anchor in [li.a for li in soup.findAll('li')]
    if anchor['href'].startwith(ARTICLE_URL_PREFIX)]

    return articles

def main():
    articles = []
    BASE_URL = "http://news.nwsuaf.edu.cn/index.html"
    br = mechanize.Browser()
    data = br.open(BASE_URL).get_data()
    links = scrape_links(BASE_URL, data)

    # Scrape articles in main page
    articles.extend(scrape_articles(data))

    # Scrape articles in linked pages
    for link in links[1:]:
        data = br.follow_link(link).get_data()
        articles.extend(scrape_articles(data))
        br.back()

    # Ouput is the list of titles and URLs for each article found

print ("Article Networkn"

"---------------")

    print "nn".join(['Title: "%(title)s"nURL: "%(url)s"' % article
    for article in articles])

if __name__ == "__main__":
    main()

本文使用Blog_Backup未注册版本导出，请到soft.pt42.com注册。

posted @ 2011-04-05 21:32 莫忆往西阅读(392) 评论(0) 编辑收藏举报

刷新页面返回顶部

Time Goes By

如花美眷似水流年

BeautifulSoup简单例子

公告

Time Goes By

如花美眷 似水流年

BeautifulSoup简单例子

公告

如花美眷似水流年