BeautifulSoup简单例子
Beautiful Soup Documentation 中文文档:http://www.crummy.com/software/BeautifulSoup/documentation.zh.html
#! /usr/bin/env python
#coding=utf-8import mechanize
from BeautifulSoup import BeautifulSoup
def scrape_links(base_url, data):
soup = BeautifulSoup(data)
#create mechanize.Browser instance
links = [mechanize.Link(base_url = base_url,
url = str(anchor['href']),
text = str(anchor.string),
tag = str(anchor.name),
attrs = [(str(name),str(value))
for name, value in anchor.attrs])
for anchor in soup.right.findAll("a")] #findAll返回所有满足条件的list,find只返回一个第一个
return links
def scrape_articles(data):
"""
Scrape the title and url of all the articles in this page
"""
# URL prefix is used to filter out other links
# such as the ones pointing to books
ARTICLE_URL_PREFIX = 'http://www.packtpub.com/article/'
soup = BeautifulSoup(data)
articles = [{'title':str(anchor.string),
'url':str(anchor['href'])}
for anchor in [li.a for li in soup.findAll('li')]
if anchor['href'].startwith(ARTICLE_URL_PREFIX)]
return articles
def main():
articles = []
BASE_URL = "http://news.nwsuaf.edu.cn/index.html"
br = mechanize.Browser()
data = br.open(BASE_URL).get_data()
links = scrape_links(BASE_URL, data)
# Scrape articles in main page
articles.extend(scrape_articles(data))
# Scrape articles in linked pages
for link in links[1:]:
data = br.follow_link(link).get_data()
articles.extend(scrape_articles(data))
br.back()
# Ouput is the list of titles and URLs for each article found
print ("Article Networkn"
"---------------")
print "nn".join(['Title: "%(title)s"nURL: "%(url)s"' % articlefor article in articles])
if __name__ == "__main__":
main()
本文使用Blog_Backup未注册版本导出,请到soft.pt42.com注册。