BeautifulSoup 抓取网站url
1 # -*- coding:utf-8 -*- 2 import urlparse 3 import urllib2 4 from bs4 import BeautifulSoup 5 6 url = "http://www.baidu.com" 7 8 urls = [url] # stack of urls to scrape 9 visited = [url] # historic record of urls 10 1 # -*- coding:utf-8 -*- 2 import urlparse 3 import urllib2 4 from bs4 import BeautifulSoup 5 6 url = "http://www.baidu.com" 7 8 urls = [url] # stack of urls to scrape 9 visited = [url] # historic record of urls 10 11 while len(urls) > 0: 12 try: 13 htmltext = urllib2.urlopen(urls[0]).read() 14 except: 15 print urls[0] 16 soup = BeautifulSoup(htmltext,"html") 17 18 urls.pop(0) 19 20 for tag in soup.findAll("a", href=True): 21 tag["href"] = urlparse.urljoin(url, tag["href"]) 22 if url in tag["href"] and tag["href"] not in visited: 23 urls.append(tag["href"]) 24 visited.append(tag["href"]) 25 26 print len(urls)