python 爬虫(四)
爬遍整个网络
1 当我们访问整个网络的时候,我们不可避免的会访问不同的网站,但是不同的网站会有完全不同的结构和内容...
现在一步一步的构建访问整个网络的脚本
I 从一个网站开始,每一次都爬向不同的网站。如果在一个页面找不到指向其他网站的链接,获取本网站其他界面信息,直到找到其他网站的链接。
# -*- coding:utf-8 -*- from urllib.request import urlopen from urllib.error import HTTPError from bs4 import BeautifulSoup from random import choice import re basename = "http://en.wikipedia.org" visitedpages = set() def getInternalLinks(bsObj,includeUrl): return [eachlink.attrs['href'] for eachlink in bsObj.find_all("a",href=re.compile("^(/|.*" + includeUrl + ")")) if 'href' in eachlink.attrs] def getExternalLinks(bsObj,excludeUrl): return [eachlink.attrs['href'] for eachlink in bsObj.find_all("a",href=re.compile("^(http|www)((?!" + excludeUrl + ").)*$")) if 'href' in eachlink.attrs] def splitAddress(address): addressParts = address.replace("http://","").split("/") return addressParts def getRandomExternalLink(startingPage): html = urlopen(startingPage) with html: bsObj = BeautifulSoup(html,"html.parser") externalLinks = getExternalLinks(bsObj,splitAddress(startingPage)[0]) if len(externalLinks) == 0: internalLinks = getInternalLinks(bsObj, splitAddress(startingPage)[0]) return choice(internalLinks) else: return choice(externalLinks) def followExternalLink(startingPage): externalLink = getRandomExternalLink("http://www.oreilly.com/") if externalLink in visitedpages: print("visited") else: print("the random external link is " + externalLink) visitedpages.add(externalLink) followExternalLink(externalLink) if __name__ == "__main__": #print(splitAddress("http://www.oreilly.com/")[0]) #print(getRandomExternalLink("http://www.oreilly.com/")) followExternalLink("http://www.oreilly.com/")
II 从一个网站开始,查找这个网站所有界面信息,获取整个网站指向其他网站的链接
# -*- coding:utf-8 -*- from urllib.request import urlopen from urllib.error import HTTPError from bs4 import BeautifulSoup from random import choice import re def getInternalLinks(bsObj,includeUrl): return [eachlink.attrs['href'] for eachlink in bsObj.find_all("a",href=re.compile("^(/|.*" + includeUrl + ")")) if 'href' in eachlink.attrs] def getExternalLinks(bsObj,excludeUrl): return [eachlink.attrs['href'] for eachlink in bsObj.find_all("a",href=re.compile("^(http|www)((?!" + excludeUrl + ").)*$")) if 'href' in eachlink.attrs] def splitAddress(address): addressParts = address.replace("http://","").split("/") return addressParts allINlinks = set() allEXlinks = set() def getAllexternalLinks(startPage): try: with urlopen(startPage) as html: bsObj = BeautifulSoup(html,"html.parser") except HTTPError as e: print(e) else: allinternallinks = getInternalLinks(bsObj,splitAddress(startPage)[0]) allexternallinks = getExternalLinks(bsObj,splitAddress(startPage)[0]) print("************external*******************************") for eachexternallink in allexternallinks: if eachexternallink not in allEXlinks: allEXlinks.add(eachexternallink) print(eachexternallink) print("************internal*******************************") for eachinternallink in allinternallinks: if eachinternallink not in allINlinks: allINlinks.add(eachinternallink) print(eachinternallink) getAllexternalLinks(eachinternallink) if __name__ == "__main__": getAllexternalLinks("http://www.oreilly.com/")
***************还存在问题的代码***************************