python爬虫算法深度优先遍历_爬虫基础 之深度优先,广度优先策略
1.深度优先递归方式;
import re
import requests
headers = {
'User-Agent':"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36 Edg/85.0.564.41"
}
def get_html(url):
try:
res= requests.get(url,headers=headers)
return res.text
except:
return ""
def get_son_url(url):
# 获取
html = get_html(url)
html_re = ''
href_list = re.findall(html_re,html,re.S)
return href_list
def deep_path(url):
if deepdict[url] > 3:
return
print("\t"*deepdict[url],"当前层级:%d" % deepdict[url])
# 获取子url列表
sonurl_list = get_son_url(url) #返回的是一个列表
#遍历所有的子url
for sonurl in sonurl_list:
if sonurl.startswith('https') or sonurl.startswith('http'):
if sonurl not in deepdict:
deepdict[sonurl] = deepdict[url]+1
deep_path(sonurl)
if __name__ == '__main__':
url = "https://www.baidu.com/s?wd=%E6%AD%A6%E6%B1%89%E5%85%89%E8%B0%B7"
# 控制层级
deepdict = {}
deepdict[url] = 1
deep_path(url)
2.广度优先策略之队列方法:
import re
import requests
headers = {
'User-Agent':"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36 Edg/85.0.564.41"
}
#获取网页源代码
def get_html(url):
try:
res= requests.get(url,headers=headers)
return res.text
except:
return ""
#获取子url列表
def get_son_url(url):
# 获取
html = get_html(url)
html_re = ''
href_list = re.findall(html_re,html,re.S)
return href_list
#广度爬取
def vast_path(url):
#队列方法 先进先出
#append 入队列 pop 出队列 用列表 模拟队列
url_queue = []
url_queue.append(url) #默认先把第一个放进来
while len(url_queue)>0:
#出队列 每次取出一个
url = url_queue.pop(0)
print("\t" * deepdict[url],'当前层级:%d'%deepdict[url])
if deepdict[url]<3:
#获取子url列表
sonurl_list = get_son_url(url)
for sonurl in sonurl_list:
#过滤出有效链接
if sonurl.startswith('https') or sonurl.startswith('http'):
if sonurl not in deepdict: #过滤重复url
deepdict[sonurl] = deepdict[url]+1
#入队列
url_queue.append(sonurl)
if __name__ == '__main__':
url = "https://www.baidu.com/s?wd=%E6%AD%A6%E6%B1%89%E5%85%89%E8%B0%B7"
# 控制层级
deepdict = {} #控制层级
deepdict[url] = 1 # 默认第一级
vast_path(url)
3.深度优先策略之栈方法:
import re
import requests
headers = {
'User-Agent':"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36 Edg/85.0.564.41"
}
#获取网页源代码
def get_html(url):
try:
res= requests.get(url,headers=headers)
return res.text
except:
return ""
#获取子url列表
def get_son_url(url):
# 获取
html = get_html(url)
html_re = ''
href_list = re.findall(html_re,html,re.S)
return href_list
#广度爬取
def vast_path(url):
#队列方法 先进先出
#append 入栈 pop 出栈 用列表 模拟栈
url_queue = []
url_queue.append(url) #默认先把第一个放进来
while len(url_queue)>0:
#出栈 每次取出最后一个
url = url_queue.pop()
print("\t" * deepdict[url],'当前层级:%d'%deepdict[url])
if deepdict[url]<3:
#获取子url列表
sonurl_list = get_son_url(url)
for sonurl in sonurl_list:
#过滤出有效链接
if sonurl.startswith('https') or sonurl.startswith('http'):
if sonurl not in deepdict: #过滤重复url
deepdict[sonurl] = deepdict[url]+1 #子url相比父url层级+1
#入队列
url_queue.append(sonurl)
if __name__ == '__main__':
url = "https://www.baidu.com/s?wd=%E6%AD%A6%E6%B1%89%E5%85%89%E8%B0%B7"
# 控制层级
deepdict = {} #控制层级
deepdict[url] = 1 # 默认第一级
vast_path(url)