获取指定网页的所有链接
Github上的一个小项目,通过requests和bs4获取网页指定网页上的所有链接:
""" 一个小程序,获取指定网页上的所有链接 """ import requests from bs4 import BeautifulSoup url = input("请输入网址:") # 从终端输入网址 if ("https" or "http") in url: # 判定一下 webData = requests.get(url) # 获取网页响应 # print(webData) else: webData = requests.get("https://" + url) webData.encoding = webData.apparent_encoding # 编码 webData.raise_for_status() # webData.encoding = 'utf-8' # 编码 # print(webData.text) htmlData = webData.text # 解析网页数据 # soup = BeautifulSoup(htmlData, 'html.parser') soup = BeautifulSoup(htmlData, 'lxml') # print(soup) # 开始查找网页下所有链接 allLinksFromPage = [] links = soup.find_all('a') # print(links) for link in links: getLink = link.get('href') allLinksFromPage.append(getLink) # print(allLinksFromPage) # 开始存储 with open('myLinks.txt', 'w') as saved: print(allLinksFromPage[0:10], file=saved) # 保存前十条 saved.close()