python 爬取页面所有的url
1.使用request爬取有效的URL
# -*- coding: GBK -*- from urllib import request import re #爬取某个主页上的全部有效URL def crawb(): # 1. 确定好要爬取的入口链接 url = "http://www.baidu.com" #根据网页选取合适的正则表达式 pattern = '<a href=".*?"' headers = {'User-Agent', 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36'} opener = request.build_opener() opener.addheaders = [headers] data = opener.open(url).read().decode('utf8') content_href = re.findall(pattern, data, re.I) # 5.过滤掉重复的链接 sets = set(content_href) # 6.后续操作,比如打印出来或者保存到文件中。 file = "url" with open(file, 'w') as f: for ur in sets: #删除开头的<a href="和结尾的" ur = ur[9:-1] try: respose=request.urlopen(ur) f.write(ur + "\n") except : print(ur+":is not url") if __name__ == "__main__": crawb()
2.使用requests爬取有效的URL
# -*- coding: GBK -*- from urllib import request import re import requests def crawb(): url="http://www.baidu.com" file = "url" kv = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) Chrome/57.0.2987.98 Safari/537.36 LBBROWSER'} with open(file, 'w') as f: r = requests.get(url, headers=kv) r.encoding = r.apparent_encoding pagetext = r.text # 正则表达式表示要爬取的是<a href="和"中的内容,"或'都可以,即当前页面下所有的链接url,返回列表 pagelinks = re.findall(r'(?<=<a href=\").*?(?=\")|(?<=href=\').*?(?=\')', pagetext) for link in pagelinks: try: respose=request.urlopen(link) f.write(link + "\n") except: print(link + ":is not url") if __name__ == "__main__": crawb()
3.beautifulSoup爬取页面中以http:开头的url
import re import requests from bs4 import BeautifulSoup def crawb(): url = 'http://www.baidu.com' page = requests.get(url).text pagesoup = BeautifulSoup(page, 'lxml') for link in pagesoup.find_all(name='a', attrs={"href": re.compile(r'^http:')}): print(link.get('href')) if __name__=="__main__": crawb()