python3: 博客园列表爬取;
import requests from bs4 import BeautifulSoup as bs import html5lib header = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.96 Safari/537.36' } def download(): """ 模拟浏览器进行访问; :param url: :return: """ for pageIdx in range(1, 5, 1): #print(pageIdx) url = "https://www.cnblogs.com/sitehome/p/%s" % str(pageIdx) try: r = requests.get(url, timeout=30) r.raise_for_status() r.encoding = r.apparent_encoding data = r.text except: return content = bs(data, 'html5lib') for ctx in content.find_all('h3'): print(ctx.a['href'], ctx.a.string) if __name__ == "__main__": download()