Python 爬虫练习项目——异步加载爬取
项目代码
from bs4 import BeautifulSoup import requests url_prefix = 'https://knewone.com/discover?page=' infos = [] # 获取单个页面数据 def getAPage(url,data = None): web_data = requests.get(url) soup = BeautifulSoup(web_data.text,'lxml') # print(soup) images = soup.select('header > a > img') titles = soup.select('section > h4 > a') links = soup.select('a.cover-inner') likes = soup.select('span.fanciers_count') if data == None: for image,title,link,like in zip(images,titles,links,likes): data = { 'image':image.get('src'), 'title':title.get_text(), 'link':'https://knewone.com' + link.get('href'), 'like':int(like.get_text()) } print(data) infos.append(data) # 获取多个加载的数据 def getMorePages(start,end): for url_suffix in range(start,end): getAPage(url_prefix + str(url_suffix)) print('---------------已经获取{}条数据---------------'.format(len(infos)), sep='\n') # 获取点赞排名前几的数据 def getInfosByLikes(order,infos =infos): infos = sorted(infos,key= lambda info:info['like'],reverse = True) for info in infos[:order]: print(info['like'],info['title'],info['image'],info['link']) getMorePages(1,4) getInfosByLikes(5)
项目特点:
【转载】同步加载、异步加载、延迟加载
爬取的网站链接