爬虫大作业
打开酷我新歌排行榜,网址是http://www.kuwo.cn/bang/index
代码:
用requests库和BeautifulSoup库,爬取酷我新歌榜当前页面的每首的曲目、作者、歌词、排行、链接等,将获取歌曲详情的代码定义成一个函数
musicUrl='http://www.kuwo.cn/yinyue/41185134?catalog=yueku2016' url='http://www.kuwo.cn/bang/index' gequ = {} quan={} def getNewsDetail(musicUrl ,pm): resd = requests.get(musicUrl) resd.encoding = 'utf-8' soupd = BeautifulSoup(resd.text, 'html.parser') info = soupd.select('p')[1].text panduan = soupd.select('p')[4].text # pinlun = soupd.select('.title_').text if (info.find("返回")>0): print('版权保护') return 1 else: gequ['曲目']= gequ1=soupd.select('#lrcName')[0].text # for tit in gequ['曲目']: # print(tit.text) gequ['歌手']=asinger=soupd.select('.artist')[0].text # for singer in asinger: # print('歌手:'+asinger[0].text) # return atit if(panduan.find("歌词")>0): gequ['歌词']='none' else: gequ['歌词'] = ageci = soupd.select('#llrcId')[0].text # for geci in ageci: # print(geci.text) gequ['排名'] =pm f = open('quan2.txt', 'a',encoding='utf-8') f.write(gequ1 + ' ' + str(pm) + '\n') f.close() return gequ # gequ=getNewsDetail(musicUrl) # print(gequ) def getListPage(musicsurl): res = requests.get(url) res.encoding = 'utf-8' soupq = BeautifulSoup(res.text,'html.parser') i=1 p=200 for music in soupq.select('.listMusic'): for music in music.select('.name'): for music in music.select('a'): musicUrl=music['href'] i=i-1 pm=p+i # set(pm) print(musicUrl) # print(pm) # list.append(getNewsDetail(musicUrl)) print(getNewsDetail(musicUrl,pm)) return 1 getListPage(url)
生成词云:
abel_mask = np.array(Image.open("C:\\timg.jpg")) f=open('quan2.txt','r',encoding='utf-8').read() font=r'C:\Windows\Fonts\simkai.ttf' wordcloud=WordCloud(background_color="white",width=1000,height=860,margin=2,font_path=font,mask=abel_mask).generate(f) image_colors = ImageColorGenerator(abel_mask) plt.imshow(wordcloud) plt.axis("off") plt.show() wordcloud.to_file('timg1.jpg')