爬虫大作业

 

打开酷我新歌排行榜,网址是http://www.kuwo.cn/bang/index

 

 

代码

用requests库和BeautifulSoup库,爬取酷我新歌榜当前页面的每首的曲目、作者、歌词、排行、链接等,将获取歌曲详情的代码定义成一个函数 

musicUrl='http://www.kuwo.cn/yinyue/41185134?catalog=yueku2016'
url='http://www.kuwo.cn/bang/index'
gequ = {}
quan={}
def getNewsDetail(musicUrl ,pm):
    resd = requests.get(musicUrl)
    resd.encoding = 'utf-8'
    soupd = BeautifulSoup(resd.text, 'html.parser')
    info = soupd.select('p')[1].text
    panduan = soupd.select('p')[4].text
    # pinlun =  soupd.select('.title_').text
    if (info.find("返回")>0):
        print('版权保护')
        return 1
    else:




        gequ['曲目']= gequ1=soupd.select('#lrcName')[0].text
                            # for tit in  gequ['曲目']:
                            #     print(tit.text)
        gequ['歌手']=asinger=soupd.select('.artist')[0].text
                            # for singer in asinger:
                            # print('歌手:'+asinger[0].text)
                                 # return atit
        if(panduan.find("歌词")>0):
            gequ['歌词']='none'
        else:
            gequ['歌词'] = ageci = soupd.select('#llrcId')[0].text
                            # for geci in ageci:
                            #    print(geci.text)
        gequ['排名'] =pm
        f = open('quan2.txt', 'a',encoding='utf-8')
        f.write(gequ1 + ' ' + str(pm) + '\n')
        f.close()






    return gequ
# gequ=getNewsDetail(musicUrl)
# print(gequ)

def getListPage(musicsurl):

    res = requests.get(url)
    res.encoding = 'utf-8'
    soupq = BeautifulSoup(res.text,'html.parser')
    i=1
    p=200

    for music in soupq.select('.listMusic'):
         for music in music.select('.name'):
             for music in music.select('a'):
               musicUrl=music['href']
               i=i-1
               pm=p+i
               # set(pm)
               print(musicUrl)
               # print(pm)
               # list.append(getNewsDetail(musicUrl))
               print(getNewsDetail(musicUrl,pm))

    return 1

getListPage(url)

 

生成词云:

abel_mask = np.array(Image.open("C:\\timg.jpg"))
f=open('quan2.txt','r',encoding='utf-8').read()
font=r'C:\Windows\Fonts\simkai.ttf'
wordcloud=WordCloud(background_color="white",width=1000,height=860,margin=2,font_path=font,mask=abel_mask).generate(f)
image_colors = ImageColorGenerator(abel_mask)
plt.imshow(wordcloud)

plt.axis("off")
plt.show()
wordcloud.to_file('timg1.jpg')

 

posted @ 2018-04-27 23:57  黄俊熙  阅读(337)  评论(0编辑  收藏  举报