爬取网易云音乐(包括歌词和评论)
字段说明
Singer_tb是歌手表,包括的字段有:
Sger_id:用于计数和排序
Sger_name:歌手名称
Sger_num:歌手名称对应的编号,可用于多表联查
歌手数量:6位
Album_tb:是专辑表,包括的字段有:
Albun_id:用于计数和排序
Album_name:专辑名称
Album_num:专辑编号,可用于和表song_tb歌曲表进行多表联查
Sger_num:歌手名称对应的编号,可用和singer_tb歌手表进行多表联查,
专辑数量:223张
Song_tb:是歌曲表,包括的字段有:
Song_id:用于计数和排序
Song_name:歌曲名称
Song_num:歌曲编号
Album_num:专辑编号,可用于和表song_tb歌曲表进行多表联查
Sger_num:歌手名称对应的编号,可用和singer_tb歌手表进行多表联查,
Song_lrc:歌词
Song_cmts:歌曲评论
歌曲数量:2286条
多表联查简单说明
每个表与表之间的对应关系已经匹配好,可用多表联查实现(具体操作可在数据库中实现),
看周杰伦的资料,可以看到周杰伦的所有歌曲和专辑,都可用多表联查.
例如:在数据库终端中输入select song_name,sger_name from song_tb,singer_tb
Where song_tb.sger_num = singer_tb.sger_num; 只是一个多表联查的例子
遇到的问题:
- 歌词的接口问题,由于js加密的方法比较复杂,我找到的是api接口,直接可以获取歌词
- 评论的插入数据库的问题,遇到的是emoji表情插不进去的情况,需要对评论的表情去掉,
原因是数据库的编码问题,数据库默认的是utf8编码,一般一个字符占三个字节,但是 emoji表情占4个字节,导致插入不进去的情况发生,用的是正则方法去掉emoji表情。
其他的爬取过程比较顺利.
# http://music.163.com/discover/playlist/?order=hot&cat=%E5%85%A8%E9%83%A8&limit=35&offset=35 # http://music.163.com/discover/playlist/?order=hot&cat=%E5%85%A8%E9%83%A8&limit=35&offset=70 from urllib import request,parse import pymysql from bs4 import BeautifulSoup import re import random from selenium import webdriver import time import json import requests def singer(): base_url = 'http://music.163.com/discover/artist/cat?id=1001' user_agent = [ 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0', "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36", ] ua = random.choice(user_agent) headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', # Accept-Encoding:gzip, deflate, sdch, 'Accept-Language': 'zh-CN,zh;q=0.8', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Cookie': '_ntes_nnid=9590bff1d254dd7fa3f25d8e0d311522,1510122577046; _ntes_nuid=9590bff1d254dd7fa3f25d8e0d311522; _ngd_tid=dGE34taznjKAcDgyTJtO2J7d%2Bg%2BOLdSZ; usertrack=ezq0pVqnqQJAj7faBRRpAg==; __f_=1521533778639; starttime=; NTES_SESS=gWckWp0kS7P8.9Ll7xa1C2UB_4KpPrwEUUnWVSgGwgUuP_OdPs6CJlnou4rhyTwLPIJVjVNPLy3yX18e7HET2ih10YmTlRyC72K7.chKLFGewToNDdDACY3ojifgYw5TipjDIF7JEcSBMG6jhgsdk4TJayFaVg0m3mSciKsZf0JgHiZjNW9Urz_X2s8tcyGw9.DPBx6s5eROyccVAQqxWLj.v33_K253y; S_INFO=1523348346|0|3&80##|m13349949963_1; P_INFO=m13349949963_1@163.com|1523348346|0|mail163|00&99|null&null&null#jix&360100#10#0#0|133963&1||13349949963@163.com; nts_mail_user=13349949963@163.com:-1:1; df=mail163_letter; Province=0790; City=0791; MUSIC_EMAIL_U=8e19f5c8cbc11303a2d71c0d3532255599dcb9d14e06692f6202bc9e762f9363b95b9657afa5636f8b99162d656ec71eead030040e2add0d4bcf6e8189568a96; playliststatus=visible; JSESSIONID-WYYY=dwTSf7No9xGH7HzrhqYcwnPQIVAnwgM6Pq%2FO%5ClmDiH2l5ScrkuvMSG%2BYZutH6wAz9WPwmNoo2evEm9Ee%2B%2Fa3%5Cx%5C%2B%2FUoToq37TQd%2BkzRzkSimgZlpbqnQXVP%5Cdu86phA4Se0w%2FQpgg15A8%2FjES3ahRByglGzMjzuiSSE8DRk%2B9ojksu33%3A1523358787887; _iuqxldmzr_=32; __utma=94650624.199382336.1520261898.1523349418.1523355839.5; __utmb=94650624.21.10.1523355839; __utmc=94650624; __utmz=94650624.1523349418.4.2.utmcsr=baidu|utmccn=(organic)|utmcmd=organic', 'Host': 'music.163.com', 'Referer': 'http://music.163.com/', 'Upgrade-Insecure-Requests': 1, 'User-Agent': ua } req = request.Request(base_url, headers=headers) response = request.urlopen(req) html = response.read() html = html.decode('utf-8') soup = BeautifulSoup(html, 'lxml') singers= soup.select(".m-sgerlist")[0] lis = singers.find_all(['li']) #取歌手的名字 for li in lis: #歌手名 singer = li.select('a[class="nm nm-icn f-thide s-fc0"]')[0].text href = li.select('a[class="nm nm-icn f-thide s-fc0"]')[0].attrs['href'] # print(singer,type(singer)) pattern = re.compile('[0-9]+') id = pattern.findall(href)[0] num = int(id) # print(id,type(id)) conn = pymysql.connect('127.0.0.1','root','zb376100870','163_song',charset='utf8') cursor = conn.cursor() sql = "insert into singer_tb(sger_name,sger_num) VALUE (%s,%s)" data = (singer,num) cursor.execute(sql,data) conn.commit() #获取专辑 album(id,headers) def album(id,headers): # if id == '6452': sger_num = int(id) for i in range(10): j = i*12 sger_url = 'http://music.163.com/artist/album?id=%d&limit=12&offset=%d'%(sger_num,j) req = request.Request(sger_url,headers=headers) response = request.urlopen(req) html = response.read() html = html.decode('utf-8') soup = BeautifulSoup(html,'lxml') albums = soup.select('#m-song-module') # print('123456'+html) if albums != []: lis = albums[0].find_all(['li']) # print(lis) for li in lis: album_name = li.select('a[class="tit s-fc0"]')[0].text href = li.select('a[class="tit s-fc0"]')[0].attrs['href'] pattern = re.compile('[0-9]+') album_id = pattern.findall(href)[0] album_num = int(album_id) # print(album_name) conn = pymysql.connect('127.0.0.1', 'root', 'zb376100870', '163_song', charset='utf8') cursor = conn.cursor() sql = "insert into album_tb(album_name,album_num,sger_num) VALUE (%s,%s,%s)" data = (album_name, album_num,sger_num) cursor.execute(sql, data) conn.commit() song(album_num,headers,sger_num) def song(album_num,headers,sger_num): # print(album_num,type(album_num)) song_url = 'http://music.163.com/album?id=%d'%(album_num) req = request.Request(song_url, headers=headers) response = request.urlopen(req) html = response.read() html = html.decode('utf-8') soup = BeautifulSoup(html, 'lxml') # albums = soup.select('#m-song-module') # print(html) album_song = soup.select('ul[class="f-hide"]')[0] songs = album_song.find_all(['a']) for song in songs: song_name = song.text href = song.attrs['href'] pattern = re.compile('[0-9]+') song_id = pattern.findall(href)[0] song_num = int(song_id) print(song_name,song_num,album_num,sger_num) # conn = pymysql.connect('127.0.0.1', 'root', 'zb376100870', '163_song', charset='utf8') # cursor = conn.cursor() # sql = "insert into song_tb(song_name,song_num,album_num,sger_num) VALUE (%s,%s,%s,%s)" # data = (song_name, song_num,album_num, sger_num) # cursor.execute(sql, data) # conn.commit() word_cmts(song_name, song_num,album_num, sger_num,headers) # word_cmts(song_num,headers) def word_cmts(song_name, song_num,album_num, sger_num,headers): print(song_num) word_url = 'http://music.163.com/api/song/lyric?' + 'id=' + str(song_num) + '&lv=1&kv=1&tv=-1' # cmts_url = 'http://music.163.com/api/v1/resource/comments/R_SO_4_185807?id=' + str(song_num) + '&lv=1&kv=1&tv=-1' # cmts_url = 'http://music.163.com/api/v1/resource/comments/R_SO_4_185807?id=531051217&lv=1&kv=1&tv=-1' req = request.Request(word_url, headers=headers) response = request.urlopen(req) html = response.read() html = html.decode('utf-8') j = json.loads(html) print('~~~~~~~~~~~',j) if 'lrc' in j: lrc = j['lrc']['lyric'] # print(lrc) pat = re.compile(r'\[.*\]') lrc = re.sub(pat,"",lrc) lrc = lrc.strip() else: lrc = '' # print(lrc,type(lrc)) cmts_url = 'http://music.163.com/api/v1/resource/comments/R_SO_4_' + str(song_num) + '?id=' + str(song_num) + '&lv=1&kv=1&tv=-1' # cmts_url = 'http://music.163.com/api/v1/resource/comments/R_SO_4_185807?id=531051217&lv=1&kv=1&tv=-1' req = request.Request(cmts_url,headers=headers) response = request.urlopen(req) html = response.read() html = html.decode('utf-8') c = json.loads(html) cmts = c['hotComments'] lst = [] for cmt in cmts: comment = cmt['content'] comment = comment.strip() # print(comment) # patttern = re.compile(r'[a-zA-Z0-9\w+]+') # comment = re.sub(patttern,'',comment) # comment = comment.strip() # print(comment) lst.append(comment) cmmt = str(lst) # print(cmmt,type(cmmt)) try: # python UCS-4 build的处理方式 highpoints = re.compile(u'[\U00010000-\U0010ffff]') except re.error: # python UCS-2 build的处理方式 highpoints = re.compile(u'[\uD800-\uDBFF][\uDC00-\uDFFF]') cmmt = highpoints.sub(u'??', cmmt) # print(cmmt) conn = pymysql.connect('127.0.0.1', 'root', 'zb376100870', '163_song', charset='utf8') cursor = conn.cursor() sql = "insert into song_tb(song_name,song_num,album_num,sger_num,song_lrc,song_cmts) VALUE (%s,%s,%s,%s,%s,%s)" data = (song_name, song_num, album_num, sger_num,lrc,cmmt) cursor.execute(sql, data) conn.commit() if __name__ == '__main__': singer()