Baidu音乐歌曲爬虫:
1、分析Baidu音乐歌曲下载接口,组装参数
2、判断是否需要登录
a、使用cookie
b、使用selenium
3、歌曲信息页面分析
4、数据表设计
歌曲类型表
歌曲表
表都无所谓,自己设计就行。
-------------------------------
# -*- coding: utf-8 -*- ''' *** _author_= "fengshaungzi" _time_='2018-4-10' _python_version_ = 'python2.7' _script_type_ = 'spider' url = 'http://music.baidu.com/tag/类型?start=0&size=20&third_type=0' *** ''' from os import path from bs4 import BeautifulSoup import urllib,urllib2,requests,cookielib import sys,time,datetime from selenium import webdriver from selenium.webdriver.chrome.options import Options import pymysql,shutil import sys,os reload(sys) sys.setdefaultencoding('utf-8') d = path.dirname(__file__) class BadiuMusicSpider(): def __init__(self): pass def login(self,cursor,type_id,type_q): chrome_options = Options() chrome_options.add_argument('--headless') chrome_options.add_argument('--disable-gpu') driver = webdriver.Chrome() driver.maximize_window() driver.get("http://i.baidu.com/welcome/") time.sleep(5) driver.find_element_by_xpath('/html/body/header/div/div/a[2]').click() time.sleep(2) driver.find_element_by_xpath('//*[@id="TANGRAM__PSP_10__userName"]').clear() driver.find_element_by_xpath('//*[@id="TANGRAM__PSP_10__userName"]').send_keys('用户') time.sleep(2) driver.find_element_by_xpath('//*[@id="TANGRAM__PSP_10__password"]').clear() driver.find_element_by_xpath('//*[@id="TANGRAM__PSP_10__password"]').send_keys('密码') ##如果有验证码 time.sleep(3) try: driver.find_element_by_xpath('//*[@id="TANGRAM__PSP_10__verifyCodeChange"]').click() input = raw_input(u'请输入验证码:') code = driver.find_element_by_xpath('//*[@id="TANGRAM__PSP_10__verifyCode"]') code.clear() code.send_keys(input) except: print u'没有验证码。' driver.find_element_by_xpath('//*[@id="TANGRAM__PSP_10__submit"]').submit() time.sleep(2) self.parse_html(driver,cursor,type_id,type_q) def parse_html(self,driver,cursor,type_id,type_q,page=1,): #response = urllib2.urlopen(url).read() #response = opener.open(urllib2.Request(url, headers=headers)) #response = response.read() #response = requests.get(url, headers=headers, cookies=cookies).content #response = opener.open(urllib2.Request(url, headers=headers)) #response = response.read() start = (page-1)*20 print u'---开始获取第{0}页的数据----'.format(page) url = 'http://music.baidu.com/tag/{0}?start={1}&size=20&third_type=0'.format(type_q,start) driver.get(url) time.sleep(2) response = driver.page_source obj = BeautifulSoup(response, 'html.parser') ##获取歌曲m_url span_list = obj.find_all('span',{"class":"song-title"}) ## 判断下是否有下一页 try: driver.find_element_by_class_name('page-navigator-next') next_page = 1 except: next_page = 0 #try: for v in span_list: list = [] try: m_url = v.find('a')['href'] except: continue ###获取song_id song_id = m_url.replace('/song/', '') ##组装下url头部 m_url = 'http://music.baidu.com{0}'.format(m_url) ###开始获取歌曲信息 data = self.save_music_info(m_url,type_id) ### 判断data['check']==0,说明歌曲已经存在跳出这次循环 if data.has_key('check'): print u'---该歌曲已经存在---' continue singer_path = u"G:\\www\\music2\\"+data['singer'] ###歌曲信息获取完毕开始下载歌曲 需要song_id music_lrc = self.save_music_lrc(driver,song_id,singer_path) if music_lrc.has_key('words') and music_lrc['words'] =='暂无': data['words'] ='' else: print u"歌词:"+music_lrc['lrc_name'] data['words'] = u'music2/LRC/'+music_lrc['lrc_name'] data['filepath'] = u'music2/{0}/{1}.mp3'.format(data['singer'],data['name']) ## 设置id的值 cursor.execute('select id from network_music order by cast(id as SIGNED INTEGER) desc limit 0,1') old_id = cursor.fetchone() if old_id: id_n = str(int(old_id[0]) + 1) else: id_n = str(1) # 进入数据库 list = [(id_n,data['name'],data['singer'],data['album'],data['publishtime'],data['publishcompany'],data['composer'],data['lyrics'], \ data['filesize'],data['filetime'],data['userhead'],data['types'],data['status'],data['words'],data['filepath'])] #xprint list self.save_db(cursor,list) ''' except: ## 记入log try: datetime_now = datetime.datetime.now() datetime_str = '{0}-{1}-{2} {3}:{4}:{5}'.format(datetime_now.year,datetime_now.month,datetime_now.day,datetime_now.hour,datetime_now.minute,datetime_now.second) effect_row = cursor.executemany("insert into music_log(page,datetime)values(%s,%s)",[(page,datetime_str)]) ## 提交,不然无法保存新建或者修改的数据 conn.commit() except: print 'Add log fault!' ''' page = page + 1 #input = raw_input('输入任意值继续执行:') if next_page==1: print u'------开始获取下一页的数据----' self.parse_html(driver,cursor,type_id,type_q,page=page) else: print u"-----爬虫程序即将结束-----" cursor.close() conn.close() def save_music_info(self,m_url,type_id): data = {} music_info_response = urllib2.urlopen(m_url).read() music_info_obj = BeautifulSoup(music_info_response, 'html.parser') ##获取歌曲信息 name singer alnum pubdate pic tag company name = music_info_obj.find('span',{"class":"name"}).text.strip() name = name.replace('"','') name = name.replace("'",'') singer = music_info_obj.find('span',{"class":"artist"}).find('a').text.strip() singer = singer.replace('"', '') singer = singer.replace("'", '') if os.path.exists("G:\\www\\music2\\"+singer) == False: os.mkdir("G:\\www\\music2\\"+singer) else: print u'歌手文件夹已经存在!' album = music_info_obj.find('p',{"class":"album"}).find('a').text.strip() ##发布时间需要处理; 排除空白的情况 if music_info_obj.find('p',{"class":"publish"}).text.strip() ==u'发行时间:': publishtime = '未知' else: publishtime = music_info_obj.find('p',{"class":"publish"}).text.strip() publishtime = publishtime.replace(u'发行时间:','') ##发行公司需要处理;排除空白的情况 if music_info_obj.find('p',{"class":"company"}).text.strip() ==u'发行公司:': publishcompany = '未知' else: publishcompany = music_info_obj.find('p',{"class":"company"}).text.strip() publishcompany = publishcompany.replace(u'发行公司:','') ###获取图片 pic_url = music_info_obj.find('img',{"class":"music-song-ing"})['src'] if pic_url: pic_path = self.save_pic(pic_url) data['name'] = name print u"歌名:"+name data['singer'] = singer print u"歌手:" + singer data['album'] = album data['publishtime'] =publishtime data['publishcompany'] = publishcompany data['composer'] = '' data['lyrics'] = '' data['filesize'] = '' data['filetime'] = 0 data['userhead'] = pic_path if pic_path else '' data['types'] = ','+str(type_id)+',' data['status'] = 0 ## 判断数据库是否重复 #print 'select id,TYPES from network_music where NAME="{0}" and SINGER="{1}"'.format(name,singer) cursor.execute('select id,TYPES from network_music where NAME="{0}" and SINGER="{1}"'.format(name,singer)) result_types = cursor.fetchall() if result_types: if str(type_id) in result_types[0][1]: pass else: types = result_types[0][1] + str(type_id)+',' cursor.execute("UPDATE network_music SET TYPES='{0}' WHERE id ={1}".format(types, result_types[0][0])) ## 提交,不然无法保存新建或者修改的数据 conn.commit() data['check'] = 0 return data def save_music_lrc(self, driver,song_id,singer_path): music_lrc = {} m_api = 'http://music.baidu.com/data/music/file?link=&song_id={0}'.format(song_id) driver.get(m_api) time.sleep(3) ### 找到最新的文件 path_d = u'C:\\Users\\hz\\Downloads' file_lists = os.listdir(path_d) try: file_lists.sort(key=lambda fn: os.path.getmtime(path_d + "\\" + fn)) filename = file_lists[-1] if filename: #print filename #print singer_path ### 移动到 shutil.move(u'C:\\Users\\hz\\Downloads\\'+filename,singer_path) except: #os.remove(my_file) print u"移动失败,文件名字问题,手动修改" ##跳转到页面 driver.get('http://music.baidu.com/song/{0}'.format(song_id)) time.sleep(2) try: l_api = driver.find_element_by_xpath('//*[@id="lyricCont"]').get_attribute('data-lrclink') driver.get(l_api) time.sleep(2) try: music_lrc['lrc_name'] = self.get_lrc_path() except: print u'获取歌词文件名错误' except: music_lrc['words'] = '暂无' print u'没有歌词' return music_lrc def save_db(self,cursor,list): print list try: effect_row = cursor.executemany("insert into network_music(ID,NAME,SINGER,ALBUM,PUBLISHTIME,PUBLISHCOMPANY,COMPOSER,LYRICS, \ FILESIZE,FILETIME,USERHEAD,TYPES,STATUS,WORDS,FILEPATH)values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) ", list) ## 提交,不然无法保存新建或者修改的数据 conn.commit() except: print 'Add this db fault!' def save_pic(self, pic_url, save_path=''): ##组装成接口 pic_list = ['.jpg@','.png@','.jpeg@','.JPG@','.PNG@','.JPEG@'] for v in pic_list: #print pic_url if v in pic_url: check = 1 else: endname = '.errorpic' if 'check' in vars() and check == 1: endname = v.replace('@', '') #print endname,pic_url save_path = path.join(d, 'music2/USERHEAD/') ###名字暂用时间戳 picName = int(time.time()) savepic = save_path + str(picName) + endname try: urllib.urlretrieve(pic_url, savepic) return 'music2/USERHEAD/' + str(picName) + endname except: return 'no' def get_lrc_path(self): path_d = u'C:\\Users\\hz\\Downloads' file_lists = os.listdir(path_d) file_lists.sort(key=lambda fn: os.path.getmtime(path_d + "\\" + fn)) lrc_name = file_lists[-1] ''' if lrc_name: shutil.move(u'C:\\Users\\hz\\Downloads\\' + lrc_name, u'G:\\www\\music2\\LRC\\') ''' return lrc_name ''' def auto_down1(self, url, filename): try: urllib.urlretrieve(url, filename) except urllib.ContentTooShortError: print 'Network conditions is not good.Reloading.' auto_down(url, filename) def auto_down2(self, url, filename): ##加载cookies raw_cookies = "PSTM=1523331116; BIDUPSID=6598753517A81D738FD546C2D96EDAC5; BAIDUID=E5EE59A93C8788A953248CD76BEBD48D:FG=1; H_PS_PSSID=1425_18194_21127_26182_20928; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; PHPSESSID=bae76nl31pln7r47vi3i1o9jh7; Hm_lvt_4010fd5075fcfe46a16ec4cb65e02f04=1523420559,1523420572; PSINO=2; Hm_lpvt_4010fd5075fcfe46a16ec4cb65e02f04=1523425208" cookies = {} for line in raw_cookies.split(';'): key, value = line.split('=', 1) # 1代表只分一次,得到两个数据 cookies[key] = value r = requests.get(url, stream=True,cookies = cookies ) f = open(filename, "wb") for chunk in r.iter_content(chunk_size=512): if chunk: f.write(chunk) f.close() def auto_down3(self, url, filename): cookie = cookielib.MozillaCookieJar() cookie.load('c.txt', ignore_expires=True, ignore_discard=True) opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie)) urllib2.install_opener(opener) music = urllib2.urlopen(url).read() f = open(filename,'wb') f.write(music) f.close() ''' if __name__ == "__main__": print r'Starting....' for i in range(5): sys.stdout.write('>'*i + '\n') sys.stdout.flush() time.sleep(0.5) conn = pymysql.Connection(host="localhost", user="root", passwd="root", db='test1', charset="UTF8") # 创建指针 cursor = conn.cursor() type = raw_input(r'请输入歌曲的类型: ').strip() ## 加入数据库 ## 先判断值是否存在 result = cursor.execute("select id from network_type where RESOURCETYPE='m' and TYPENAME='{0}'".format(type)) if result == 0: print u'-----该类型不存在添加至数据库-------' effect_row = cursor.executemany("insert into network_type(PID,RESOURCETYPE,TYPENAME)values(%s,%s,%s)", [(-1,'m',type)]) type_id = int(cursor.lastrowid) else: print u'-----该类型存在不需要添加至数据库-------' type_val= cursor.fetchall() type_id = type_val[0][0] ## 提交,不然无法保存新建或者修改的数据 conn.commit() type_q = urllib2.quote(type) # 实例 bmSpider = BadiuMusicSpider() bmSpider.login(cursor,type_id,type_q)
----代码的逻辑
第一步:登录百度,使用selenium(本来我打算用selenium登录之后导出cookie,再通过加载cookie,但是遇到了些问题,再加上工作原因就没有用这个,下次我有空再试,验证码方面,没有设置,遇到验证码关了重启,只要登录成功了,可以爬很久了。)
第二步:输入歌曲类型,默认从第一页开始抓取,接下来就是各种循环,入库啥的,还有文件移动。
总的来说还是比较简单的一个爬虫,不足之处大佬轻喷。
今ならできます。