爱奇艺、腾讯电视剧解析采集
import re import urllib import urllib.request import pymysql def getHtml(url): page = urllib.request.urlopen(url) html = page.read() return html # iqiyi:<a data-pb="" href="(http://www.iqiyi.com/v_[\s\S]*?html)"[\s\S]*?title="([\s\S]*?)" # tencent:<a\s*href="(http://v\.qq\.com/x/cover.*?html)"\s*target="_blank[\s\S]*?<span[\s\S]*?episodeNumber">([\s\S]*?)</span> def parse(url,regular): html = getHtml(url) html=html.decode('utf-8') urls = re.findall(regular, html, re.I) lst={} for u in urls: key=u[1] value=u[0] lst[str(key)]=str(value) result="" for v,k in lst.items(): result+="第{v}集${k}#".format(v=v,k=k) # print(result) result = result[:-1] return result,len(lst) def exceDB(url,id,regular): result,len=parse(url,regular) len="同步更新至{len}集".format(len=len) conn = pymysql.connect("localhost","root","sa","m8",use_unicode=True,charset="utf8") cur = conn.cursor() sql = "update mac_vod set d_remarks=%s,d_playurl=%s where d_id=%s" sta=cur.execute(sql,(len,result,id)) print(sta) cur.close() conn.commit() conn.close() # 鬼吹灯之牧野诡事 每周一、周二20:00各更新1集 url = "http://www.iqiyi.com/lib/m_211070614.html" id = 39097 regular = '<a data-pb="" href="(http://www.iqiyi.com/v_[\s\S]*?html)"[\s\S]*?title="([\s\S]*?)"' exceDB(url,id,regular) # 双世宠妃 每周一二20点每天更新2集 url = "http://v.qq.com/detail/4/47xswolfi4iamlx.html" id = 21271 regular = '<a\s*href="(http://v\.qq\.com/x/cover.*?html)"\s*target="_blank[\s\S]*?<span[\s\S]*?episodeNumber">([\s\S]*?)</span>' exceDB(url,id,regular)
爱奇艺正则做了调整,可以用新的:regular = '<a data-pb="[\s\S]*?" href="http://(www.iqiyi.com/v_[\s\S]*?.html)"[\s\S]*?title="([\s\S]*?)"'
使用python3.6.1+pymysql
pymysql 使用pip install pymysql 命令安装即可
采集后展示效果见:www.shurua.com
优酷的采集为了省事直接用了火车头,此处暂不介绍了
优酷采集,先做备份
http://list.youku.com/show/id_z24d56510933411e5b432.html
showid:"[参数]"
http://list.youku.com/show/module?id=302926&tab=showInfo&cname=1&callback=jQuery
http://list.youku.com/show/module?id=[参数1]&tab=showInfo&cname=%E7%94%B5%E8%A7%86%E5%89%A7&callback=jQuery111209912194847404647_1494915784794&_=1494915784795
http://list.youku.com/show/episode?id=[参数1]&stage=reload_41&callback=jQuery
<li><a class=\"c555\" href=\"\/\/v.youku.com\/v_show\/[参数]\" target=\"_blank\">[参数]<\/a>