斗鱼直播实时数据爬取 标签: django数据爬虫 2017-05-30 23:46 229人阅读 评论(0)
思路
1, 解析URL
2, 利用爬虫神器 bs4 和 正则表达式得到想要的信息;
3, 进库和本地保存
DJango后台展示和本地CSV(卖相太差,不发了)
* 存储本地的CSV 直接运行 DySpyder().summary_data180() 即可*
直接上代码
# -*- coding: utf-8 -*-
import os
import re
import django
import urllib.request as ur
class DySpyder():
def __init__(self):
pass
def open_url(self, url):
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'}
req = ur.Request(url=url, headers=headers) # python2,urllib.request()
response = ur.urlopen(req) # python2,urllib2.urlopen()
return response.read().decode('utf-8')
def tv_spyder(self):
url = "https://www.douyu.com/directory/all"
data = self.open_url(url)
from bs4 import BeautifulSoup
cate = ['', 'serach_lastli', 'last','lastserach_lastli'] # - - s- l - ll 6loop
soup1 = BeautifulSoup(data, 'html.parser')
soup = soup1.find("ul", id='live-list-contentbox')
res = []
for c in cate:
tmp = soup.findAll('li', c)
res.extend(tmp)
return res
import datetime
def set_data(self, x):
import datetime
res = {}
# title.__init__
title0 = str(x.find("h3").next_element)
spans = x.findAll(["span"])
# basic info to the link
tag, dy_name, dy_num = tuple([s.next_element for s in spans][2:5])
parterb = r'''.*<img data-original=(.*?) height="163" src=(.*?) width="283"/>.*'''
# the urls of img and gif
img, gif = re.findall(parterb, repr(x))[0]
p2 = r'''.*a class="play-list-link" (.*?)=(.*?) (.*?)=(.*?) (.*?)=(.*?) (.*?)=(.*?) (.*?)=(.*?) (.*?)=(.*?) (.*?)=(.*?) (.*?)=(.*?)>.*'''
t1 = [x for x in re.findall(p2, repr(x))][0]
# the head of link-info
for i in range(int(len(t1)/2 - 1)):
res.setdefault(t1[2*i], t1[2*i+1])
res.setdefault("dt", datetime.datetime.today())
res.setdefault('tag', tag)
res.setdefault('dy_name', dy_name)
res.setdefault('dy_num', dy_num)
res.setdefault('title0', title0)
res.setdefault('img', img)
res.setdefault('gif', gif)
return res
def summary_data180(self):
l = [self.set_data(x) for x in self.tv_spyder()]
import pandas as pd
df_tmp = pd.DataFrame(l)
df_tmp.to_csv("C:\\Users\\lenovo\\Desktop\\dy180.csv")
return df_tmp
#print(summary_data180())
def main(self):
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "minicms.settings")
django.setup()
from tv.models import Info
from django.utils import timezone
df = self.summary_data180()
print(df.columns)
import numpy as np
array2 = np.array(df)
for i in range(len(df)):
Info.objects.create(data_rid=array2[i][0],
data_rpos=array2[i][1],
data_sid=array2[i][2],
data_sub_rt=array2[i][3],
data_tid=array2[i][4],
dt=timezone.now(), ##修改了时间
dy_name=array2[i][6],
dy_num=array2[i][7],
gif=array2[i][8],
href=array2[i][9],
img=array2[i][10],
tag=array2[i][11],
target=array2[i][12],
title0=array2[i][13]
)
print("执行完毕")
dyspyder = DySpyder()
#dyspyder.main()
后续
- 随着时间更新, 每隔 10min 自动一次到数据库——可以获取 Tag 或者用户的规律
- 增加 虎牙-战旗-龙珠 的数据
- 增加图片进库和自己定义的页面实时优化; 实现多直播平台的归一化推荐
Django 存库模板
from django.db import models
# Create your models here.
class Info(models.Model):
data_rid = models.CharField("房间ID", max_length=20)
data_rpos = models.CharField("", max_length=20)
data_sid= models.CharField("", max_length=20)
data_sub_rt = models.CharField("", max_length=20)
data_tid = models.CharField("", max_length=20)
dt = models.DateTimeField("时间")
dy_name = models.CharField("账号名字", max_length=50)
dy_num = models.CharField("观看数", max_length=20)
gif= models.CharField("GIF", max_length=120)
href = models.CharField("房间url", max_length=20)
img = models.CharField("IMG_url", max_length=120)
tag = models.CharField("标签", max_length=120)
target = models.CharField("目标", max_length=20)
title0 = models.CharField("标题", max_length=120)
def __str__(self):
return self.dy_name + "_" + self.title0
class Meta:
verbose_name = '斗鱼时间信息'
verbose_name_plural = '斗鱼时间信息180条'
class ImgTools(models.Model):
img_url = models.URLField(verbose_name="线上路径")
dt = models.DateTimeField("时间")
data_rid = models.CharField("房间ID", max_length=20)
upload_to = models.URLField(verbose_name="本地路径")
TXT 爬取更新
def find_min(nums):
for i in range(len(nums)):
if nums[i+1] > nums[i]:
return i, nums[i]
def set_urls(book_id):
url = "http://www.biqudu.com/" + book_id + "/"
partern = r".*<dd> <a href=(.*?)>(.*?)</a></dd>.*"
import pandas as pd
import numpy as np
## 本方法不能分卷, 后续补上相关的事情优化
df1 = pd.DataFrame(np.array(re.findall(partern, open_url(url))), columns=["url", "title"])
df1["num"] = [int(list(re.findall(r".*/(.*?).html", x))[0]) for x in df1["url"]]
####### find all-span
start_index = find_min(df1["num"])[0]
return df1[start_index: len(df1)]
# 为单独一个小说页面爬取;txt 文档; content;
def detail():
url = "http://www.biqudu.com/21_21470/1394112.html"
data = open_url(url)
from bs4 import BeautifulSoup
soup = BeautifulSoup(data, 'html.parser')
content = soup.findAll('div', id="content")[0]
return content
# print(detail())
def test(request):
content = detail()
return render(request, "base_test.html", {"content": content})
* 近期会花精力弄微信小程序, 爬虫放置一段时间。 ==== END ====*