python 特征选择绘图 + mine

demo代码：

# _*_coding:UTF-8_*_
import numpy as np
import sys 
import pandas as pd
from pandas import Series,DataFrame
import numpy as np
import sys 
from sklearn import preprocessing
from sklearn.ensemble import ExtraTreesClassifier
import os
from minepy import MINE

def iterbrowse(path):
    for home, dirs, files in os.walk(path):
        for filename in files:
            yield os.path.join(home, filename)


def get_data(filename):
    white_verify = []
    with open(filename) as f:
        lines = f.readlines()
        data = {}
        for line in lines:
            a = line.split("\t")
            if len(a) != 78: 
                print(line)
                raise Exception("fuck")
            white_verify.append([float(n) for n in a[3:]])
    return white_verify


if __name__ == '__main__':
    # pdb.set_trace()
    neg_file = "cc_data/black_all.txt"
    pos_file = "cc_data/white_all.txt"
    X = []
    y = []
    if os.path.isfile(pos_file):
        if pos_file.endswith('.txt'):
            pos_set = np.genfromtxt(pos_file)
        elif pos_file.endswith('.npy'):
            pos_set = np.load(pos_file)
        X.extend(pos_set)
        y += [0] * len(pos_set)
    if os.path.isfile(neg_file):
        if neg_file.endswith('.txt'):
            neg_set = np.genfromtxt(neg_file)
        elif neg_file.endswith('.npy'):
            neg_set = np.load(neg_file)

        '''
        X.extend(list(neg_set) * 5)
        y += [1] * (5 * len(neg_set))
        '''
        X.extend(neg_set)
        y += [1] * len(neg_set)

    print("len of X:", len(X))
    print("X sample:", X[:3])
    print("len of y:", len(y))
    print("y sample:", y[:3])
    X = [x[3:] for x in X]
    print("filtered X sample:", X[:3])

    cols = [str(i + 6) for i in range(len(X[0]))]
    clf = ExtraTreesClassifier()
    clf.fit(X, y)
    print (clf.feature_importances_)
    print "Features sorted by their score:"
    print sorted(zip(clf.feature_importances_, cols), reverse=True)

    black_verify = []
    for f in iterbrowse("todo/top"):
        print(f)
        black_verify += get_data(f)
    # ValueError: operands could not be broadcast together with shapes (1,74) (75,) (1,74)
    print(black_verify)
    black_verify_labels = [3] * len(black_verify)

    white_verify = get_data("todo/white_verify.txt")
    print(white_verify)
    white_verify_labels = [2] * len(white_verify)

    unknown_verify = get_data("todo/pek_feature74.txt")
    print(unknown_verify)

    # extend data
    X = np.concatenate((X, black_verify))
    y += black_verify_labels
    X = np.concatenate((X, white_verify))
    y += white_verify_labels

    #################################### plot ####################################
    data_train = pd.DataFrame(X)
    # cols = [str(i) for i in range(6, 81)]
    data_train.columns = cols

    # add label column
    # data_train = data_train.assign(label=pd.Series(y))
    data_train["label"] = pd.Series(y)

    print(data_train.info())
    print(data_train.columns)



    import matplotlib.pyplot as plt

    for col in cols:
        fig = plt.figure(figsize=(20, 16), dpi=8)
        fig.set(alpha=0.2)
        plt.figure()
        data_train[data_train.label == 0.0][col].plot()
        data_train[data_train.label == 1.0][col].plot()
        data_train[data_train.label == 2.0][col].plot()
        data_train[data_train.label == 3.0][col].plot()
        plt.xlabel(u"sample data id")
        plt.ylabel(u"value")
        plt.title(col)
        plt.legend((u'white', u'black', u"white-todo", u"black-todo"), loc='best')
        plt.show()

    print "calculate MINE mic value:"
    for col in cols:
        print col,
        mine = MINE(alpha=0.6, c=15,
                    est="mic_approx")  # http://minepy.readthedocs.io/en/latest/python.html#second-example
        mine.compute_score(data_train[col], y)
        print "MIC=", mine.mic()

    sys.exit(-1)

extend data 表示待预测的数据

关于mic：

from __future__ import division
import numpy as np
import matplotlib.pyplot as plt
from minepy import MINE


rs = np.random.RandomState(seed=0)

def mysubplot(x, y, numRows, numCols, plotNum,
              xlim=(-4, 4), ylim=(-4, 4)):

    r = np.around(np.corrcoef(x, y)[0, 1], 1)
    mine = MINE(alpha=0.6, c=15, est="mic_approx")
    mine.compute_score(x, y)
    mic = np.around(mine.mic(), 1)
    ax = plt.subplot(numRows, numCols, plotNum,
                     xlim=xlim, ylim=ylim)
    ax.set_title('Pearson r=%.1f\nMIC=%.1f' % (r, mic),fontsize=10)
    ax.set_frame_on(False)
    ax.axes.get_xaxis().set_visible(False)
    ax.axes.get_yaxis().set_visible(False)
    ax.plot(x, y, ',')
    ax.set_xticks([])
    ax.set_yticks([])
    return ax

def rotation(xy, t):
    return np.dot(xy, [[np.cos(t), -np.sin(t)], [np.sin(t), np.cos(t)]])

def mvnormal(n=1000):
    cors = [1.0, 0.8, 0.4, 0.0, -0.4, -0.8, -1.0]
    for i, cor in enumerate(cors):
        cov = [[1, cor],[cor, 1]]
        xy = rs.multivariate_normal([0, 0], cov, n)
        mysubplot(xy[:, 0], xy[:, 1], 3, 7, i+1)

def rotnormal(n=1000):
    ts = [0, np.pi/12, np.pi/6, np.pi/4, np.pi/2-np.pi/6,
          np.pi/2-np.pi/12, np.pi/2]
    cov = [[1, 1],[1, 1]]
    xy = rs.multivariate_normal([0, 0], cov, n)
    for i, t in enumerate(ts):
        xy_r = rotation(xy, t)
        mysubplot(xy_r[:, 0], xy_r[:, 1], 3, 7, i+8)

def others(n=1000):
    x = rs.uniform(-1, 1, n)
    y = 4*(x**2-0.5)**2 + rs.uniform(-1, 1, n)/3
    mysubplot(x, y, 3, 7, 15, (-1, 1), (-1/3, 1+1/3))

    y = rs.uniform(-1, 1, n)
    xy = np.concatenate((x.reshape(-1, 1), y.reshape(-1, 1)), axis=1)
    xy = rotation(xy, -np.pi/8)
    lim = np.sqrt(2+np.sqrt(2)) / np.sqrt(2)
    mysubplot(xy[:, 0], xy[:, 1], 3, 7, 16, (-lim, lim), (-lim, lim))

    xy = rotation(xy, -np.pi/8)
    lim = np.sqrt(2)
    mysubplot(xy[:, 0], xy[:, 1], 3, 7, 17, (-lim, lim), (-lim, lim))

    y = 2*x**2 + rs.uniform(-1, 1, n)
    mysubplot(x, y, 3, 7, 18, (-1, 1), (-1, 3))

    y = (x**2 + rs.uniform(0, 0.5, n)) * \
        np.array([-1, 1])[rs.random_integers(0, 1, size=n)]
    mysubplot(x, y, 3, 7, 19, (-1.5, 1.5), (-1.5, 1.5))

    y = np.cos(x * np.pi) + rs.uniform(0, 1/8, n)
    x = np.sin(x * np.pi) + rs.uniform(0, 1/8, n)
    mysubplot(x, y, 3, 7, 20, (-1.5, 1.5), (-1.5, 1.5))

    xy1 = np.random.multivariate_normal([3, 3], [[1, 0], [0, 1]], int(n/4))
    xy2 = np.random.multivariate_normal([-3, 3], [[1, 0], [0, 1]], int(n/4))
    xy3 = np.random.multivariate_normal([-3, -3], [[1, 0], [0, 1]], int(n/4))
    xy4 = np.random.multivariate_normal([3, -3], [[1, 0], [0, 1]], int(n/4))
    xy = np.concatenate((xy1, xy2, xy3, xy4), axis=0)
    mysubplot(xy[:, 0], xy[:, 1], 3, 7, 21, (-7, 7), (-7, 7))

plt.figure(facecolor='white')
mvnormal(n=800)
rotnormal(n=200)
others(n=800)
plt.tight_layout()
plt.show()

posted @ 2018-05-24 10:05 bonelee 阅读(1736) 评论(2) 收藏举报

刷新页面返回顶部

将者，智、信、仁、勇、严也。

Hi，我是李智华，华为-安全AI算法专家，欢迎来到安全攻防对抗的有趣世界。

python 特征选择绘图 + mine

公告

将者，智、信、仁、勇、严也。

Hi，我是李智华，华为-安全AI算法专家，欢迎来到安全攻防对抗的有趣世界。

python 特征选择 绘图 + mine

公告

python 特征选择绘图 + mine