机器学习——数据预处理

基础

机器学习主要有两种，监督学习和非监督学习。监督学习就是督促计算机去学习，明确告诉它目标是什么，非监督学习是让计算机“自学成才”，没有设定目标，学习完告诉我你学到了什么

 1 # encoding=utf-8
 2 
 3 from sklearn import linear_model
 4 import matplotlib.pyplot as plt
 5 import numpy as np
 6 
 7 # 房屋面积与价格历史数据（csv文件）
 8 data = np.array([[150, 6450], [200, 7450], [250, 8450], [300, 9450], [350, 11450], [400, 15450], [600, 18450]])
 9 # print data[:, 0].reshape(-1, 1)
10 # plt.scatter(data[:, 0], data[:, 1], color='blue')
11 # plt.show()
12 
13 # 线性模型
14 # regr = linear_model.LinearRegression()
15 # 拟合
16 # regr.fit(data[:, 0].reshape(-1, 1), data[:, 1])
17 # 直线的斜率、截距
18 # a, b = regr.coef_, regr.intercept_
19 # print a, b
20 # plt.plot(data[:,0],regr.predict(data[:,0].reshape(-1,1)),color='red',linewidth=4)
21 # plt.scatter(data[:, 0], regr.predict(data[:, 0].reshape(-1, 1)), color='red')
22 # 预测175天和800天房价数据
23 # print regr.predict(175)
24 # print regr.predict(800)
25 # plt.show()

数据预处理

导入类库

1 from sklearn.feature_extraction import DictVectorizer
2 from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
3 import jieba
4 from sklearn.feature_selection import VarianceThreshold
5 from sklearn.preprocessing import StandardScaler, MinMaxScaler

数据处理

字典数据抽取

代码

 1 def dictvec():
 2     '''
 3     字典数据抽取：DictVectorizer
 4     sprase：为False时生成矩阵形式
 5     fit_transform：训练数据集
 6     get_feature_names：获取特征名，即列名或表头
 7     inverse_transform：得到每行数据中为1的数据（为1即为存在）
 8     :return:
 9     '''
10     dict = DictVectorizer(sparse=False)
11     data = dict.fit_transform(
12         [{'city': '北京', 'pos': '北方', 'temperature': 100},
13          {'city': '上海', 'pos': '南方', 'temperature': 60},
14          {'city': '深圳', 'pos': '南方', 'temperature': 30},
15          {'city': '重庆', 'pos': '南方', 'temperature': 70},
16          {'city': '北京', 'pos': '北方', 'temperature': 100}])
17 
18     print(dict.get_feature_names())
19     print(dict.inverse_transform(data))
20     print(data)
21     return None

结果

'''
['city=上海', 'city=北京', 'city=深圳', 'city=重庆', 'pos=北方', 'pos=南方', 'temperature']
[{'city=北京': 1.0, 'pos=北方': 1.0, 'temperature': 100.0}, {'city=上海': 1.0, 'pos=南方': 1.0, 'temperature': 60.0}, {'city=深圳': 1.0, 'pos=南方': 1.0, 'temperature': 30.0}, {'city=重庆': 1.0, 'pos=南方': 1.0, 'temperature': 70.0}, {'city=北京': 1.0, 'pos=北方': 1.0, 'temperature': 100.0}]
[[  0.   1.   0.   0.   1.   0. 100.]
 [  1.   0.   0.   0.   0.   1.  60.]
 [  0.   0.   1.   0.   0.   1.  30.]
 [  0.   0.   0.   1.   0.   1.  70.]
 [  0.   1.   0.   0.   1.   0. 100.]]
'''

英文特征值化

代码

 1 def countvec():
 2     '''
 3     对文本进行特征值化：CountVectorizer对文本中的词可进行统计
 4     排序：会按照英文常用性进行排序
 5     停用：a 等无显著特征的词会被停用
 6     :return: None
 7     '''
 8     cv = CountVectorizer()
 9     data = cv.fit_transform(['this is a test test', 'we have a test'])
10 
11     print(cv.get_feature_names())
12     print(data.toarray())
13     return None

结果

'''
['have', 'is', 'test', 'this', 'we']
[[0 1 2 1 0]
 [1 0 1 0 1]]
'''

中文特征值化

代码

def cutword():
    # 分词
    con1 = jieba.cut('天空灰得像哭过')
    con2 = jieba.cut('离开你以后')
    con3 = jieba.cut('并没有很自由')

    # 转换成列表
    content1 = list(con1)
    content2 = list(con2)
    content3 = list(con3)

    # 把列表转换成字符串
    c1 = ' '.join(content1)
    c2 = ' '.join(content2)
    c3 = ' '.join(content3)
    return c1, c2, c3

 1 def hanzivec():
 2     '''
 3     对文本进行特征值化：CountVectorizer对文本中的词可进行统计
 4     :return: None
 5     '''
 6     c1, c2, c3 = cutword()
 7     cv = CountVectorizer()
 8     print(c1, c2, c3)
 9     data = cv.fit_transform([c1, c2, c3])
10 
11     print(cv.get_feature_names())
12     print(data.toarray())
13     return None

结果

'''
天空 灰得 像 哭 过 离开 你 以后 并 没有 很 自由
['以后', '天空', '没有', '灰得', '离开', '自由']
[[0 1 0 1 0 0]
 [1 0 0 0 1 0]
 [0 0 1 0 0 1]]
'''

词频

代码

def tfidfvec():
    '''
    中文特征值化
    TF(词频)：在一篇文章中出现该词的次数与文章中总词数的比值，（出现次数/文章总词数）
    IDF(逆向词频)：log(文章总数/该词出现的文章数)
    TF,IDF值越大说明该词特征越显著
    '''
    c1, c2, c3 = cutword()
    print(c1, c2, c3)
    tf = TfidfVectorizer()
    data = tf.fit_transform([c1, c2, c3])
    print(tf.get_feature_names())
    print(data.toarray())
    return None

结果

'''
天空 灰得 像 哭 过 离开 你 以后 并 没有 很 自由
['以后', '天空', '没有', '灰得', '离开', '自由']
[[0.         0.70710678 0.         0.70710678 0.         0.        ]
 [0.70710678 0.         0.         0.         0.70710678 0.        ]
 [0.         0.         0.70710678 0.         0.         0.70710678]]
'''

标准化缩放

代码

 1 def stand():
 2     '''
 3     标准化缩放：特征列均值为0，标准差为1
 4     将数据差值很大，但变化率等相近的数据标准化，类似于横坐标是1000,2000,3000，纵坐标是1,2,3
 5     :return:
 6     '''
 7     std = StandardScaler()
 8     # data = std.fit_transform([[1., -1., 3.], [2., 4., 2.], [4., 6., -1.]])
 9     data = std.fit_transform([[1., 2., 3.], [100., 200., 300.], [1000., 2000., 3000.]])
10     print(data)
11     return None

结果

'''
[[-0.81438366 -0.81438366 -0.81438366]
 [-0.59409956 -0.59409956 -0.59409956]
 [ 1.40848322  1.40848322  1.40848322]]
'''

归一化

代码

1 def mm():
2     '''
3     归一化处理：类似于上面标准化，可以设定归一化后的特征值范围
4     :return:
5     '''
6     mm = MinMaxScaler(feature_range=(2, 3))
7     data = mm.fit_transform([[90, 2, 10, 40], [60, 4, 15, 45], [75, 3, 13, 46]])
8     print(data)
9     return None

结果

'''
[[3.         2.         2.         2.        ]
 [2.         3.         3.         2.83333333]
 [2.5        2.5        2.6        3.        ]]
'''

特征选择

代码

 1 def var():
 2     '''
 3     特征选择-删除低方差的特征
 4     threshold：阈值，小于设定阈值方差的特征列将被剔除
 5     注：方差小的，特征不显著
 6     :return:
 7     '''
 8     var = VarianceThreshold(threshold=1.0)
 9     data = var.fit_transform([[0, 2, 0, 3], [0, 1, 4, 3], [0, 1, 1, 3]])
10 
11     print(data)
12     return None

结果

'''
[[0]
 [4]
 [1]]
'''

posted @ 2018-10-04 09:51 BO00097 阅读(651) 评论(0) 编辑收藏举报

刷新页面返回顶部

抿嘴唇

机器学习——数据预处理

基础

数据预处理

导入类库

数据处理

字典数据抽取

代码

结果

英文特征值化

代码

结果

中文特征值化

代码

结果

词频

代码

结果

标准化缩放

代码

结果

归一化

代码

结果

特征选择

代码

结果

公告