机器学习小记
1 from sklearn.datasets import load_iris 2 from sklearn.model_selection import train_test_split 3 from sklearn.feature_extraction import DictVectorizer 4 from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer 5 from sklearn.preprocessing import MinMaxScaler, StandardScaler 6 from sklearn.feature_selection import VarianceThreshold 7 from sklearn.decomposition import PCA 8 from scipy.stats import pearsonr 9 import jieba 10 import pandas as pd 11 12 13 def datasets_demo(): 14 """ 15 sklearn数据集使用 16 :return: 17 """ 18 # 获取数据集 19 iris = load_iris() 20 print("鸢尾花数据集:\n", iris) 21 print("查看数据集描述:\n", iris["DESCR"]) 22 print("查看特征值的名字:\n", iris.feature_names) 23 print("查看特征值:\n", iris.data, iris.data.shape) 24 25 # 数据集划分 26 x_train, x_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.2, random_state=22) 27 print("训练集的特征值:\n", x_train, x_train.shape) 28 29 return None 30 31 32 def dict_demo(): 33 """ 34 字典特征抽取 35 :return: 36 """ 37 data = [{'city': '北京','temperature':100}, {'city': '上海','temperature':60}, {'city': '深圳','temperature':30}] 38 # 1、实例化一个转换器类 39 transfer = DictVectorizer(sparse=True) 40 41 # 2、调用fit_transform() 42 data_new = transfer.fit_transform(data) 43 print("data_new:\n", data_new.toarray(), type(data_new)) 44 print("特征名字:\n", transfer.get_feature_names()) 45 46 return None 47 48 49 def count_demo(): 50 """ 51 文本特征抽取:CountVecotrizer 52 :return: 53 """ 54 data = ["life is short,i like like python", "life is too long,i dislike python"] 55 # 1、实例化一个转换器类 56 transfer = CountVectorizer(stop_words=["is", "too"]) 57 58 # 2、调用fit_transform 59 data_new = transfer.fit_transform(data) 60 print("data_new:\n", data_new.toarray()) 61 print("特征名字:\n", transfer.get_feature_names()) 62 63 return None 64 65 def count_chinese_demo(): 66 """ 67 中文文本特征抽取:CountVecotrizer 68 :return: 69 """ 70 data = ["我 爱 北京 天安门", "天安门 上 太阳 升"] 71 # 1、实例化一个转换器类 72 transfer = CountVectorizer() 73 74 # 2、调用fit_transform 75 data_new = transfer.fit_transform(data) 76 print("data_new:\n", data_new.toarray()) 77 print("特征名字:\n", transfer.get_feature_names()) 78 79 return None 80 81 82 def cut_word(text): 83 """ 84 进行中文分词:"我爱北京天安门" --> "我 爱 北京 天安门" 85 :param text: 86 :return: 87 """ 88 return " ".join(list(jieba.cut(text))) 89 90 91 def count_chinese_demo2(): 92 """ 93 中文文本特征抽取,自动分词 94 :return: 95 """ 96 # 将中文文本进行分词 97 data = ["一种还是一种今天很残酷,明天更残酷,后天很美好,但绝对大部分是死在明天晚上,所以每个人不要放弃今天。", 98 "我们看到的从很远星系来的光是在几百万年之前发出的,这样当我们看到宇宙时,我们是在看它的过去。", 99 "如果只用一种方式了解某样事物,你就不会真正了解它。了解事物真正含义的秘密取决于如何将其与我们所了解的事物相联系。"] 100 101 data_new = [] 102 for sent in data: 103 data_new.append(cut_word(sent)) 104 # print(data_new) 105 # 1、实例化一个转换器类 106 transfer = CountVectorizer(stop_words=["一种", "所以"]) 107 108 # 2、调用fit_transform 109 data_final = transfer.fit_transform(data_new) 110 print("data_new:\n", data_final.toarray()) 111 print("特征名字:\n", transfer.get_feature_names()) 112 113 return None 114 115 def tfidf_demo(): 116 """ 117 用TF-IDF的方法进行文本特征抽取 118 :return: 119 """ 120 # 将中文文本进行分词 121 data = ["一种还是一种今天很残酷,明天更残酷,后天很美好,但绝对大部分是死在明天晚上,所以每个人不要放弃今天。", 122 "我们看到的从很远星系来的光是在几百万年之前发出的,这样当我们看到宇宙时,我们是在看它的过去。", 123 "如果只用一种方式了解某样事物,你就不会真正了解它。了解事物真正含义的秘密取决于如何将其与我们所了解的事物相联系。"] 124 125 data_new = [] 126 for sent in data: 127 data_new.append(cut_word(sent)) 128 # print(data_new) 129 # 1、实例化一个转换器类 130 transfer = TfidfVectorizer(stop_words=["一种", "所以"]) 131 132 # 2、调用fit_transform 133 data_final = transfer.fit_transform(data_new) 134 print("data_new:\n", data_final.toarray()) 135 print("特征名字:\n", transfer.get_feature_names()) 136 137 return None 138 139 def minmax_demo(): 140 """ 141 归一化 142 :return: 143 """ 144 # 1、获取数据 145 data = pd.read_csv("dating.txt") 146 data = data.iloc[:, :3] 147 print("data:\n", data) 148 149 # 2、实例化一个转换器类 150 transfer = MinMaxScaler(feature_range=[2, 3]) 151 152 # 3、调用fit_transform 153 data_new = transfer.fit_transform(data) 154 print("data_new:\n", data_new) 155 156 return None 157 158 159 def stand_demo(): 160 """ 161 标准化 162 :return: 163 """ 164 # 1、获取数据 165 data = pd.read_csv("dating.txt") 166 data = data.iloc[:, :3] 167 print("data:\n", data) 168 169 # 2、实例化一个转换器类 170 transfer = StandardScaler() 171 172 # 3、调用fit_transform 173 data_new = transfer.fit_transform(data) 174 print("data_new:\n", data_new) 175 return None 176 177 def variance_demo(): 178 """ 179 过滤低方差特征 180 :return: 181 """ 182 # 1、获取数据 183 data = pd.read_csv("factor_returns.csv") 184 data = data.iloc[:, 1:-2] 185 print("data:\n", data) 186 187 # 2、实例化一个转换器类 188 transfer = VarianceThreshold(threshold=10) 189 190 # 3、调用fit_transform 191 data_new = transfer.fit_transform(data) 192 print("data_new:\n", data_new, data_new.shape) 193 194 # 计算某两个变量之间的相关系数 195 r1 = pearsonr(data["pe_ratio"], data["pb_ratio"]) 196 print("相关系数:\n", r1) 197 r2 = pearsonr(data['revenue'], data['total_expense']) 198 print("revenue与total_expense之间的相关性:\n", r2) 199 200 return None 201 202 203 def pca_demo(): 204 """ 205 PCA降维 206 :return: 207 """ 208 data = [[2,8,4,5], [6,3,0,8], [5,4,9,1]] 209 210 # 1、实例化一个转换器类 211 transfer = PCA(n_components=0.95) 212 213 # 2、调用fit_transform 214 data_new = transfer.fit_transform(data) 215 print("data_new:\n", data_new) 216 return None 217 218 if __name__ == "__main__": 219 # 代码1:sklearn数据集使用 220 # datasets_demo() 221 # 代码2:字典特征抽取 222 # dict_demo() 223 # 代码3:文本特征抽取:CountVecotrizer 224 # count_demo() 225 # 代码4:中文文本特征抽取:CountVecotrizer 226 # count_chinese_demo() 227 # 代码5:中文文本特征抽取,自动分词 228 # count_chinese_demo2() 229 # 代码6:中文分词 230 # print(cut_word("我爱北京天安门")) 231 # 代码7:用TF-IDF的方法进行文本特征抽取 232 # tfidf_demo() 233 # 代码8:归一化 234 # minmax_demo() 235 # 代码9:标准化 236 # stand_demo() 237 # 代码10:低方差特征过滤 238 # variance_demo() 239 # 代码11:PCA降维 240 pca_demo()