机器学习小记

  1 from sklearn.datasets import load_iris
  2 from sklearn.model_selection import train_test_split
  3 from sklearn.feature_extraction import DictVectorizer
  4 from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
  5 from sklearn.preprocessing import MinMaxScaler, StandardScaler
  6 from sklearn.feature_selection import VarianceThreshold
  7 from sklearn.decomposition import PCA
  8 from scipy.stats import pearsonr
  9 import jieba
 10 import pandas as pd
 11 
 12 
 13 def datasets_demo():
 14     """
 15     sklearn数据集使用
 16     :return:
 17     """
 18     # 获取数据集
 19     iris = load_iris()
 20     print("鸢尾花数据集:\n", iris)
 21     print("查看数据集描述:\n", iris["DESCR"])
 22     print("查看特征值的名字:\n", iris.feature_names)
 23     print("查看特征值:\n", iris.data, iris.data.shape)
 24 
 25     # 数据集划分
 26     x_train, x_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.2, random_state=22)
 27     print("训练集的特征值:\n", x_train, x_train.shape)
 28 
 29     return None
 30 
 31 
 32 def dict_demo():
 33     """
 34     字典特征抽取
 35     :return:
 36     """
 37     data = [{'city': '北京','temperature':100}, {'city': '上海','temperature':60}, {'city': '深圳','temperature':30}]
 38     # 1、实例化一个转换器类
 39     transfer = DictVectorizer(sparse=True)
 40 
 41     # 2、调用fit_transform()
 42     data_new = transfer.fit_transform(data)
 43     print("data_new:\n", data_new.toarray(), type(data_new))
 44     print("特征名字:\n", transfer.get_feature_names())
 45 
 46     return None
 47 
 48 
 49 def count_demo():
 50     """
 51     文本特征抽取:CountVecotrizer
 52     :return:
 53     """
 54     data = ["life is short,i like like python", "life is too long,i dislike python"]
 55     # 1、实例化一个转换器类
 56     transfer = CountVectorizer(stop_words=["is", "too"])
 57 
 58     # 2、调用fit_transform
 59     data_new = transfer.fit_transform(data)
 60     print("data_new:\n", data_new.toarray())
 61     print("特征名字:\n", transfer.get_feature_names())
 62 
 63     return None
 64 
 65 def count_chinese_demo():
 66     """
 67     中文文本特征抽取:CountVecotrizer
 68     :return:
 69     """
 70     data = ["我 爱 北京 天安门", "天安门 上 太阳 升"]
 71     # 1、实例化一个转换器类
 72     transfer = CountVectorizer()
 73 
 74     # 2、调用fit_transform
 75     data_new = transfer.fit_transform(data)
 76     print("data_new:\n", data_new.toarray())
 77     print("特征名字:\n", transfer.get_feature_names())
 78 
 79     return None
 80 
 81 
 82 def cut_word(text):
 83     """
 84     进行中文分词:"我爱北京天安门" --> "我 爱 北京 天安门"
 85     :param text:
 86     :return:
 87     """
 88     return " ".join(list(jieba.cut(text)))
 89 
 90 
 91 def count_chinese_demo2():
 92     """
 93     中文文本特征抽取,自动分词
 94     :return:
 95     """
 96     # 将中文文本进行分词
 97     data = ["一种还是一种今天很残酷,明天更残酷,后天很美好,但绝对大部分是死在明天晚上,所以每个人不要放弃今天。",
 98             "我们看到的从很远星系来的光是在几百万年之前发出的,这样当我们看到宇宙时,我们是在看它的过去。",
 99             "如果只用一种方式了解某样事物,你就不会真正了解它。了解事物真正含义的秘密取决于如何将其与我们所了解的事物相联系。"]
100 
101     data_new = []
102     for sent in data:
103         data_new.append(cut_word(sent))
104     # print(data_new)
105     # 1、实例化一个转换器类
106     transfer = CountVectorizer(stop_words=["一种", "所以"])
107 
108     # 2、调用fit_transform
109     data_final = transfer.fit_transform(data_new)
110     print("data_new:\n", data_final.toarray())
111     print("特征名字:\n", transfer.get_feature_names())
112 
113     return None
114 
115 def tfidf_demo():
116     """
117     用TF-IDF的方法进行文本特征抽取
118     :return:
119     """
120     # 将中文文本进行分词
121     data = ["一种还是一种今天很残酷,明天更残酷,后天很美好,但绝对大部分是死在明天晚上,所以每个人不要放弃今天。",
122             "我们看到的从很远星系来的光是在几百万年之前发出的,这样当我们看到宇宙时,我们是在看它的过去。",
123             "如果只用一种方式了解某样事物,你就不会真正了解它。了解事物真正含义的秘密取决于如何将其与我们所了解的事物相联系。"]
124 
125     data_new = []
126     for sent in data:
127         data_new.append(cut_word(sent))
128     # print(data_new)
129     # 1、实例化一个转换器类
130     transfer = TfidfVectorizer(stop_words=["一种", "所以"])
131 
132     # 2、调用fit_transform
133     data_final = transfer.fit_transform(data_new)
134     print("data_new:\n", data_final.toarray())
135     print("特征名字:\n", transfer.get_feature_names())
136 
137     return None
138 
139 def minmax_demo():
140     """
141     归一化
142     :return:
143     """
144     # 1、获取数据
145     data = pd.read_csv("dating.txt")
146     data = data.iloc[:, :3]
147     print("data:\n", data)
148 
149     # 2、实例化一个转换器类
150     transfer = MinMaxScaler(feature_range=[2, 3])
151 
152     # 3、调用fit_transform
153     data_new = transfer.fit_transform(data)
154     print("data_new:\n", data_new)
155 
156     return None
157 
158 
159 def stand_demo():
160     """
161     标准化
162     :return:
163     """
164     # 1、获取数据
165     data = pd.read_csv("dating.txt")
166     data = data.iloc[:, :3]
167     print("data:\n", data)
168 
169     # 2、实例化一个转换器类
170     transfer = StandardScaler()
171 
172     # 3、调用fit_transform
173     data_new = transfer.fit_transform(data)
174     print("data_new:\n", data_new)
175     return None
176 
177 def variance_demo():
178     """
179     过滤低方差特征
180     :return:
181     """
182     # 1、获取数据
183     data = pd.read_csv("factor_returns.csv")
184     data = data.iloc[:, 1:-2]
185     print("data:\n", data)
186 
187     # 2、实例化一个转换器类
188     transfer = VarianceThreshold(threshold=10)
189 
190     # 3、调用fit_transform
191     data_new = transfer.fit_transform(data)
192     print("data_new:\n", data_new, data_new.shape)
193 
194     # 计算某两个变量之间的相关系数
195     r1 = pearsonr(data["pe_ratio"], data["pb_ratio"])
196     print("相关系数:\n", r1)
197     r2 = pearsonr(data['revenue'], data['total_expense'])
198     print("revenue与total_expense之间的相关性:\n", r2)
199 
200     return None
201 
202 
203 def pca_demo():
204     """
205     PCA降维
206     :return:
207     """
208     data = [[2,8,4,5], [6,3,0,8], [5,4,9,1]]
209 
210     # 1、实例化一个转换器类
211     transfer = PCA(n_components=0.95)
212 
213     # 2、调用fit_transform
214     data_new = transfer.fit_transform(data)
215     print("data_new:\n", data_new)
216     return None
217 
218 if __name__ == "__main__":
219     # 代码1:sklearn数据集使用
220     # datasets_demo()
221     # 代码2:字典特征抽取
222     # dict_demo()
223     # 代码3:文本特征抽取:CountVecotrizer
224     # count_demo()
225     # 代码4:中文文本特征抽取:CountVecotrizer
226     # count_chinese_demo()
227     # 代码5:中文文本特征抽取,自动分词
228     # count_chinese_demo2()
229     # 代码6:中文分词
230     # print(cut_word("我爱北京天安门"))
231     # 代码7:用TF-IDF的方法进行文本特征抽取
232     # tfidf_demo()
233     # 代码8:归一化
234     # minmax_demo()
235     # 代码9:标准化
236     # stand_demo()
237     # 代码10:低方差特征过滤
238     # variance_demo()
239     # 代码11:PCA降维
240     pca_demo()

 

posted @ 2022-12-09 15:11  LianJXian  阅读(17)  评论(0编辑  收藏  举报