Pandas实战 Facebook 数据集预测签到位置

import pandas as pd

# 获取数据
data = pd.read_csv("./facebook/train.csv")

data.head()

	row_id	x	y	accuracy	time	place_id
0	0	0.7941	9.0809	54	470702	8523065625
1	1	5.9567	4.7968	13	186555	1757726713
2	2	8.3078	7.0407	74	322648	1137537235
3	3	7.3665	2.5165	65	704587	6567393236
4	4	4.0961	1.1307	31	472130	7440663949

# 基本数据处理
# 1.缩小数据范围
data.query("x < 2.5 & x > 2 & y < 1.5 & y > 1.0")

	row_id	x	y	accuracy	time	place_id
112	112	2.2360	1.3655	66	623174	7663031065
180	180	2.2003	1.2541	65	610195	2358558474
367	367	2.4108	1.3213	74	579667	6644108708
874	874	2.0822	1.1973	320	143566	3229876087
1022	1022	2.0160	1.1659	65	207993	3244363975
...	...	...	...	...	...	...
29115112	29115112	2.1889	1.2914	168	721885	4606837364
29115204	29115204	2.1193	1.4692	58	563389	2074133146
29115338	29115338	2.0007	1.4852	25	765986	6691588909
29115464	29115464	2.4132	1.4237	61	151918	7396159924
29117493	29117493	2.2948	1.0504	81	79569	1168869217

83197 rows × 6 columns

data.head()

	row_id	x	y	accuracy	time	place_id
0	0	0.7941	9.0809	54	470702	8523065625
1	1	5.9567	4.7968	13	186555	1757726713
2	2	8.3078	7.0407	74	322648	1137537235
3	3	7.3665	2.5165	65	704587	6567393236
4	4	4.0961	1.1307	31	472130	7440663949

data["time"]

0           470702
1           186555
2           322648
3           704587
4           472130
             ...  
29118016    399740
29118017    125480
29118018    737758
29118019    764975
29118020    102842
Name: time, Length: 29118021, dtype: int64

# 2.处理时间特征
time_value = pd.to_datetime(data["time"],unit="s")

date = pd.DatetimeIndex(time_value)

data["day"]=date.day

data["weekday"]=date.weekday

data["hour"]=date.hour

data.head()

	row_id	x	y	accuracy	time	place_id	day	weekday	hour
0	0	0.7941	9.0809	54	470702	8523065625	6	1	10
1	1	5.9567	4.7968	13	186555	1757726713	3	5	3
2	2	8.3078	7.0407	74	322648	1137537235	4	6	17
3	3	7.3665	2.5165	65	704587	6567393236	9	4	3
4	4	4.0961	1.1307	31	472130	7440663949	6	1	11

# 3.过滤签到次数少的地点place_count = data.groupby("place_id").count()["row_id"]

place_count[place_count > 3].head()

place_id1000015801     781000017288     951000025138    5631000052096    9611000063498     60Name: row_id, dtype: int64

data_final = data[data["place_id"].isin(place_count[place_count > 3].index.values)]

data_final.head()

	row_id	x	y	accuracy	time	place_id	day	weekday	hour
0	0	0.7941	9.0809	54	470702	8523065625	6	1	10
1	1	5.9567	4.7968	13	186555	1757726713	3	5	3
2	2	8.3078	7.0407	74	322648	1137537235	4	6	17
3	3	7.3665	2.5165	65	704587	6567393236	9	4	3
4	4	4.0961	1.1307	31	472130	7440663949	6	1	11

# 筛选特征值和目标值x = data_final[["x","y","accuracy","day","weekday","hour"]]y = data_final["place_id"]

x.head()

	x	y	accuracy	day	weekday	hour
0	0.7941	9.0809	54	6	1	10
1	5.9567	4.7968	13	3	5	3
2	8.3078	7.0407	74	4	6	17
3	7.3665	2.5165	65	9	4	3
4	4.0961	1.1307	31	6	1	11

y.head()

0    85230656251    17577267132    11375372353    65673932364    7440663949Name: place_id, dtype: int64

# 数据集划分from sklearn.model_selection import train_test_splitx_train, x_test, y_train, y_test = train_test_split(x, y);

from sklearn.preprocessing import StandardScalerfrom sklearn.model_selection import train_test_split, GridSearchCVfrom sklearn.neighbors import KNeighborsClassifier

# 3. 特征工程(标准化)transfer = StandardScaler()x_train = transfer.fit_transform(x_train)x_test = transfer.transform(x_test)# 4. KNN算法预估器estimator = KNeighborsClassifier()# 加入网格搜索与交叉验证param_dict = {"n_neighbors": [ 3, 5]}estimator = GridSearchCV(estimator, param_grid=param_dict, cv=3)estimator.fit(x_train, y_train)# 5. 模型评估#   (1) 比对真实值与预测值y_predict = estimator.predict(x_test)print("y_predict:", y_predict)print("比对真实值与预测值：", y_test == y_predict)#   (2) 计算准确率score = estimator.score(x_test, y_test)print("准确率：\n", score)print("最佳参数：\n", estimator.best_params_)print("最佳结果：\n", estimator.best_score_)print("最佳估计器：\n", estimator.best_estimator_)print("最佳验证结果：\n", estimator.cv_results_)

y_predict: [3152373328 1911687695 3376568433 ... 5132208889 6142130159 6434055790]比对真实值与预测值： 2824787     False25584762    False28945483    False13511540    False28128585    False            ...  6067146     False25971193    False4086338     False24043598    False23926688    FalseName: place_id, Length: 7279238, dtype: bool准确率： 0.11855224406730484最佳参数： {'n_neighbors': 3}最佳结果： 0.09723119370461579最佳估计器： KNeighborsClassifier(n_neighbors=3)最佳验证结果： {'mean_fit_time': array([113.03914762, 128.09856272]), 'std_fit_time': array([2.48930113, 6.34220784]), 'mean_score_time': array([ 788.99727678, 1027.41904926]), 'std_score_time': array([ 9.12906035, 73.60085472]), 'param_n_neighbors': masked_array(data=[3, 5],             mask=[False, False],       fill_value='?',            dtype=object), 'params': [{'n_neighbors': 3}, {'n_neighbors': 5}], 'split0_test_score': array([0.09731527, 0.09202337]), 'split1_test_score': array([0.09721594, 0.09201059]), 'split2_test_score': array([0.09716237, 0.09203766]), 'mean_test_score': array([0.09723119, 0.09202387]), 'std_test_score': array([6.33458518e-05, 1.10542744e-05]), 'rank_test_score': array([1, 2])}

posted @ 2022-05-28 17:01 嘿，抬头！阅读(78) 评论(0) 收藏举报

刷新页面返回顶部

Pandas实战 Facebook 数据集预测签到位置

公告