Pandas实战 Facebook 数据集预测签到位置

import pandas as pd
# 获取数据
data = pd.read_csv("./facebook/train.csv")
data.head()
row_idxyaccuracytimeplace_id
000.79419.0809544707028523065625
115.95674.7968131865551757726713
228.30787.0407743226481137537235
337.36652.5165657045876567393236
444.09611.1307314721307440663949
# 基本数据处理
# 1.缩小数据范围
data.query("x < 2.5 & x > 2 & y < 1.5 & y > 1.0")
row_idxyaccuracytimeplace_id
1121122.23601.3655666231747663031065
1801802.20031.2541656101952358558474
3673672.41081.3213745796676644108708
8748742.08221.19733201435663229876087
102210222.01601.1659652079933244363975
.....................
29115112291151122.18891.29141687218854606837364
29115204291152042.11931.4692585633892074133146
29115338291153382.00071.4852257659866691588909
29115464291154642.41321.4237611519187396159924
29117493291174932.29481.050481795691168869217

83197 rows × 6 columns

data.head()
row_idxyaccuracytimeplace_id
000.79419.0809544707028523065625
115.95674.7968131865551757726713
228.30787.0407743226481137537235
337.36652.5165657045876567393236
444.09611.1307314721307440663949
data["time"]
0           470702
1           186555
2           322648
3           704587
4           472130
             ...  
29118016    399740
29118017    125480
29118018    737758
29118019    764975
29118020    102842
Name: time, Length: 29118021, dtype: int64
# 2.处理时间特征
time_value = pd.to_datetime(data["time"],unit="s")
date = pd.DatetimeIndex(time_value)
data["day"]=date.day
data["weekday"]=date.weekday
data["hour"]=date.hour
data.head()
row_idxyaccuracytimeplace_iddayweekdayhour
000.79419.08095447070285230656256110
115.95674.7968131865551757726713353
228.30787.04077432264811375372354617
337.36652.5165657045876567393236943
444.09611.13073147213074406639496111
# 3.过滤签到次数少的地点place_count = data.groupby("place_id").count()["row_id"]
place_count[place_count > 3].head()
place_id1000015801     781000017288     951000025138    5631000052096    9611000063498     60Name: row_id, dtype: int64
data_final = data[data["place_id"].isin(place_count[place_count > 3].index.values)]
data_final.head()
row_idxyaccuracytimeplace_iddayweekdayhour
000.79419.08095447070285230656256110
115.95674.7968131865551757726713353
228.30787.04077432264811375372354617
337.36652.5165657045876567393236943
444.09611.13073147213074406639496111
# 筛选特征值和目标值x = data_final[["x","y","accuracy","day","weekday","hour"]]y = data_final["place_id"]
x.head()
xyaccuracydayweekdayhour
00.79419.0809546110
15.95674.796813353
28.30787.0407744617
37.36652.516565943
44.09611.1307316111
y.head()
0    85230656251    17577267132    11375372353    65673932364    7440663949Name: place_id, dtype: int64
# 数据集划分from sklearn.model_selection import train_test_splitx_train, x_test, y_train, y_test = train_test_split(x, y);
from sklearn.preprocessing import StandardScalerfrom sklearn.model_selection import train_test_split, GridSearchCVfrom sklearn.neighbors import KNeighborsClassifier
# 3. 特征工程(标准化)transfer = StandardScaler()x_train = transfer.fit_transform(x_train)x_test = transfer.transform(x_test)# 4. KNN算法预估器estimator = KNeighborsClassifier()# 加入网格搜索与交叉验证param_dict = {"n_neighbors": [ 3, 5]}estimator = GridSearchCV(estimator, param_grid=param_dict, cv=3)estimator.fit(x_train, y_train)# 5. 模型评估#   (1) 比对真实值与预测值y_predict = estimator.predict(x_test)print("y_predict:", y_predict)print("比对真实值与预测值:", y_test == y_predict)#   (2) 计算准确率score = estimator.score(x_test, y_test)print("准确率:\n", score)print("最佳参数:\n", estimator.best_params_)print("最佳结果:\n", estimator.best_score_)print("最佳估计器:\n", estimator.best_estimator_)print("最佳验证结果:\n", estimator.cv_results_)
y_predict: [3152373328 1911687695 3376568433 ... 5132208889 6142130159 6434055790]比对真实值与预测值: 2824787     False25584762    False28945483    False13511540    False28128585    False            ...  6067146     False25971193    False4086338     False24043598    False23926688    FalseName: place_id, Length: 7279238, dtype: bool准确率: 0.11855224406730484最佳参数: {'n_neighbors': 3}最佳结果: 0.09723119370461579最佳估计器: KNeighborsClassifier(n_neighbors=3)最佳验证结果: {'mean_fit_time': array([113.03914762, 128.09856272]), 'std_fit_time': array([2.48930113, 6.34220784]), 'mean_score_time': array([ 788.99727678, 1027.41904926]), 'std_score_time': array([ 9.12906035, 73.60085472]), 'param_n_neighbors': masked_array(data=[3, 5],             mask=[False, False],       fill_value='?',            dtype=object), 'params': [{'n_neighbors': 3}, {'n_neighbors': 5}], 'split0_test_score': array([0.09731527, 0.09202337]), 'split1_test_score': array([0.09721594, 0.09201059]), 'split2_test_score': array([0.09716237, 0.09203766]), 'mean_test_score': array([0.09723119, 0.09202387]), 'std_test_score': array([6.33458518e-05, 1.10542744e-05]), 'rank_test_score': array([1, 2])}
posted @   嘿,抬头!  阅读(54)  评论(0编辑  收藏  举报
相关博文:
阅读排行:
· 单线程的Redis速度为什么快?
· 展开说说关于C#中ORM框架的用法!
· Pantheons:用 TypeScript 打造主流大模型对话的一站式集成库
· SQL Server 2025 AI相关能力初探
· 为什么 退出登录 或 修改密码 无法使 token 失效
点击右上角即可分享
微信分享提示