PA-Fitness(运用PCA和K-means进行不同人群的分类)
PA-Fitness
一、数据集/数据预处理
1、原始数据集:姓名,年龄,性别,多久运动一次?运动对您的重要性?您当前的健康水平?买过运动器材吗?...
(https://www.kaggle.com/datasets/nithilaa/fitness-analysis)可在这个网站下载
2、处理后的数据集:男女分开(对应代码如下)
# Importing Libraries and Reading Dataset
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import missingno as msno
import category_encoders as ce
pd.set_option('display.max_columns',None)
df=pd.read_csv("dataset/fitness analysis.csv")
# Data Cleaning
df.info()
print("\n\nNull Values found in Dataframe: " + str(df.isna().sum().sum()) + "\n")
print(df.isna().sum()) #没有空项 直接进行编码 如果有的话 需要进行填充
# Reducing the Column Heading to be more readable and easier to work with
new_cols=['Timestamp','Name','Gender','Age_range','Exercise_importance','Fitness_level','Regularity','Barriers','Exercises','Do_you','Time','Time_spent','Balanced_diet','Prevents_balanced_diet','Health_level','Recommend_fitness','Equipment','Motivation']
# column_reference=pd.DataFrame(new_cols,df.columns)
# print(column_reference)
df.columns=new_cols
# 去掉无关项
df = df.drop(columns=['Timestamp','Name'],axis = 1)
print(df.head())
#Encoding Data
# Finding Unique Values in necessary Columns
gender_vals=df["Gender"].unique()
print(gender_vals)
age_vals=df["Age_range"].unique()
print(age_vals)
fit_lev_vals=df["Fitness_level"].unique()
print(fit_lev_vals)
reg_vals=df['Regularity'].unique()
print(reg_vals)
do_you_vals=df['Do_you'].unique()
print(do_you_vals)
time_vals=df['Time'].unique()
print(time_vals)
time_spent_vals=df['Time_spent'].unique()
print(time_spent_vals)
bal_diet_vals=df['Balanced_diet'].unique()
print(bal_diet_vals)
rec_fit_vals=df['Recommend_fitness'].unique()
print(rec_fit_vals)
equ_own_vals=df['Equipment'].unique()
print(equ_own_vals)
# Creating objects of OrdinalEncoding
encoder = ce.OrdinalEncoder(cols=[['Gender', 'Age_range', 'Fitness_level', 'Regularity',
'Do_you', 'Time', 'Time_spent', 'Balanced_diet',
'Recommend_fitness', 'Equipment']], return_df=True,
mapping=[
## 4.1. Finding Unique Values in Columns
{'col': 'Gender',
'mapping': {'Female': 1, 'Male': 2}},
{'col': 'Age_range',
'mapping': {'15 to 18': 1, '19 to 25': 2, '26 to 30': 3,
'30 to 40': 4, '40 and above': 5}},
{'col': 'Fitness_level',
'mapping': {'Unfit': 1, 'Average': 2, 'Good': 3,
'Very good': 4, 'Perfect': 5}},
{'col': 'Regularity',
'mapping': {'Never': 1, '1 to 2 times a week': 2, '2 to 3 times a week': 3,
'3 to 4 times a week': 4, '5 to 6 times a week': 5, 'Everyday': 6}},
{'col': 'Do_you',
'mapping': {"I don't really exercise": 1, 'Alone': 2, 'With a friend': 3,
'With a group': 4, 'Within a class environment': 5}},
{'col': 'Time',
'mapping': {'Early morning': 1, 'Afternoon': 2, 'Evening': 3}},
{'col': 'Time_spent',
'mapping': {"I don't really exercise": 0, '30 minutes': 1, '1 hour': 2,
'2 hours': 3, '3 hours and above': 4}},
{'col': 'Balanced_diet',
'mapping': {'No': 0, 'Not always': 1, 'Yes': 2}},
{'col': 'Recommend_fitness',
'mapping': {'No': 0, 'Yes': 1}},
{'col': 'Equipment',
'mapping': {'No': 0, 'Yes': 1}}
])
df_encode = encoder.fit_transform(df)
df1 = df_encode.drop(columns = ['Barriers','Exercises','Prevents_balanced_diet','Motivation'])
df1.to_csv('dataset/out_fitness_analysis.csv', index=False, columns=['Gender', 'Age_range', 'Exercise_importance','Fitness_level','Regularity','Do_you','Time','Time_spent','Balanced_diet','Health_level','Recommend_fitness','Equipment'])
# print(df1.head())
# df1.info()
'''
Gender Age_range Exercise_importance Fitness_level Regularity Do_you \
0 1 2 2 3 1 1
1 1 2 4 4 1 4
2 1 1 3 3 2 2
3 1 1 4 3 4 2
4 1 2 3 1 1 1
Time Time_spent Balanced_diet Health_level Recommend_fitness Equipment
0 1 0 1 3 1 0
1 1 0 1 4 1 0
2 1 1 1 4 1 1
3 3 2 2 4 1 0
4 3 0 2 4 1 0 '''
out_fitness_analysis_male= open("dataset/out_fitness_analysis_male.csv", "w", encoding='utf-8')
out_fitness_analysis_male.write("Gender,Age_range,Exercise_importance,Fitness_level,Regularity,Do_you,Time,Time_spent,Balanced_diet,Health_level,Recommend_fitness,Equipment\n")
out_fitness_analysis_female = open("dataset/out_fitness_analysis_female.csv", "w", encoding='utf-8')
out_fitness_analysis_female.write("Gender,Age_range,Exercise_importance,Fitness_level,Regularity,Do_you,Time,Time_spent,Balanced_diet,Health_level,Recommend_fitness,Equipment\n")
for _, row in df1.iterrows():
if row['Gender'] == 1:
Gender = row['Gender']
Age_range = row['Age_range']
Exercise_importance = row['Exercise_importance']
Fitness_level = row['Fitness_level']
Regularity = row['Regularity']
Do_you = row['Do_you']
Time = row['Time']
Time_spent = row['Time_spent']
Balanced_diet = row['Balanced_diet']
Health_level = row['Health_level']
Recommend_fitness = row['Recommend_fitness']
Equipment = row['Equipment']
out_fitness_analysis_male.write(f"{Gender},{Age_range},{Exercise_importance},{Fitness_level},{Regularity},{Do_you},{Time},{Time_spent},{Balanced_diet},{Health_level},{Recommend_fitness},{Equipment}\n")
for _, row in df1.iterrows():
if row['Gender'] == 2:
Gender = row['Gender']
Age_range = row['Age_range']
Exercise_importance = row['Exercise_importance']
Fitness_level = row['Fitness_level']
Regularity = row['Regularity']
Do_you = row['Do_you']
Time = row['Time']
Time_spent = row['Time_spent']
Balanced_diet = row['Balanced_diet']
Health_level = row['Health_level']
Recommend_fitness = row['Recommend_fitness']
Equipment = row['Equipment']
out_fitness_analysis_female.write(f"{Gender},{Age_range},{Exercise_importance},{Fitness_level},{Regularity},{Do_you},{Time},{Time_spent},{Balanced_diet},{Health_level},{Recommend_fitness},{Equipment}\n")
此为女性的数据集
二、运用PCA和k-means
1、原始的df(11个属性)
2、数据归一化(11个属性)
3、计算每个属性的贡献值,然后选择0.8为阈值,得出需要保留7个components
4、运用k-means 选取最佳目标簇的数值
5、得出5个簇心
6、分簇
7、测试,3个新用户 输入年龄范围 运动重要性... 11个属性 得出其在5类人群中的分类
对应代码如下
import sys
import spotipy
import yaml
import spotipy.util as util
from pprint import pprint
import json
import argparse
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import seaborn as sns
from yellowbrick.cluster import KElbowVisualizer
from kneed import KneeLocator
import plotly.graph_objects as go
from plotly.subplots import make_subplots
sns.set()
pd.set_option('display.max_columns',None)
df = pd.read_csv("dataset/out_fitness_analysis_female.csv")
# print(df.head())
#Principal Component Analysis (PCA)
non_features = ['Gender']
track_info = df[non_features]
df_X = df.drop(columns=non_features)
print(df_X.head())
scaler = StandardScaler()
X_std = scaler.fit_transform(df_X) #数据归一化处理
print(X_std)
pca = PCA()
pca.fit(X_std)
# The attribute shows how much variance is explained by each of the nine features
evr = pca.explained_variance_ratio_
print(evr)
fig = plt.figure(figsize=(10,8))
plt.plot(range(1, len(df_X.columns)+1), evr.cumsum(), marker='o', linestyle='--')
plt.xlabel('Number of Components', fontsize=18)
plt.ylabel('Cumulative Explained Variance',fontsize=18)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
fig = plt.savefig("dataset/Number_of_Components_PCA.png")
plt.show()
for i, exp_var in enumerate(evr.cumsum()):
if exp_var >= 0.8:
n_comps = i + 1
break
print("Number of components:", n_comps)
pca = PCA(n_components=n_comps)
pca.fit(X_std)
scores_pca = pca.transform(X_std)
#K-Means Clustering
#Finding the elbow point of the WCSS (within cluster sum of squares) curve using the YellowBrick KElbowVisualizer
visualizer = KElbowVisualizer(KMeans(init='k-means++', random_state=42), k=(1,21), timings=False)
visualizer.fit(scores_pca)
visualizer.show()
n_clusters = visualizer.elbow_value_
print("Optimal number of clusters:", n_clusters)
#Finding the elbow point of the WCSS (within cluster sum of squares) curve using the kneed KneeLocator
wcss = []
max_clusters = 21
for i in range(1, max_clusters):
kmeans_pca = KMeans(i, init='k-means++', random_state=42)
kmeans_pca.fit(scores_pca)
wcss.append(kmeans_pca.inertia_)
n_clusters = KneeLocator([i for i in range(1, max_clusters)], wcss, curve='convex', direction='decreasing').knee
print("Optimal number of clusters", n_clusters)
fig = plt.figure(figsize=(10,8))
plt.plot(range(1, 21), wcss, marker='o', linestyle='--')
plt.vlines(KneeLocator([i for i in range(1, max_clusters)], wcss, curve='convex', direction='decreasing').knee, ymin=min(wcss), ymax=max(wcss), linestyles='dashed')
plt.xlabel('Number of Clusters', fontsize=18)
plt.ylabel('Within Cluster Sum of Squares (WCSS)', fontsize=18)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
fig.savefig("dataset/num_clusters.png")
plt.show()
kmeans_pca = KMeans(n_clusters=n_clusters, init='k-means++', random_state=42)
kmeans_pca.fit(scores_pca)
print(kmeans_pca.cluster_centers_)
df_seg_pca_kmeans = pd.concat([df_X.reset_index(drop=True), pd.DataFrame(scores_pca)], axis=1)
df_seg_pca_kmeans.columns.values[(-1*n_comps):] = ["Component " + str(i+1) for i in range(n_comps)]
df_seg_pca_kmeans['Cluster'] = kmeans_pca.labels_
print(df_seg_pca_kmeans.head())
x = df_seg_pca_kmeans['Component 2']
y = df_seg_pca_kmeans['Component 1']
fig = plt.figure(figsize=(10, 8))
sns.scatterplot(x, y, hue=df_seg_pca_kmeans['Cluster'], palette = ['tab:blue', 'tab:orange', 'tab:green', 'tab:red', 'tab:purple'])
plt.title('Clusters by PCA Components', fontsize=20)
plt.xlabel("Component 2", fontsize=18)
plt.ylabel("Component 1", fontsize=18)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.show()
fig.savefig("dataset/clusters-2d.png")
#测试
#Gender,Age_range,Exercise_importance,Fitness_level,Regularity,Do_you,Time,
# Time_spent,Balanced_diet,Health_level,Recommend_fitness,Equipment
data = {'Age_range': [1,2,4],
'Exercise_importance': [1,2,3],
'Fitness_level': [3,4,5],
'Regularity': [4,5,6],
'Do_you': [1,2,4],
'Time': [1,2,3],
'Time_spent': [0,2,3],
'Balanced_diet': [2,1,0],
'Health_level': [1,2,3],
'Recommend_fitness': [0,1,0],
'Equipment': [1,0,1],
}
frame = pd.DataFrame(data)
print(frame)
X_std_new = scaler.fit_transform(frame)
scores_pca_new = pca.transform(X_std_new)
print(kmeans_pca.predict(scores_pca_new))
posted on 2022-07-31 08:33 monster-little 阅读(155) 评论(0) 编辑 收藏 举报