monster-little

PA-Fitness(运用PCA和K-means进行不同人群的分类)

PA-Fitness

一、数据集/数据预处理

1、原始数据集:姓名,年龄,性别,多久运动一次?运动对您的重要性?您当前的健康水平?买过运动器材吗?...

https://www.kaggle.com/datasets/nithilaa/fitness-analysis)可在这个网站下载

2、处理后的数据集:男女分开(对应代码如下)

# Importing Libraries and Reading Dataset

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import missingno as msno
import category_encoders as ce

pd.set_option('display.max_columns',None)

df=pd.read_csv("dataset/fitness analysis.csv")

# Data Cleaning
df.info()
print("\n\nNull Values found in Dataframe: " + str(df.isna().sum().sum()) + "\n")
print(df.isna().sum())      #没有空项 直接进行编码 如果有的话 需要进行填充

# Reducing the Column Heading to be more readable and easier to work with
new_cols=['Timestamp','Name','Gender','Age_range','Exercise_importance','Fitness_level','Regularity','Barriers','Exercises','Do_you','Time','Time_spent','Balanced_diet','Prevents_balanced_diet','Health_level','Recommend_fitness','Equipment','Motivation']
# column_reference=pd.DataFrame(new_cols,df.columns)
# print(column_reference)

df.columns=new_cols

# 去掉无关项
df = df.drop(columns=['Timestamp','Name'],axis = 1)
print(df.head())

#Encoding Data
# Finding Unique Values in necessary Columns

gender_vals=df["Gender"].unique()
print(gender_vals)
age_vals=df["Age_range"].unique()
print(age_vals)
fit_lev_vals=df["Fitness_level"].unique()
print(fit_lev_vals)
reg_vals=df['Regularity'].unique()
print(reg_vals)
do_you_vals=df['Do_you'].unique()
print(do_you_vals)
time_vals=df['Time'].unique()
print(time_vals)
time_spent_vals=df['Time_spent'].unique()
print(time_spent_vals)
bal_diet_vals=df['Balanced_diet'].unique()
print(bal_diet_vals)
rec_fit_vals=df['Recommend_fitness'].unique()
print(rec_fit_vals)
equ_own_vals=df['Equipment'].unique()
print(equ_own_vals)

# Creating objects of OrdinalEncoding

encoder = ce.OrdinalEncoder(cols=[['Gender', 'Age_range', 'Fitness_level', 'Regularity',
                                   'Do_you', 'Time', 'Time_spent', 'Balanced_diet',
                                   'Recommend_fitness', 'Equipment']], return_df=True,
                            mapping=[
                                ## 4.1. Finding Unique Values in Columns
                                {'col': 'Gender',
                                 'mapping': {'Female': 1, 'Male': 2}},

                                {'col': 'Age_range',
                                 'mapping': {'15 to 18': 1, '19 to 25': 2, '26 to 30': 3,
                                             '30 to 40': 4, '40 and above': 5}},

                                {'col': 'Fitness_level',
                                 'mapping': {'Unfit': 1, 'Average': 2, 'Good': 3,
                                             'Very good': 4, 'Perfect': 5}},

                                {'col': 'Regularity',
                                 'mapping': {'Never': 1, '1 to 2 times a week': 2, '2 to 3 times a week': 3,
                                             '3 to 4 times a week': 4, '5 to 6 times a week': 5, 'Everyday': 6}},

                                {'col': 'Do_you',
                                 'mapping': {"I don't really exercise": 1, 'Alone': 2, 'With a friend': 3,
                                             'With a group': 4, 'Within a class environment': 5}},

                                {'col': 'Time',
                                 'mapping': {'Early morning': 1, 'Afternoon': 2, 'Evening': 3}},

                                {'col': 'Time_spent',
                                 'mapping': {"I don't really exercise": 0, '30 minutes': 1, '1 hour': 2,
                                             '2 hours': 3, '3 hours and above': 4}},

                                {'col': 'Balanced_diet',
                                 'mapping': {'No': 0, 'Not always': 1, 'Yes': 2}},

                                {'col': 'Recommend_fitness',
                                 'mapping': {'No': 0, 'Yes': 1}},

                                {'col': 'Equipment',
                                 'mapping': {'No': 0, 'Yes': 1}}

                            ])
df_encode = encoder.fit_transform(df)
df1 = df_encode.drop(columns = ['Barriers','Exercises','Prevents_balanced_diet','Motivation'])
df1.to_csv('dataset/out_fitness_analysis.csv', index=False, columns=['Gender', 'Age_range', 'Exercise_importance','Fitness_level','Regularity','Do_you','Time','Time_spent','Balanced_diet','Health_level','Recommend_fitness','Equipment'])
# print(df1.head())
# df1.info()
'''
   Gender  Age_range  Exercise_importance  Fitness_level  Regularity  Do_you  \
0       1          2                    2              3           1       1   
1       1          2                    4              4           1       4   
2       1          1                    3              3           2       2   
3       1          1                    4              3           4       2   
4       1          2                    3              1           1       1   

   Time  Time_spent  Balanced_diet  Health_level  Recommend_fitness  Equipment  
0     1           0              1             3                  1          0  
1     1           0              1             4                  1          0  
2     1           1              1             4                  1          1  
3     3           2              2             4                  1          0  
4     3           0              2             4                  1          0 '''


out_fitness_analysis_male= open("dataset/out_fitness_analysis_male.csv", "w", encoding='utf-8')
out_fitness_analysis_male.write("Gender,Age_range,Exercise_importance,Fitness_level,Regularity,Do_you,Time,Time_spent,Balanced_diet,Health_level,Recommend_fitness,Equipment\n")
out_fitness_analysis_female = open("dataset/out_fitness_analysis_female.csv", "w", encoding='utf-8')
out_fitness_analysis_female.write("Gender,Age_range,Exercise_importance,Fitness_level,Regularity,Do_you,Time,Time_spent,Balanced_diet,Health_level,Recommend_fitness,Equipment\n")

for _, row in df1.iterrows():
    if row['Gender'] == 1:
        Gender = row['Gender']
        Age_range = row['Age_range']
        Exercise_importance = row['Exercise_importance']
        Fitness_level = row['Fitness_level']
        Regularity = row['Regularity']
        Do_you = row['Do_you']
        Time = row['Time']
        Time_spent = row['Time_spent']
        Balanced_diet = row['Balanced_diet']
        Health_level = row['Health_level']
        Recommend_fitness = row['Recommend_fitness']
        Equipment = row['Equipment']
        out_fitness_analysis_male.write(f"{Gender},{Age_range},{Exercise_importance},{Fitness_level},{Regularity},{Do_you},{Time},{Time_spent},{Balanced_diet},{Health_level},{Recommend_fitness},{Equipment}\n")


for _, row in df1.iterrows():
    if row['Gender'] == 2:
        Gender = row['Gender']
        Age_range = row['Age_range']
        Exercise_importance = row['Exercise_importance']
        Fitness_level = row['Fitness_level']
        Regularity = row['Regularity']
        Do_you = row['Do_you']
        Time = row['Time']
        Time_spent = row['Time_spent']
        Balanced_diet = row['Balanced_diet']
        Health_level = row['Health_level']
        Recommend_fitness = row['Recommend_fitness']
        Equipment = row['Equipment']
        out_fitness_analysis_female.write(f"{Gender},{Age_range},{Exercise_importance},{Fitness_level},{Regularity},{Do_you},{Time},{Time_spent},{Balanced_diet},{Health_level},{Recommend_fitness},{Equipment}\n")

此为女性的数据集

二、运用PCA和k-means

1、原始的df(11个属性)

2、数据归一化(11个属性)

3、计算每个属性的贡献值,然后选择0.8为阈值,得出需要保留7个components

4、运用k-means 选取最佳目标簇的数值

5、得出5个簇心

6、分簇

7、测试,3个新用户 输入年龄范围 运动重要性... 11个属性 得出其在5类人群中的分类

对应代码如下

import sys
import spotipy
import yaml
import spotipy.util as util
from pprint import pprint
import json
import argparse
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import seaborn as sns
from yellowbrick.cluster import KElbowVisualizer
from kneed import KneeLocator
import plotly.graph_objects as go
from plotly.subplots import make_subplots
sns.set()

pd.set_option('display.max_columns',None)
df = pd.read_csv("dataset/out_fitness_analysis_female.csv")
# print(df.head())

#Principal Component Analysis (PCA)
non_features = ['Gender']
track_info = df[non_features]
df_X = df.drop(columns=non_features)
print(df_X.head())

scaler = StandardScaler()
X_std = scaler.fit_transform(df_X)  #数据归一化处理
print(X_std)

pca = PCA()
pca.fit(X_std)

# The attribute shows how much variance is explained by each of the nine features
evr = pca.explained_variance_ratio_
print(evr)

fig = plt.figure(figsize=(10,8))
plt.plot(range(1, len(df_X.columns)+1), evr.cumsum(), marker='o', linestyle='--')
plt.xlabel('Number of Components', fontsize=18)
plt.ylabel('Cumulative Explained Variance',fontsize=18)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
fig = plt.savefig("dataset/Number_of_Components_PCA.png")
plt.show()

for i, exp_var in enumerate(evr.cumsum()):
    if exp_var >= 0.8:
        n_comps = i + 1
        break
print("Number of components:", n_comps)
pca = PCA(n_components=n_comps)
pca.fit(X_std)
scores_pca = pca.transform(X_std)

#K-Means Clustering
#Finding the elbow point of the WCSS (within cluster sum of squares) curve using the YellowBrick KElbowVisualizer
visualizer = KElbowVisualizer(KMeans(init='k-means++', random_state=42), k=(1,21), timings=False)
visualizer.fit(scores_pca)
visualizer.show()
n_clusters = visualizer.elbow_value_
print("Optimal number of clusters:", n_clusters)

#Finding the elbow point of the WCSS (within cluster sum of squares) curve using the kneed KneeLocator
wcss = []
max_clusters = 21
for i in range(1, max_clusters):
    kmeans_pca = KMeans(i, init='k-means++', random_state=42)
    kmeans_pca.fit(scores_pca)
    wcss.append(kmeans_pca.inertia_)
n_clusters = KneeLocator([i for i in range(1, max_clusters)], wcss, curve='convex', direction='decreasing').knee
print("Optimal number of clusters", n_clusters)

fig = plt.figure(figsize=(10,8))
plt.plot(range(1, 21), wcss, marker='o', linestyle='--')
plt.vlines(KneeLocator([i for i in range(1, max_clusters)], wcss, curve='convex', direction='decreasing').knee, ymin=min(wcss), ymax=max(wcss), linestyles='dashed')
plt.xlabel('Number of Clusters', fontsize=18)
plt.ylabel('Within Cluster Sum of Squares (WCSS)', fontsize=18)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
fig.savefig("dataset/num_clusters.png")
plt.show()

kmeans_pca = KMeans(n_clusters=n_clusters, init='k-means++', random_state=42)
kmeans_pca.fit(scores_pca)
print(kmeans_pca.cluster_centers_)


df_seg_pca_kmeans = pd.concat([df_X.reset_index(drop=True), pd.DataFrame(scores_pca)], axis=1)
df_seg_pca_kmeans.columns.values[(-1*n_comps):] = ["Component " + str(i+1) for i in range(n_comps)]
df_seg_pca_kmeans['Cluster'] = kmeans_pca.labels_
print(df_seg_pca_kmeans.head())

x = df_seg_pca_kmeans['Component 2']
y = df_seg_pca_kmeans['Component 1']
fig = plt.figure(figsize=(10, 8))
sns.scatterplot(x, y, hue=df_seg_pca_kmeans['Cluster'], palette = ['tab:blue', 'tab:orange', 'tab:green', 'tab:red', 'tab:purple'])
plt.title('Clusters by PCA Components', fontsize=20)
plt.xlabel("Component 2", fontsize=18)
plt.ylabel("Component 1", fontsize=18)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.show()
fig.savefig("dataset/clusters-2d.png")

#测试
#Gender,Age_range,Exercise_importance,Fitness_level,Regularity,Do_you,Time,
# Time_spent,Balanced_diet,Health_level,Recommend_fitness,Equipment
data = {'Age_range': [1,2,4],
        'Exercise_importance': [1,2,3],
        'Fitness_level': [3,4,5],
        'Regularity': [4,5,6],
        'Do_you': [1,2,4],
        'Time': [1,2,3],
        'Time_spent': [0,2,3],
        'Balanced_diet': [2,1,0],
        'Health_level': [1,2,3],
        'Recommend_fitness': [0,1,0],
        'Equipment': [1,0,1],
        }
frame = pd.DataFrame(data)
print(frame)
X_std_new = scaler.fit_transform(frame)
scores_pca_new = pca.transform(X_std_new)
print(kmeans_pca.predict(scores_pca_new))

posted on 2022-07-31 08:33  monster-little  阅读(155)  评论(0编辑  收藏  举报

导航