3.大数据分析过程及采用的算法
由于网站上的数据集字段未作详细说明,故本次实验数据分析仅使用,id,age,job,marital,education,default,housing,loan,subscibe等字段,其中subscibe为特征,其余字段均当作标签
| import warnings |
| warnings.filterwarnings('ignore') |
| import pandas as pd |
| df_train=pd.read_csv('.\\train.csv')[['id',"age","job","marital","education","default","housing","loan","subscribe"]] |
| df_train |

检测空值情况

检查是否存在重复行
| df_train[df_train.duplicated()] |

使用箱型图检测数据是否存在异常值
| import matplotlib.pyplot as plt |
| plt.rcParams['font.sans-serif']=['SimHei']#解决中文乱码问题 |
| plt.rcParams["font.size"]=10#设置字体大小 |
| plt.boxplot(df_train['age']) |
| plt.title('数据分析箱型图') |
| plt.show() |

统计性分析
| plt.figure(figsize=(10, 4)) |
| job_counts = df_train['job'].value_counts().head(10) |
| job_counts.plot(kind='bar', color='skyblue') |
| plt.title('银行客户前10职业统计') |
| plt.xlabel('职业') |
| plt.ylabel('频率') |
| plt.xticks(rotation=45) |
| plt.show() |

婚姻分布图
| marital_status_counts = df_train['marital'].value_counts() |
| labels = marital_status_counts.index.tolist() |
| plt.figure(figsize=(8, 8)) |
| plt.pie(marital_status_counts, labels=labels, autopct='%1.1f%%', startangle=140) |
| plt.title('银行客户婚姻状况分布') |
| plt.show() |

客户教育水平
| import seaborn as sns |
| plt.figure(figsize=(20, 15)) |
| |
| |
| plt.subplot(5, 2, 4) |
| sns.countplot(x='education', data=df_train) |
| plt.title('客户教育水平分布') |
| plt.xlabel('教育水平') |
| plt.ylabel('数量') |
| plt.xticks(rotation=45) |
| |
| |
| plt.tight_layout() |
| plt.show() |

| plt.figure(figsize=(20, 15)) |
| |
| plt.subplot(2, 3, 5) |
| job_subscribe = pd.crosstab(df_train['job'], df_train['subscribe']) |
| job_subscribe.plot(kind='bar', stacked=True, color=['skyblue', 'orange'], ax=plt.gca()) |
| plt.title('职业与是否为银行客户的关系') |
| plt.xlabel('职业') |
| plt.ylabel('人数') |
| plt.xticks(rotation=45) |
| plt.show() |

| |
| plt.figure(figsize=(20, 15)) |
| plt.subplot(5, 2, 5) |
| sns.countplot(x='default', data=df) |
| plt.title('客户默认信用情况') |
| plt.xlabel('默认信用') |
| plt.ylabel('数量') |
| plt.show() |

| plt.figure(figsize=(20, 15)) |
| |
| plt.subplot(2, 3, 1) |
| df_train['housing'].value_counts().plot(kind='pie', autopct='%1.1f%%') |
| plt.title('是否有住房贷款') |
| plt.ylabel('') |
| plt.show() |

| |
| plt.figure(figsize=(20, 15)) |
| plt.subplot(2, 3, 2) |
| df_train['loan'].value_counts().plot(kind='pie', autopct='%1.1f%%') |
| plt.title('是否有个人贷款') |
| plt.ylabel('') |
| plt.show() |

| |
| plt.figure(figsize=(20, 15)) |
| plt.subplot(2, 3, 3) |
| df_train['subscribe'].value_counts().plot(kind='pie', autopct='%1.1f%%') |
| plt.title('是否为银行客户') |
| plt.ylabel('') |
| plt.show() |

| |
| plt.figure(figsize=(20, 15)) |
| df_train.boxplot(column='age', by='subscribe') |
| plt.title('年龄与是否为银行客户的关系') |
| plt.suptitle('') # 去除默认的副标题 |
| plt.xlabel('是否为银行客户') |
| plt.ylabel('年龄') |
| plt.show() |

机器学习预测分析
| import copy |
| def convert_yes_no(value): |
| if value == 'yes': |
| return 1 |
| elif value == 'no': |
| return 0 |
| else: |
| return None |
| |
| df=copy.deepcopy(df_train) |
| for column in ['default', 'housing', 'loan', 'subscribe']: |
| df[column] = df[column].apply(convert_yes_no) |
| |
| df = df.dropna(subset=['default', 'housing', 'loan', 'subscribe']).reset_index(drop=True) |
| |
| df |

| x=df.iloc[:,:-1].drop(['id'], axis = 1) |
| y=df.iloc[:,-1:] |
| |
| from sklearn.preprocessing import OneHotEncoder |
| non_numeric_columns = x.select_dtypes(include=['object']).columns |
| encoder = OneHotEncoder(sparse=False) |
| encoded_data = encoder.fit_transform(x[non_numeric_columns]) |
| encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(non_numeric_columns)) |
| x = pd.concat([x.drop(non_numeric_columns, axis=1), encoded_df], axis=1) |
| x.head() |


| from sklearn.preprocessing import StandardScaler |
| scaler = StandardScaler()#标准差标准化 |
| x= scaler.fit_transform(x) |
| |
| from sklearn.model_selection import train_test_split |
| x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.365,random_state=42) |
| |
| from sklearn.linear_model import LogisticRegression |
| cgr=LogisticRegression() |
| cgr.fit(x_train,y_train) |
| cgr.score(x_test,y_test) |
其他模型
| from sklearn.metrics import roc_curve,auc |
| y_score=cgr.predict_proba(x_test)[:,1] |
| fpr, tpr, thersholds = roc_curve(y_test,y_score) |
| roc_auc=auc(fpr,tpr) |
| plt.plot(fpr, tpr, 'k--', label='ROC (area = {0:.2f})'.format(roc_auc), lw=2) |
| plt.xlim([-0.05, 1.05]) # 设置x、y轴的上下限, |
| plt.ylim([-0.05, 1.05]) |
| plt.xlabel('False Positive Rate') |
| plt.ylabel('True Positive Rate') # 可以使用中文,但需要导入一些库即字体 |
| plt.title('ROC Curve') |
| plt.legend(loc="lower right") |
| plt.show() |

随机森林
| from sklearn.ensemble import RandomForestClassifier |
| rf_model = RandomForestClassifier() |
| rf_model.fit(x_train, y_train) |
| accuracy = rf_model.score(x_test, y_test) |
| accuracy |

| |
| plt.rcParams['font.sans-serif'] = ['SimHei'] |
| plt.rcParams['axes.unicode_minus'] = False |
| |
| |
| plt.figure(figsize=(12, 6)) |
| sns.boxplot(x='marital', y='age', data=df_train) |
| plt.title('年龄与婚姻状况分布') |
| plt.xlabel('婚姻状况') |
| plt.ylabel('年龄') |
| |
| plt.show() |

| plt.figure(figsize=(12, 6)) |
| sns.countplot(x='job', hue='education', data=df_train) |
| plt.title('工作类型与教育水平分布') |
| plt.xlabel('工作类型') |
| plt.ylabel('数量') |
| plt.xticks(rotation=45) |
| |
| plt.show() |

| df_train['loan_status'] = df_train['housing'] + ',' + df_train['loan'] |
| |
| plt.figure(figsize=(12, 6)) |
| sns.countplot(x='loan_status', hue='subscribe', data=df_train) |
| plt.title('信贷情况与订阅关系') |
| plt.xlabel('信贷情况') |
| plt.ylabel('数量') |
| |
| plt.show() |

| |
| subscribed_df = df_train[df_train['subscribe'] == 'yes'] |
| |
| |
| education_counts = subscribed_df['education'].value_counts() |
| |
| non_subscribed_df = df_train[df_train['subscribe'] == 'no'] |
| |
| |
| non_education_counts = non_subscribed_df['education'].value_counts() |
| |
| |
| plt.figure(figsize=(12, 6)) |
| |
| |
| plt.subplot(1, 2, 1) |
| education_counts.plot(kind='bar', color='skyblue') |
| plt.title('银行客户的教育水平') |
| plt.xlabel('教育水平') |
| plt.ylabel('客户数量') |
| plt.xticks(rotation=45) |
| |
| |
| plt.subplot(1, 2, 2) |
| non_education_counts.plot(kind='bar', color='orange') |
| plt.title('非银行客户的教育水平') |
| plt.xlabel('教育水平') |
| plt.ylabel('客户数量') |
| plt.xticks(rotation=45) |
| |
| plt.tight_layout() |
| plt.show() |

| heatmap_data = pd.crosstab(df_train['job'], df_train['education']) |
| |
| plt.figure(figsize=(10, 8)) |
| sns.heatmap(heatmap_data, annot=True, cmap='coolwarm') |
| plt.title('工作类型与教育水平的相关性热力图') |
| plt.xlabel('教育水平') |
| plt.ylabel('工作类型') |
| |
| plt.show() |

| |
| loan_marital_job = pd.crosstab(index=[df_train['job'], df_train['marital']], columns=df_train['loan_status']) |
| |
| |
| loan_marital_job.plot(kind='bar', stacked=True, figsize=(14, 8)) |
| plt.title('各职业婚姻状况下的贷款状态分布') |
| plt.xlabel('职业和婚姻状况') |
| plt.ylabel('数量') |
| plt.xticks(rotation=45) |
| plt.show() |

| bins = [20, 30, 40, 50, 60] |
| |
| |
| df_train['age_group'] = pd.cut(df_train['age'], bins, right=False, labels=["20-29", "30-39", "40-49", "50-59"]) |
| |
| |
| age_job_counts = df_train.groupby(['age_group', 'job']).size().unstack(fill_value=0) |
| |
| |
| plt.figure(figsize=(15, 8)) |
| age_job_counts.plot(kind='bar', stacked=True, colormap='viridis') |
| plt.title('不同年龄组的工作类型分布', fontsize=14) |
| plt.xlabel('年龄组', fontsize=12) |
| plt.ylabel('工作数量', fontsize=12) |
| plt.xticks(rotation=45, fontsize=10) |
| plt.yticks(fontsize=10) |
| plt.legend(title='工作类型', bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=10) |
| plt.tight_layout() |
| |
| plt.show() |

| |
| married_df = df_train[df_train['marital'] == 'married'] |
| unmarried_df = df_train[df_train['marital'] != 'married'] |
| |
| |
| median_age_married = married_df['age'].median() |
| median_age_unmarried = unmarried_df['age'].median() |
| |
| |
| plt.figure(figsize=(14, 7)) |
| |
| |
| plt.hist(married_df['age'], bins=range(min(df_train['age']), max(df_train['age']) + 5, 5), alpha=0.5, label='已婚') |
| |
| |
| plt.hist(unmarried_df['age'], bins=range(min(df_train['age']), max(df_train['age']) + 5, 5), alpha=0.5, label='未婚') |
| |
| plt.axvline(median_age_married, color='blue', linestyle='dashed', linewidth=1, label='已婚中位年龄') |
| plt.axvline(median_age_unmarried, color='orange', linestyle='dashed', linewidth=1, label='未婚中位年龄') |
| |
| plt.title('已婚与未婚人群的年龄分布', fontsize=16) |
| plt.xlabel('年龄', fontsize=14) |
| plt.ylabel('人数', fontsize=14) |
| plt.legend(fontsize=12) |
| plt.grid(True) |
| plt.tight_layout() |
| |
| plt.show() |

完整代码:
| import warnings |
| warnings.filterwarnings('ignore') |
| |
| import pandas as pd |
| df_train=pd.read_csv('.\\train.csv')[['id',"age","job","marital","education","default","housing","loan","subscribe"]] |
| df_train |
| |
| df_train.info() |
| |
| df_train[df_train.duplicated()] |
| |
| import matplotlib.pyplot as plt |
| plt.rcParams['font.sans-serif']=['SimHei']#解决中文乱码问题 |
| plt.rcParams["font.size"]=10#设置字体大小 |
| plt.boxplot(df_train['age']) |
| plt.title('数据分析箱型图') |
| plt.show() |
| |
| plt.figure(figsize=(10, 4)) |
| job_counts = df_train['job'].value_counts().head(10) |
| job_counts.plot(kind='bar', color='skyblue') |
| plt.title('银行客户前10职业统计') |
| plt.xlabel('职业') |
| plt.ylabel('频率') |
| plt.xticks(rotation=45) |
| plt.show() |
| |
| marital_status_counts = df_train['marital'].value_counts() |
| labels = marital_status_counts.index.tolist() |
| plt.figure(figsize=(8, 8)) |
| plt.pie(marital_status_counts, labels=labels, autopct='%1.1f%%', startangle=140) |
| plt.title('银行客户婚姻状况分布') |
| plt.show() |
| |
| import seaborn as sns |
| plt.figure(figsize=(20, 15)) |
| |
| |
| plt.subplot(5, 2, 4) |
| sns.countplot(x='education', data=df_train) |
| plt.title('客户教育水平分布') |
| plt.xlabel('教育水平') |
| plt.ylabel('数量') |
| plt.xticks(rotation=45) |
| |
| |
| plt.tight_layout() |
| plt.show() |
| |
| plt.figure(figsize=(20, 15)) |
| |
| plt.subplot(2, 3, 5) |
| job_subscribe = pd.crosstab(df_train['job'], df_train['subscribe']) |
| job_subscribe.plot(kind='bar', stacked=True, color=['skyblue', 'orange'], ax=plt.gca()) |
| plt.title('职业与是否为银行客户的关系') |
| plt.xlabel('职业') |
| plt.ylabel('人数') |
| plt.xticks(rotation=45) |
| plt.show() |
| |
| |
| plt.figure(figsize=(20, 15)) |
| plt.subplot(5, 2, 5) |
| sns.countplot(x='default', data=df) |
| plt.title('客户默认信用情况') |
| plt.xlabel('默认信用') |
| plt.ylabel('数量') |
| plt.show() |
| |
| plt.figure(figsize=(20, 15)) |
| |
| plt.subplot(2, 3, 1) |
| df_train['housing'].value_counts().plot(kind='pie', autopct='%1.1f%%') |
| plt.title('是否有住房贷款') |
| plt.ylabel('') |
| plt.show() |
| |
| # 是否有个人贷款(饼图) |
| plt.figure(figsize=(20, 15)) |
| plt.subplot(2, 3, 2) |
| df_train['loan'].value_counts().plot(kind='pie', autopct='%1.1f%%') |
| plt.title('是否有个人贷款') |
| plt.ylabel('') |
| plt.show() |
| |
| # 是否为银行客户(饼图) |
| plt.figure(figsize=(20, 15)) |
| plt.subplot(2, 3, 3) |
| df_train['subscribe'].value_counts().plot(kind='pie', autopct='%1.1f%%') |
| plt.title('是否为银行客户') |
| plt.ylabel('') |
| plt.show() |
| |
| # 年龄与是否为银行客户的关系(箱形图) |
| plt.figure(figsize=(20, 15)) |
| df_train.boxplot(column='age', by='subscribe') |
| plt.title('年龄与是否为银行客户的关系') |
| plt.suptitle('') # 去除默认的副标题 |
| plt.xlabel('是否为银行客户') |
| plt.ylabel('年龄') |
| plt.show() |
| |
| import copy |
| def convert_yes_no(value): |
| if value == 'yes': |
| return 1 |
| elif value == 'no': |
| return 0 |
| else: |
| return None |
| |
| df=copy.deepcopy(df_train) |
| for column in ['default', 'housing', 'loan', 'subscribe']: |
| df[column] = df[column].apply(convert_yes_no) |
| |
| df = df.dropna(subset=['default', 'housing', 'loan', 'subscribe']).reset_index(drop=True) |
| |
| df |
| |
| x=df.iloc[:,:-1].drop(['id'], axis = 1) |
| y=df.iloc[:,-1:] |
| |
| from sklearn.preprocessing import OneHotEncoder |
| non_numeric_columns = x.select_dtypes(include=['object']).columns |
| encoder = OneHotEncoder(sparse=False) |
| encoded_data = encoder.fit_transform(x[non_numeric_columns]) |
| encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(non_numeric_columns)) |
| x = pd.concat([x.drop(non_numeric_columns, axis=1), encoded_df], axis=1) |
| x.head() |
| |
| from sklearn.preprocessing import StandardScaler |
| scaler = StandardScaler()#标准差标准化 |
| x= scaler.fit_transform(x) |
| |
| from sklearn.model_selection import train_test_split |
| x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.365,random_state=42) |
| |
| from sklearn.linear_model import LogisticRegression |
| cgr=LogisticRegression() |
| cgr.fit(x_train,y_train) |
| cgr.score(x_test,y_test) |
| |
| from sklearn.metrics import roc_curve,auc |
| y_score=cgr.predict_proba(x_test)[:,1] |
| fpr, tpr, thersholds = roc_curve(y_test,y_score) |
| roc_auc=auc(fpr,tpr) |
| plt.plot(fpr, tpr, 'k--', label='ROC (area = {0:.2f})'.format(roc_auc), lw=2) |
| plt.xlim([-0.05, 1.05]) # 设置x、y轴的上下限, |
| plt.ylim([-0.05, 1.05]) |
| plt.xlabel('False Positive Rate') |
| plt.ylabel('True Positive Rate') # 可以使用中文,但需要导入一些库即字体 |
| plt.title('ROC Curve') |
| plt.legend(loc="lower right") |
| plt.show() |
| |
| from sklearn.tree import DecisionTreeClassifier |
| from sklearn.metrics import accuracy_score |
| |
| # 创建决策树分类器实例 |
| dtc = DecisionTreeClassifier() |
| |
| # 训练模型 |
| dtc.fit(x_train, y_train) |
| |
| # 在测试集上进行预测 |
| y_pred = dtc.predict(x_test) |
| |
| # 计算准确率 |
| accuracy = accuracy_score(y_test, y_pred) |
| accuracy |
| |
| from sklearn.svm import SVC |
| svm_model = SVC() |
| svm_model.fit(x_train, y_train) |
| accuracy = svm_model.score(x_test, y_test) |
| accuracy |
| |
| from sklearn.ensemble import RandomForestClassifier |
| rf_model = RandomForestClassifier() |
| rf_model.fit(x_train, y_train) |
| accuracy = rf_model.score(x_test, y_test) |
| accuracy |
| |
| from sklearn.ensemble import GradientBoostingClassifier |
| gb_model = GradientBoostingClassifier() |
| gb_model.fit(x_train, y_train) |
| accuracy = gb_model.score(x_test, y_test) |
| accuracy |
| |
| from sklearn.neighbors import KNeighborsClassifier |
| knn_model = KNeighborsClassifier() |
| knn_model.fit(x_train, y_train) |
| accuracy = knn_model.score(x_test, y_test) |
| accuracy |
| |
| from sklearn.naive_bayes import GaussianNB |
| nb_model = GaussianNB() |
| nb_model.fit(x_train, y_train) |
| accuracy = nb_model.score(x_test, y_test) |
| accuracy |
| |
| # 设置matplotlib中文显示 |
| plt.rcParams['font.sans-serif'] = ['SimHei'] |
| plt.rcParams['axes.unicode_minus'] = False |
| |
| # 绘制第一个图表:年龄与婚姻状况 |
| plt.figure(figsize=(12, 6)) |
| sns.boxplot(x='marital', y='age', data=df_train) |
| plt.title('年龄与婚姻状况分布') |
| plt.xlabel('婚姻状况') |
| plt.ylabel('年龄') |
| |
| plt.show() |
| |
| plt.figure(figsize=(12, 6)) |
| sns.countplot(x='job', hue='education', data=df_train) |
| plt.title('工作类型与教育水平分布') |
| plt.xlabel('工作类型') |
| plt.ylabel('数量') |
| plt.xticks(rotation=45) |
| |
| plt.show() |
| |
| df_train['loan_status'] = df_train['housing'] + ',' + df_train['loan'] |
| |
| plt.figure(figsize=(12, 6)) |
| sns.countplot(x='loan_status', hue='subscribe', data=df_train) |
| plt.title('信贷情况与订阅关系') |
| plt.xlabel('信贷情况') |
| plt.ylabel('数量') |
| |
| plt.show() |
| |
| # 对比银行客户和非银行客户的教育水平 |
| subscribed_df = df_train[df_train['subscribe'] == 'yes'] |
| |
| # 统计各学历层次的频率 |
| education_counts = subscribed_df['education'].value_counts() |
| # 筛选出非订阅银行服务的客户(subscribe为'no') |
| non_subscribed_df = df_train[df_train['subscribe'] == 'no'] |
| |
| # 统计非银行客户的教育水平频率 |
| non_education_counts = non_subscribed_df['education'].value_counts() |
| |
| # 绘制银行客户和非银行客户的教育水平对比图 |
| plt.figure(figsize=(12, 6)) |
| |
| # 银行客户 |
| plt.subplot(1, 2, 1) |
| education_counts.plot(kind='bar', color='skyblue') |
| plt.title('银行客户的教育水平') |
| plt.xlabel('教育水平') |
| plt.ylabel('客户数量') |
| plt.xticks(rotation=45) |
| |
| # 非银行客户 |
| plt.subplot(1, 2, 2) |
| non_education_counts.plot(kind='bar', color='orange') |
| plt.title('非银行客户的教育水平') |
| plt.xlabel('教育水平') |
| plt.ylabel('客户数量') |
| plt.xticks(rotation=45) |
| |
| plt.tight_layout() |
| plt.show() |
| |
| heatmap_data = pd.crosstab(df_train['job'], df_train['education']) |
| |
| plt.figure(figsize=(10, 8)) |
| sns.heatmap(heatmap_data, annot=True, cmap='coolwarm') |
| plt.title('工作类型与教育水平的相关性热力图') |
| plt.xlabel('教育水平') |
| plt.ylabel('工作类型') |
| |
| plt.show() |
| |
| # 创建一个新的数据框,用于绘制复合条形图 |
| loan_marital_job = pd.crosstab(index=[df_train['job'], df_train['marital']], columns=df_train['loan_status']) |
| |
| # 绘制复合条形图 |
| loan_marital_job.plot(kind='bar', stacked=True, figsize=(14, 8)) |
| plt.title('各职业婚姻状况下的贷款状态分布') |
| plt.xlabel('职业和婚姻状况') |
| plt.ylabel('数量') |
| plt.xticks(rotation=45) |
| plt.show() |
| |
| bins = [20, 30, 40, 50, 60] |
| |
| # 为DataFrame添加一个新列,用于表示年龄组 |
| df_train['age_group'] = pd.cut(df_train['age'], bins, right=False, labels=["20-29", "30-39", "40-49", "50-59"]) |
| |
| # 计算每个年龄组中不同工作的数量 |
| age_job_counts = df_train.groupby(['age_group', 'job']).size().unstack(fill_value=0) |
| |
| # 绘制条形图 |
| plt.figure(figsize=(15, 8)) |
| age_job_counts.plot(kind='bar', stacked=True, colormap='viridis') |
| plt.title('不同年龄组的工作类型分布', fontsize=14) |
| plt.xlabel('年龄组', fontsize=12) |
| plt.ylabel('工作数量', fontsize=12) |
| plt.xticks(rotation=45, fontsize=10) |
| plt.yticks(fontsize=10) |
| plt.legend(title='工作类型', bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=10) |
| plt.tight_layout() |
| |
| plt.show() |
| |
| # 创建已婚和未婚的DataFrame |
| married_df = df_train[df_train['marital'] == 'married'] |
| unmarried_df = df_train[df_train['marital'] != 'married'] |
| |
| # 计算已婚和未婚的年龄中位数 |
| median_age_married = married_df['age'].median() |
| median_age_unmarried = unmarried_df['age'].median() |
| |
| # 绘制年龄分布直方图 |
| plt.figure(figsize=(14, 7)) |
| |
| # 已婚人群的年龄分布 |
| plt.hist(married_df['age'], bins=range(min(df_train['age']), max(df_train['age']) + 5, 5), alpha=0.5, label='已婚') |
| |
| # 未婚人群的年龄分布 |
| plt.hist(unmarried_df['age'], bins=range(min(df_train['age']), max(df_train['age']) + 5, 5), alpha=0.5, label='未婚') |
| |
| plt.axvline(median_age_married, color='blue', linestyle='dashed', linewidth=1, label='已婚中位年龄') |
| plt.axvline(median_age_unmarried, color='orange', linestyle='dashed', linewidth=1, label='未婚中位年龄') |
| |
| plt.title('已婚与未婚人群的年龄分布', fontsize=16) |
| plt.xlabel('年龄', fontsize=14) |
| plt.ylabel('人数', fontsize=14) |
| plt.legend(fontsize=12) |
| plt.grid(True) |
| plt.tight_layout() |
| |
| plt.show() |
四、总结
通过对银行客户的大数据分析和挖掘,我得到了以下有益的结论:
客户画像:通过数据分析,可以得出客户的基本信息、交易记录、产品持有情况等特征,从而构建出客户画像。这有助于银行更好地了解客户需求和行为,为精准营销和服务提供支持。
认购产品预测:通过训练预测模型,可以预测客户未来认购产品的可能性。这有助于银行提前了解客户需求,提供更加精准的产品推荐和服务,提高客户满意度和忠诚度。
自己在完成此设计过程中,得到了的收获如下:
深入了解银行客户数据的特点和结构,掌握了数据清洗、整合和特征提取等数据处理技术,熟悉了多种机器学习算法,并能够根据数据特点选择合适的算法进行模型训练和优化,了解了大数据在金融行业中的应用场景和价值,加深了对大数据分析的认识和理解。
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 无需6万激活码!GitHub神秘组织3小时极速复刻Manus,手把手教你使用OpenManus搭建本
· Manus爆火,是硬核还是营销?
· 终于写完轮子一部分:tcp代理 了,记录一下
· 别再用vector<bool>了!Google高级工程师:这可能是STL最大的设计失误
· 单元测试从入门到精通