1 导入必要的库
- import pandas as pd
- import numpy as np
- import missingno as msno
- import matplotlib.pyplot as plt
- from matplotlib import rcParams
- import seaborn as sns
- from sklearn.metrics import roc_curve, auc
- from sklearn.linear_model import LogisticRegression
- from sklearn.tree import DecisionTreeClassifier
- from sklearn.svm import SVC
- from sklearn.neighbors import KNeighborsClassifier
- from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
- from xgboost import XGBClassifier
- from lightgbm import LGBMClassifier
- from sklearn.ensemble import StackingClassifier
- from sklearn.metrics import confusion_matrix
- # 忽略Matplotlib的警告(可选)
- import warnings
- warnings.filterwarnings("ignore")
- # 设置中文显示和负号正常显示
- plt.rcParams['font.sans-serif'] = ['SimHei']
- plt.rcParams['axes.unicode_minus'] = False
复制代码 2 导入数据
- # 读取Excel文件
- df = pd.read_excel('目标客户体验数据.xlsx')
- df
复制代码 idstylea1a2a3a4a5a6a7a8...B9B10B11B12B13B14B15B16B17isorno012753.03539987.65379181.55817185.61159385.62967885.80757682.34645384.769555...6142319.01110305012388.92279082.94626285.16608185.18972477.76249883.59557982.15236788.872546...654418.0108030023395.04829493.33313177.66037593.03427488.86999894.16996295.60265595.877373...495224.0101700034371.15232876.78576766.69170181.92612566.65499877.77367477.58024776.989502...6104727.010101825045370.57396271.64594970.44455474.02983266.65499866.33609262.09302174.436962...6134225.01515000..................................................................19591960271.35766375.37369070.44455477.76704468.43027675.02696575.48226170.333970...563525.0112000019601961299.03688899.03017299.03255099.98334299.97749899.99186799.99271399.980365...5147929.0993020019611962290.77128191.92105592.67178796.71974394.96189997.24515895.60265595.877373...684539.02017030019621963282.42732788.51148974.05246493.03427476.46171194.70467691.16357295.877373...684421.01272020019631964277.80869277.803468702.99611477.76704461.80365377.77367473.19018958.796528...674325.010122500 1964 rows × 28 columns
3 数据预处置惩罚
- # 可视化缺失值
- msno.matrix(df)
- plt.axis('off')
- plt.show()
- # 将“B7”列中的缺失值替换为整型数值0
- df['B7'].fillna(0, inplace=True)
- # 检测重复值并处理
- print("重复值数量:", df.duplicated().sum())
- df.drop_duplicates(inplace=True)
复制代码 运行结果如图3-1所示:
图3-1 缺失值可视化与重复值检测
4 数据分布
4.1 箱线图
- # 设置颜色风格为精简学术风格
- sns.set_style("whitegrid")
- # 设置图形大小
- plt.figure(figsize=(15, 40))
- features_to_plot_features = [col for col in df.columns]
- # 对每个特征绘制箱线图子图
- for i, feature in enumerate(features_to_plot_features):
- plt.subplot(len(features_to_plot_features) // 2 + 1, 4, i + 1) # 设置子图位置
- sns.boxplot(df[feature], palette="Set3") # 使用Set3颜色风格
- plt.title(feature)
- plt.xlabel('')
- # 显示图形
- plt.tight_layout() # 调整子图间距
- plt.show()
复制代码 运行结果如图4-1所示:
图4-1 箱线图
4.2 pair plot
- # 使用seaborn的pairplot绘制数据分布
- sns.pairplot(df)
复制代码 运行结果如图4-2所示:
图4-2 散点图
4.3 hist plot
- # 设置颜色风格为精简学术风格
- #sns.set_style("whitegrid")
- # 设置图形大小
- plt.figure(figsize=(15, 40))
- features_to_plot_features = [col for col in df.columns]
- # 对每个特征绘制
- for i, feature in enumerate(features_to_plot_features):
- plt.subplot(len(features_to_plot_features) // 2 + 1, 4, i + 1) # 设置子图位置
- #sns.kdeplot(df[feature], palette="Set3") # 使用Set3颜色风格
- sns.histplot(df[feature], palette="Set3",kde=True) # 使用Set3颜色风格
- plt.title(feature)
- plt.xlabel('')
- # 显示图形
- plt.tight_layout() # 调整子图间距
- plt.show()
复制代码 运行结果如图4-3所示:
图4-3 直方图
4.4 小提琴图
- # 设置颜色风格为精简学术风格
- sns.set_style("whitegrid")
- # 设置图形大小
- plt.figure(figsize=(15,45))
- features_to_plot_features = [col for col in df.columns]
- palette=['deep','muted','pastel','muted','pastel','viridis','dark','rocket','crest','mako','flare','magma','viridis','vlag','icefire',
- 'deep','muted','pastel','viridis','dark','colorblind','rocket','crest','mako','flare','magma','bright','vlag','icefire']
- # 对每个特征绘制箱线图子图
- for i, feature in enumerate(features_to_plot_features):
- plt.subplot(len(features_to_plot_features) // 2 + 1, 4, i + 1) # 设置子图位置
- sns.violinplot(df[feature], palette=palette[i])
- plt.title(feature)
- plt.xlabel('') # 移除x轴标签,因为只有一个变量
- # 显示图形
- plt.tight_layout() # 调整子图间距
- plt.show()
复制代码 运行结果如图4-4所示:
图4-4 小提琴图
5 相关性
5.1 heatmap
- # 计算相关性矩阵
- corr_matrix = df.corr()
- # 绘制heatmap
- plt.figure(figsize=(25,25))
- sns.heatmap(corr_matrix, annot=True, cmap='PuOr', linewidths=.5)
- plt.title('Heatmap of Correlation Matrix')
复制代码 运行结果如图5-1所示:
图5-1 heatmap
5.2 clustermap
- # 直接使用clustermap对原始数据进行聚类并绘制热图
- sns.clustermap(df, standard_scale=1, cmap='PuBuGn', annot=False, fmt=".2f")
- plt.title('Clustermap of DataFrame')
- plt.show()
复制代码 运行结果如图5-2所示:
图5-2 clustermap
6 分别数据集
- from sklearn.model_selection import train_test_split
- X = df.drop('isorno', axis=1)
- y = df['isorno']
- X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
复制代码 7 Stacking分类模型建立
- # 定义基础层分类器列表
- base_learners = [
- ('Logistic Regression', LogisticRegression()),
- ('Decision Tree', DecisionTreeClassifier()),
- ('SVM', SVC(probability=True)),
- ('KNN', KNeighborsClassifier(3)),
- ('Random Forest', RandomForestClassifier()),
- ('AdaBoost', AdaBoostClassifier()),
- ('XGBoost', XGBClassifier()),
- ('LightGBM', LGBMClassifier())
- ]
-
- # 使用StackingClassifier构建stacking集成学习模型
- stacking_clf = StackingClassifier(
- estimators=base_learners,
- final_estimator=LogisticRegression()
- )
复制代码 8 训练模型
- # 训练stacking集成学习模型
- stacking_clf.fit(X_train, y_train)
复制代码 模型结构如图8-1所示:
图8-1 Stacking分类模型结构
9 猜测
- # 预测
- y_pred_stacking = stacking_clf.predict(X_test)
复制代码 10 猜测结果
- indices = range(len(y_test))
-
- # 绘制真实值和预测值的折线图
- plt.figure(figsize=(10, 6))
- plt.plot(indices, y_test, color='g', marker='o', markerfacecolor='none', markeredgecolor='black', label='True Values', linestyle='-')
- plt.plot(indices, y_pred_stacking, color='black', marker='*',markerfacecolor='none',markeredgecolor='r',label='Predicted Values', linestyle='--')
- plt.title('True vs Predicted Values')
- plt.xlabel('Index in Test Set')
- plt.ylabel('isorno Value')
- plt.legend()
- plt.grid(True)
- plt.show()
复制代码 运行结果如图10-1所示:
图10-1 猜测结果
11 模型评估
11.1 肴杂矩阵
- plt.figure(figsize=(6,6))
- # 假设 num_classes 是类别的数量
- num_classes = len(np.unique(y_train))
-
- # 确保我们可以将分类器数量和一个额外的堆叠模型的混淆矩阵放入布局中
- # 这里我们假设最大可以显示9个基础分类器的混淆矩阵,以及一个堆叠模型的混淆矩阵
- max_classifiers_to_show = 9
-
- # 创建一个3x4的布局来容纳所有子图(9个基础分类器 + 1个堆叠模型)
- fig, axes = plt.subplots(nrows=3, ncols=3, figsize=(18, 12)) # 调整figsize以适应你的需要
- axes = axes.flatten()[:max_classifiers_to_show] # 只使用前max_classifiers_to_show个子图
-
- # 创建一个颜色映射列表
- #cmaps = sns.color_palette("husl", max_classifiers_to_show) # 使用seaborn的颜色映射
- cmaps = ['Blues', 'plasma', 'Spectral', 'Purples', 'gist_stern', 'gist_ncar', 'inferno', 'BuGn', 'binary']
- # 遍历分类器
- for ax_idx, (name, clf) in enumerate(base_learners[:max_classifiers_to_show]):
- # 拟合模型
- clf.fit(X_train, y_train)
- # 预测测试集
- y_pred = clf.predict(X_test)
- # 计算混淆矩阵
- cm = confusion_matrix(y_test, y_pred)
- sns.heatmap(cm, annot=True, fmt='d', cmap=cmaps[ax_idx], ax=axes[ax_idx])
- axes[ax_idx].set_xlabel('Predicted')
- axes[ax_idx].set_ylabel('True')
- axes[ax_idx].set_title(f'Confusion Matrix for {name}')
-
- # 添加堆叠集成学习模型的混淆矩阵
- stacking_cm = confusion_matrix(y_test, y_pred_stacking)
- sns.heatmap(stacking_cm, annot=True, fmt='d', cmap=cmaps[-1], ax=axes[-1]) # 使用最后一个颜色映射
- axes[-1].set_xlabel('Predicted')
- axes[-1].set_ylabel('True')
- axes[-1].set_title('Confusion Matrix for Stacking')
-
- # 显示图形
- plt.tight_layout() # 确保子图之间不重叠
- plt.show()
复制代码 运行结果如图11-1所示:
图11-1 肴杂矩阵
11.2 ROC曲线
- plt.figure(figsize=(6, 6))
- markers = ['o', '.', '2', '^', '*', '>', '+', '1', 'p', '_', '8']
- linestyles = ['-', '--', ':', '-.', 'solid', 'dashed', '-.', '-.', ':', '-', '--']
- colors = ['b', 'g', 'r', 'c', 'r', 'y', 'k', 'tab:blue', 'tab:orange', 'tab:green', 'tab:red', 'tab:purple', 'tab:brown', 'tab:pink', 'tab:gray', 'tab:olive', 'tab:cyan']
-
- # 绘制基础分类器的ROC曲线
- for i, (name, clf) in enumerate(base_learners):
- clf.fit(X_train, y_train)
- y_score = clf.predict_proba(X_test)[:, 1]
- fpr, tpr, thresholds = roc_curve(y_test, y_score)
- roc_auc = auc(fpr, tpr)
- plt.plot(fpr, tpr, color=colors[i % len(colors)], label=f'{name} (AUC = {roc_auc:.2f})', marker=markers[i % len(markers)], linestyle=linestyles[i % len(linestyles)])
-
- # 绘制堆叠分类器的ROC曲线
- stacking_y_score = stacking_clf.predict_proba(X_test)[:, 1]
- stacking_fpr, stacking_tpr, _ = roc_curve(y_test, stacking_y_score)
- stacking_roc_auc = auc(stacking_fpr, stacking_tpr)
- plt.plot(stacking_fpr, stacking_tpr, color='black', label=f'Stacking (AUC = {stacking_roc_auc:.2f})', linestyle='--', marker='s')
-
- plt.plot([0, 1], [0, 1], color='grey', linestyle='--')
- plt.xlim([0.0, 1.0])
- plt.ylim([0.0, 1.05])
- plt.xlabel('False Positive Rate')
- plt.ylabel('True Positive Rate')
- plt.title('Receiver Operating Characteristic')
- plt.legend(loc="lower right")
- plt.show()
复制代码 运行结果如图11-2所示:
图11-2 ROC曲线对比
12 猜测新数据
- # 读取Excel文件
- new_data = pd.read_excel('待判定的数据.xlsx')
- # 将“B7”列中的缺失值替换为整型数值0
- new_data['B7'].fillna(0, inplace=True)
- # 检测重复值并处理
- print("重复值数量:", new_data.duplicated().sum())
- df.drop_duplicates(inplace=True)
- if 'isorno' in new_data.columns:
- new_data = new_data.drop('isorno', axis=1)
- # 预测
- new_pred_stacking = stacking_clf.predict(new_data)
- plt.plot(new_pred_stacking,color='black',marker='*',markerfacecolor='none',markeredgecolor='r',label='Predicted Values', linestyle='--')
复制代码 新猜测结果如图12-1所示:
图12-1 新数据猜测结果
免责声明:如果侵犯了您的权益,请联系站长,我们会及时删除侵权内容,谢谢合作!更多信息从访问主页:qidao123.com:ToB企服之家,中国第一个企服评测及商务社交产业平台。 |