基于R 4.2.2版本演示
一、写在前面
有不少大佬问做机器学习分类能不能用R语言,不想学Python咯。
答曰:可!用GPT大概Kimi转一下就得了呗。
加上最近也没啥内容写了,就帮各位搬运一下吧。
二、R代码实现Catboost分类
(1)导入数据
我习惯用RStudio自带的导入功能:
(2)创建Catboost模型(默认参数)
- # Load necessary libraries
- library(caret)
- library(pROC)
- library(ggplot2)
- library(catboost)
- # Assume 'data' is your dataframe containing the data
- # Set seed to ensure reproducibility
- set.seed(123)
- # Split data into training and validation sets (80% training, 20% validation)
- trainIndex <- createDataPartition(data$X, p = 0.8, list = FALSE)
- trainData <- data[trainIndex, ]
- validData <- data[-trainIndex, ]
- # Prepare pools for CatBoost
- trainPool <- catboost.load_pool(data = trainData[, -which(names(trainData) == "X")], label = trainData$X)
- validPool <- catboost.load_pool(data = validData[, -which(names(validData) == "X")], label = validData$X)
- # Define parameters for CatBoost
- params <- list(
- iterations = 250,
- depth = 6,
- learning_rate = 0.1,
- l2_leaf_reg = 10,
- loss_function = "Logloss",
- eval_metric = "AUC"
- )
- # Train the CatBoost model
- model <- catboost.train(learn_pool = trainPool, params = params)
- # Predict on the training and validation sets using the correct parameter
- trainPredict <- catboost.predict(model, trainPool, prediction_type = "Probability")
- validPredict <- catboost.predict(model, validPool, prediction_type = "Probability")
- # Convert predictions to binary using 0.5 as threshold
- trainPredictBinary <- ifelse(trainPredict > 0.5, 1, 0)
- validPredictBinary <- ifelse(validPredict > 0.5, 1, 0)
- # 计算ROC对象
- trainRoc <- roc(response = as.numeric(trainData$X) - 1, predictor = trainPredict)
- validRoc <- roc(response = as.numeric(validData$X) - 1, predictor = validPredict)
- # 使用ggplot绘制ROC曲线
- trainRocPlot <- ggplot(data = data.frame(fpr = 1 - trainRoc$specificities, tpr = trainRoc$sensitivities), aes(x = fpr, y = tpr)) +
- geom_line(color = "blue") +
- geom_area(aes(ifelse(fpr <= 1, fpr, NA)), fill = "blue", alpha = 0.2) + # 使用条件表达式确保不超出坐标范围
- geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "black") +
- ggtitle("Training ROC Curve") +
- xlab("False Positive Rate") +
- ylab("True Positive Rate") +
- annotate("text", x = 0.5, y = 0.1, label = paste("Training AUC =", round(auc(trainRoc), 2)), hjust = 0.5, color = "blue")
- validRocPlot <- ggplot(data = data.frame(fpr = 1 - validRoc$specificities, tpr = validRoc$sensitivities), aes(x = fpr, y = tpr)) +
- geom_line(color = "red") +
- geom_area(aes(ifelse(fpr <= 1, fpr, NA)), fill = "red", alpha = 0.2) + # 使用条件表达式确保不超出坐标范围
- geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "black") +
- ggtitle("Validation ROC Curve") +
- xlab("False Positive Rate") +
- ylab("True Positive Rate") +
- annotate("text", x = 0.5, y = 0.2, label = paste("Validation AUC =", round(auc(validRoc), 2)), hjust = 0.5, color = "red")
- # 显示绘图
- print(trainRocPlot)
- print(validRocPlot)
- # Calculate confusion matrices based on 0.5 cutoff for probability
- confMatTrain <- table(trainData$X, trainPredict >= 0.5)
- confMatValid <- table(validData$X, validPredict >= 0.5)
- # Plot and display confusion matrices
- plot_confusion_matrix <- function(pred, actual, dataset_name) {
- conf_mat <- table(Predicted = pred >= 0.5, Actual = actual)
- conf_mat_df <- as.data.frame(as.table(conf_mat))
- colnames(conf_mat_df) <- c("Actual", "Predicted", "Freq")
-
- p <- ggplot(data = conf_mat_df, aes(x = Predicted, y = Actual, fill = Freq)) +
- geom_tile(color = "white") +
- geom_text(aes(label = Freq), vjust = 1.5, color = "black", size = 5) +
- scale_fill_gradient(low = "white", high = "steelblue") +
- labs(title = paste("Confusion Matrix -", dataset_name, "Set"), x = "Predicted Class", y = "Actual Class") +
- theme_minimal() +
- theme(axis.text.x = element_text(angle = 45, hjust = 1), plot.title = element_text(hjust = 0.5))
-
- print(p)
- }
- # Call the function to plot confusion matrices for both training and validation sets
- plot_confusion_matrix(trainPredict, trainData$X, "Training")
- plot_confusion_matrix(validPredict, validData$X, "Validation")
- # Extract values for calculations
- a_train <- confMatTrain[1, 1]
- b_train <- confMatTrain[1, 2]
- c_train <- confMatTrain[2, 1]
- d_train <- confMatTrain[2, 2]
- a_valid <- confMatValid[1, 1]
- b_valid <- confMatValid[1, 2]
- c_valid <- confMatValid[2, 1]
- d_valid <- confMatValid[2, 2]
- # Training Set Metrics
- acc_train <- (a_train + d_train) / sum(confMatTrain)
- error_rate_train <- 1 - acc_train
- sen_train <- d_train / (d_train + c_train)
- sep_train <- a_train / (a_train + b_train)
- precision_train <- d_train / (b_train + d_train)
- F1_train <- (2 * precision_train * sen_train) / (precision_train + sen_train)
- MCC_train <- (d_train * a_train - b_train * c_train) / sqrt((d_train + b_train) * (d_train + c_train) * (a_train + b_train) * (a_train + c_train))
- auc_train <- roc(response = trainData$X, predictor = trainPredict)$auc
- # Validation Set Metrics
- acc_valid <- (a_valid + d_valid) / sum(confMatValid)
- error_rate_valid <- 1 - acc_valid
- sen_valid <- d_valid / (d_valid + c_valid)
- sep_valid <- a_valid / (a_valid + b_valid)
- precision_valid <- d_valid / (b_valid + d_valid)
- F1_valid <- (2 * precision_valid * sen_valid) / (precision_valid + sen_valid)
- MCC_valid <- (d_valid * a_valid - b_valid * c_valid) / sqrt((d_valid + b_valid) * (d_valid + c_valid) * (a_valid + b_valid) * (a_valid + c_valid))
- auc_valid <- roc(response = validData$X, predictor = validPredict)$auc
- # Print Metrics
- cat("Training Metrics\n")
- cat("Accuracy:", acc_train, "\n")
- cat("Error Rate:", error_rate_train, "\n")
- cat("Sensitivity:", sen_train, "\n")
- cat("Specificity:", sep_train, "\n")
- cat("Precision:", precision_train, "\n")
- cat("F1 Score:", F1_train, "\n")
- cat("MCC:", MCC_train, "\n")
- cat("AUC:", auc_train, "\n\n")
- cat("Validation Metrics\n")
- cat("Accuracy:", acc_valid, "\n")
- cat("Error Rate:", error_rate_valid, "\n")
- cat("Sensitivity:", sen_valid, "\n")
- cat("Specificity:", sep_valid, "\n")
- cat("Precision:", precision_valid, "\n")
- cat("F1 Score:", F1_valid, "\n")
- cat("MCC:", MCC_valid, "\n")
- cat("AUC:", auc_valid, "\n")
复制代码在R语言中,Catboost模型得单独安装,下面是一些可以调整的关键参数:
①学习率 (learning_rate):控制每步模型更新的幅度。较小的学习率可以进步模型的训练稳定性和准确性,但可能必要更多的时间和更多的树来收敛。
②树的深度 (depth):决定了每棵树的最大深度。较深的树可以更好地捕捉数据中的复杂关系,但也可能导致过拟合。
③树的数量 (iterations):模型中树的总数。更多的树可以增长模型的复杂度和能力,但同样可能导致过拟合。
④L2 正则化系数 (l2_leaf_reg):在模型的丧失函数中增长一个正则项,以减少模型复杂度和过拟合风险。
⑤边界计数 (border_count):用于数值特征分箱的边界数量,影响模型在连续特征上的决策边界。
⑥种别特征组合深度 (cat_features):CatBoost 优化了对种别特征的处置处罚,可以指定在模型中使用的种别特征。
⑦子采样 (subsample):指定每棵树训练时从训练数据集中随机抽取的比例,有助于防止模型过拟合。
⑧列采样 (colsample_bylevel,colsample_bytree):控制每棵树或每个级别使用的特征的比例,可以增长模型的多样性,低落过拟合风险。
⑨最小数据在叶节点 (min_data_in_leaf):叶节点必需的最小样本数量,增长这个参数的值可以防止模型学习过于具体的模式,从而低落过拟合风险。
⑩评估指标 (eval_metric):用于训练过程中模型评估的性能指标。
结果输出(随便挑的):
从AUC来看,Catboost随便一跑,就跑出过拟合了,跟Xgboost差不多。
三、Catboost调参
随便设置了一下,效果不明显,给各位自行嗨皮:
- # Load necessary libraries
- library(caret)
- library(pROC)
- library(ggplot2)
- library(catboost)
- # Assume 'data' is your dataframe containing the data
- # Set seed to ensure reproducibility
- set.seed(123)
- # Convert the target variable to factor if not already
- data$X <- as.factor(data$X)
- data$X <- as.numeric(data$X) - 1
- # Split data into training and validation sets (80% training, 20% validation)
- trainIndex <- createDataPartition(data$X, p = 0.8, list = FALSE)
- trainData <- data[trainIndex, ]
- validData <- data[-trainIndex, ]
- # Prepare CatBoost pools
- trainPool <- catboost.load_pool(data = trainData[, -which(names(trainData) == "X")], label = trainData$X)
- validPool <- catboost.load_pool(data = validData[, -which(names(validData) == "X")], label = validData$X)
- # Define parameter grid
- depths <- c(2, 4, 6) # Reduced maximum depth
- l2_leaf_regs <- c(1, 3, 5, 10, 20, 25) # Increased maximum regularization
- iterations <- c(500, 1000) # Added higher iteration count for lower learning rates
- learning_rates <- c(0.05, 0.1) # Lower maximum learning rate
- subsample <- 1.0 # Use 80% of data for each tree to prevent overfitting
- best_auc <- 0
- best_params <- list()
- # Loop through parameter grid
- for (depth in depths) {
- for (l2_leaf_reg in l2_leaf_regs) {
- for (iter in iterations) {
- for (learning_rate in learning_rates) {
- # Set parameters for this iteration
- params <- list(
- iterations = iter,
- depth = depth,
- learning_rate = learning_rate,
- l2_leaf_reg = l2_leaf_reg,
- loss_function = 'Logloss',
- eval_metric = 'AUC'
- )
-
- # Train the model
- model <- catboost.train(learn_pool = trainPool, test_pool = validPool, params = params)
-
- # Predict on the validation set
- validPredict <- catboost.predict(model, validPool)
- if (is.vector(validPredict)) {
- validPredictBinary <- ifelse(validPredict > 0.5, 1, 0)
- } else {
- # Assuming the second column is the probability of the positive class
- validPredictBinary <- ifelse(validPredict[, 2] > 0.5, 1, 0)
- }
-
- # Calculate AUC
- validRoc <- roc(response = as.numeric(validData$X) - 1, predictor = validPredictBinary)
- auc_score <- auc(validRoc)
-
- # Update best model if current AUC is better
- if (auc_score > best_auc) {
- best_auc <- auc_score
- best_params <- params
- }
- }
- }
- }
- }
- # Print the best AUC and corresponding parameters
- print(paste("Best AUC:", best_auc))
- print("Best Parameters:")
- print(best_params)
- # After parameter tuning, train the model with best parameters
- model <- catboost.train(learn_pool = trainPool, params = best_params)
- # Predict on the training and validation sets using the correct parameter
- trainPredict <- catboost.predict(model, trainPool, prediction_type = "Probability")
- validPredict <- catboost.predict(model, validPool, prediction_type = "Probability")
- # Convert predictions to binary using 0.5 as threshold
- trainPredictBinary <- ifelse(trainPredict > 0.5, 1, 0)
- validPredictBinary <- ifelse(validPredict > 0.5, 1, 0)
- # 计算ROC对象
- trainRoc <- roc(response = as.numeric(trainData$X) - 1, predictor = trainPredict)
- validRoc <- roc(response = as.numeric(validData$X) - 1, predictor = validPredict)
- # 使用ggplot绘制ROC曲线
- trainRocPlot <- ggplot(data = data.frame(fpr = 1 - trainRoc$specificities, tpr = trainRoc$sensitivities), aes(x = fpr, y = tpr)) +
- geom_line(color = "blue") +
- geom_area(aes(ifelse(fpr <= 1, fpr, NA)), fill = "blue", alpha = 0.2) + # 使用条件表达式确保不超出坐标范围
- geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "black") +
- ggtitle("Training ROC Curve") +
- xlab("False Positive Rate") +
- ylab("True Positive Rate") +
- annotate("text", x = 0.5, y = 0.1, label = paste("Training AUC =", round(auc(trainRoc), 2)), hjust = 0.5, color = "blue")
- validRocPlot <- ggplot(data = data.frame(fpr = 1 - validRoc$specificities, tpr = validRoc$sensitivities), aes(x = fpr, y = tpr)) +
- geom_line(color = "red") +
- geom_area(aes(ifelse(fpr <= 1, fpr, NA)), fill = "red", alpha = 0.2) + # 使用条件表达式确保不超出坐标范围
- geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "black") +
- ggtitle("Validation ROC Curve") +
- xlab("False Positive Rate") +
- ylab("True Positive Rate") +
- annotate("text", x = 0.5, y = 0.2, label = paste("Validation AUC =", round(auc(validRoc), 2)), hjust = 0.5, color = "red")
- # 显示绘图
- print(trainRocPlot)
- print(validRocPlot)
- # Calculate confusion matrices based on 0.5 cutoff for probability
- confMatTrain <- table(trainData$X, trainPredict >= 0.5)
- confMatValid <- table(validData$X, validPredict >= 0.5)
- # Function to plot confusion matrix using ggplot2
- plot_confusion_matrix <- function(conf_mat, dataset_name) {
- conf_mat_df <- as.data.frame(as.table(conf_mat))
- colnames(conf_mat_df) <- c("Actual", "Predicted", "Freq")
-
- p <- ggplot(data = conf_mat_df, aes(x = Predicted, y = Actual, fill = Freq)) +
- geom_tile(color = "white") +
- geom_text(aes(label = Freq), vjust = 1.5, color = "black", size = 5) +
- scale_fill_gradient(low = "white", high = "steelblue") +
- labs(title = paste("Confusion Matrix -", dataset_name, "Set"), x = "Predicted Class", y = "Actual Class") +
- theme_minimal() +
- theme(axis.text.x = element_text(angle = 45, hjust = 1), plot.title = element_text(hjust = 0.5))
-
- print(p)
- }
- # Now call the function to plot and display the confusion matrices
- plot_confusion_matrix(confMatTrain, "Training")
- plot_confusion_matrix(confMatValid, "Validation")
- # Extract values for calculations
- a_train <- confMatTrain[1, 1]
- b_train <- confMatTrain[1, 2]
- c_train <- confMatTrain[2, 1]
- d_train <- confMatTrain[2, 2]
- a_valid <- confMatValid[1, 1]
- b_valid <- confMatValid[1, 2]
- c_valid <- confMatValid[2, 1]
- d_valid <- confMatValid[2, 2]
- # Training Set Metrics
- acc_train <- (a_train + d_train) / sum(confMatTrain)
- error_rate_train <- 1 - acc_train
- sen_train <- d_train / (d_train + c_train)
- sep_train <- a_train / (a_train + b_train)
- precision_train <- d_train / (b_train + d_train)
- F1_train <- (2 * precision_train * sen_train) / (precision_train + sen_train)
- MCC_train <- (d_train * a_train - b_train * c_train) / sqrt((d_train + b_train) * (d_train + c_train) * (a_train + b_train) * (a_train + c_train))
- auc_train <- roc(response = trainData$X, predictor = trainPredict)$auc
- # Validation Set Metrics
- acc_valid <- (a_valid + d_valid) / sum(confMatValid)
- error_rate_valid <- 1 - acc_valid
- sen_valid <- d_valid / (d_valid + c_valid)
- sep_valid <- a_valid / (a_valid + b_valid)
- precision_valid <- d_valid / (b_valid + d_valid)
- F1_valid <- (2 * precision_valid * sen_valid) / (precision_valid + sen_valid)
- MCC_valid <- (d_valid * a_valid - b_valid * c_valid) / sqrt((d_valid + b_valid) * (d_valid + c_valid) * (a_valid + b_valid) * (a_valid + c_valid))
- auc_valid <- roc(response = validData$X, predictor = validPredict)$auc
- # Print Metrics
- cat("Training Metrics\n")
- cat("Accuracy:", acc_train, "\n")
- cat("Error Rate:", error_rate_train, "\n")
- cat("Sensitivity:", sen_train, "\n")
- cat("Specificity:", sep_train, "\n")
- cat("Precision:", precision_train, "\n")
- cat("F1 Score:", F1_train, "\n")
- cat("MCC:", MCC_train, "\n")
- cat("AUC:", auc_train, "\n\n")
- cat("Validation Metrics\n")
- cat("Accuracy:", acc_valid, "\n")
- cat("Error Rate:", error_rate_valid, "\n")
- cat("Sensitivity:", sen_valid, "\n")
- cat("Specificity:", sep_valid, "\n")
- cat("Precision:", precision_valid, "\n")
- cat("F1 Score:", F1_valid, "\n")
- cat("MCC:", MCC_valid, "\n")
- cat("AUC:", auc_valid, "\n")
复制代码结果输出:
提供个样本代码吧,我不调了。
五、最后
至于怎么安装,自学了哈。
数据嘛:
链接:https://pan.baidu.com/s/1rEf6JZyzA1ia5exoq5OF7g?pwd=x8xm
提取码:x8xm
免责声明:如果侵犯了您的权益,请联系站长,我们会及时删除侵权内容,谢谢合作!更多信息从访问主页:qidao123.com:ToB企服之家,中国第一个企服评测及商务社交产业平台。 |