竞赛实战--天池金融风控分类题目

打印 上一主题 下一主题

主题 583|帖子 583|积分 1749

背景

1、金融风控分类题目,作为机器学习竞赛是一个比力好的选择
2、怎样举行数据处理
代码

数据分析部分

  1. #!/usr/bin/env python
  2. # coding: utf-8
  3. import os
  4. import gc
  5. import numpy as np
  6. import pandas as pd
  7. import warnings
  8. import lightgbm as lgb
  9. import catboost as cbt
  10. import xgboost as xgb
  11. from sklearn.metrics import roc_auc_score
  12. from sklearn.model_selection import StratifiedKFold, KFold, train_test_split, GridSearchCV
  13. from sklearn.preprocessing import LabelEncoder, StandardScaler
  14. from tqdm import tqdm
  15. import matplotlib.pyplot as plt
  16. import seaborn as sns
  17. from scipy.stats import kstest
  18. warnings.filterwarnings("ignore")
  19. pd.set_option('display.max_columns', None)
  20. # plt.ion()
复制代码
  1. # ## 导入数据
  2. BASE_DIR = os.path.dirname(os.path.abspath(__file__))
  3. DATA_PATH = os.path.join(BASE_DIR, 'data')
  4. train_data_file = os.path.join(DATA_PATH, "train.csv")
  5. train_data = pd.read_csv(train_data_file)
  6. test_data_file = os.path.join(DATA_PATH, "testA.csv")
  7. test_data = pd.read_csv(test_data_file)
  8. target = train_data['isDefault']
  9. train_data = train_data.drop(['isDefault'], axis=1)
  10. data = pd.concat([train_data, test_data])
  11. objectList = [i for i in train_data.columns if train_data[i].dtype == 'O']
  12. classList = [i for i in train_data.select_dtypes(exclude=['object']).columns if len(train_data[i].unique()) <= 10]
  13. numericalList = [i for i in train_data.select_dtypes(exclude=['object']).columns if i not in classList]
复制代码
对差别类型变量举行分类分组处理
  1. # ## 变量分类和缺失值处理
  2. info = pd.DataFrame(data.isnull().sum())
  3. info = info[info[0] != 0]
  4. miss_fea = info.index
  5. miss_objectList = [i for i in miss_fea if i in objectList]
  6. miss_classList = [i for i in miss_fea if i in classList]
  7. miss_numericalList = [i for i in miss_fea if i in numericalList]
  8. # 填补缺失值
  9. data['employmentLength'] = data['employmentLength'].fillna(0)
  10. data['n11'] = data['n11'].fillna(0)
  11. data['n12'] = data['n12'].fillna(0)
  12. data['employmentTitle'] = data['employmentTitle'].fillna(data['employmentTitle'].mode()[0])
  13. data['postCode'] = data['postCode'].fillna(data['postCode'].mode()[0])
  14. data['dti'] = data['dti'].fillna(data['postCode'].mean())
  15. data['pubRecBankruptcies'] = data['pubRecBankruptcies'].fillna(data['pubRecBankruptcies'].mean())
  16. data['revolUtil'] = data['revolUtil'].fillna(data['revolUtil'].mean())
  17. data['title'] = data['title'].fillna(data['title'].mode()[0])
  18. NoNameList = [i for i in miss_numericalList if i.startswith("n")]
  19. for i in NoNameList:
  20.     data[i] = data[i].fillna(data[i].mode()[0])
  21. # ## object 变量处理
  22. data['employmentLength'].replace({'10+ years': '10 years', '< 1 year': '0 years', '0': '0 years'}, inplace=True)
  23. data['employmentLength'] = data['employmentLength'].apply(lambda s: int(str(s).split()[0]) if pd.notnull(s) else s)
  24. data['earliesCreditLine'] = data['earliesCreditLine'].apply(lambda s: int(s[-4:]))
  25. data = data.drop(['issueDate'], axis=1)
  26. le = LabelEncoder()
  27. data['grade'] = le.fit_transform(data['grade'])
  28. data['subGrade'] = le.fit_transform(data['subGrade'])
  29. # 删除不需要的列
  30. dropList = ['id', 'ficoRangeHigh', 'applicationType', 'policyCode', 'n3', 'n11', 'n12', 'n13']
  31. data.drop(dropList, axis=1, inplace=True)
  32. train_data = data[:800000]
  33. # 将target和train_data进行重新拼接
  34. train_data['isDefault']=target
  35. test_data = data[800000:]
  36. print("Divide data.")
复制代码
  1. # # ## 异常值处理
  2. # percentile = pd.DataFrame()
  3. # numList = [i for i in train_data.columns if i not in classList]
  4. # # 正态分布检测
  5. # for i in numList:
  6. #     print(kstest(data[i], 'norm', (data[i].mean(), data[i].std())))
  7. # # 异常值处理
  8. # stdsc = StandardScaler()
  9. # for i in numList:
  10. #     new_i = "zheng_" + i
  11. #     train_data[new_i] = stdsc.fit_transform(train_data[i].values.reshape(-1, 1))
  12. #     data_std = np.std(train_data[new_i])
  13. #     data_mean = np.mean(train_data[new_i])
  14. #     outliers_cut_off = data_std * 3
  15. #     lower_rule = data_mean - outliers_cut_off
  16. #     upper_rule = data_mean + outliers_cut_off
  17. #     train_data = train_data[(train_data[new_i] < upper_rule) & (train_data[new_i] > lower_rule)]
  18. # train_data = train_data.iloc[:, :38]
复制代码
生存数据,在部分情况下由于数据体量过大,生存中间数据有助于后续处理。
  1. FEATURE_PATH = os.path.join(BASE_DIR, 'feature')
  2. feature_train_data = os.path.join(FEATURE_PATH, 'train_data.csv')
  3. feature_test_data = os.path.join(FEATURE_PATH, 'test_data.csv')
  4. train_data.to_csv(feature_train_data,index=0)
  5. test_data.to_csv(feature_test_data,index=0)
复制代码
模子搭建部分

  1. # 定义模型训练函数
  2. def train_model(x_train, y_train, test_data, params, n_splits=5):
  3.     skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=2019)
  4.     oof = np.zeros(len(x_train))
  5.     predictions = np.zeros((len(test_data), n_splits))
  6.     for fold_, (train_idx, valid_idx) in enumerate(skf.split(x_train, y_train)):
  7.         print(f"\nFold {fold_ + 1}")
  8.         x_tr, x_val = x_train.iloc[train_idx], x_train.iloc[valid_idx]
  9.         y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[valid_idx]
  10.         train_set = lgb.Dataset(x_tr, label=y_tr)
  11.         val_set = lgb.Dataset(x_val, label=y_val)
  12.         clf = lgb.train(params, train_set, 5000, valid_sets=[val_set],
  13.                         verbose_eval=250, early_stopping_rounds=50)
  14.         oof[valid_idx] = clf.predict(x_val, num_iteration=clf.best_iteration)
  15.         predictions[:, fold_] = clf.predict(test_data, num_iteration=clf.best_iteration)
  16.    
  17.     print("\n\nCV AUC: {:<0.4f}".format(roc_auc_score(y_train, oof)))
  18.     return oof, predictions
  19. # 训练模型并生成预测
  20. oof, predictions = train_model(x_train_gbdt, y_train_gbdt, x_test_bgdt, default_params)
复制代码
参考资料


免责声明:如果侵犯了您的权益,请联系站长,我们会及时删除侵权内容,谢谢合作!更多信息从访问主页:qidao123.com:ToB企服之家,中国第一个企服评测及商务社交产业平台。
回复

使用道具 举报

0 个回复

倒序浏览

快速回复

您需要登录后才可以回帖 登录 or 立即注册

本版积分规则

大连密封材料

金牌会员
这个人很懒什么都没写!

标签云

快速回复 返回顶部 返回列表