背景
1、金融风控分类题目,作为机器学习竞赛是一个比力好的选择
2、怎样举行数据处理
代码
数据分析部分
- #!/usr/bin/env python
- # coding: utf-8
- import os
- import gc
- import numpy as np
- import pandas as pd
- import warnings
- import lightgbm as lgb
- import catboost as cbt
- import xgboost as xgb
- from sklearn.metrics import roc_auc_score
- from sklearn.model_selection import StratifiedKFold, KFold, train_test_split, GridSearchCV
- from sklearn.preprocessing import LabelEncoder, StandardScaler
- from tqdm import tqdm
- import matplotlib.pyplot as plt
- import seaborn as sns
- from scipy.stats import kstest
- warnings.filterwarnings("ignore")
- pd.set_option('display.max_columns', None)
- # plt.ion()
复制代码- # ## 导入数据
- BASE_DIR = os.path.dirname(os.path.abspath(__file__))
- DATA_PATH = os.path.join(BASE_DIR, 'data')
- train_data_file = os.path.join(DATA_PATH, "train.csv")
- train_data = pd.read_csv(train_data_file)
- test_data_file = os.path.join(DATA_PATH, "testA.csv")
- test_data = pd.read_csv(test_data_file)
- target = train_data['isDefault']
- train_data = train_data.drop(['isDefault'], axis=1)
- data = pd.concat([train_data, test_data])
- objectList = [i for i in train_data.columns if train_data[i].dtype == 'O']
- classList = [i for i in train_data.select_dtypes(exclude=['object']).columns if len(train_data[i].unique()) <= 10]
- numericalList = [i for i in train_data.select_dtypes(exclude=['object']).columns if i not in classList]
复制代码 对差别类型变量举行分类分组处理
- # ## 变量分类和缺失值处理
- info = pd.DataFrame(data.isnull().sum())
- info = info[info[0] != 0]
- miss_fea = info.index
- miss_objectList = [i for i in miss_fea if i in objectList]
- miss_classList = [i for i in miss_fea if i in classList]
- miss_numericalList = [i for i in miss_fea if i in numericalList]
- # 填补缺失值
- data['employmentLength'] = data['employmentLength'].fillna(0)
- data['n11'] = data['n11'].fillna(0)
- data['n12'] = data['n12'].fillna(0)
- data['employmentTitle'] = data['employmentTitle'].fillna(data['employmentTitle'].mode()[0])
- data['postCode'] = data['postCode'].fillna(data['postCode'].mode()[0])
- data['dti'] = data['dti'].fillna(data['postCode'].mean())
- data['pubRecBankruptcies'] = data['pubRecBankruptcies'].fillna(data['pubRecBankruptcies'].mean())
- data['revolUtil'] = data['revolUtil'].fillna(data['revolUtil'].mean())
- data['title'] = data['title'].fillna(data['title'].mode()[0])
- NoNameList = [i for i in miss_numericalList if i.startswith("n")]
- for i in NoNameList:
- data[i] = data[i].fillna(data[i].mode()[0])
- # ## object 变量处理
- data['employmentLength'].replace({'10+ years': '10 years', '< 1 year': '0 years', '0': '0 years'}, inplace=True)
- data['employmentLength'] = data['employmentLength'].apply(lambda s: int(str(s).split()[0]) if pd.notnull(s) else s)
- data['earliesCreditLine'] = data['earliesCreditLine'].apply(lambda s: int(s[-4:]))
- data = data.drop(['issueDate'], axis=1)
- le = LabelEncoder()
- data['grade'] = le.fit_transform(data['grade'])
- data['subGrade'] = le.fit_transform(data['subGrade'])
- # 删除不需要的列
- dropList = ['id', 'ficoRangeHigh', 'applicationType', 'policyCode', 'n3', 'n11', 'n12', 'n13']
- data.drop(dropList, axis=1, inplace=True)
- train_data = data[:800000]
- # 将target和train_data进行重新拼接
- train_data['isDefault']=target
- test_data = data[800000:]
- print("Divide data.")
复制代码- # # ## 异常值处理
- # percentile = pd.DataFrame()
- # numList = [i for i in train_data.columns if i not in classList]
- # # 正态分布检测
- # for i in numList:
- # print(kstest(data[i], 'norm', (data[i].mean(), data[i].std())))
- # # 异常值处理
- # stdsc = StandardScaler()
- # for i in numList:
- # new_i = "zheng_" + i
- # train_data[new_i] = stdsc.fit_transform(train_data[i].values.reshape(-1, 1))
- # data_std = np.std(train_data[new_i])
- # data_mean = np.mean(train_data[new_i])
- # outliers_cut_off = data_std * 3
- # lower_rule = data_mean - outliers_cut_off
- # upper_rule = data_mean + outliers_cut_off
- # train_data = train_data[(train_data[new_i] < upper_rule) & (train_data[new_i] > lower_rule)]
- # train_data = train_data.iloc[:, :38]
复制代码 生存数据,在部分情况下由于数据体量过大,生存中间数据有助于后续处理。
- FEATURE_PATH = os.path.join(BASE_DIR, 'feature')
- feature_train_data = os.path.join(FEATURE_PATH, 'train_data.csv')
- feature_test_data = os.path.join(FEATURE_PATH, 'test_data.csv')
- train_data.to_csv(feature_train_data,index=0)
- test_data.to_csv(feature_test_data,index=0)
复制代码 模子搭建部分
- # 定义模型训练函数
- def train_model(x_train, y_train, test_data, params, n_splits=5):
- skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=2019)
- oof = np.zeros(len(x_train))
- predictions = np.zeros((len(test_data), n_splits))
- for fold_, (train_idx, valid_idx) in enumerate(skf.split(x_train, y_train)):
- print(f"\nFold {fold_ + 1}")
- x_tr, x_val = x_train.iloc[train_idx], x_train.iloc[valid_idx]
- y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[valid_idx]
- train_set = lgb.Dataset(x_tr, label=y_tr)
- val_set = lgb.Dataset(x_val, label=y_val)
- clf = lgb.train(params, train_set, 5000, valid_sets=[val_set],
- verbose_eval=250, early_stopping_rounds=50)
- oof[valid_idx] = clf.predict(x_val, num_iteration=clf.best_iteration)
- predictions[:, fold_] = clf.predict(test_data, num_iteration=clf.best_iteration)
-
- print("\n\nCV AUC: {:<0.4f}".format(roc_auc_score(y_train, oof)))
- return oof, predictions
- # 训练模型并生成预测
- oof, predictions = train_model(x_train_gbdt, y_train_gbdt, x_test_bgdt, default_params)
复制代码 参考资料
免责声明:如果侵犯了您的权益,请联系站长,我们会及时删除侵权内容,谢谢合作!更多信息从访问主页:qidao123.com:ToB企服之家,中国第一个企服评测及商务社交产业平台。 |