vif-方差膨胀因子
- 使用statsmodels中的variance_inflation_factor,数据集使用乳腺癌数据集
- import pandas as pd
- import numpy as np
- from sklearn.datasets import load_breast_cancer
- from tqdm import notebook
- from statsmodels.stats.outliers_influence import variance_inflation_factor
- from statsmodels.tools import add_constant
- cancer = load_breast_cancer()
- df = pd.DataFrame(cancer.data,columns=['_'.join(i.split()) for i in cancer.feature_names])
- df['y'] = cancer.target
- def calc_vif(df,desc_dict=None):
- """
- df: DataFrame
- desc_dict: 字段中文释义字典
-
- return: vif DataFrame
- """
- X=add_constant(df)
- vif_list = []
- for i in notebook.tqdm(range(X.shape[1])):
- vif = variance_inflation_factor(X.values,i)
- vif_list.append(vif)
- s = pd.DataFrame(vif_list,index=X.columns)
- s = s.reset_index()
- s.columns = ['feature','vif']
- if desc_dict:
- s['中文释义'] = s.feature.map(desc_dict)
- s = s[['feature','中文释义','vif']]
- return s
-
- str_list = df.select_dtypes(include='object').columns.tolist() # string类型
- flo_list = df.select_dtypes(exclude='object').columns.tolist() # 连续型
- # vif 不能计算string类型
- df_vif = df[flo_list]
- # vif 计算不能有空值
- for i in notebook.tqdm(flo_list):
- df_vif[i] = df_vif[i].fillna(0)
- vif_data = calc_vif(df_vif)
- vif_data
复制代码
免责声明:如果侵犯了您的权益,请联系站长,我们会及时删除侵权内容,谢谢合作!更多信息从访问主页:qidao123.com:ToB企服之家,中国第一个企服评测及商务社交产业平台。 |