有个疑问,数据分箱需要对数据先进行排序嘛?求科普一下这类知识
不需要
def optimal_binning_boundary(x: pd.Series, y: pd.Series, nan: float = -999.) -> list:
'''
利用决策树获得最优分箱的边界值列表
'''
boundary = [] # 待return的分箱边界值列表
x = x.fillna(nan).values # 填充缺失值
y = y.values
clf = DecisionTreeClassifier(criterion='entropy', # “信息熵”最小化准则划分
max_leaf_nodes=20, # 最大叶子节点数,=== 即最大分为几个箱
min_samples_leaf=0.015) # 叶子节点样本数量最小占比, === 即每个箱最少数据量
clf.fit(x.reshape(-1, 1), y) # 训练决策树
n_nodes = clf.tree_.node_count
children_left = clf.tree_.children_left
children_right = clf.tree_.children_right
threshold = clf.tree_.threshold
for i in range(n_nodes):
if children_left[i] != children_right[i]: # 获得决策树节点上的划分边界值
boundary.append(threshold[i])
boundary.sort()
min_x = x.min()
max_x = x.max() + 0.1 # +0.1是为了考虑后续groupby操作时,能包含特征最大值的样本
boundary = [min_x] + boundary + [max_x]
print("分界值列表:", boundary)
return boundary
def feature_woe_iv(x: pd.Series, y: pd.Series, nan: float = -999.) -> pd.DataFrame:
'''
计算变量各个分箱的WOE、IV值,返回一个DataFrame
'''
x = x.fillna(nan)
boundary = optimal_binning_boundary(x, y, nan) # 获得最优分箱边界值列表
# boundary = get_chimerge_cutoff(x, y)
df = pd.concat([x, y], axis=1) # 合并x、y为一个DataFrame,方便后续计算
df.columns = ['x', 'y'] # 特征变量、目标变量字段的重命名
df['bins'] = pd.cut(x=x, bins=boundary, right=False) # 获得每个x值所在的分箱区间
grouped = df['y'].groupby(df['bins']) # 统计各分箱区间的好、坏、总客户数量
# 计算统计指标
df_result = grouped.agg([('good', lambda y: (y == 1).sum()), # 1为good
('bad', lambda y: (y == 4).sum()), # 0为bad
('total', 'count')])
# 计算好坏比
df_result['good_pct'] = df_result['good'] / df_result['good'].sum() # 好占比
df_result['bad_pct'] = df_result['bad'] / df_result['bad'].sum() # 坏占比
df_result['total_pct'] = df_result['total'] / df_result['total'].sum() # 总占比
df_result['bad_rate'] = df_result['bad'] / df_result['total'] # 坏比率
# 计算WOE
df_result['woe'] = np.log(df_result['good_pct'] / df_result['bad_pct'])
# 计算IV
df_result['iv'] = (df_result['good_pct'] - df_result['bad_pct']) * df_result['woe']
print(f"变量IV = {df_result['iv'].sum()}")
return df_result