数据分箱(分桶)需要先对数据进行排序嘛?

有个疑问,数据分箱需要对数据先进行排序嘛?求科普一下这类知识

不需要

def optimal_binning_boundary(x: pd.Series, y: pd.Series, nan: float = -999.) -> list:
    '''
    利用决策树获得最优分箱的边界值列表
    '''
    boundary = []  # 待return的分箱边界值列表

    x = x.fillna(nan).values  # 填充缺失值
    y = y.values

    clf = DecisionTreeClassifier(criterion='entropy',  # “信息熵”最小化准则划分
                                 max_leaf_nodes=20,  # 最大叶子节点数,=== 即最大分为几个箱
                                 min_samples_leaf=0.015)  # 叶子节点样本数量最小占比, === 即每个箱最少数据量

    clf.fit(x.reshape(-1, 1), y)  # 训练决策树

    n_nodes = clf.tree_.node_count
    children_left = clf.tree_.children_left
    children_right = clf.tree_.children_right
    threshold = clf.tree_.threshold

    for i in range(n_nodes):
        if children_left[i] != children_right[i]:  # 获得决策树节点上的划分边界值
            boundary.append(threshold[i])

    boundary.sort()

    min_x = x.min()
    max_x = x.max() + 0.1  # +0.1是为了考虑后续groupby操作时,能包含特征最大值的样本
    boundary = [min_x] + boundary + [max_x]
    print("分界值列表:", boundary)

    return boundary


def feature_woe_iv(x: pd.Series, y: pd.Series, nan: float = -999.) -> pd.DataFrame:
    '''
    计算变量各个分箱的WOE、IV值,返回一个DataFrame
    '''
    x = x.fillna(nan)
    boundary = optimal_binning_boundary(x, y, nan)  # 获得最优分箱边界值列表
    # boundary = get_chimerge_cutoff(x, y)
    df = pd.concat([x, y], axis=1)  # 合并x、y为一个DataFrame,方便后续计算
    df.columns = ['x', 'y']  # 特征变量、目标变量字段的重命名
    df['bins'] = pd.cut(x=x, bins=boundary, right=False)  # 获得每个x值所在的分箱区间

    grouped = df['y'].groupby(df['bins'])  # 统计各分箱区间的好、坏、总客户数量

    # 计算统计指标
    df_result = grouped.agg([('good', lambda y: (y == 1).sum()),  # 1为good
                             ('bad', lambda y: (y == 4).sum()),  # 0为bad
                             ('total', 'count')])

    # 计算好坏比
    df_result['good_pct'] = df_result['good'] / df_result['good'].sum()  # 好占比
    df_result['bad_pct'] = df_result['bad'] / df_result['bad'].sum()  # 坏占比
    df_result['total_pct'] = df_result['total'] / df_result['total'].sum()  # 总占比

    df_result['bad_rate'] = df_result['bad'] / df_result['total']  # 坏比率

    # 计算WOE
    df_result['woe'] = np.log(df_result['good_pct'] / df_result['bad_pct'])

    # 计算IV
    df_result['iv'] = (df_result['good_pct'] - df_result['bad_pct']) * df_result['woe']

    print(f"变量IV = {df_result['iv'].sum()}")

    return df_result