可否请教一下大家这个代码如何理解,谢谢
#进行基本的样本均衡
Label_num = Counter(step_labels)
min_Label_num = min(Label_num.values())#取得各标签中最少出现标签的个数
formal_id = list()
output = np.array(step_labels)
for i in range(class_nb):#从各标签对应的场景中,按照最少的标签个数抽取出对应数量的场景id
idx = np.where(output == i)[0]
balanced_idx = np.random.choice(idx, size = min_Label_num, replace = False)
formal_id = formal_id + list(balanced_idx)
step_labels = output[np.array(formal_id)]
step_data = np.array(step_data)[np.array(formal_id)]
个人感觉大概是这样(写了下注释)
from collections import Counter
import numpy as np
Label_num = Counter(step_labels) # 统计step_labels中各项出现次数
min_Label_num = min(Label_num.values()) # 取得各标签中最少出现标签的个数
format_id = list() # 初始化format_id空列表
output = np.array(step_labels) # 转化step_labels为np数组output
for i in range(class_nb): # 从各标签对应的场景中,按照最少的标签个数抽取对应数量的场景id
idx = np.where(output == i)[0] # idx为定位到的标签i在output(step_labels)中出现的位置,这里算是有点坑np.where返回值格式为(np.array, ),所以需要加[0]取出
balanced_idx = np.random.choice(idx, size = min_Label_num, replace = False) # 这个就是根据之前计算的最少出现的标签个数,从各标签随机取出该数量的样本(位置),不重复取样
format_id = format_id + list(balanced_idx) # 将取得的样本(位置)加入到之前建的列表format_id中
step_labels = output[np.array(format_id)] # 根据循环后format_id从output中取出对应样本构成新的step_labels,此时各样本数量均一致,为min_Label_num
step_data = np.array(step_data)[np.array(format_id)] # 与step_labels基本一致,从step_data中也根据format_id取出对应数据,形成新的step_data