求一份【支持向量机或者随机森林】 的 MPI和OpenMP代码(C/C++语言) ,如果有实例验证就更好了!
可以网上找资源,也可以大佬根据其他语言改写,在VS2012环境能运行就可以了!谢谢
有人回答了吗?我怎么没有看见?
代码:
#include "SVM.h"
#include <math.h>
using namespace std;
#define eps 1e-2 //误差精度
const int C = 100; //惩罚参数
SVM::~SVM()
{
if (data) delete[]data;
if (label) delete label;
if (alpha) delete alpha;
if (gx) delete gx;
}
//d中为样本,每个样本按行存储; l标签(1或-1); sn样本个数; fn特征个数
void SVM::initialize(double d, double l, int sn, int fn)
{
this->sampleNum = sn;
this->featureNum = fn;
this->label = new double[sampleNum];
this->data = new double[sampleNum];
for (int i = 0; i < sampleNum; i++)
{
this->label[i] = l[i];
}
for (int i = 0; i < sampleNum; i++)
{
this->data[i] = new double[featureNum];
for (int j = 0; j < featureNum; j++)
{
data[i][j] = d[ifeatureNum + j];
}
}
alpha = new double[sampleNum] {0};
gx = new double[sampleNum] {0};
}
double SVM::s_max(double a, double b)
{
return a > b ? a : b;
}
double SVM::s_min(double a, double b)
{
return a < b ? a : b;
}
double SVM::objFun(int x)
{
int j = 0;
//选择一个0 < alpha[j] < C
for (int i = 0; i < sampleNum; i++)
{
if (alpha[i]>0 && alpha[i] < C)
{
j = i;
break;
}
}
//计算b
double b = label[j];
for (int i = 0; i < sampleNum; i++)
{
b -= alpha[i] * label[i] * kernel(i, j);
}
//构造决策函数
double objf = b;
for (int i = 0; i < sampleNum; i++)
{
objf += alpha[i] * label[i] * kernel(x, i);
}
return objf;
}
//判断有无收敛
bool SVM::isConvergence()
{
//alpah[i] * y[i]求和等于0
//0 <= alpha[i] <= C
//y[i] * gx[i]满足一定条件
double sum = 0;
for (int i = 0; i < sampleNum; i++)
{
if (alpha[i] < -eps || alpha[i] > C + eps) return false;
else
{
// alpha[i] = 0
if (fabs(alpha[i]) < eps && label[i] * gx[i] < 1 - eps) return false;
// 0 < alpha[i] < C
if (alpha[i] > -eps && alpha[i] < C + eps && fabs(label[i] * gx[i] - 1)>eps) return false;
// alpha[i] = C
if (fabs(alpha[i] - C) < eps && label[i] * gx[i] > 1 + eps) return false;
}
sum += alpha[i] * label[i];
}
if (fabs(sum) > eps) return false;
return true;
}
//假装是个核函数
//两个向量做内积
double SVM::kernel(int i, int j)
{
double res = 0;
for (int k = 0; k < featureNum; k++)
{
res += data[i][k] * data[j][k];
}
return res;
}
//计算g(xi),也就是对样本i的预测值
void SVM::computeGx()
{
for (int i = 0; i < sampleNum; i++)
{
gx[i] = 0;
for(int j=0; j < sampleNum; j++)
{
gx[i] += alpha[j] * label[j] * kernel(i, j);
}
gx[i] += b;
}
}
//更新很多东西
void SVM::update(int a1, int a2, double x1, double x2)
{
//更新阈值b
double b1_new = -(gx[a1] - label[a1]) - label[a1] * kernel(a1, a1)*(alpha[a1] - x1)
- label[a2] * kernel(a2, a1)*(alpha[a2] - x2) + b;
double b2_new = -(gx[a2] - label[a2]) - label[a1] * kernel(a1, a2)*(alpha[a1] - x1)
- label[a2] * kernel(a2, a2)*(alpha[a2] - x2) + b;
if (fabs(alpha[a1]) < eps || fabs(alpha[a1] - C) < eps || fabs(alpha[a2]) < eps || fabs(alpha[a2] - C) < eps)
b = (b1_new + b2_new) / 2;
else
b = b1_new;
/*
int j = 0;
//选择一个0 < alpha[j] < C
for (int i = 0; i < sampleNum; i++)
{
if (alpha[i]>0 && alpha[i] < C)
{
j = i;
break;
}
}
//计算b
double b = label[j];
for (int i = 0; i < sampleNum; i++)
{
b -= alpha[i] * label[i] * kernel(i, j);
}
*/
//更新gx
computeGx();
}
//选取第二个变量
/*
先选择是对应E1-E2最大的
若没有,用启发式规则,选目标函数有足够下降的alpha2
还没有,选择新的alpha1
*/
int SVM::secondAlpha(int a1)
{
//先计算出所有的E,也就是样本xi的预测值与真实输出之差Ei=g(xi)-yi
//若E1为正,选最小的Ei作为E2,反正选最大
bool pos = (gx[a1] - label[a1] > 0);
double tmp = pos ? 100000000 : -100000000;
double ei = 0; int a2 = -1;
for (int i = 0; i < sampleNum; i++)
{
ei = gx[i] - label[i];
if (pos && ei < tmp || !pos && ei > tmp)
{
tmp = ei;
a2 = i;
}
}
//对于特殊情况,直接遍历间隔边界上的支持向量点,选择具有最大下降的值
return a2;
}
//选定a1和a2,进行更新
bool SVM::takeStep(int a1, int a2)
{
if (a1 < -eps) return false;
double x1, x2; //old alpha
x2 = alpha[a2];
x1 = alpha[a1];
//计算剪辑的边界
double L, H;
double s = label[a1] * label[a2];//a1 与 a2同号或异号
L = s < 0 ? s_max(0, alpha[a2] - alpha[a1]) : s_max(0, alpha[a2] + alpha[a1] - C);
H = s < 0 ? s_min(C, C + alpha[a2] - alpha[a1]) : s_min(C, alpha[a2] + alpha[a1]);
if (L >= H) return false;
double eta = kernel(a1, a1) + kernel(a2, a2) - 2 * kernel(a1, a2);
//更新alpah[a2]
if (eta > 0)
{
alpha[a2] = x2 + label[a2] * (gx[a1] - label[a1] - gx[a2] + label[a2]) / eta;
if (alpha[a2] < L) alpha[a2] = L;
else if (alpha[a2] > H) alpha[a2] = H;
}
else//我也不知道为什么这么算,我抄的论文里的,意思是选到超平面距离近的边界
{
alpha[a2] = L;
double Lobj = objFun(a2);
alpha[a2] = H;
double Hobj = objFun(a2);
if (Lobj < Hobj - eps)
alpha[a2] = L;
else if (Lobj > Hobj + eps)
alpha[a2] = H;
else
alpha[a2] = x2;
}
//下降太少,忽略不计
if (fabs(alpha[a2] - x2) < eps*(alpha[a2] + x2 + eps))
{
alpha[a2] = x2;
return false;
}
//更新alpha[a1]
alpha[a1] = x1 + s*(x2 - alpha[a2]);
update(a1, a2, x1, x2);
/*
for (int ii = 0; ii < sampleNum; ii++)
{
cout << gx[ii] << endl;
}
*/
return true;
}
//由SVM分类决策的对偶最优化问题求解alpha
/*
用序列最小最优化算法(SMO)求解alpha
step1:选取一对需要更新的变量alpha[i]和alpha[j]
step2:固定alpha[i]和alpha[j]以外的参数,求解对偶问题的最优化解获得更新后的alpha[i]和alpha[j]
参考:李航《统计学习方法》
JC Platt《Sequential Minimal Optimization: A Fast Algorithm for Training Support Vector Machines》
*/
void SVM::SMO()
{
//bool convergence = false; //判断有没有收敛
int a1, a2;
bool Changed = true; //有没有更新
int numChanged = 0; //更新了多少次
int *eligSample = new int[sampleNum]; // 记录访问过的样本
int cnt = 0; //样本个数
computeGx();
do
{
numChanged = 0; cnt = 0;
//选择第一个变量(最不满足KKT条件的样本点)
//优先选 0 < alpha < C 的样本 , alpha会随着后面的迭代发生变化
for (int i = 0; i < sampleNum; i++)
{
//记录下不满足KKT条件的样本,做个缓存
if (Changed)
{
cnt = 0;
for (int j = 0; j < sampleNum; j++)
{
if (alpha[j] > eps && alpha[j] < C - eps)
{
eligSample[cnt++] = j;
}
}
Changed = false;
}
if (alpha[i] > eps && alpha[i] < C-eps)
{
a1 = i;
//不满足KKT条件
if (fabs(label[i] * gx[i] - 1) > eps)
{
//选择第二个变量,优先选下降最多的
a2 = secondAlpha(i);
Changed = takeStep(a1, a2);
numChanged += Changed;
if(Changed) continue;
else //目标函数没有下降
{
//先依次遍历间隔边界上的
for(int j=0; j<cnt;j++)
{
if (eligSample[j] == i) continue;
a2 = eligSample[j];
Changed = takeStep(a1, a2);
numChanged += Changed;
if (Changed) break;
}
if (Changed) continue;
//再遍历整个数据集
int k = 0;
for (int j = 0; j < sampleNum; j++)
{
//这是上面已经试过的间隔上的点
if (eligSample[k] == j)
{
k++;
continue;
}
a2 = j;
Changed = takeStep(a1, a2);
numChanged += Changed;
if (Changed) break;
}
//找不到合适的alpha2, 换一个alpha1
}
}
}
}
if(numChanged)//已经有改变了
{
Changed = false;
continue;
}
//选其他不满足KKT条件的样本
for (int i = 0; i < sampleNum; i++)
{
a1 = i;
if (fabs(alpha[i]) < eps && label[i] * gx[i] < 1 ||
fabs(alpha[i] - C) < eps && label[i] * gx[i] > 1)
{
//选择第二个变量,步骤同上
a2 = secondAlpha(i);
Changed = takeStep(a1, a2);
numChanged += Changed;
if (Changed) continue;
else //目标函数没有下降
{
//先依次遍历间隔边界上的
//间隔边界上的点已经记录在eligSample中了
for(int j=0; j<cnt; j++)
{
if (eligSample[j] == i) continue;
a2 = eligSample[j];
Changed = takeStep(a1, a2);
numChanged += Changed;
if (Changed) break;
}
if (Changed) continue;
//再遍历整个数据集
int k = 0;
for (int j = 0; j < sampleNum; j++)
{
if (j == eligSample[k])
{
k++;
continue;
}
a2 = j;
Changed = takeStep(a1, a2);
numChanged += Changed;
if (Changed) break;
}
//找不到合适的alpha2, 换一个alpha1
}
}
}
/*
// if (!Changed)
{
cout<<"num"<<numChanged<<endl;
show();
}
//《统计学习方法》里说的收敛条件是这个,但不管用
//所以改用JC Platt论文伪代码所提方法(也不是完全一样)
convergence = isConvergence();
//show();
cnt++;
if (cnt == 10000)
{
cout << "num" << numChanged << endl;
show();
}
*/
}while (numChanged);
delete eligSample;
}
void SVM::show()
{
cout << "支持向量为:" << endl;
for (int i = 0; i < sampleNum; i++)
{
if(alpha[i]>eps)
cout <<i<<" 对应的alpha为:"<<alpha[i]<< endl;
}
cout << endl;
}
还有随机森林的(刚才的是支持向量机)
随机森林代码如下:
#include
#include
#include
#include "random_forest.h"
using namespace std;
vector<decision_tree*> alltrees; // 森林(决策树集合)
vector trainAll,train,test; // 样本集
vector attributes; // 属性集(元素为属性序号)
int trainAllNum = 0;
int testAllNum = 0;
int MaxAttr; // 属性总数
int *ArrtNum; // 属性个数集(元素为属性最大值)
unsigned int F;
int tree_num = 100; // 决策树个数
const int leafattrnum = -1; // 叶子节点的属性序号
int TP = 0,
FN = 0,
FP = 0,
TN = 0,
TestP = 0,
TestN = 0;
// 读入数据
void init(char * trainname, char * testname)
{
trainAllNum =readData(trainAll, trainname);
testAllNum = readData(test,testname);
calculate_attributes();
double temp =(double)trainAllNum;
temp =log(temp)/log(2.0);
F = (unsigned int)floor(temp+0.5)+1;
if(F>MaxAttr) F = MaxAttr;
}
// 初始化训练样本子集
void sub_init()
{
// 选取决策树的训练样本集合
RandomSelectData(trainAll, train);
// 计算样本属性个数
calculate_ArrtNum();
}
// 读数据
int readData(vector&data, const char* fileName)
{
ifstream fin;
fin.open(fileName);
string line;
int datanum=0;
// 每行数据作为一个样本
while(getline(fin,line))
{
TupleData d;
istringstream stream(line);
string str;
// 设置每个样本的标签和内容
while(stream>>str)
{
if(str.find('+')==0)
{
d.label='+';
}
else if(str.find('-')==0)
{
d.label='-';
}
else
{
int j=stringtoint(str);
d.A.push_back(j);
}
}
data.push_back(d);
datanum++;
}
fin.close();
return datanum;
}
// 生成根节点的训练样本子集
voidRandomSelectData(vector &data, vector&subdata)
{
int index;
subdata.clear();
int d = 0;
while (d < trainAllNum)
{
index = rand() % trainAllNum;
subdata.push_back(data.at(index));
d++;
}
}
// 计算属性序列
void calculate_attributes()
{
// 每个样本必须具有相同的属性个数
TupleData d = trainAll.at(0);
MaxAttr = d.A.size();
attributes.clear();
// 建立属性集合attributes,元素为属性序号
for (int i = 0; i < MaxAttr; i++)
{
attributes.push_back(i);
}
// 初始化属性最大值序列,元素为属性最大值
ArrtNum = new int[MaxAttr];
}
// 字符串转化为int
int stringtoint(string s)
{
int sum=0;
for(int i=0; s[i]!='\0';i++)
{
int j=int(s[i])-48;
sum=sum*10+j;
}
return sum;
}
// 计算ArrtNum元素值
void calculate_ArrtNum()
{
for(int i = 0; i < MaxAttr; i++) ArrtNum[i] = 0;
// ArrtNum元素值为属性最大值
for (vector::const_iterator it = train.begin(); it !=train.end(); it++)
{
int i = 0;
for (vector<int>::const_iterator intt=(*it).A.begin();intt!=(*it).A.end();intt++)
{
int valuemax=(*intt)+1;
if(valuemax>ArrtNum[i]) ArrtNum[i]=valuemax;
i++;
}
}
}
// 计算熵
double Entropy(double p, double s)
{
double n = s - p;
double result = 0;
if (n != 0)
result += - double(n) / s * log(double(n) / s) / log(2.0);
if (p != 0)
result += double(-p) / s * log(double(p) / s) / log(2.0);
return result;
}
// 训练一棵决策树
int creat_classifier(decision_tree*&p, const vector &samples, vector&attributes)
{
if (p == NULL)
p = new decision_tree();
// 根据样本真实类别,输出叶子节点类别
if (Allthesame(samples, '+'))
{
p->node.label = '+';
p->node.attrNum = leafattrnum;
p->childs.clear();
return 1;
}
if (Allthesame(samples, '-'))
{
p->node.label = '-';
p->node.attrNum = leafattrnum;
p->childs.clear();
return 1;
}
// 如果属性序列为空,当前节点就为叶子节点
if (attributes.size() == 0)
{
p->node.label = Majorityclass(samples);
p->node.attrNum = leafattrnum;
p->childs.clear();
return 1;
}
// 计算当前节点的最优属性
p->node.attrNum = BestGainArrt(samples, attributes);
// 中间节点无标签
p->node.label = ' ';
// 计算子节点候选属性集合,候选集合元素越来越少
vector newAttributes;
for (vector::iterator it = attributes.begin(); it !=attributes.end(); it++)
if ((*it) != p->node.attrNum)
newAttributes.push_back((*it));
// 初始化样本子集,建立maxvalue个样本子集,也就说明该节点有maxvalue个子节点
// 为什么不建立一个阈值,进行二分类?
int maxvalue = ArrtNum[p->node.attrNum];
vector* subSamples = newvector[maxvalue];
for (int i = 0; i < maxvalue; i++)
subSamples[i].clear();
// 将样本集合分为样本子集
for (vector::const_iterator it = samples.begin(); it !=samples.end(); it++)
{
// 对样本进行分类,分别分到maxvalue个子节点中
// p->node.attrNum是当前节点的最优属性序号
// (*it).A.at(p->node.attrNum)正是子节点的序号
// 基于当前节点最优属性,计算当前样本的归类
subSamples[(*it).A.at(p->node.attrNum)].push_back((*it));
}
decision_tree *child;
for (int i = 0; i < maxvalue; i++)
{
child = new decision_tree;
if (subSamples[i].size() == 0)
child->node.label = Majorityclass(samples);
else
creat_classifier(child, subSamples[i], newAttributes);
p->childs.push_back(child);
}
delete[] subSamples;
return 0;
}
// 计算节点处的信息增益
int BestGainArrt(constvector &samples, vector &attributes)
{
int attr,
bestAttr = 0,
p = 0,
s = (int)samples.size();
// 计算正样本个数
for (vector::const_iterator it = samples.begin(); it !=samples.end(); it++)
{
if ((*it).label == '+')
p++;
}
double infoD;
double bestResult = 0;
// 计算初始熵
infoD = Entropy(p, s);
vector m_attributes;
// 随机确定候选属性集
RandomSelectAttr(attributes, m_attributes);
// 遍历属性(即主题),通过信息增益筛选最优属性
for (vector::iterator it = m_attributes.begin(); it !=m_attributes.end(); it++)
{
attr = (*it);
double result = infoD;
// 第attr个属性的最大属性值
int maxvalue = ArrtNum[attr];
// 正负样本集
int* subN = newint[maxvalue];
int* subP = newint[maxvalue];
int* sub = newint[maxvalue];
for (int i = 0; i < maxvalue; i++)
{
subN[i] = 0;
subP[i] = 0;
sub[i] = 0;
}
// 基于特定属性,对当前训练样本进行分类
// 属性计算这一步的确没有,属性值直接存储在样本中
for (vector<TupleData>::const_iterator jt = samples.begin(); jt !=samples.end(); jt++)
{
if ((*jt).label == '+')
subP[(*jt).A.at(attr)] ++;
else
subN[(*jt).A.at(attr)] ++;
sub[(*jt).A.at(attr)]++;
}
// 计算特定属性下信息增益(相对熵)
double SplitInfo = 0;
for(int i = 0; i < maxvalue; i++)
{
double partsplitinfo;
partsplitinfo =-double(sub[i])/s*log(double(sub[i])/s)/log(2.0);
SplitInfo =SplitInfo+partsplitinfo;
}
double infoattr = 0;
for (int i = 0; i < maxvalue; i++)
{
double partentropy;
partentropy = Entropy(subP[i],subP[i] + subN[i]);
infoattr =infoattr+((double)(subP[i] + subN[i])/(double)(s))*partentropy;
}
result = result - infoattr;
result = result / SplitInfo;
// 寻找最优属性
if (result > bestResult)
{
bestResult = result;
bestAttr = attr;
}
delete[] subN;
delete[] subP;
delete[] sub;
}
if (bestResult == 0)
{
bestAttr=attributes.at(0);
}
return bestAttr;
}
void RandomSelectAttr(vector&data, vector &subdata)
{
int index;
unsigned int dataNum=data.size();
subdata.clear();
if(dataNum<=F)
{
for (vector<int>::iterator it = data.begin(); it != data.end();it++)
{
int attr = (*it);
subdata.push_back(attr);
}
}
else
{
set<int> AttrSet;
AttrSet.clear();
while (AttrSet.size() < F)
{
index = rand() % dataNum;
if (AttrSet.count(index) == 0)
{
AttrSet.insert(index);
subdata.push_back(data.at(index));
}
}
}
}
bool Allthesame(constvector &samples, char ch)
{
for (vector::const_iterator it = samples.begin(); it !=samples.end(); it++)
if ((*it).label != ch)
return false;
return true;
}
// 确定节点中哪个类别样本个数最多
char Majorityclass(constvector &samples)
{
int p = 0, n = 0;
for (vector::const_iterator it = samples.begin(); it !=samples.end(); it++)
if ((*it).label == '+')
p++;
else
n++;
if (p >= n)
return '+';
else
return '-';
}
// 测试阶段
char testClassifier(decision_tree *p,TupleData d)
{
// 抵达叶子节点
if (p->node.label != ' ')
return p->node.label;
// 节点处最优属性
int attrNum = p->node.attrNum;
// 错误样本
if (d.A.at(attrNum) < 0)
return ' ';
// 确定分支
return testClassifier(p->childs.at(d.A.at(attrNum)), d);
}
void testData()
{
for (vector::iterator it = test.begin(); it !=test.end(); it++)
{
printf("新样本\n");
if((*it).label=='+') TestP++;
else TestN++;
int p = 0, n = 0;
for(int i = 0; i < tree_num; i++)
{
if(testClassifier(alltrees.at(i), (*it))=='+') p++;
else n++;
}
if(p>n)
{
if((*it).label=='+') TP++;
else FP++;
}
else
{
if((*it).label=='+') FN++;
else TN++;
}
}
}
void freeClassifier(decision_tree *p)
{
if (p == NULL)
return;
for (vector<decision_tree*>::iterator it = p->childs.begin();it != p->childs.end(); it++)
{
freeClassifier(*it);
}
delete p;
}
void freeArrtNum()
{
delete[] ArrtNum;
}
void showResult()
{
cout << "Train size: "<<trainAllNum<<endl;
cout << "Test size: "<<testAllNum<<endl;
cout << "True positive: "<< TP << endl;
cout << "False negative: "<<FN<<endl;
cout << "False positive: "<<FP<<endl;
cout << "True negative: "<<TN<<endl;
}
int main(int argc, char **argv)
{
char * trainfile=argv[1];
char* testfile=argv[2];
srand((unsigned)time(NULL));
// 初始化样本
init("1.txt", "2.txt");
// 训练阶段
for(int i = 0; i < tree_num; i++)
{
printf("第 %d 棵决策树训练开始\n", i);
// 每棵树的训练样本子集
sub_init();
// 训练每棵决策树
decision_tree * root=NULL;
creat_classifier(root, train, attributes);
// 建立森林
alltrees.push_back(root);
printf("第 %d 棵决策树训练完毕\n", i);
}
// 测试阶段
testData();
for (vector<decision_tree *>::const_iterator it =alltrees.begin(); it != alltrees.end(); it++)
{
freeClassifier((*it));
}
freeArrtNum();
showResult();
system("pause");
return 0;
}