悬赏!求【支持向量机 或 随机森林】 的 MPI和OpenMP代码(C/C++)

求一份【支持向量机或者随机森林】 的 MPI和OpenMP代码(C/C++语言) ,如果有实例验证就更好了!

可以网上找资源,也可以大佬根据其他语言改写,在VS2012环境能运行就可以了!谢谢

有人回答了吗?我怎么没有看见?

代码:

#include "SVM.h"
#include <math.h>
using namespace std;

#define eps 1e-2 //误差精度
const int C = 100; //惩罚参数

SVM::~SVM()
{
if (data) delete[]data;
if (label) delete label;
if (alpha) delete alpha;
if (gx) delete gx;
}

//d中为样本,每个样本按行存储; l标签(1或-1); sn样本个数; fn特征个数
void SVM::initialize(double d, double l, int sn, int fn)
{
this->sampleNum = sn;
this->featureNum = fn;
this->label = new double[sampleNum];
this->data = new double
[sampleNum];
for (int i = 0; i < sampleNum; i++)
{
this->label[i] = l[i];
}
for (int i = 0; i < sampleNum; i++)
{
this->data[i] = new double[featureNum];
for (int j = 0; j < featureNum; j++)
{
data[i][j] = d[i
featureNum + j];
}
}
alpha = new double[sampleNum] {0};
gx = new double[sampleNum] {0};

}

double SVM::s_max(double a, double b)
{
return a > b ? a : b;
}

double SVM::s_min(double a, double b)
{
return a < b ? a : b;
}

double SVM::objFun(int x)
{
int j = 0;
//选择一个0 < alpha[j] < C
for (int i = 0; i < sampleNum; i++)
{
if (alpha[i]>0 && alpha[i] < C)
{
j = i;
break;
}
}
//计算b
double b = label[j];
for (int i = 0; i < sampleNum; i++)
{
b -= alpha[i] * label[i] * kernel(i, j);
}
//构造决策函数
double objf = b;
for (int i = 0; i < sampleNum; i++)
{
objf += alpha[i] * label[i] * kernel(x, i);
}
return objf;
}

//判断有无收敛
bool SVM::isConvergence()
{
//alpah[i] * y[i]求和等于0
//0 <= alpha[i] <= C
//y[i] * gx[i]满足一定条件
double sum = 0;
for (int i = 0; i < sampleNum; i++)
{
if (alpha[i] < -eps || alpha[i] > C + eps) return false;
else
{
// alpha[i] = 0
if (fabs(alpha[i]) < eps && label[i] * gx[i] < 1 - eps) return false;
// 0 < alpha[i] < C
if (alpha[i] > -eps && alpha[i] < C + eps && fabs(label[i] * gx[i] - 1)>eps) return false;
// alpha[i] = C
if (fabs(alpha[i] - C) < eps && label[i] * gx[i] > 1 + eps) return false;
}
sum += alpha[i] * label[i];
}
if (fabs(sum) > eps) return false;

return true;

}

//假装是个核函数
//两个向量做内积
double SVM::kernel(int i, int j)
{
double res = 0;
for (int k = 0; k < featureNum; k++)
{
res += data[i][k] * data[j][k];
}
return res;
}

//计算g(xi),也就是对样本i的预测值
void SVM::computeGx()
{
for (int i = 0; i < sampleNum; i++)
{
gx[i] = 0;
for(int j=0; j < sampleNum; j++)
{
gx[i] += alpha[j] * label[j] * kernel(i, j);
}
gx[i] += b;
}
}

//更新很多东西
void SVM::update(int a1, int a2, double x1, double x2)
{
//更新阈值b

double b1_new = -(gx[a1] - label[a1]) - label[a1] * kernel(a1, a1)*(alpha[a1] - x1)
    - label[a2] * kernel(a2, a1)*(alpha[a2] - x2) + b;
double b2_new = -(gx[a2] - label[a2]) - label[a1] * kernel(a1, a2)*(alpha[a1] - x1)
    - label[a2] * kernel(a2, a2)*(alpha[a2] - x2) + b;
if (fabs(alpha[a1]) < eps || fabs(alpha[a1] - C) < eps || fabs(alpha[a2]) < eps || fabs(alpha[a2] - C) < eps)
    b = (b1_new + b2_new) / 2;
else 
    b = b1_new; 
/*
int j = 0;
//选择一个0 < alpha[j] < C
for (int i = 0; i < sampleNum; i++)
{
    if (alpha[i]>0 && alpha[i] < C)
    {
        j = i;
        break;
    }
}
//计算b
double b = label[j];
for (int i = 0; i < sampleNum; i++)
{
    b -= alpha[i] * label[i] * kernel(i, j);
}
*/
//更新gx
computeGx();

}

//选取第二个变量
/*
先选择是对应E1-E2最大的
若没有,用启发式规则,选目标函数有足够下降的alpha2
还没有,选择新的alpha1
*/
int SVM::secondAlpha(int a1)
{
//先计算出所有的E,也就是样本xi的预测值与真实输出之差Ei=g(xi)-yi
//若E1为正,选最小的Ei作为E2,反正选最大
bool pos = (gx[a1] - label[a1] > 0);
double tmp = pos ? 100000000 : -100000000;
double ei = 0; int a2 = -1;
for (int i = 0; i < sampleNum; i++)
{
ei = gx[i] - label[i];
if (pos && ei < tmp || !pos && ei > tmp)
{
tmp = ei;
a2 = i;
}
}
//对于特殊情况,直接遍历间隔边界上的支持向量点,选择具有最大下降的值
return a2;
}

//选定a1和a2,进行更新
bool SVM::takeStep(int a1, int a2)
{
if (a1 < -eps) return false;

double x1, x2;        //old alpha
x2 = alpha[a2];
x1 = alpha[a1];
//计算剪辑的边界
double L, H;
double s = label[a1] * label[a2];//a1 与 a2同号或异号
L = s < 0 ? s_max(0, alpha[a2] - alpha[a1]) : s_max(0, alpha[a2] + alpha[a1] - C);
H = s < 0 ? s_min(C, C + alpha[a2] - alpha[a1]) : s_min(C, alpha[a2] + alpha[a1]);
if (L >= H) return false;
double eta = kernel(a1, a1) + kernel(a2, a2) - 2 * kernel(a1, a2);
//更新alpah[a2]
if (eta > 0)
{
    alpha[a2] = x2 + label[a2] * (gx[a1] - label[a1] - gx[a2] + label[a2]) / eta;
    if (alpha[a2] < L) alpha[a2] = L; 
    else if (alpha[a2] > H) alpha[a2] = H;
}
else//我也不知道为什么这么算,我抄的论文里的,意思是选到超平面距离近的边界
{
    alpha[a2] = L;
    double Lobj = objFun(a2);
    alpha[a2] = H;
    double Hobj = objFun(a2);
    if (Lobj < Hobj - eps)
        alpha[a2] = L;
    else if (Lobj > Hobj + eps)
        alpha[a2] = H;
    else
        alpha[a2] = x2;
}
//下降太少,忽略不计
if (fabs(alpha[a2] - x2) < eps*(alpha[a2] + x2 + eps))
{
    alpha[a2] = x2;
    return false;
}
//更新alpha[a1]
alpha[a1] = x1 + s*(x2 - alpha[a2]);


update(a1, a2, x1, x2);
/*
for (int ii = 0; ii < sampleNum; ii++)
{
    cout << gx[ii] << endl;
}
*/
return true;

}

//由SVM分类决策的对偶最优化问题求解alpha
/*
用序列最小最优化算法(SMO)求解alpha
step1:选取一对需要更新的变量alpha[i]和alpha[j]
step2:固定alpha[i]和alpha[j]以外的参数,求解对偶问题的最优化解获得更新后的alpha[i]和alpha[j]
参考:李航《统计学习方法》
JC Platt《Sequential Minimal Optimization: A Fast Algorithm for Training Support Vector Machines》
*/
void SVM::SMO()
{
//bool convergence = false; //判断有没有收敛
int a1, a2;
bool Changed = true; //有没有更新
int numChanged = 0; //更新了多少次
int *eligSample = new int[sampleNum]; // 记录访问过的样本
int cnt = 0; //样本个数
computeGx();

do
{
    numChanged = 0; cnt = 0; 
    //选择第一个变量(最不满足KKT条件的样本点)
    //优先选 0 < alpha < C 的样本 , alpha会随着后面的迭代发生变化
    for (int i = 0; i < sampleNum; i++)
    {
        //记录下不满足KKT条件的样本,做个缓存
        if (Changed)
        {
            cnt = 0;
            for (int j = 0; j < sampleNum; j++)
            {
                if (alpha[j] > eps && alpha[j] < C - eps)
                {
                    eligSample[cnt++] = j;
                }
            }
            Changed = false;
        }
        if (alpha[i] > eps && alpha[i] < C-eps)
        {
            a1 = i;
            //不满足KKT条件
            if (fabs(label[i] * gx[i] - 1) > eps)
            {
                //选择第二个变量,优先选下降最多的
                a2 = secondAlpha(i);
                Changed = takeStep(a1, a2);
                numChanged += Changed;
                if(Changed) continue;
                else //目标函数没有下降
                {
                    //先依次遍历间隔边界上的
                    for(int j=0; j<cnt;j++)
                    {
                        
                        if (eligSample[j] == i) continue;
                        a2 = eligSample[j];
                        Changed = takeStep(a1, a2);
                        numChanged += Changed;

                        if (Changed) break;
                    }
                    if (Changed) continue;
                    //再遍历整个数据集
                    int k = 0;
                    for (int j = 0; j < sampleNum; j++)
                    {
                        //这是上面已经试过的间隔上的点
                        if (eligSample[k] == j)
                        {
                            k++;
                            continue;
                        }
                        a2 = j;
                        Changed = takeStep(a1, a2);
                        numChanged += Changed;

                        if (Changed) break;
                    }
                    //找不到合适的alpha2, 换一个alpha1
                }
            }
        }

    }
    if(numChanged)//已经有改变了
    {
        Changed = false;
        continue;
    }
    //选其他不满足KKT条件的样本
    for (int i = 0; i < sampleNum; i++)
    {
        a1 = i;
        if (fabs(alpha[i]) < eps && label[i] * gx[i] < 1 ||
            fabs(alpha[i] - C) < eps && label[i] * gx[i] > 1)
        {
            //选择第二个变量,步骤同上
            a2 = secondAlpha(i);
            Changed = takeStep(a1, a2);
            numChanged += Changed;

            if (Changed) continue;
            else //目标函数没有下降
            {
                //先依次遍历间隔边界上的
                //间隔边界上的点已经记录在eligSample中了
                for(int j=0; j<cnt; j++)
                {

                    if (eligSample[j] == i) continue;
                    a2 = eligSample[j];
                    Changed = takeStep(a1, a2);
                    numChanged += Changed;

                    if (Changed) break;
                }
                if (Changed) continue;
                //再遍历整个数据集
                int k = 0;
                for (int j = 0; j < sampleNum; j++)
                {
                    if (j == eligSample[k])
                    {
                        k++;
                        continue;
                    }
                    a2 = j;
                    Changed = takeStep(a1, a2);
                    numChanged += Changed;

                    if (Changed) break;
                }
                //找不到合适的alpha2, 换一个alpha1
            }
        }
    }
    /*

// if (!Changed)
{
cout<<"num"<<numChanged<<endl;
show();
}

    //《统计学习方法》里说的收敛条件是这个,但不管用
    //所以改用JC Platt论文伪代码所提方法(也不是完全一样)
    convergence = isConvergence();
    //show();
    cnt++;
    if (cnt == 10000)
    {
        cout << "num" << numChanged << endl;
        show();
    }
    */
}while (numChanged);

delete eligSample;

}

void SVM::show()
{
cout << "支持向量为:" << endl;
for (int i = 0; i < sampleNum; i++)
{
if(alpha[i]>eps)
cout <<i<<" 对应的alpha为:"<<alpha[i]<< endl;
}
cout << endl;
}

还有随机森林的(刚才的是支持向量机)
随机森林代码如下:
#include

#include

#include

#include "random_forest.h"

using namespace std;

vector<decision_tree*> alltrees; // 森林(决策树集合)

vector trainAll,train,test; // 样本集

vector attributes; // 属性集(元素为属性序号)

int trainAllNum = 0;

int testAllNum = 0;

int MaxAttr; // 属性总数

int *ArrtNum; // 属性个数集(元素为属性最大值)

unsigned int F;

int tree_num = 100; // 决策树个数

const int leafattrnum = -1; // 叶子节点的属性序号

int TP = 0,

                    FN          = 0,

                    FP          = 0,

                    TN          = 0,

                    TestP       = 0,

                    TestN       = 0;

// 读入数据

void init(char * trainname, char * testname)

{

trainAllNum =readData(trainAll, trainname);

testAllNum = readData(test,testname);

calculate_attributes();

double temp =(double)trainAllNum;

temp =log(temp)/log(2.0);

F = (unsigned int)floor(temp+0.5)+1;

if(F>MaxAttr) F = MaxAttr;

}

// 初始化训练样本子集

void sub_init()

{

// 选取决策树的训练样本集合

RandomSelectData(trainAll, train);

// 计算样本属性个数

calculate_ArrtNum();

}

// 读数据

int readData(vector&data, const char* fileName)

{

ifstream fin;

fin.open(fileName);

string line;

int datanum=0;

// 每行数据作为一个样本

while(getline(fin,line))

{

   TupleData d;

   istringstream stream(line);

   string str;



   // 设置每个样本的标签和内容

   while(stream>>str)

   {

       if(str.find('+')==0)

       {

            d.label='+';

       }

       else if(str.find('-')==0)

       {

            d.label='-';

       }

       else

       {

            int j=stringtoint(str);

            d.A.push_back(j);

       }

   }



   data.push_back(d);

   datanum++;

}

fin.close();

return datanum;

}

// 生成根节点的训练样本子集

voidRandomSelectData(vector &data, vector&subdata)

{

int index;

subdata.clear();

int d = 0;

while (d < trainAllNum)

{

   index = rand() % trainAllNum;

   subdata.push_back(data.at(index));

   d++;

}

}

// 计算属性序列

void calculate_attributes()

{

// 每个样本必须具有相同的属性个数

TupleData d = trainAll.at(0);

MaxAttr = d.A.size();

attributes.clear();

// 建立属性集合attributes,元素为属性序号

for (int i = 0; i < MaxAttr; i++)

{

   attributes.push_back(i);

}

// 初始化属性最大值序列,元素为属性最大值

ArrtNum = new int[MaxAttr];

}

// 字符串转化为int

int stringtoint(string s)

{

int sum=0;

for(int i=0; s[i]!='\0';i++)

{

   int j=int(s[i])-48;

   sum=sum*10+j;

}

return sum;

}

// 计算ArrtNum元素值

void calculate_ArrtNum()

{

for(int i = 0; i < MaxAttr; i++) ArrtNum[i] = 0;

// ArrtNum元素值为属性最大值

for (vector::const_iterator it = train.begin(); it !=train.end(); it++)

{

   int i = 0;



   for (vector<int>::const_iterator intt=(*it).A.begin();intt!=(*it).A.end();intt++)

   {

       int valuemax=(*intt)+1;

       if(valuemax>ArrtNum[i]) ArrtNum[i]=valuemax;

       i++;

   }

}

}

// 计算熵

double Entropy(double p, double s)

{

double n = s - p;

double result = 0;

if (n != 0)

   result += - double(n) / s * log(double(n) / s) / log(2.0);

if (p != 0)

   result += double(-p) / s * log(double(p) / s) / log(2.0);

return result;

}

// 训练一棵决策树

int creat_classifier(decision_tree*&p, const vector &samples, vector&attributes)

{

if (p == NULL)

   p = new decision_tree();

// 根据样本真实类别,输出叶子节点类别

if (Allthesame(samples, '+'))

{

   p->node.label = '+';

   p->node.attrNum = leafattrnum;

   p->childs.clear();

   return 1;

}

if (Allthesame(samples, '-'))

{

   p->node.label = '-';

   p->node.attrNum = leafattrnum;

   p->childs.clear();

   return 1;

}

// 如果属性序列为空,当前节点就为叶子节点

if (attributes.size() == 0)

{

   p->node.label = Majorityclass(samples);

   p->node.attrNum = leafattrnum;

   p->childs.clear();

   return 1;

}

// 计算当前节点的最优属性

p->node.attrNum = BestGainArrt(samples, attributes);

// 中间节点无标签

p->node.label = ' ';

// 计算子节点候选属性集合,候选集合元素越来越少

vector newAttributes;

for (vector::iterator it = attributes.begin(); it !=attributes.end(); it++)

   if ((*it) != p->node.attrNum)

       newAttributes.push_back((*it));

// 初始化样本子集,建立maxvalue个样本子集,也就说明该节点有maxvalue个子节点

// 为什么不建立一个阈值,进行二分类?

int maxvalue = ArrtNum[p->node.attrNum];

vector* subSamples = newvector[maxvalue];

for (int i = 0; i < maxvalue; i++)

   subSamples[i].clear();

// 将样本集合分为样本子集

for (vector::const_iterator it = samples.begin(); it !=samples.end(); it++)

{

   // 对样本进行分类,分别分到maxvalue个子节点中

   // p->node.attrNum是当前节点的最优属性序号

   // (*it).A.at(p->node.attrNum)正是子节点的序号

   // 基于当前节点最优属性,计算当前样本的归类

   subSamples[(*it).A.at(p->node.attrNum)].push_back((*it));

}

decision_tree *child;

for (int i = 0; i < maxvalue; i++)

{

   child = new decision_tree;



   if (subSamples[i].size() == 0)

       child->node.label = Majorityclass(samples);

   else

       creat_classifier(child, subSamples[i], newAttributes);



   p->childs.push_back(child);

}

delete[] subSamples;

return 0;

}

// 计算节点处的信息增益

int BestGainArrt(constvector &samples, vector &attributes)

{

int attr,

   bestAttr = 0,

   p = 0,

   s = (int)samples.size();

// 计算正样本个数

for (vector::const_iterator it = samples.begin(); it !=samples.end(); it++)

{

   if ((*it).label == '+')

       p++;

}

double infoD;

double bestResult = 0;

// 计算初始熵

infoD = Entropy(p, s);

vector m_attributes;

// 随机确定候选属性集

RandomSelectAttr(attributes, m_attributes);

// 遍历属性(即主题),通过信息增益筛选最优属性

for (vector::iterator it = m_attributes.begin(); it !=m_attributes.end(); it++)

{

   attr            = (*it);

   double result   = infoD;



   // 第attr个属性的最大属性值

   int maxvalue    = ArrtNum[attr];



   // 正负样本集

   int* subN       = newint[maxvalue];

   int* subP       = newint[maxvalue];

   int* sub        = newint[maxvalue];



   for (int i = 0; i < maxvalue; i++)

   {

       subN[i] = 0;

       subP[i] = 0;

       sub[i]  = 0;

   }



   // 基于特定属性,对当前训练样本进行分类

   // 属性计算这一步的确没有,属性值直接存储在样本中

   for (vector<TupleData>::const_iterator jt = samples.begin(); jt !=samples.end(); jt++)

   {

       if ((*jt).label == '+')

            subP[(*jt).A.at(attr)] ++;

       else

            subN[(*jt).A.at(attr)] ++;

       sub[(*jt).A.at(attr)]++;

   }



   // 计算特定属性下信息增益(相对熵)

   double SplitInfo = 0;

   for(int i = 0; i < maxvalue; i++)

   {

       double partsplitinfo;

       partsplitinfo   =-double(sub[i])/s*log(double(sub[i])/s)/log(2.0);

       SplitInfo       =SplitInfo+partsplitinfo;

   }



   double infoattr = 0;

   for (int i = 0; i < maxvalue; i++)

   {

       double partentropy;

       partentropy     = Entropy(subP[i],subP[i] + subN[i]);

       infoattr        =infoattr+((double)(subP[i] + subN[i])/(double)(s))*partentropy;

   }

   result = result - infoattr;

   result = result / SplitInfo;



   // 寻找最优属性

   if (result > bestResult)

   {

       bestResult      = result;

       bestAttr        = attr;

   }

   delete[] subN;

   delete[] subP;

   delete[] sub;

}

if (bestResult == 0)

{

   bestAttr=attributes.at(0);

}

return bestAttr;

}

void RandomSelectAttr(vector&data, vector &subdata)

{

int index;

unsigned int dataNum=data.size();

subdata.clear();

if(dataNum<=F)

{

   for (vector<int>::iterator it = data.begin(); it != data.end();it++)

   {

       int attr = (*it);

       subdata.push_back(attr);

   }

}

else

{

   set<int> AttrSet;

   AttrSet.clear();

   while (AttrSet.size() < F)

   {

       index = rand() % dataNum;

       if (AttrSet.count(index) == 0)

       {

            AttrSet.insert(index);

           subdata.push_back(data.at(index));

       }

   }

}

}

bool Allthesame(constvector &samples, char ch)

{

for (vector::const_iterator it = samples.begin(); it !=samples.end(); it++)

   if ((*it).label != ch)

       return false;

return true;

}

// 确定节点中哪个类别样本个数最多

char Majorityclass(constvector &samples)

{

int p = 0, n = 0;

for (vector::const_iterator it = samples.begin(); it !=samples.end(); it++)

   if ((*it).label == '+')

       p++;

   else

       n++;

if (p >= n)

   return '+';

else

   return '-';

}

// 测试阶段

char testClassifier(decision_tree *p,TupleData d)

{

// 抵达叶子节点

if (p->node.label != ' ')

   return p->node.label;

// 节点处最优属性

int attrNum = p->node.attrNum;

// 错误样本

if (d.A.at(attrNum) < 0)

   return ' ';

// 确定分支

return testClassifier(p->childs.at(d.A.at(attrNum)), d);

}

void testData()

{

for (vector::iterator it = test.begin(); it !=test.end(); it++)

{

   printf("新样本\n");

   if((*it).label=='+') TestP++;

   else TestN++;



   int p = 0, n = 0;



   for(int i = 0; i < tree_num; i++)

   {

       if(testClassifier(alltrees.at(i), (*it))=='+')  p++;

       else n++;

   }



   if(p>n)

   {

       if((*it).label=='+') TP++;

       else FP++;

   }

   else

   {

       if((*it).label=='+') FN++;

       else TN++;

   }

}

}

void freeClassifier(decision_tree *p)

{

if (p == NULL)

   return;

for (vector<decision_tree*>::iterator it = p->childs.begin();it != p->childs.end(); it++)

{

   freeClassifier(*it);

}

delete p;

}

void freeArrtNum()

{

delete[] ArrtNum;

}

void showResult()

{

cout << "Train size: "<<trainAllNum<<endl;

cout << "Test size: "<<testAllNum<<endl;

cout << "True positive: "<< TP << endl;

cout << "False negative: "<<FN<<endl;

cout << "False positive: "<<FP<<endl;

cout << "True negative: "<<TN<<endl;

}

int main(int argc, char **argv)

{

char * trainfile=argv[1];

char* testfile=argv[2];

srand((unsigned)time(NULL));

// 初始化样本

init("1.txt", "2.txt");

// 训练阶段

for(int i = 0; i < tree_num; i++)

{

   printf("第 %d 棵决策树训练开始\n", i);



   // 每棵树的训练样本子集

   sub_init();



   // 训练每棵决策树

   decision_tree * root=NULL;

   creat_classifier(root, train, attributes);



   // 建立森林

   alltrees.push_back(root);



   printf("第 %d 棵决策树训练完毕\n", i);

}

// 测试阶段

testData();

for (vector<decision_tree *>::const_iterator it =alltrees.begin(); it != alltrees.end(); it++)

{

   freeClassifier((*it));

}

freeArrtNum();

showResult();

system("pause");

return 0;

}