soft-tree/SoftTree.cpp at master · oir/soft-tree · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
/* Ozan Irsoy, Bogazici University, 2012.
 *
 * SoftTree.cpp implements Soft Regression Trees.
 * Internal nodes have linear sigmoid splits,
 * leaf nodes have constant values.
 *
 *
 * See,
 *   "Soft Decision Trees", O. Irsoy, O. T. Yildiz, E. Alpaydin,
 *   ICPR 21, Tsukuba, Japan.
 * for details.
 *
 *
 */

#include <iostream>
#include <fstream>
#include <algorithm>
#include <cmath>
#include <string>
#include <vector>
#include <cassert>
#include "common.cpp"

#define HARDINIT true     // if true, optimization starts from hard tree parameters, else, randomly
#define MINALPHA 1        // starting range of learning rate
#define MAXALPHA 10       // ending range of learning rate
#define MAXEPOCH 25       // number of epochs in training
#define MAXRETRY 10       // number of restart of optimization from a starting point
#define PRETH 1e-3        // prepruning threshold

#define uint unsigned int

using namespace std;

class SoftTree;
class Node;

class Node
{
  public:

    Node();

  private:

    void learnParams(const vector<vector<double> > &, const vector<double> &,
                      const vector<vector<double> > &, const vector<double> &,
                      double, SoftTree*);
    void splitNode(const vector<vector<double> > &, const vector<double> &,
                   const vector<vector<double> > &, const vector<double> &,
                   SoftTree*);
    double evaluate(const vector<double> &);
    uint size();
    void print(uint);
    void hardInit(const vector< vector <double> > &, const vector<double> &);

  // pointers & flags
    Node* parent;
    Node* left;
    Node* right;
    bool isLeaf;
    bool isLeft;

  // params
    vector<double> w;
    double w0; // if leaf, response value
               // if internal, bias of split

    double y;  // last computed response
    double v;  // last computed gating

  friend class SoftTree;
};

class SoftTree
{
  public:
    SoftTree(const vector< vector<double> > &X, const vector<double> &Y,
             const vector< vector<double> > &V, const vector<double> &R,
             char type = 'r');
    double evaluate(const vector<double> &);
    double meanSqErr(const vector<vector<double> > &, const vector<double> &);
    double errRate(const vector<vector<double> > &, const vector<double> &);
    uint size();
    void print();

  private:
    Node* root;
    char type; // 'r'egression or 'c'lassification

  friend class Node;
};

Node::Node()
{
  isLeaf = true;
  parent = NULL;
  left = NULL;
  right = NULL;
}

SoftTree::SoftTree(const vector< vector<double> > &X, const vector<double> &Y,
                   const vector< vector<double> > &V, const vector<double> &R,
                   char type) {
  assert(type == 'r' || type == 'c');
  this->type = type;
  root = new Node;

  root->w = vector<double>(X[0].size());
  root->w0 = 0;
  for (uint i=0; i<Y.size(); i++)
    root->w0 += Y[i];
  root->w0 /= Y.size();

  root->splitNode(X, Y, V, R, this);
}

void Node::hardInit(const vector< vector< double > >& X, const vector< double >& Y)
{
  vector<double> sv(X.size()); // soft membership
  double total=0;

  assert(w.size() != 0);

  // (1) compute soft memberships
  for (uint j=0; j<X.size(); j++) {
    double t = 1;
    Node* m = this;
    Node* p;

    while(m->parent != NULL) {
      p = m->parent;
      if (m->isLeft)
        t *= sigmoid(dot(p->w, X[j]) + p->w0);
      else
        t *= (1-sigmoid(dot(p->w, X[j]) + p->w0));
      m = m->parent;
    }
    sv[j] = t;
    total += t;
  }

  if (total <= 1) { // not enough data, init randomly
    w = vector<double>(X[0].size());
    for (uint i=0; i<w.size(); i++)
      w[i] = urand(-0.005, 0.005);
    w0 = urand(-0.005, 0.005);
    left->w0 = urand(-0.005, 0.005);
    right->w0 = urand(-0.005, 0.005);
    return;
  }

  uint dim, bestDim=-1;
  double errBest = -1;
  double bestSplit;
  double bestw10, bestw20;
  vector<double> bestw1, bestw2;

  // (2) look for the best hard split
  for (dim=0; dim<X[0].size(); dim++)
  {
    vector<pair<double,uint> > f;

    for (uint i=0; i<X.size(); i++)
      f.push_back(make_pair(X[i][dim],i));

    sort(f.begin(), f.end());

    double sp;
    for (uint i=0; i<f.size()-1; i++) {

      if (f[i].first == f[i+1].first) continue;
      sp = 0.5*(f[i].first + f[i+1].first);

      double w10,w20,left,right,lsum,rsum;

      w10 = w20 = lsum = rsum = 0;
      for (uint j=0; j<=i; j++) {
        w10 += Y[f[j].second]*sv[f[j].second];
        lsum += sv[f[j].second];
      }
      w10 /= lsum;

      for (uint j=i+1; j<f.size(); j++) {
        w20 += Y[f[j].second]*sv[f[j].second];
        rsum += sv[f[j].second];
      }
      w20 /= rsum;

      // weighted MSE for regression and
      // weighted Gini Impurity for classification
      double errl = 0, errr = 0;
      for (uint j=0; j<=i; j++)
        errl += (w10 - Y[f[j].second])*(w10 - Y[f[j].second])*sv[f[j].second];
      errl /= lsum;
      for (uint j=i+1; j<f.size(); j++)
        errr += (w20 - Y[f[j].second])*(w20 - Y[f[j].second])*sv[f[j].second];
      errr /= rsum;

      double a = lsum/(lsum+rsum+0.0);
      double b = rsum/(lsum+rsum+0.0);

      if (a*errl + b*errr < errBest || errBest == -1) {
        bestSplit = sp;
        bestDim = dim;
        errBest = a*errl + b*errr;
        bestw10 = w10;
        bestw20 = w20;
        //cout << errbest << endl;
      }
    }
  }

  // (3) init params according to best hard split

  w = vector<double>(X[0].size());
  for (uint i=0; i<w.size(); i++)
    w[i] = urand(-0.005, 0.005);
  w[bestDim] = -0.5;
  w0 = bestSplit*0.5;
  left->w0 = bestw10;
  right->w0 = bestw20;

  //cout << "bestsplit: " << bestsplit << endl;

  assert(w.size() != 0);
}

double SoftTree::evaluate(const vector<double> &x) {
  if (type == 'r')
    return root->evaluate(x);
  else if (type == 'c')
    return sigmoid(root->evaluate(x));
}

uint SoftTree::size() {
  return root->size();
}

uint Node::size() {
  if (isLeaf)
    return 1;
  else
    return 1 + left->size() + right->size();
}

void Node::print(uint depth) {
  for (int i=0; i<depth; i++) cout << "__";
  if (!isLeaf)
    for (int i=0; i<w.size(); i++) cout << w[i] << ", ";
  cout << w0 << endl;
  if (!isLeaf) {
    left->print(depth+1);
    right->print(depth+1);
  }
}

void SoftTree::print() {
  root->print(1);
}

double Node::evaluate(const vector<double> &x) {
  if (isLeaf)
    y = w0;
  else {
    v = sigmoid(dot(w,x)+w0);
    y = v*(left->evaluate(x)) + (1-v)*(right->evaluate(x));
  }

  return y;
}

// This learns the parameters of current node and their potential children
void Node::learnParams(const vector< vector<double> > &X, const vector<double> &Y,
			const vector< vector<double> > &V, const vector<double> &R,
                        double alpha, SoftTree* tree) {
  double u = 0.1;		// momentum weight
  double eps = 0.00001;

  uint e,i,j,temp;
  vector<uint> ix;

  vector<double> dw(X[0].size()); // grad of w
  vector<double> dwp(X[0].size()); // previous grad of w

  double dw10, dw20, dw0; // grads of w0
  double dw10p, dw20p, dw0p; // previous grads of w0

  for (i=0; i<Y.size(); i++) ix.push_back(i);

  for (e=0; e<MAXEPOCH; e++) {
    shuffle(ix);

    for (i=0; i<X.size(); i++) {
      j = ix[i];
      vector<double> x = X[j];
      double r = Y[j];
      double y = tree->evaluate(x);
      double d = y - r;

      double t = alpha*d;
      Node* m = this;
      Node* p;

      // compute negative gradient
      while(m->parent != NULL) {
        p = m->parent;
        if (m->isLeft)
          t *= p->v;
        else
          t *= (1 - p->v);
        m = m->parent;
      }

      dw = (-t*(left->y - right->y)*(v)*(1-v))*x;
      dw0 = -t*(left->y - right->y)*(v)*(1-v);
      dw10 = -t*(v);
      dw20 = -t*(1-v);

      // update params (params -= alpha*gradient)
      w += dw + u*dwp;
      w0 += dw0 + u*dw0p;
      left->w0 += dw10 + u*dw10p;
      right->w0 += dw20 + u*dw20p;

      // update previous values
      dwp = dw;
      dw0p = dw0;
      dw10p = dw10;
      dw20p = dw20;

      alpha *= 0.9999;
    }
  }
}

double SoftTree::meanSqErr(const vector< vector<double> > &X,
                           const vector<double> &Y) {
  assert(type == 'r');
  double err=0, y;
  for (uint i=0; i<Y.size(); i++) {
    y = evaluate(X[i]);
    err += (Y[i]-y)*(Y[i]-y);
  }
  err /= Y.size();
  return err;
}

double SoftTree::errRate(const vector< vector<double> > &X,
                         const vector<double> &Y) {
  assert(type == 'c');
  double err=0, y;
  for (uint i=0; i<Y.size(); i++) {
    y = evaluate(X[i]);
    err += (int)(Y[i] != (y > 0.5));
  }
  err /= Y.size();
  return err;
}

/* This splits the current node into two children recursively as follows:
 * (1) Create two children, then learn parameters of current node and its kids
 * (2) Measure error. If improved, keep the split, also recursively split
 *     the children. If not improved, revert the split (so that current node
 *     is a leaf) and stop the recursion at this node.
 */
void Node::splitNode(const vector< vector<double> > &X, const vector<double> &Y,
                     const vector< vector<double> > &V, const vector<double> &R,
                     SoftTree* tree) {
  double err = 0;
  double y, r;
  char type = tree->type;

  if (type == 'r')
    err = tree->meanSqErr(V, R);
  else if (type == 'c')
    err = tree->errRate(V, R);

  Node temp;
  temp = *this;

  isLeaf = 0; // Make it an internal node
  w = vector<double>(X[0].size());

  // Create left child
  left = new Node;
  left->parent = this;
  left->isLeft = true;

  // Create right child
  right = new Node;
  right->parent = this;
  right->isLeft = false;

  vector<double> bestw, bestwl, bestwr;
  double bestw0, bestw0l, bestw0r;
  double bestErr = 1e10;
  double newErr;

  // make MAXRETRY re-initializations and choose best
  double alpha;
  for (uint t=0; t<MAXRETRY; t++) {
    if (HARDINIT) hardInit(X, Y);
    else {
      for (uint i=0; i<w.size(); i++)
        w[i] = urand(-0.005, 0.005);
      w0 = urand(-0.005, 0.005);
      left->w0 = urand(-0.005, 0.005);
      right->w0 = urand(-0.005, 0.005);
    }

    //alpha = MINALPHA + (t*(MAXALPHA-MINALPHA))/(float)MAXRETRY;
    alpha = (MAXALPHA+0.0) / pow(2,t+1);
    learnParams(X, Y, V, R, alpha, tree);

    if (type == 'r')
      newErr = tree->meanSqErr(V, R);
    else if (type == 'c')
      newErr = tree->errRate(V, R);
    if (newErr < bestErr) {
      bestw = vector<double>(w);
      bestw0 = w0;
      bestw0l = left->w0;
      bestw0r = right->w0;
      bestErr = newErr;
    }
  }

  w = vector<double>(bestw);
  w0 = bestw0;
  left->w0 = bestw0l;
  right->w0 = bestw0r;

  //cout << "besterr: " << bestErr << " err: " << err << endl;

  // if the split is good enough, keep it,
  // continue splitting the children
  if (bestErr + PRETH < err) {
    left->splitNode(X, Y, V, R, tree);
    right->splitNode(X, Y, V, R, tree);
  } else {  // revert the split and stop recursion
    delete left;
    delete right;
    *this = temp;
  }
}