source: orange/source/orange/discretize.cpp @ 11703:9b8d8ab7820c

Revision 11703:9b8d8ab7820c, 25.3 KB checked in by janezd <janez.demsar@…>, 7 months ago (diff)

Removed the GPL copyright notice from all files except orangeqt.

Line 
1#include <math.h>
2
3#include "vars.hpp"
4#include "domain.hpp"
5#include "examples.hpp"
6#include "examplegen.hpp"
7#include "getarg.hpp"
8
9#include "classify.hpp"
10#include "random.hpp"
11#include "distvars.hpp"
12#include "basstat.hpp"
13#include "contingency.hpp"
14#include "transval.hpp"
15#include "classfromvar.hpp"
16
17#include "discretize.ppp"
18
19
20TEquiDistDiscretizer::TEquiDistDiscretizer(const int noi, const float fv, const float st)
21: numberOfIntervals(noi),
22  firstCut(fv),
23  step(st)
24{}
25
26
27// Transforms the value; results is 1+floor((val.floatV-firstCut)/step); 0 if below firstCut, numberOfIntervals if above range
28void TEquiDistDiscretizer::transform(TValue &val)
29{ if (val.varType!=TValue::FLOATVAR)
30    raiseError("discrete value expected");
31 
32  if (!val.isSpecial()) {
33    if (step<0)
34      raiseError("'step' not set");
35    if (numberOfIntervals<1)
36      raiseError("invalid number of intervals (%i)", numberOfIntervals);
37
38    if ((step==0) || (numberOfIntervals==1))
39      val.intV = 0;
40
41    else {
42      val.intV = (val.floatV<firstCut) ? 0 : 1+int(floor((val.floatV-firstCut)/step));
43      if (val.intV>=numberOfIntervals)
44        val.intV = numberOfIntervals-1;
45    }
46  }
47 
48  val.varType = TValue::INTVAR;
49}
50
51
52inline int numDecs(const float &diff, float &factor)
53{ if (diff>= 1.0) {
54    factor = 100.0;
55    return 2;
56  }
57  else {
58    int decs = (int)ceil(-log10(diff));
59    if (decs<2)
60      decs = 2;
61    factor = exp(decs*log(10.0));
62    return decs;
63  }
64}
65
66
67inline float roundFromDecs(const int &decs)
68{ 
69  return decs <= 0 ? 100.0 : exp(decs*log(10.0));
70}
71
72inline void roundToFactor(float &f, const float &factor)
73{ f = floor(f*factor+0.5)/factor; }
74
75
76string mcvt(double f, int decs)
77{ 
78  char buf[64];
79  sprintf(buf, "%.*f", decs, f);
80  return buf;
81}
82
83/*  Constructs a new TEnumVariable. Its values represent the intervals for values of passed variable var;
84    getValueFrom points to a classifier which gets a value of the original variable (var) and transforms it using
85    'this' transformer. */
86PVariable TEquiDistDiscretizer::constructVar(PVariable var, float mindiff)
87{ 
88  mindiff = 1.0; // Ignores the given mindiff; see http://www.ailab.si/orange/trac/ticket/576
89  TFloatVariable *fvar = var.AS(TFloatVariable);
90  if (!fvar)
91    raiseError("invalid attribute type (continuous attribute expected)");
92
93  TEnumVariable *evar=mlnew TEnumVariable("D_"+var->get_name());
94  PVariable revar(evar);
95
96  evar->ordered = true;
97
98  if (numberOfIntervals<2)
99    evar->addValue("C");
100
101  else {
102    float roundfactor;
103    int decs = numDecs(step<mindiff ? step : mindiff, roundfactor);
104
105    if ((fvar->adjustDecimals != 2) && (decs < fvar->numberOfDecimals)) {
106      decs = fvar->numberOfDecimals;
107      roundfactor = roundFromDecs(fvar->numberOfDecimals);
108    }
109
110    roundToFactor(firstCut, roundfactor);
111    roundToFactor(step, roundfactor);
112
113    float f = firstCut;
114    string pval;
115
116    pval = mcvt(f, decs);
117    evar->addValue(string("<") + pval);
118
119    int steps = numberOfIntervals-2;
120    while (steps--) {
121      string s("[");
122      s += pval;
123      f += step;
124      s += ", ";
125      pval = mcvt(f, decs);
126      s += pval;
127      s += ")";
128      evar->addValue(s);
129    }
130
131    evar->addValue(string(">") + pval);
132  }
133 
134  TClassifierFromVar *tcfv = mlnew TClassifierFromVar(revar, var);
135  tcfv->transformUnknowns = true;
136  tcfv->transformer = this; // rewrapping
137  revar->getValueFrom = tcfv;
138  return revar;
139}
140
141
142void TEquiDistDiscretizer::getCutoffs(vector<float> &cutoffs) const
143{
144  cutoffs.clear();
145  for(int i = 0; i < numberOfIntervals-1; i++)
146    cutoffs.push_back(firstCut+step*i);
147}
148
149
150TThresholdDiscretizer::TThresholdDiscretizer(const float &athreshold)
151: threshold(athreshold)
152{}
153
154
155void TThresholdDiscretizer::transform(TValue &val)
156{ if (!val.isSpecial())
157    val.intV = (val.floatV<=threshold) ? 0 : 1;
158  val.varType = TValue::INTVAR;
159}
160
161
162PVariable TThresholdDiscretizer::constructVar(PVariable var, float mindiff)
163{ 
164  mindiff = 1.0; // Ignores the given mindiff; see http://www.ailab.si/orange/trac/ticket/576
165  TEnumVariable *evar = mlnew TEnumVariable("D_"+var->get_name());
166  PVariable revar(evar);
167
168  evar->ordered = true;
169
170  char s[10];
171  sprintf(s, "<= %5.3f", threshold);
172  evar->values->push_back(s);
173  sprintf(s, "> %5.3f", threshold);
174  evar->values->push_back(s);
175
176  TClassifierFromVar *tcfv = mlnew TClassifierFromVar(revar, var);
177  tcfv->transformUnknowns = true;
178  tcfv->transformer = this; // rewrapping
179  revar->getValueFrom = tcfv;
180  return revar;
181}
182
183
184void TThresholdDiscretizer::getCutoffs(vector<float> &cutoffs) const
185{
186  cutoffs.clear();
187  cutoffs.push_back(threshold);
188}
189
190
191TBiModalDiscretizer::TBiModalDiscretizer(const float &al, const float &ah)
192: low(al),
193  high(ah)
194{}
195
196
197void TBiModalDiscretizer::transform(TValue &val)
198{ 
199  if (val.varType != TValue::FLOATVAR)
200    raiseError("continuous value expected");
201
202  if (!val.isSpecial())
203    val.intV = ((val.intV > low) && (val.intV > high)) ? 1 : 0;
204
205  val.varType = TValue::INTVAR;
206}
207
208
209PVariable TBiModalDiscretizer::constructVar(PVariable var, float mindiff)
210{ 
211  mindiff = 1.0; // Ignores the given mindiff; see http://www.ailab.si/orange/trac/ticket/576
212  TFloatVariable *fvar = var.AS(TFloatVariable);
213  if (!fvar)
214    raiseError("invalid attribute type (continuous attribute expected)");
215
216  TEnumVariable *evar = mlnew TEnumVariable("D_"+var->get_name());
217  PVariable revar(evar);
218
219  evar->ordered = true;
220
221  if (high<=low)
222    raiseError("invalid interval: (%5.3f, %5.3f]", low, high);
223
224  float roundfactor;
225  if (high-low < mindiff) {
226    mindiff = high-low;
227  }
228  int decs = numDecs(mindiff, roundfactor);
229
230  if ((fvar->adjustDecimals != 2) && (decs < fvar->numberOfDecimals)) {
231    decs = fvar->numberOfDecimals;
232    roundfactor = roundFromDecs(fvar->numberOfDecimals);
233  }
234
235  roundToFactor(low, roundfactor);
236  roundToFactor(high, roundfactor);
237  string lstr = mcvt(low, decs);
238  string hstr = mcvt(high, decs);
239
240  evar->values->push_back("<=" + lstr + " or >" + hstr);
241  evar->values->push_back("between "+lstr+" and "+hstr);
242
243  TClassifierFromVar *tcfv = mlnew TClassifierFromVar(revar, var);
244  tcfv->transformUnknowns = true;
245  tcfv->transformer = this; // rewrapping
246  revar->getValueFrom = tcfv;
247  return revar;
248}
249
250
251void TBiModalDiscretizer::getCutoffs(vector<float> &cutoffs) const
252{
253  cutoffs.clear();
254  cutoffs.push_back(low);
255  cutoffs.push_back(high);
256}
257
258
259TIntervalDiscretizer::TIntervalDiscretizer()
260: points(mlnew TFloatList())
261{}
262
263
264TIntervalDiscretizer::TIntervalDiscretizer(PFloatList apoints)
265: points(apoints)
266{};
267
268
269
270void TIntervalDiscretizer::transform(TValue &val)
271{ checkProperty(points);
272  if (val.varType!=TValue::FLOATVAR)
273    raiseError("continuous value expected");
274
275  if (!val.isSpecial()) {
276    val.intV = 0;
277    for(TFloatList::iterator ri(points->begin()), re(points->end()); (ri!=re) && (*ri<val.floatV); ri++, val.intV++);
278  }
279
280  val.varType = TValue::INTVAR;
281}
282
283
284/*  Constructs a new TEnumVariable. Its values represent the intervals for
285    values of passed variable var; getValueFrom points to a classifier which
286    gets a value of the original variable (var) and transforms it using
287    'this' transformer. */
288PVariable TIntervalDiscretizer::constructVar(PVariable var, float mindiff )
289{
290  mindiff = 1.0; // Ignores the given mindiff; see http://www.ailab.si/orange/trac/ticket/576
291  TFloatVariable *fvar = var.AS(TFloatVariable);
292  if (!fvar)
293    raiseError("invalid attribute type (continuous attribute expected)");
294
295  TEnumVariable *evar=mlnew TEnumVariable("D_"+var->get_name());
296  PVariable revar(evar);
297
298  TEnumVariable *cl_evar=mlnew TEnumVariable("D_"+var->get_name());
299  PVariable cl_revar(cl_evar);
300
301  evar->ordered = true;
302
303  if (!points->size())
304    evar->addValue("C");
305
306  else {
307    TFloatList::iterator vb(points->begin()), ve(points->end()), vi;
308    for(vi=vb+1; vi!=ve; vi++) {
309      float ndiff = *vi - *(vi-1);
310      if (ndiff<mindiff)
311        mindiff = ndiff;
312    }
313
314    float roundfactor;
315    int decs = numDecs(mindiff, roundfactor);
316
317    if ((fvar->adjustDecimals != 2) && (decs < fvar->numberOfDecimals)) {
318      decs = fvar->numberOfDecimals;
319      roundfactor = roundFromDecs(fvar->numberOfDecimals);
320    }
321
322    vi=points->begin();
323    string ostr;
324
325    roundToFactor(*vi, roundfactor);   
326    ostr = mcvt(*vi, decs);
327    evar->addValue(string("<=") + ostr);
328
329    while(++vi!=ve) {
330      string s = "(";
331      s += ostr;
332      s += ", ";
333      roundToFactor(*vi, roundfactor);
334      ostr = mcvt(*vi, decs);
335      s += ostr;
336      s += "]";
337      evar->addValue(s);
338    }
339
340    evar->addValue(string(">")+ostr);
341  } 
342
343  TClassifierFromVar *tcfv = mlnew TClassifierFromVar(cl_revar, var);
344  tcfv->transformUnknowns = true;
345  tcfv->transformer = this; // rewrapping
346  revar->getValueFrom = tcfv; 
347  return revar;
348}
349
350
351
352void TIntervalDiscretizer::getCutoffs(vector<float> &cutoffs) const
353{
354  cutoffs = points.getReference();
355}
356
357
358// Sets the number of intervals (default is 4)
359TEquiDistDiscretization::TEquiDistDiscretization(const int anumber)
360: TDiscretization(),
361  numberOfIntervals(anumber)
362{}
363
364
365// Sets the firstCut and step according to the min and max fields of valStat.
366PVariable TEquiDistDiscretization::operator()(PBasicAttrStat valStat, PVariable var) const
367{ float step = (valStat->max-valStat->min)/numberOfIntervals;
368  PEquiDistDiscretizer discretizer = mlnew TEquiDistDiscretizer(numberOfIntervals, valStat->min+step, step);
369  return discretizer->constructVar(var);
370}
371
372
373// Sets the firstCut and step according to the range of values that occur in gen for variable var.
374PVariable TEquiDistDiscretization::operator()(PExampleGenerator gen, PVariable var, const long &)
375{ if (var->varType!=TValue::FLOATVAR)
376    raiseError("attribute '%s' is not continuous", var->get_name().c_str());
377
378  if (numberOfIntervals<=0)
379    raiseError("invalid number of intervals (%i)", numberOfIntervals);
380
381  int varPos=gen->domain->getVarNum(var);
382
383  TExampleIterator first(gen->begin());
384  while( first && (*first)[varPos].isSpecial() )
385    ++first;
386  if (!first)
387    raiseError("attribute '%s' has no known values", var->get_name().c_str());
388
389  float max, min;
390  max = min = (*first)[varPos].floatV;
391  while (++first)
392    if (!(*first)[varPos].isSpecial()) {
393      float val = (*first)[varPos].floatV;
394      if (val>max)
395        max = val;
396      if (val<min)
397        min = val;
398    };
399
400  float step = (max-min)/numberOfIntervals;
401  PEquiDistDiscretizer discretizer = mlnew TEquiDistDiscretizer(numberOfIntervals, min+step, step);
402  return discretizer->constructVar(var);
403}
404
405
406
407TFixedDiscretization::TFixedDiscretization(TFloatList &pts)
408: points(mlnew TFloatList(pts))
409{}
410
411
412TFixedDiscretization::TFixedDiscretization(const string &boundaries)
413: points()
414{ vector<string> atoms;
415  string2atoms(boundaries, atoms);
416  points = mlnew TFloatList(atoms.size());
417  TFloatList::iterator pi(points->begin());
418  ITERATE(vector<string>, ai, atoms) {
419    sscanf((*ai).c_str(), "%f", &*pi);
420    if ((pi!=points->begin()) && (*pi<=pi[-1]))
421      raiseError("mismatch in cut-off points");
422    pi++;
423  }
424}
425
426
427PVariable TFixedDiscretization::operator ()(PExampleGenerator, PVariable var, const long &)
428{ PIntervalDiscretizer discretizer = mlnew TIntervalDiscretizer (mlnew TFloatList(points));
429  return discretizer->constructVar(var);
430}
431
432
433
434TEquiNDiscretization::TEquiNDiscretization(int anumber)
435: numberOfIntervals(anumber),
436  recursiveDivision(true)
437{}
438
439
440PVariable TEquiNDiscretization::operator()(const TContDistribution &distr, PVariable var) const
441{ 
442  PIntervalDiscretizer discretizer=mlnew TIntervalDiscretizer;
443  float mindiff;
444 
445  if (distr.size() <= numberOfIntervals) {
446    cutoffsByMidpoints(discretizer, distr, mindiff);
447  }
448  else if (recursiveDivision && false) { // XXX remove when the routine is finished
449    cutoffsByDivision(discretizer, distr, mindiff);
450  }
451  else {
452    cutoffsByCounting(discretizer, distr, mindiff);
453  }
454
455  return discretizer->constructVar(var, mindiff);
456}
457
458void TEquiNDiscretization::cutoffsByMidpoints(PIntervalDiscretizer discretizer, const TContDistribution &distr, float &mindiff) const
459{
460  mindiff = 1.0;
461  TContDistribution::const_iterator cdi(distr.begin()), cde(distr.end());
462  if (cdi!=cde) {
463    float prev = (*cdi).first;
464    while (++cdi != cde) {
465      discretizer->points->push_back((prev+(*cdi).first)/2.0);
466      if (((*cdi).first - prev) < mindiff) {
467          mindiff = (*cdi).first - prev;
468      }
469    }
470  }
471}
472
473void TEquiNDiscretization::cutoffsByCounting(PIntervalDiscretizer discretizer, const TContDistribution &distr, float &mindiff) const
474{
475  if (numberOfIntervals<=0)
476    raiseError("invalid number of intervals (%i)", numberOfIntervals);
477
478  mindiff = 1.0;
479  float N = distr.abs;
480  int toGo = numberOfIntervals;
481  float inthis = 0, prevel = -1; // initialized to avoid warnings
482  float inone = N/toGo;
483
484  for(map<float, float>::const_iterator db(distr.begin()), di(db), de(distr.end()), ni; (toGo>1) && (di!=de); di++) {
485    inthis += (*di).second;
486    if ((inthis<inone) || (di==db))
487      prevel = (*di).first;
488    else {
489      ni = di; ni++;
490      if ((ni!=de) && (inthis - inone < (*di).second / 2)) {
491        discretizer->points->push_back( ((*ni).first + (*di).first) /2);
492        if ((*ni).first - (*di).first < mindiff) {
493          mindiff = (*ni).first - (*di).first;
494        }
495        N -= inthis;
496        inthis = 0;
497        prevel = (*ni).first;
498      }
499      else {
500        discretizer->points->push_back( (prevel + (*di).first) / 2);
501        if ((*di).first - prevel < mindiff) {
502          mindiff = (*di).first - prevel;
503        }
504        N -= (inthis - ((*di).second));
505        inthis = (*di).second;
506        prevel = (*di).first;
507      }
508      if (--toGo) 
509        inone = N/toGo;
510    }
511  }
512}
513
514
515void TEquiNDiscretization::cutoffsByDivision(PIntervalDiscretizer discretizer, const TContDistribution &distr, float &mindiff) const
516{ cutoffsByDivision(numberOfIntervals, discretizer->points.getReference(), distr.begin(), distr.end(), distr.abs, mindiff); }
517
518
519void TEquiNDiscretization::cutoffsByDivision(const int &, TFloatList &, 
520                                            map<float, float>::const_iterator, map<float, float>::const_iterator,
521                                            const float &, float &) const
522{ /*XXX to be finished
523
524  if (noInt & 1) {
525    if (noInt & 2) {
526      noIntLeft = (noInt-1)/2;
527      noIntRight = (noInt+1)/2;
528    }
529    else {
530      noIntLeft = (noInt+1)/2;
531      noIntRight = (noInt+1)/2;
532    }
533
534    float Nleft = N * noIntLeft / (noIntLeft + noIntRight);
535    float Nright = N - Nleft;
536
537    if ((Nleft<1) || (Nright<1))
538      return; // should set a cut-off, but couldn't -- N=1...
539
540    map<float, float>::const_iterator fii = fbeg;
541    while ((Nn<Nleft) && (fii!=fend))
542      Nn += (*fii).second;
543    Nn -= (*fii).second;
544
545    if (fii==fend) {
546    }
547
548  }
549  else {
550    float N2 = N/2, Nn = 0.0;
551    if (N2<1)
552      return; // should set a cut-off, but couldn't -- N=1...
553
554    map<float, float>::const_iterator fii = fbeg;
555    while ((Nn<N2) && (fii!=fend))
556      Nn += (*fii).second;
557    Nn -= (*fii).second;
558
559    if (fii==fend) {
560      fii--;
561      if (fii==fbeg)
562        return; // should set a cut-off, but there's only one value
563      else {
564        map<float, float>::const_iterator fjj = fii;
565        fjj--;
566        points.push_back(((*fjj).first + (*fii).first) / 2.0);
567        return;
568      }
569    }
570
571    if (noInt>2) {
572      cutoffsByDivision(noInt/2, points, fbeg, fii, Nn);
573
574      map<float, float>::const_iterator fjj = fii;
575      fjj--;
576      points.push_back(((*fjj).first + (*fii).first) / 2.0);
577     
578      cutoffsByDivision(noInt/2, points, fii, fend, N-Nn);
579    }
580  }*/
581}
582
583PVariable TEquiNDiscretization::operator()(PExampleGenerator gen, PVariable var, const long &weightID)
584{ if (var->varType!=TValue::FLOATVAR)
585    raiseError("attribute '%s' is not continuous", var->get_name().c_str());
586
587  int varPos=gen->domain->getVarNum(var);
588
589  TExampleIterator first(gen->begin());
590  while(first && (*first)[varPos].isSpecial() )
591    ++first;
592
593  if (!first)
594    raiseError("attribute '%s' has no known values.", var->get_name().c_str());
595
596  TContDistribution distr(var);
597  do {
598    TValue &val=(*first)[varPos];
599    if (!val.isSpecial())
600      distr.addfloat(float(val), WEIGHT(*first));
601  } while (++first);
602
603  return operator()(distr, var);
604}
605
606
607
608// Defined in measures.cpp
609float getEntropy(const vector<float> &);
610
611
612TEntropyDiscretization::TEntropyDiscretization()
613: maxNumberOfIntervals(0),
614  forceAttribute(false)
615{}
616
617
618PVariable TEntropyDiscretization::operator()(PExampleGenerator gen, PVariable var, const long &weightID)
619{ if (!gen->domain->classVar)
620    raiseError("class-less domain");
621
622  if (gen->domain->classVar!=TValue::INTVAR)
623    raiseError("class '%s' is not discrete", gen->domain->classVar->get_name().c_str());
624
625  if (var->varType!=TValue::FLOATVAR)
626    raiseError("attribute '%s' is not continuous", var->get_name().c_str());
627
628  int varPos=gen->domain->getVarNum(var);
629
630  TS S;
631  TDiscDistribution all;
632
633  PEITERATE(ei, gen) {
634    TValue &val = (*ei)[varPos];
635    if (!val.isSpecial()) {
636        const TValue &eclass = (*ei).getClass();
637      if (!eclass.isSpecial()) {
638        float weight = WEIGHT(*ei);
639        S[float(val)].addint(int(eclass), weight);
640          all.addint(int(eclass), weight);
641      }
642    }
643  }
644
645  /* No need to initialize seed by number of examples.
646     Different number will obviously result in different decisions. */
647  TSimpleRandomGenerator rgen;
648  return operator()(S, all, var, weightID, rgen);
649}
650
651
652PVariable TEntropyDiscretization::operator()(const TS &S, const TDiscDistribution &all, PVariable var, const long &, TSimpleRandomGenerator &rgen) const
653{
654  int k=0;
655  const_ITERATE(TDiscDistribution, ci, all)
656    if (*ci>0)
657      k++;
658
659  if (!k)
660    raiseError("no examples or all values of attribute '%s' are unknown", var->get_name().c_str());
661
662  float mindiff = 1.0;
663
664  vector<pair<float, float> > points;
665  divide(S.begin(), S.end(), all, float(getEntropy(all)), k, points, rgen, mindiff);
666
667  /* This is not correct: if, for instance, we have two cut-off points we should always remove
668     the one that was added later... */
669  if ((maxNumberOfIntervals>0) && (int(points.size())+1>maxNumberOfIntervals)) {
670    random_sort(points.begin(), points.end(), predOn2nd<pair<float, float>, less<float> >(), predOn2nd<pair<float, float>, equal_to<float> >(), rgen);
671    points.erase(points.begin()+maxNumberOfIntervals-1, points.end());
672    sort(points.begin(), points.end(), predOn1st<pair<float, float>, less<float> >());
673  }
674   
675  PIntervalDiscretizer discretizer = mlnew TIntervalDiscretizer();
676  TFloatList &dpoints = dynamic_cast<TFloatList &>(discretizer->points.getReference());
677  if (points.size()) {
678    vector<pair<float, float> >::const_iterator fi(points.begin()), fe(points.end());
679    discretizer->points->push_back((*(fi++)).first);
680    for(; fi!=fe; fi++)
681      if ((*fi).first != dpoints.back())
682        discretizer->points->push_back((*fi).first);
683  }
684
685  return discretizer->constructVar(var, mindiff);
686}
687
688
689void TEntropyDiscretization::divide(
690  const TS::const_iterator &first, const TS::const_iterator &last,
691    const TDiscDistribution &distr, float entropy, int k,
692  vector<pair<float, float> > &points,
693  TSimpleRandomGenerator &rgen,
694  float &mindiff) const
695{
696  TDiscDistribution S1dist, S2dist = distr, bestS1, bestS2;
697  float bestE = -1.0;
698  float N = distr.abs;
699  int wins = 0;
700  TS::const_iterator Ti = first, bestT;
701  for(; Ti!=last; Ti++) {
702    S1dist += (*Ti).second;
703    S2dist -= (*Ti).second;
704    if (S2dist.abs==0)
705      break;
706
707    float entro1 = S1dist.abs*float(getEntropy(S1dist))/N;
708    float entro2 = S2dist.abs*float(getEntropy(S2dist))/N;
709    float E = entro1+entro2;
710    if (   (!wins || (E<bestE)) && ((wins=1)==1)
711        || (E==bestE) && rgen.randbool(++wins)) {
712      bestS1 = S1dist;
713      bestS2 = S2dist;
714      bestE = E;
715      bestT = Ti;
716    }
717  }
718
719  if (!wins)
720    return;
721
722  int k1 = 0, k2 = 0;
723  ITERATE(TDiscDistribution, ci1, bestS1)
724    if (*ci1>0)
725      k1++;
726  ITERATE(TDiscDistribution, ci2, bestS2)
727    if (*ci2>0)
728      k2++;
729
730  float entropy1 = float(getEntropy(bestS1));
731  float entropy2 = float(getEntropy(bestS2));
732
733  float MDL =  log(float(N-1))/log(2.0)/N
734             + (log(exp(k*log(3.0))-2)/log(2.0) - (k*entropy - k1*entropy1 - k2*entropy2))/N;
735  float gain = entropy-bestE;
736
737  float cutoff = (*bestT).first;
738  bestT++;
739
740  if ((*bestT).first - cutoff < mindiff) {
741     mindiff = (*bestT).first - cutoff;
742  }
743
744//  cout << cutoff << ", info gain=" << gain << ", MDL=" << MDL << endl;
745  if (gain>MDL) {
746    if ((k1>1) && (first!=bestT))
747      divide(first, bestT, bestS1, entropy1, k1, points, rgen, mindiff);
748
749    points.push_back(pair<float, float>(cutoff, gain-MDL));
750
751    if ((k2>1) && (bestT!=last))
752      divide(bestT, last, bestS2, entropy2, k2, points, rgen, mindiff);
753  }
754  else if (forceAttribute && !points.size())
755    points.push_back(pair<float, float>(cutoff, gain-MDL));
756}
757
758
759template<class T> inline T sqr(const T &t)
760{ return t*t; }
761
762
763TBiModalDiscretization::TBiModalDiscretization(const bool sit)
764: splitInTwo(sit)
765{}
766
767
768PVariable TBiModalDiscretization::operator()(PExampleGenerator gen, PVariable var, const long &weightID)
769{ if (var->varType!=TValue::FLOATVAR)
770    raiseError("attribute '%s' is not continuous", var->get_name().c_str());
771  if (gen->domain->classVar!=TValue::INTVAR)
772    raiseError("class '%s' is not discrete", gen->domain->classVar->get_name().c_str());
773 
774  TContingencyAttrClass ccont(gen, var, weightID);
775  int nClasses = gen->domain->classVar->noOfValues();
776  float best1, best2;
777  float bestEval = -99999;
778
779  PDistribution classDist = getClassDistribution(gen, weightID);
780  TDiscDistribution &totDist = dynamic_cast<TDiscDistribution &>(classDist.getReference());
781  totDist.normalize();
782
783  // middle will contain sum of distributions from cut1 (exclusive) to cut2 (inclusive)
784  for(TDistributionMap::iterator cut1(ccont.continuous->begin()), cute(ccont.continuous->end()); cut1!=cute; cut1++) {
785    TDiscDistribution middle(nClasses);
786
787    TDistributionMap::iterator cut2 = cut1;
788    for(cut2++; cut2!=cute; cut2++) {
789      middle += (*cut2).second;
790
791      float chisq = 0.0;
792      float tabs = middle.abs;
793      int N = nClasses;
794      for(TDiscDistribution::const_iterator toti = totDist.begin(), midi = middle.begin();  N--; toti++, midi++) {
795        const float E = tabs**toti;
796        const float &n = *midi;
797        chisq += sqr( fabs(E - n) - 0.5 ) / E;
798      }
799
800      if (chisq > bestEval) {
801        bestEval = chisq;
802        best1 = (*cut1).first;
803        best2 = (*cut2).first;
804      }
805    }
806  }
807
808  PDiscretizer discretizer;
809
810  if (splitInTwo)
811    discretizer = mlnew TBiModalDiscretizer(best1, best2);
812
813  else {
814    TIntervalDiscretizer *idisc = mlnew TIntervalDiscretizer;
815    discretizer = idisc;
816    idisc->points->push_back(best1);
817    idisc->points->push_back(best2);
818  }
819
820  return discretizer->constructVar(var);
821}
822 
823 
824
825TDomainDiscretization::TDomainDiscretization(PDiscretization adisc)
826: discretization(adisc)
827{}
828
829
830PDomain TDomainDiscretization::equiDistDomain(PExampleGenerator gen)
831{
832  PDomain newDomain = mlnew TDomain();
833  newDomain->metas = gen->domain->metas;
834
835  TDomainBasicAttrStat valStats(gen);
836  const TEquiDistDiscretization &discs = dynamic_cast<TEquiDistDiscretization &>(discretization.getReference());
837
838  TVarList::iterator vi=gen->domain->variables->begin();
839  ITERATE(TDomainBasicAttrStat, si, valStats)
840    if (*si) {
841      PVariable evar=discs(*si, *vi);
842
843      newDomain->variables->push_back(evar);
844      newDomain->attributes->push_back(evar);
845      vi++;
846    }
847    else {
848      newDomain->variables->push_back(*vi);
849      newDomain->attributes->push_back(*vi);
850      vi++;
851    }
852
853  if (gen->domain->classVar) {
854    newDomain->classVar=newDomain->variables->back();
855    newDomain->attributes->erase(newDomain->attributes->end()-1);
856  }
857
858  return newDomain;
859}
860
861
862PDomain TDomainDiscretization::equiNDomain(PExampleGenerator gen, const long &weightID)
863{
864  PDomain newDomain = mlnew TDomain();
865  newDomain->metas = gen->domain->metas;
866  TDomainDistributions valDs(gen, weightID);
867
868  const TEquiNDiscretization &discs = dynamic_cast<TEquiNDiscretization &>(discretization.getReference());
869
870  TVarList::iterator vi=gen->domain->variables->begin();
871  ITERATE(TDomainDistributions, si, valDs)
872    if ((*si)->variable->varType==TValue::FLOATVAR) {
873      PVariable evar = discs(CAST_TO_CONTDISTRIBUTION(*si), *vi);
874
875      newDomain->variables->push_back(evar);
876      newDomain->attributes->push_back(evar);
877      vi++;
878    }
879    else {
880      newDomain->variables->push_back(*vi);
881      newDomain->attributes->push_back(*vi);
882      vi++;
883    }
884
885  if (gen->domain->classVar) {
886    newDomain->classVar = newDomain->variables->back();
887    newDomain->attributes->erase(newDomain->attributes->end()-1);
888  }
889
890  return newDomain;
891}
892
893
894PDomain TDomainDiscretization::otherDomain(PExampleGenerator gen, const long &weightID)
895{
896  PDomain newDomain = mlnew TDomain();
897  newDomain->metas = gen->domain->metas;
898
899  PITERATE(TVarList, vi, gen->domain->variables)
900    if ((*vi)->varType==TValue::FLOATVAR) {
901      PVariable evar=discretization->operator()(gen, *vi, weightID);
902
903      newDomain->variables->push_back(evar);
904      newDomain->attributes->push_back(evar);
905    }
906    else {
907      newDomain->variables->push_back(*vi);
908      newDomain->attributes->push_back(*vi);
909    }
910
911  if (gen->domain->classVar) {
912    newDomain->classVar=newDomain->variables->back();
913    newDomain->attributes->erase(newDomain->attributes->end()-1);
914  }
915
916  return newDomain;
917}
918
919
920PDomain TDomainDiscretization::operator()(PExampleGenerator gen, const long &weightID)
921{ checkProperty(discretization);
922
923  if (discretization.is_derived_from(TEquiDistDiscretization))
924    return equiDistDomain(gen);
925  if (discretization.is_derived_from(TEquiNDiscretization))
926    return equiNDomain(gen, weightID);
927
928  return otherDomain(gen, weightID);
929}
930
Note: See TracBrowser for help on using the repository browser.