source: orange/source/orange/discretize.cpp @ 11583:2e6ab05e7fed

Revision 11583:2e6ab05e7fed, 26.1 KB checked in by markotoplak, 11 months ago (diff)

Code indentation.

RevLine 
[34]1/*
2    This file is part of Orange.
[6531]3   
4    Copyright 1996-2010 Faculty of Computer and Information Science, University of Ljubljana
5    Contact: janez.demsar@fri.uni-lj.si
[34]6
[6531]7    Orange is free software: you can redistribute it and/or modify
[34]8    it under the terms of the GNU General Public License as published by
[6531]9    the Free Software Foundation, either version 3 of the License, or
[34]10    (at your option) any later version.
11
12    Orange is distributed in the hope that it will be useful,
13    but WITHOUT ANY WARRANTY; without even the implied warranty of
14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15    GNU General Public License for more details.
16
17    You should have received a copy of the GNU General Public License
[6531]18    along with Orange.  If not, see <http://www.gnu.org/licenses/>.
[34]19*/
20
21
22#include <math.h>
23
24#include "vars.hpp"
25#include "domain.hpp"
26#include "examples.hpp"
27#include "examplegen.hpp"
[1337]28#include "getarg.hpp"
[34]29
30#include "classify.hpp"
31#include "random.hpp"
32#include "distvars.hpp"
33#include "basstat.hpp"
34#include "contingency.hpp"
35#include "transval.hpp"
36#include "classfromvar.hpp"
37
38#include "discretize.ppp"
39
40
41TEquiDistDiscretizer::TEquiDistDiscretizer(const int noi, const float fv, const float st)
42: numberOfIntervals(noi),
[281]43  firstCut(fv),
[34]44  step(st)
45{}
46
47
[281]48// Transforms the value; results is 1+floor((val.floatV-firstCut)/step); 0 if below firstCut, numberOfIntervals if above range
[34]49void TEquiDistDiscretizer::transform(TValue &val)
50{ if (val.varType!=TValue::FLOATVAR)
51    raiseError("discrete value expected");
52 
53  if (!val.isSpecial()) {
54    if (step<0)
55      raiseError("'step' not set");
[281]56    if (numberOfIntervals<1)
57      raiseError("invalid number of intervals (%i)", numberOfIntervals);
[34]58
[281]59    if ((step==0) || (numberOfIntervals==1))
[34]60      val.intV = 0;
[281]61
[34]62    else {
[281]63      val.intV = (val.floatV<firstCut) ? 0 : 1+int(floor((val.floatV-firstCut)/step));
[34]64      if (val.intV>=numberOfIntervals)
65        val.intV = numberOfIntervals-1;
66    }
67  }
68 
69  val.varType = TValue::INTVAR;
70}
71
72
73inline int numDecs(const float &diff, float &factor)
74{ if (diff>= 1.0) {
75    factor = 100.0;
76    return 2;
77  }
78  else {
79    int decs = (int)ceil(-log10(diff));
80    if (decs<2)
81      decs = 2;
82    factor = exp(decs*log(10.0));
83    return decs;
84  }
85}
86
87
[470]88inline float roundFromDecs(const int &decs)
89{ 
90  return decs <= 0 ? 100.0 : exp(decs*log(10.0));
91}
92
[34]93inline void roundToFactor(float &f, const float &factor)
94{ f = floor(f*factor+0.5)/factor; }
95
96
97string mcvt(double f, int decs)
98{ 
[49]99  char buf[64];
100  sprintf(buf, "%.*f", decs, f);
101  return buf;
[34]102}
103
104/*  Constructs a new TEnumVariable. Its values represent the intervals for values of passed variable var;
105    getValueFrom points to a classifier which gets a value of the original variable (var) and transforms it using
106    'this' transformer. */
[6782]107PVariable TEquiDistDiscretizer::constructVar(PVariable var, float mindiff)
[281]108{ 
[6782]109  mindiff = 1.0; // Ignores the given mindiff; see http://www.ailab.si/orange/trac/ticket/576
[470]110  TFloatVariable *fvar = var.AS(TFloatVariable);
111  if (!fvar)
112    raiseError("invalid attribute type (continuous attribute expected)");
113
[7665]114  TEnumVariable *evar=mlnew TEnumVariable("D_"+var->get_name());
[281]115  PVariable revar(evar);
[34]116
[841]117  evar->ordered = true;
118
[281]119  if (numberOfIntervals<2)
120    evar->addValue("C");
[34]121
[281]122  else {
123    float roundfactor;
[6782]124    int decs = numDecs(step<mindiff ? step : mindiff, roundfactor);
[34]125
[470]126    if ((fvar->adjustDecimals != 2) && (decs < fvar->numberOfDecimals)) {
127      decs = fvar->numberOfDecimals;
128      roundfactor = roundFromDecs(fvar->numberOfDecimals);
129    }
130
[281]131    roundToFactor(firstCut, roundfactor);
132    roundToFactor(step, roundfactor);
[34]133
[281]134    float f = firstCut;
135    string pval;
[34]136
[281]137    pval = mcvt(f, decs);
138    evar->addValue(string("<") + pval);
[34]139
[281]140    int steps = numberOfIntervals-2;
141    while (steps--) {
142      string s("[");
143      s += pval;
144      f += step;
145      s += ", ";
146      pval = mcvt(f, decs);
147      s += pval;
148      s += ")";
149      evar->addValue(s);
150    }
151
152    evar->addValue(string(">") + pval);
[34]153  }
154 
[281]155  TClassifierFromVar *tcfv = mlnew TClassifierFromVar(revar, var);
[3337]156  tcfv->transformUnknowns = true;
[281]157  tcfv->transformer = this; // rewrapping
158  revar->getValueFrom = tcfv;
[34]159  return revar;
160}
161
162
[1247]163void TEquiDistDiscretizer::getCutoffs(vector<float> &cutoffs) const
164{
165  cutoffs.clear();
166  for(int i = 0; i < numberOfIntervals-1; i++)
167    cutoffs.push_back(firstCut+step*i);
168}
169
[34]170
171TThresholdDiscretizer::TThresholdDiscretizer(const float &athreshold)
172: threshold(athreshold)
173{}
174
175
176void TThresholdDiscretizer::transform(TValue &val)
177{ if (!val.isSpecial())
178    val.intV = (val.floatV<=threshold) ? 0 : 1;
179  val.varType = TValue::INTVAR;
180}
181
182
[6782]183PVariable TThresholdDiscretizer::constructVar(PVariable var, float mindiff)
[281]184{ 
[6782]185  mindiff = 1.0; // Ignores the given mindiff; see http://www.ailab.si/orange/trac/ticket/576
[7665]186  TEnumVariable *evar = mlnew TEnumVariable("D_"+var->get_name());
[281]187  PVariable revar(evar);
188
[841]189  evar->ordered = true;
190
[281]191  char s[10];
192  sprintf(s, "<= %5.3f", threshold);
193  evar->values->push_back(s);
194  sprintf(s, "> %5.3f", threshold);
195  evar->values->push_back(s);
196
197  TClassifierFromVar *tcfv = mlnew TClassifierFromVar(revar, var);
[3337]198  tcfv->transformUnknowns = true;
[281]199  tcfv->transformer = this; // rewrapping
200  revar->getValueFrom = tcfv;
201  return revar;
202}
203
204
[1247]205void TThresholdDiscretizer::getCutoffs(vector<float> &cutoffs) const
206{
207  cutoffs.clear();
208  cutoffs.push_back(threshold);
209}
210
211
[281]212TBiModalDiscretizer::TBiModalDiscretizer(const float &al, const float &ah)
213: low(al),
214  high(ah)
215{}
216
217
218void TBiModalDiscretizer::transform(TValue &val)
219{ 
220  if (val.varType != TValue::FLOATVAR)
221    raiseError("continuous value expected");
222
223  if (!val.isSpecial())
224    val.intV = ((val.intV > low) && (val.intV > high)) ? 1 : 0;
225
226  val.varType = TValue::INTVAR;
227}
228
229
[6782]230PVariable TBiModalDiscretizer::constructVar(PVariable var, float mindiff)
[281]231{ 
[6782]232  mindiff = 1.0; // Ignores the given mindiff; see http://www.ailab.si/orange/trac/ticket/576
[470]233  TFloatVariable *fvar = var.AS(TFloatVariable);
234  if (!fvar)
235    raiseError("invalid attribute type (continuous attribute expected)");
236
[7665]237  TEnumVariable *evar = mlnew TEnumVariable("D_"+var->get_name());
[281]238  PVariable revar(evar);
239
[841]240  evar->ordered = true;
241
[281]242  if (high<=low)
243    raiseError("invalid interval: (%5.3f, %5.3f]", low, high);
244
245  float roundfactor;
[6782]246  if (high-low < mindiff) {
247    mindiff = high-low;
248  }
249  int decs = numDecs(mindiff, roundfactor);
[470]250
251  if ((fvar->adjustDecimals != 2) && (decs < fvar->numberOfDecimals)) {
252    decs = fvar->numberOfDecimals;
253    roundfactor = roundFromDecs(fvar->numberOfDecimals);
254  }
255
[281]256  roundToFactor(low, roundfactor);
257  roundToFactor(high, roundfactor);
258  string lstr = mcvt(low, decs);
259  string hstr = mcvt(high, decs);
260
261  evar->values->push_back("<=" + lstr + " or >" + hstr);
262  evar->values->push_back("between "+lstr+" and "+hstr);
263
264  TClassifierFromVar *tcfv = mlnew TClassifierFromVar(revar, var);
[3337]265  tcfv->transformUnknowns = true;
[281]266  tcfv->transformer = this; // rewrapping
267  revar->getValueFrom = tcfv;
268  return revar;
269}
270
271
[1247]272void TBiModalDiscretizer::getCutoffs(vector<float> &cutoffs) const
273{
274  cutoffs.clear();
275  cutoffs.push_back(low);
276  cutoffs.push_back(high);
277}
278
279
[34]280TIntervalDiscretizer::TIntervalDiscretizer()
281: points(mlnew TFloatList())
282{}
283
284
285TIntervalDiscretizer::TIntervalDiscretizer(PFloatList apoints)
286: points(apoints)
287{};
288
289
[281]290
[34]291void TIntervalDiscretizer::transform(TValue &val)
292{ checkProperty(points);
293  if (val.varType!=TValue::FLOATVAR)
294    raiseError("continuous value expected");
295
296  if (!val.isSpecial()) {
297    val.intV = 0;
298    for(TFloatList::iterator ri(points->begin()), re(points->end()); (ri!=re) && (*ri<val.floatV); ri++, val.intV++);
299  }
300
301  val.varType = TValue::INTVAR;
302}
303
304
305/*  Constructs a new TEnumVariable. Its values represent the intervals for
306    values of passed variable var; getValueFrom points to a classifier which
307    gets a value of the original variable (var) and transforms it using
308    'this' transformer. */
[6782]309PVariable TIntervalDiscretizer::constructVar(PVariable var, float mindiff )
[281]310{
[6782]311  mindiff = 1.0; // Ignores the given mindiff; see http://www.ailab.si/orange/trac/ticket/576
[470]312  TFloatVariable *fvar = var.AS(TFloatVariable);
313  if (!fvar)
314    raiseError("invalid attribute type (continuous attribute expected)");
315
[7665]316  TEnumVariable *evar=mlnew TEnumVariable("D_"+var->get_name());
[281]317  PVariable revar(evar);
[34]318
[7665]319  TEnumVariable *cl_evar=mlnew TEnumVariable("D_"+var->get_name());
[1743]320  PVariable cl_revar(cl_evar);
321
[841]322  evar->ordered = true;
323
[34]324  if (!points->size())
325    evar->addValue("C");
326
327  else {
[1337]328    TFloatList::iterator vb(points->begin()), ve(points->end()), vi;
[34]329    for(vi=vb+1; vi!=ve; vi++) {
330      float ndiff = *vi - *(vi-1);
331      if (ndiff<mindiff)
332        mindiff = ndiff;
333    }
334
335    float roundfactor;
336    int decs = numDecs(mindiff, roundfactor);
337
[470]338    if ((fvar->adjustDecimals != 2) && (decs < fvar->numberOfDecimals)) {
339      decs = fvar->numberOfDecimals;
340      roundfactor = roundFromDecs(fvar->numberOfDecimals);
341    }
342
[34]343    vi=points->begin();
344    string ostr;
345
346    roundToFactor(*vi, roundfactor);   
347    ostr = mcvt(*vi, decs);
348    evar->addValue(string("<=") + ostr);
349
350    while(++vi!=ve) {
351      string s = "(";
352      s += ostr;
353      s += ", ";
354      roundToFactor(*vi, roundfactor);
355      ostr = mcvt(*vi, decs);
356      s += ostr;
357      s += "]";
358      evar->addValue(s);
359    }
360
361    evar->addValue(string(">")+ostr);
[1743]362  } 
[34]363
[1743]364  TClassifierFromVar *tcfv = mlnew TClassifierFromVar(cl_revar, var);
[3337]365  tcfv->transformUnknowns = true;
[281]366  tcfv->transformer = this; // rewrapping
[1743]367  revar->getValueFrom = tcfv; 
[34]368  return revar;
369}
370
371
372
[1247]373void TIntervalDiscretizer::getCutoffs(vector<float> &cutoffs) const
374{
375  cutoffs = points.getReference();
376}
377
378
[34]379// Sets the number of intervals (default is 4)
380TEquiDistDiscretization::TEquiDistDiscretization(const int anumber)
381: TDiscretization(),
382  numberOfIntervals(anumber)
383{}
384
385
[281]386// Sets the firstCut and step according to the min and max fields of valStat.
[34]387PVariable TEquiDistDiscretization::operator()(PBasicAttrStat valStat, PVariable var) const
[281]388{ float step = (valStat->max-valStat->min)/numberOfIntervals;
389  PEquiDistDiscretizer discretizer = mlnew TEquiDistDiscretizer(numberOfIntervals, valStat->min+step, step);
390  return discretizer->constructVar(var);
[34]391}
392
393
[281]394// Sets the firstCut and step according to the range of values that occur in gen for variable var.
[34]395PVariable TEquiDistDiscretization::operator()(PExampleGenerator gen, PVariable var, const long &)
396{ if (var->varType!=TValue::FLOATVAR)
[7665]397    raiseError("attribute '%s' is not continuous", var->get_name().c_str());
[34]398
[281]399  if (numberOfIntervals<=0)
400    raiseError("invalid number of intervals (%i)", numberOfIntervals);
401
[34]402  int varPos=gen->domain->getVarNum(var);
403
404  TExampleIterator first(gen->begin());
405  while( first && (*first)[varPos].isSpecial() )
406    ++first;
407  if (!first)
[7665]408    raiseError("attribute '%s' has no known values", var->get_name().c_str());
[34]409
410  float max, min;
411  max = min = (*first)[varPos].floatV;
412  while (++first)
413    if (!(*first)[varPos].isSpecial()) {
414      float val = (*first)[varPos].floatV;
415      if (val>max)
416        max = val;
417      if (val<min)
418        min = val;
419    };
420
[281]421  float step = (max-min)/numberOfIntervals;
422  PEquiDistDiscretizer discretizer = mlnew TEquiDistDiscretizer(numberOfIntervals, min+step, step);
423  return discretizer->constructVar(var);
[34]424}
425
426
427
428TFixedDiscretization::TFixedDiscretization(TFloatList &pts)
429: points(mlnew TFloatList(pts))
430{}
431
432
433TFixedDiscretization::TFixedDiscretization(const string &boundaries)
434: points()
[1337]435{ vector<string> atoms;
[34]436  string2atoms(boundaries, atoms);
437  points = mlnew TFloatList(atoms.size());
438  TFloatList::iterator pi(points->begin());
439  ITERATE(vector<string>, ai, atoms) {
440    sscanf((*ai).c_str(), "%f", &*pi);
441    if ((pi!=points->begin()) && (*pi<=pi[-1]))
442      raiseError("mismatch in cut-off points");
443    pi++;
444  }
445}
446
447
448PVariable TFixedDiscretization::operator ()(PExampleGenerator, PVariable var, const long &)
449{ PIntervalDiscretizer discretizer = mlnew TIntervalDiscretizer (mlnew TFloatList(points));
[281]450  return discretizer->constructVar(var);
[34]451}
452
453
454
455TEquiNDiscretization::TEquiNDiscretization(int anumber)
456: numberOfIntervals(anumber),
457  recursiveDivision(true)
458{}
459
460
461PVariable TEquiNDiscretization::operator()(const TContDistribution &distr, PVariable var) const
[5074]462{ 
463  PIntervalDiscretizer discretizer=mlnew TIntervalDiscretizer;
[6782]464  float mindiff;
[5074]465 
466  if (distr.size() <= numberOfIntervals) {
[6782]467    cutoffsByMidpoints(discretizer, distr, mindiff);
[5074]468  }
469  else if (recursiveDivision && false) { // XXX remove when the routine is finished
[6782]470    cutoffsByDivision(discretizer, distr, mindiff);
[5074]471  }
472  else {
[6782]473    cutoffsByCounting(discretizer, distr, mindiff);
[5074]474  }
[34]475
[6782]476  return discretizer->constructVar(var, mindiff);
[34]477}
478
[6782]479void TEquiNDiscretization::cutoffsByMidpoints(PIntervalDiscretizer discretizer, const TContDistribution &distr, float &mindiff) const
[5074]480{
[6782]481  mindiff = 1.0;
[5074]482  TContDistribution::const_iterator cdi(distr.begin()), cde(distr.end());
483  if (cdi!=cde) {
484    float prev = (*cdi).first;
485    while (++cdi != cde) {
486      discretizer->points->push_back((prev+(*cdi).first)/2.0);
[6782]487      if (((*cdi).first - prev) < mindiff) {
488          mindiff = (*cdi).first - prev;
489      }
[5074]490    }
491  }
492}
493
[6782]494void TEquiNDiscretization::cutoffsByCounting(PIntervalDiscretizer discretizer, const TContDistribution &distr, float &mindiff) const
[34]495{
496  if (numberOfIntervals<=0)
497    raiseError("invalid number of intervals (%i)", numberOfIntervals);
498
[6782]499  mindiff = 1.0;
[34]500  float N = distr.abs;
501  int toGo = numberOfIntervals;
502  float inthis = 0, prevel = -1; // initialized to avoid warnings
503  float inone = N/toGo;
504
[4449]505  for(map<float, float>::const_iterator db(distr.begin()), di(db), de(distr.end()), ni; (toGo>1) && (di!=de); di++) {
[34]506    inthis += (*di).second;
507    if ((inthis<inone) || (di==db))
508      prevel = (*di).first;
509    else {
[4449]510      ni = di; ni++;
511      if ((ni!=de) && (inthis - inone < (*di).second / 2)) {
512        discretizer->points->push_back( ((*ni).first + (*di).first) /2);
[6782]513        if ((*ni).first - (*di).first < mindiff) {
514          mindiff = (*ni).first - (*di).first;
515        }
[4449]516        N -= inthis;
517        inthis = 0;
[4655]518        prevel = (*ni).first;
[4449]519      }
520      else {
521        discretizer->points->push_back( (prevel + (*di).first) / 2);
[6782]522        if ((*di).first - prevel < mindiff) {
523          mindiff = (*di).first - prevel;
524        }
[4449]525        N -= (inthis - ((*di).second));
526        inthis = (*di).second;
[4655]527        prevel = (*di).first;
[4449]528      }
[34]529      if (--toGo) 
530        inone = N/toGo;
531    }
532  }
533}
534
535
[6782]536void TEquiNDiscretization::cutoffsByDivision(PIntervalDiscretizer discretizer, const TContDistribution &distr, float &mindiff) const
537{ cutoffsByDivision(numberOfIntervals, discretizer->points.getReference(), distr.begin(), distr.end(), distr.abs, mindiff); }
[34]538
539
540void TEquiNDiscretization::cutoffsByDivision(const int &, TFloatList &, 
541                                            map<float, float>::const_iterator, map<float, float>::const_iterator,
[6782]542                                            const float &, float &) const
[34]543{ /*XXX to be finished
544
545  if (noInt & 1) {
546    if (noInt & 2) {
547      noIntLeft = (noInt-1)/2;
548      noIntRight = (noInt+1)/2;
549    }
550    else {
551      noIntLeft = (noInt+1)/2;
552      noIntRight = (noInt+1)/2;
553    }
554
555    float Nleft = N * noIntLeft / (noIntLeft + noIntRight);
556    float Nright = N - Nleft;
557
558    if ((Nleft<1) || (Nright<1))
559      return; // should set a cut-off, but couldn't -- N=1...
560
561    map<float, float>::const_iterator fii = fbeg;
562    while ((Nn<Nleft) && (fii!=fend))
563      Nn += (*fii).second;
564    Nn -= (*fii).second;
565
566    if (fii==fend) {
567    }
568
569  }
570  else {
571    float N2 = N/2, Nn = 0.0;
572    if (N2<1)
573      return; // should set a cut-off, but couldn't -- N=1...
574
575    map<float, float>::const_iterator fii = fbeg;
576    while ((Nn<N2) && (fii!=fend))
577      Nn += (*fii).second;
578    Nn -= (*fii).second;
579
580    if (fii==fend) {
581      fii--;
582      if (fii==fbeg)
583        return; // should set a cut-off, but there's only one value
584      else {
585        map<float, float>::const_iterator fjj = fii;
586        fjj--;
587        points.push_back(((*fjj).first + (*fii).first) / 2.0);
588        return;
589      }
590    }
591
592    if (noInt>2) {
593      cutoffsByDivision(noInt/2, points, fbeg, fii, Nn);
594
595      map<float, float>::const_iterator fjj = fii;
596      fjj--;
597      points.push_back(((*fjj).first + (*fii).first) / 2.0);
598     
599      cutoffsByDivision(noInt/2, points, fii, fend, N-Nn);
600    }
601  }*/
602}
603
604PVariable TEquiNDiscretization::operator()(PExampleGenerator gen, PVariable var, const long &weightID)
605{ if (var->varType!=TValue::FLOATVAR)
[7665]606    raiseError("attribute '%s' is not continuous", var->get_name().c_str());
[34]607
608  int varPos=gen->domain->getVarNum(var);
609
610  TExampleIterator first(gen->begin());
611  while(first && (*first)[varPos].isSpecial() )
612    ++first;
613
614  if (!first)
[7665]615    raiseError("attribute '%s' has no known values.", var->get_name().c_str());
[34]616
617  TContDistribution distr(var);
618  do {
619    TValue &val=(*first)[varPos];
620    if (!val.isSpecial())
621      distr.addfloat(float(val), WEIGHT(*first));
622  } while (++first);
623
624  return operator()(distr, var);
625}
626
627
628
[308]629// Defined in measures.cpp
[34]630float getEntropy(const vector<float> &);
631
632
633TEntropyDiscretization::TEntropyDiscretization()
[281]634: maxNumberOfIntervals(0),
635  forceAttribute(false)
[34]636{}
637
638
639PVariable TEntropyDiscretization::operator()(PExampleGenerator gen, PVariable var, const long &weightID)
640{ if (!gen->domain->classVar)
641    raiseError("class-less domain");
642
643  if (gen->domain->classVar!=TValue::INTVAR)
[7665]644    raiseError("class '%s' is not discrete", gen->domain->classVar->get_name().c_str());
[34]645
646  if (var->varType!=TValue::FLOATVAR)
[7665]647    raiseError("attribute '%s' is not continuous", var->get_name().c_str());
[34]648
649  int varPos=gen->domain->getVarNum(var);
650
651  TS S;
652  TDiscDistribution all;
653
654  PEITERATE(ei, gen) {
655    TValue &val = (*ei)[varPos];
656    if (!val.isSpecial()) {
[131]657        const TValue &eclass = (*ei).getClass();
[34]658      if (!eclass.isSpecial()) {
659        float weight = WEIGHT(*ei);
660        S[float(val)].addint(int(eclass), weight);
661          all.addint(int(eclass), weight);
662      }
663    }
664  }
665
[281]666  /* No need to initialize seed by number of examples.
667     Different number will obviously result in different decisions. */
668  TSimpleRandomGenerator rgen;
[59]669  return operator()(S, all, var, weightID, rgen);
[34]670}
671
672
[59]673PVariable TEntropyDiscretization::operator()(const TS &S, const TDiscDistribution &all, PVariable var, const long &, TSimpleRandomGenerator &rgen) const
[34]674{
675  int k=0;
676  const_ITERATE(TDiscDistribution, ci, all)
677    if (*ci>0)
678      k++;
679
680  if (!k)
[7665]681    raiseError("no examples or all values of attribute '%s' are unknown", var->get_name().c_str());
[34]682
[6782]683  float mindiff = 1.0;
[34]684
685  vector<pair<float, float> > points;
[6782]686  divide(S.begin(), S.end(), all, float(getEntropy(all)), k, points, rgen, mindiff);
[34]687
[281]688  /* This is not correct: if, for instance, we have two cut-off points we should always remove
689     the one that was added later... */
[34]690  if ((maxNumberOfIntervals>0) && (int(points.size())+1>maxNumberOfIntervals)) {
[281]691    random_sort(points.begin(), points.end(), predOn2nd<pair<float, float>, less<float> >(), predOn2nd<pair<float, float>, equal_to<float> >(), rgen);
[34]692    points.erase(points.begin()+maxNumberOfIntervals-1, points.end());
693    sort(points.begin(), points.end(), predOn1st<pair<float, float>, less<float> >());
694  }
695   
696  PIntervalDiscretizer discretizer = mlnew TIntervalDiscretizer();
697  TFloatList &dpoints = dynamic_cast<TFloatList &>(discretizer->points.getReference());
698  if (points.size()) {
699    vector<pair<float, float> >::const_iterator fi(points.begin()), fe(points.end());
700    discretizer->points->push_back((*(fi++)).first);
701    for(; fi!=fe; fi++)
702      if ((*fi).first != dpoints.back())
703        discretizer->points->push_back((*fi).first);
704  }
705
[6782]706  return discretizer->constructVar(var, mindiff);
[34]707}
708
709
710void TEntropyDiscretization::divide(
711  const TS::const_iterator &first, const TS::const_iterator &last,
712    const TDiscDistribution &distr, float entropy, int k,
[59]713  vector<pair<float, float> > &points,
[6782]714  TSimpleRandomGenerator &rgen,
715  float &mindiff) const
[34]716{
717  TDiscDistribution S1dist, S2dist = distr, bestS1, bestS2;
718  float bestE = -1.0;
719  float N = distr.abs;
720  int wins = 0;
721  TS::const_iterator Ti = first, bestT;
722  for(; Ti!=last; Ti++) {
723    S1dist += (*Ti).second;
724    S2dist -= (*Ti).second;
725    if (S2dist.abs==0)
726      break;
727
[11583]728    float entro1 = S1dist.abs*float(getEntropy(S1dist))/N;
729    float entro2 = S2dist.abs*float(getEntropy(S2dist))/N;
730    float E = entro1+entro2;
[34]731    if (   (!wins || (E<bestE)) && ((wins=1)==1)
[59]732        || (E==bestE) && rgen.randbool(++wins)) {
[34]733      bestS1 = S1dist;
734      bestS2 = S2dist;
735      bestE = E;
736      bestT = Ti;
737    }
738  }
739
740  if (!wins)
741    return;
742
743  int k1 = 0, k2 = 0;
744  ITERATE(TDiscDistribution, ci1, bestS1)
745    if (*ci1>0)
746      k1++;
747  ITERATE(TDiscDistribution, ci2, bestS2)
748    if (*ci2>0)
749      k2++;
750
751  float entropy1 = float(getEntropy(bestS1));
752  float entropy2 = float(getEntropy(bestS2));
753
754  float MDL =  log(float(N-1))/log(2.0)/N
755             + (log(exp(k*log(3.0))-2)/log(2.0) - (k*entropy - k1*entropy1 - k2*entropy2))/N;
756  float gain = entropy-bestE;
757
758  float cutoff = (*bestT).first;
759  bestT++;
760
[6782]761  if ((*bestT).first - cutoff < mindiff) {
762     mindiff = (*bestT).first - cutoff;
763  }
764
[34]765//  cout << cutoff << ", info gain=" << gain << ", MDL=" << MDL << endl;
766  if (gain>MDL) {
767    if ((k1>1) && (first!=bestT))
[6782]768      divide(first, bestT, bestS1, entropy1, k1, points, rgen, mindiff);
[34]769
770    points.push_back(pair<float, float>(cutoff, gain-MDL));
771
772    if ((k2>1) && (bestT!=last))
[6782]773      divide(bestT, last, bestS2, entropy2, k2, points, rgen, mindiff);
[34]774  }
[281]775  else if (forceAttribute && !points.size())
776    points.push_back(pair<float, float>(cutoff, gain-MDL));
[34]777}
778
779
780template<class T> inline T sqr(const T &t)
781{ return t*t; }
782
[281]783
784TBiModalDiscretization::TBiModalDiscretization(const bool sit)
785: splitInTwo(sit)
786{}
787
788
[34]789PVariable TBiModalDiscretization::operator()(PExampleGenerator gen, PVariable var, const long &weightID)
790{ if (var->varType!=TValue::FLOATVAR)
[7665]791    raiseError("attribute '%s' is not continuous", var->get_name().c_str());
[34]792  if (gen->domain->classVar!=TValue::INTVAR)
[7665]793    raiseError("class '%s' is not discrete", gen->domain->classVar->get_name().c_str());
[34]794 
795  TContingencyAttrClass ccont(gen, var, weightID);
796  int nClasses = gen->domain->classVar->noOfValues();
797  float best1, best2;
[281]798  float bestEval = -99999;
[34]799
800  PDistribution classDist = getClassDistribution(gen, weightID);
801  TDiscDistribution &totDist = dynamic_cast<TDiscDistribution &>(classDist.getReference());
802  totDist.normalize();
803
[281]804  // middle will contain sum of distributions from cut1 (exclusive) to cut2 (inclusive)
805  for(TDistributionMap::iterator cut1(ccont.continuous->begin()), cute(ccont.continuous->end()); cut1!=cute; cut1++) {
806    TDiscDistribution middle(nClasses);
[34]807
[281]808    TDistributionMap::iterator cut2 = cut1;
809    for(cut2++; cut2!=cute; cut2++) {
810      middle += (*cut2).second;
[34]811
[281]812      float chisq = 0.0;
813      float tabs = middle.abs;
814      int N = nClasses;
815      for(TDiscDistribution::const_iterator toti = totDist.begin(), midi = middle.begin();  N--; toti++, midi++) {
816        const float E = tabs**toti;
817        const float &n = *midi;
818        chisq += sqr( fabs(E - n) - 0.5 ) / E;
[34]819      }
820
[281]821      if (chisq > bestEval) {
822        bestEval = chisq;
823        best1 = (*cut1).first;
824        best2 = (*cut2).first;
825      }
[34]826    }
827  }
828
[281]829  PDiscretizer discretizer;
830
831  if (splitInTwo)
832    discretizer = mlnew TBiModalDiscretizer(best1, best2);
833
834  else {
835    TIntervalDiscretizer *idisc = mlnew TIntervalDiscretizer;
836    discretizer = idisc;
837    idisc->points->push_back(best1);
838    idisc->points->push_back(best2);
839  }
840
841  return discretizer->constructVar(var);
[34]842}
843 
844 
845
846TDomainDiscretization::TDomainDiscretization(PDiscretization adisc)
847: discretization(adisc)
848{}
849
850
851PDomain TDomainDiscretization::equiDistDomain(PExampleGenerator gen)
852{
853  PDomain newDomain = mlnew TDomain();
[131]854  newDomain->metas = gen->domain->metas;
[34]855
856  TDomainBasicAttrStat valStats(gen);
857  const TEquiDistDiscretization &discs = dynamic_cast<TEquiDistDiscretization &>(discretization.getReference());
858
859  TVarList::iterator vi=gen->domain->variables->begin();
860  ITERATE(TDomainBasicAttrStat, si, valStats)
861    if (*si) {
862      PVariable evar=discs(*si, *vi);
863
864      newDomain->variables->push_back(evar);
865      newDomain->attributes->push_back(evar);
866      vi++;
867    }
868    else {
869      newDomain->variables->push_back(*vi);
870      newDomain->attributes->push_back(*vi);
871      vi++;
872    }
873
874  if (gen->domain->classVar) {
875    newDomain->classVar=newDomain->variables->back();
876    newDomain->attributes->erase(newDomain->attributes->end()-1);
877  }
878
879  return newDomain;
880}
881
882
883PDomain TDomainDiscretization::equiNDomain(PExampleGenerator gen, const long &weightID)
884{
885  PDomain newDomain = mlnew TDomain();
[131]886  newDomain->metas = gen->domain->metas;
[34]887  TDomainDistributions valDs(gen, weightID);
888
889  const TEquiNDiscretization &discs = dynamic_cast<TEquiNDiscretization &>(discretization.getReference());
890
891  TVarList::iterator vi=gen->domain->variables->begin();
892  ITERATE(TDomainDistributions, si, valDs)
893    if ((*si)->variable->varType==TValue::FLOATVAR) {
894      PVariable evar = discs(CAST_TO_CONTDISTRIBUTION(*si), *vi);
895
896      newDomain->variables->push_back(evar);
897      newDomain->attributes->push_back(evar);
898      vi++;
899    }
900    else {
901      newDomain->variables->push_back(*vi);
902      newDomain->attributes->push_back(*vi);
903      vi++;
904    }
905
906  if (gen->domain->classVar) {
907    newDomain->classVar = newDomain->variables->back();
908    newDomain->attributes->erase(newDomain->attributes->end()-1);
909  }
910
911  return newDomain;
912}
913
914
915PDomain TDomainDiscretization::otherDomain(PExampleGenerator gen, const long &weightID)
916{
917  PDomain newDomain = mlnew TDomain();
[131]918  newDomain->metas = gen->domain->metas;
[34]919
920  PITERATE(TVarList, vi, gen->domain->variables)
921    if ((*vi)->varType==TValue::FLOATVAR) {
922      PVariable evar=discretization->operator()(gen, *vi, weightID);
923
924      newDomain->variables->push_back(evar);
925      newDomain->attributes->push_back(evar);
926    }
927    else {
928      newDomain->variables->push_back(*vi);
929      newDomain->attributes->push_back(*vi);
930    }
931
932  if (gen->domain->classVar) {
933    newDomain->classVar=newDomain->variables->back();
934    newDomain->attributes->erase(newDomain->attributes->end()-1);
935  }
936
937  return newDomain;
938}
939
940
941PDomain TDomainDiscretization::operator()(PExampleGenerator gen, const long &weightID)
942{ checkProperty(discretization);
943
944  if (discretization.is_derived_from(TEquiDistDiscretization))
945    return equiDistDomain(gen);
946  if (discretization.is_derived_from(TEquiNDiscretization))
947    return equiNDomain(gen, weightID);
948
949  return otherDomain(gen, weightID);
950}
951
Note: See TracBrowser for help on using the repository browser.