Changeset 5056:ee2abc44151d in orange


Ignore:
Timestamp:
08/02/08 23:39:45 (6 years ago)
Author:
janezd <janez.demsar@…>
Branch:
default
Convert:
31726d78d798786173d827bf6bb694d848d82623
Message:
  • added binarization one-against-others to classification trees
Location:
source/orange
Files:
6 edited

Legend:

Unmodified
Added
Removed
  • source/orange/lib_learner.cpp

    r5030 r5056  
    552552C_CALL(TreeSplitConstructor_Attribute, TreeSplitConstructor_Measure, "([measure=, worstAcceptable=, minSubset=])") 
    553553C_CALL(TreeSplitConstructor_ExhaustiveBinary, TreeSplitConstructor_Measure, "([measure=, worstAcceptable=, minSubset=])") 
     554C_CALL(TreeSplitConstructor_OneAgainstOthers, TreeSplitConstructor_Measure, "([measure=, worstAcceptable=, minSubset=])") 
    554555C_CALL(TreeSplitConstructor_Threshold, TreeSplitConstructor_Measure, "([measure=, worstAcceptable=, minSubset=])") 
    555556PYXTRACT_IGNORE C_CALL(TreeSplitConstructor_LR, TreeSplitConstructor, "([minSubset=])") 
  • source/orange/measures.cpp

    r4654 r5056  
    111111 
    112112  if (classDistribution->variable->varType == TValue::INTVAR) { 
    113     dis0 = cont->discrete->front().AS(TDiscDistribution); 
    114     dis1 = cont->discrete->back().AS(TDiscDistribution); 
     113    dis0 = CLONE(TDiscDistribution, cont->discrete->front().AS(TDiscDistribution)); 
     114    dis1 = CLONE(TDiscDistribution, cont->discrete->back().AS(TDiscDistribution)); 
    115115    con0 = con1 = NULL; 
    116116  } 
    117117  else { 
    118     con0 = cont->discrete->front().AS(TContDistribution); 
    119     con1 = cont->discrete->back().AS(TContDistribution); 
     118    con0 = CLONE(TContDistribution, cont->discrete->front().AS(TContDistribution)); 
     119    con1 = CLONE(TContDistribution, cont->discrete->back().AS(TContDistribution)); 
    120120    dis0 = dis1 = NULL; 
    121121  } 
     
    437437} 
    438438 
     439int TMeasureAttribute::bestValue(PDistribution &, float &score, PContingency origContingency, PDistribution classDistribution, PDistribution apriorClass, const float &minSubset) 
     440{ 
     441  raiseError("bestValue is not supported by the selected attribute measure"); 
     442  return 0; 
     443} 
     444 
     445int TMeasureAttribute::bestValue(PDistribution &, float &score, PVariable, PExampleGenerator, PDistribution apriorClass, int weightID, const float &minSubset) 
     446{ 
     447  raiseError("bestValue is not supported by the selected attribute measure"); 
     448  return 0; 
     449} 
    439450 
    440451bool TMeasureAttribute::checkClassType(const int &varType) 
     
    18121823  } 
    18131824} 
     1825 
     1826 
     1827 
     1828 
     1829int TMeasureAttribute_relief::bestValue(PDistribution &subsetSizes, float &bestScore, PVariable var, PExampleGenerator gen, PDistribution apriorClass, int weightID, const float &minSubset) 
     1830{ 
     1831  TEnumVariable *evar = var.AS(TEnumVariable); 
     1832  if (!evar) 
     1833    raiseError("cannot discretly binarize a continuous attribute"); 
     1834 
     1835  const int noVal = evar->noOfValues(); 
     1836 
     1837  float *attrDistr = NULL; 
     1838  PSymMatrix wgain = gainMatrix(var, gen, apriorClass, weightID, NULL, &attrDistr); 
     1839  TSymMatrix &gain = wgain.getReference(); 
     1840 
     1841  float *gains = new float[noVal * noVal], *gi = gains, *ge; 
     1842 
     1843  int wins = 0; 
     1844 
     1845  try { 
     1846    float thisScore = 0.0; 
     1847    int i, j; 
     1848    for(i = 0; i < noVal; i++) 
     1849      for(j = 0; j < noVal; j++) 
     1850        *gi++ = gain.getitem(i, j); 
     1851 
     1852    float *ai, *ae; 
     1853    float nExamples; 
     1854    if (!attrDistr) { 
     1855      TDiscDistribution dd(gen, var, weightID); 
     1856      attrDistr = new float[noVal]; 
     1857      ai = attrDistr; 
     1858      ae = attrDistr + noVal; 
     1859      for(vector<float>::const_iterator di(dd.distribution.begin()); ai != ae; *ai++ = *di++); 
     1860      nExamples = dd.abs; 
     1861    } 
     1862    else { 
     1863      nExamples = 0; 
     1864      for(ai = attrDistr, ae = attrDistr + noVal; ai != ae; nExamples += *ai++); 
     1865    }    
     1866    
     1867    float maxSubset = nExamples - minSubset; 
     1868    if (maxSubset < minSubset) 
     1869      return -1; 
     1870 
     1871    int bestVal = -1; 
     1872    wins = 0; 
     1873    bestScore = 0; 
     1874    TRandomGenerator rgen(gen->numberOfExamples()); 
     1875    float *gi = gains; 
     1876    ai = attrDistr; 
     1877    for(int thisValue = 0; thisValue < noVal; thisValue++, ai++) { 
     1878      if ((*ai < minSubset) || (*ai > maxSubset)) { 
     1879        gi += noVal; 
     1880        continue; 
     1881      } 
     1882       
     1883      float thisScore = -2*gi[thisValue]; // have to subtract this, we'll add it once below 
     1884      for(ge = gi + noVal; gi != ge; thisScore += *gi++); 
     1885      if (    (!wins || (thisScore > bestScore)) && ((wins=1) == 1) 
     1886          || (thisScore == bestScore) && rgen.randbool(++wins)) { 
     1887        bestScore = thisScore; 
     1888        bestVal = thisValue; 
     1889      } 
     1890    } 
     1891 
     1892    delete gains; 
     1893    gains = NULL; 
     1894 
     1895    if (!wins) { 
     1896      delete attrDistr; 
     1897      return -1; 
     1898    } 
     1899     
     1900    subsetSizes = new TDiscDistribution(2); 
     1901    subsetSizes->addint(0, nExamples - attrDistr[bestVal]); 
     1902    subsetSizes->addint(1, attrDistr[bestVal]); 
     1903 
     1904    delete attrDistr; 
     1905    attrDistr = NULL; 
     1906 
     1907    return bestVal; 
     1908  } 
     1909  catch (...) { 
     1910    if (gains) 
     1911      delete gains; 
     1912    if (attrDistr) 
     1913      delete attrDistr; 
     1914    throw; 
     1915  } 
     1916} 
  • source/orange/measures.hpp

    r5053 r5056  
    100100  virtual PIntList bestBinarization(PDistribution &, float &score, PVariable, PExampleGenerator, PDistribution apriorClass=PDistribution(), int weightID = 0, const float &minSubset = -1); 
    101101 
     102  virtual int bestValue(PDistribution &, float &score, PContingency, PDistribution classDistribution, PDistribution apriorClass=PDistribution(), const float &minSubset = -1); 
     103  virtual int bestValue(PDistribution &, float &score, PVariable, PExampleGenerator, PDistribution apriorClass=PDistribution(), int weightID = 0, const float &minSubset = -1); 
     104 
    102105  virtual bool checkClassType(const int &varType); 
    103106  virtual void checkClassTypeExc(const int &varType); 
  • source/orange/relief.hpp

    r4629 r5056  
    104104    PSymMatrix gainMatrix(PVariable var, PExampleGenerator gen, PDistribution, int weightID, int **attrVals, float **attrDistr); 
    105105    PIntList bestBinarization(PDistribution &subsets, float &score, PVariable var, PExampleGenerator gen, PDistribution apriorClass = PDistribution(), int weightID = 0, const float &minSubset = -1); 
     106    int bestValue(PDistribution &subsetSizes, float &bestScore, PVariable var, PExampleGenerator gen, PDistribution apriorClass, int weightID, const float &minSubset); 
    106107 
    107108    void reset(); 
  • source/orange/tdidt_split.cpp

    r4632 r5056  
    551551 
    552552 
     553 
     554 
     555 
     556 
     557 
     558 
     559PClassifier TTreeSplitConstructor_OneAgainstOthers::operator()( 
     560                             PStringList &descriptions, PDiscDistribution &subsetSizes, float &quality, int &spentAttribute, 
     561 
     562                             PExampleGenerator gen, const int &weightID , 
     563                             PDomainContingency dcont, PDistribution apriorClass, 
     564                             const vector<bool> &candidates, 
     565                             PClassifier 
     566                            ) 
     567{  
     568  checkProperty(measure); 
     569  measure->checkClassTypeExc(gen->domain->classVar->varType); 
     570 
     571  int bestValue, wins, bestAttr; 
     572  PVariable bvar; 
     573 
     574  if (measure->needs==TMeasureAttribute::Generator) { 
     575    bool cse = candidates.size()==0; 
     576    bool haveCandidates = false; 
     577    vector<bool> myCandidates; 
     578    myCandidates.reserve(gen->domain->attributes->size()); 
     579    vector<bool>::const_iterator ci(candidates.begin()), ce(candidates.end()); 
     580    TVarList::const_iterator vi, ve(gen->domain->attributes->end()); 
     581    for(vi = gen->domain->attributes->begin(); vi != ve; vi++) { 
     582      bool co = (*vi)->varType == TValue::INTVAR && (!cse || (ci!=ce) && *ci); 
     583      myCandidates.push_back(co); 
     584      haveCandidates = haveCandidates || co; 
     585    } 
     586    if (!haveCandidates) 
     587      return returnNothing(descriptions, subsetSizes, quality, spentAttribute); 
     588 
     589    PDistribution thisSubsets; 
     590    float thisQuality; 
     591    wins = 0; 
     592    int thisAttr = 0; 
     593 
     594    int N = gen->numberOfExamples(); 
     595    TSimpleRandomGenerator rgen(N); 
     596 
     597    ci = myCandidates.begin(); 
     598    for(vi = gen->domain->attributes->begin(); vi != ve; ci++, vi++, thisAttr++) { 
     599      if (*ci) { 
     600        thisSubsets = NULL; 
     601        int thisValue = measure->bestValue(thisSubsets, thisQuality, *vi, gen, apriorClass, weightID, minSubset); 
     602        if ((thisValue >=0) 
     603                && (   (!wins || (thisQuality>quality)) && ((wins=1)==1) 
     604                    || (thisQuality==quality) && rgen.randbool(++wins))) { 
     605            bestAttr = thisAttr; 
     606            quality = thisQuality; 
     607            subsetSizes = thisSubsets; 
     608            bestValue = thisValue; 
     609          } 
     610      } 
     611    } 
     612   
     613    if (!wins) 
     614      return returnNothing(descriptions, subsetSizes, quality, spentAttribute); 
     615 
     616    if (quality<worstAcceptable) 
     617      return returnNothing(descriptions, subsetSizes, spentAttribute); 
     618 
     619    if (subsetSizes && subsetSizes->variable) 
     620      bvar = subsetSizes->variable; 
     621    else { 
     622      TEnumVariable *evar = mlnew TEnumVariable(""); 
     623      const string &value = gen->domain->attributes->at(bestAttr).AS(TEnumVariable)->values->at(bestValue); 
     624      evar->addValue(string("not ")+value); 
     625      evar->addValue(value); 
     626      bvar = evar; 
     627    } 
     628  } 
     629   
     630  else { 
     631    bool cse = candidates.size()==0; 
     632    if (!cse && noCandidates(candidates)) 
     633      return returnNothing(descriptions, subsetSizes, quality, spentAttribute); 
     634 
     635    if (!dcont || dcont->classIsOuter) { 
     636      dcont = PDomainContingency(mlnew TDomainContingency(gen, weightID)); 
     637    } 
     638 
     639    int N = gen ? gen->numberOfExamples() : -1; 
     640    if (N<0) 
     641      N = dcont->classes->cases; 
     642    TSimpleRandomGenerator rgen(N); 
     643 
     644    PDistribution classDistribution = dcont->classes; 
     645 
     646    vector<bool>::const_iterator ci(candidates.begin()), ce(candidates.end()); 
     647 
     648    TDiscDistribution *dis0, *dis1; 
     649    TContDistribution *con0, *con1; 
     650 
     651    int thisAttr = 0; 
     652    bestAttr = -1; 
     653    wins = 0; 
     654    quality = 0.0; 
     655    float leftExamples, rightExamples; 
     656 
     657    TDomainContingency::iterator dci(dcont->begin()), dce(dcont->end()); 
     658    for(; (cse || (ci!=ce)) && (dci!=dce); dci++, thisAttr++) { 
     659 
     660      // We consider the attribute only if it is a candidate, discrete and has at least two values 
     661      if ((cse || *(ci++)) && ((*dci)->outerVariable->varType==TValue::INTVAR) && ((*dci)->discrete->size()>=2)) { 
     662 
     663        const TDistributionVector &distr = *(*dci)->discrete; 
     664 
     665        // If the attribute is binary, we check subsetSizes and assess the quality if they are OK 
     666        if (distr.size() == 2) { 
     667          if ((distr.front()->abs < minSubset) || (distr.back()->abs < minSubset)) 
     668            continue; // next attribute 
     669          else { 
     670            float thisMeas = measure->call(thisAttr, dcont, apriorClass); 
     671            if (   ((!wins || (thisMeas>quality)) && ((wins=1)==1)) 
     672                || ((thisMeas==quality) && rgen.randbool(++wins))) { 
     673              bestAttr = thisAttr; 
     674              quality = thisMeas; 
     675              leftExamples = distr.front()->abs; 
     676              rightExamples = distr.back()->abs; 
     677              bestValue = 1; 
     678            } 
     679            continue; 
     680          } 
     681        } 
     682 
     683        int binWins = 0, binBestValue = -1; 
     684        float binQuality = -1.0; 
     685        float binLeftExamples = -1.0, binRightExamples = -1.0; 
     686 
     687        PContingency cont = prepareBinaryCheat(classDistribution, *dci, bvar, dis0, dis1, con0, con1); 
     688        int thisValue = 0; 
     689        const float maxSubset = (*dci)->innerDistribution->abs - minSubset; 
     690        for(TDistributionVector::const_iterator dvi(distr.begin()), dve(distr.end()); (dvi!=dve); dvi++, thisValue++) { 
     691          if (((*dvi)->abs < minSubset) || ((*dvi)->abs > maxSubset)) 
     692            continue; 
     693 
     694          float thisMeas; 
     695           
     696          // First for discrete classes 
     697          if (dis0) { 
     698            *dis0 = CAST_TO_DISCDISTRIBUTION(*dvi); 
     699            *dis1 = CAST_TO_DISCDISTRIBUTION((*dci)->innerDistribution); 
     700            *dis1 -= *dis0; 
     701            thisMeas = measure->operator()(cont, classDistribution, apriorClass); 
     702          } 
     703          else { 
     704            *con0 = CAST_TO_CONTDISTRIBUTION(*dvi); 
     705            *con1 = CAST_TO_CONTDISTRIBUTION((*dci)->innerDistribution); 
     706            *con0 -= *con1; 
     707            thisMeas = measure->operator()(cont, classDistribution, apriorClass); 
     708          } 
     709           
     710          if (   ((!binWins) || (thisMeas>binQuality)) && ((binWins=1) ==1) 
     711              || (thisMeas==binQuality) && rgen.randbool(++binWins)) { 
     712            binBestValue = thisValue;  
     713            binQuality = thisMeas; 
     714            binLeftExamples = dis0->abs; 
     715            binRightExamples = dis1->abs; 
     716          } 
     717        } 
     718 
     719        if (       binWins 
     720            && (   (!wins || (binQuality>quality)) && ((wins=1)==1) 
     721                || (binQuality==quality) && rgen.randbool(++wins))) { 
     722          bestAttr = thisAttr; 
     723          quality = binQuality; 
     724          leftExamples = binLeftExamples; 
     725          rightExamples = binRightExamples; 
     726          bestValue = binBestValue; 
     727        } 
     728      } 
     729    } 
     730  
     731 
     732    if (!wins) 
     733      return returnNothing(descriptions, subsetSizes, quality, spentAttribute); 
     734 
     735    subsetSizes = mlnew TDiscDistribution(); 
     736    subsetSizes->addint(0, leftExamples); 
     737    subsetSizes->addint(1, rightExamples); 
     738  } 
     739 
     740  PVariable attribute = gen->domain->attributes->at(bestAttr); 
     741 
     742  if (attribute->noOfValues() == 2) { 
     743    spentAttribute = bestAttr; 
     744    descriptions = mlnew TStringList(attribute.AS(TEnumVariable)->values.getReference()); 
     745    return mlnew TClassifierFromVarFD(attribute, gen->domain, bestAttr, subsetSizes); 
     746  } 
     747 
     748  const string &bestValueS = attribute.AS(TEnumVariable)->values->at(bestValue); 
     749  descriptions = mlnew TStringList(); 
     750  descriptions->push_back(string("not ") + bestValueS); 
     751  descriptions->push_back(bestValueS); 
     752   
     753  bvar->name = gen->domain->attributes->at(bestAttr)->name; 
     754 
     755  TIntList *bestMapping = mlnew TIntList(attribute.AS(TEnumVariable)->values->size(), 0); 
     756  PIntList wb = bestMapping; 
     757  bestMapping->at(bestValue) = 1; 
     758  spentAttribute = -1; 
     759  return mlnew TClassifierFromVarFD(bvar, gen->domain, bestAttr, subsetSizes, mlnew TMapIntValue(bestMapping)); 
     760} 
     761 
     762 
     763 
     764 
     765 
     766 
     767 
     768 
     769 
     770 
    553771TTreeSplitConstructor_Threshold::TTreeSplitConstructor_Threshold(PMeasureAttribute meas, const float &worst, const float &aml) 
    554772: TTreeSplitConstructor_Measure(meas, worst, aml) 
  • source/orange/tdidt_split.hpp

    r1337 r5056  
    152152}; 
    153153 
     154class ORANGE_API TTreeSplitConstructor_OneAgainstOthers: public TTreeSplitConstructor_Measure { 
     155public: 
     156  __REGISTER_CLASS 
     157  virtual PClassifier operator()(PStringList &descriptions, 
     158                                PDiscDistribution &subsetSizes, 
     159                                float &quality, int &spentAttribute, 
     160 
     161                                PExampleGenerator, const int &weightID = 0, 
     162                                PDomainContingency = PDomainContingency(), 
     163                                PDistribution apriorClass = PDistribution(), 
     164                                const vector<bool> &candidates = vector<bool>(), 
     165                                PClassifier nodeClassifier = PClassifier() 
     166                               ); 
     167}; 
     168 
    154169 
    155170class ORANGE_API TTreeSplitConstructor_Threshold: public TTreeSplitConstructor_Measure { 
Note: See TracChangeset for help on using the changeset viewer.