Changeset 9312:1dd9b1794297 in orange


Ignore:
Timestamp:
12/06/11 13:20:11 (2 years ago)
Author:
markotoplak
Branch:
default
Convert:
e2350830f4445b129977f86a151f35fac558e856
Message:

Random forest uses SimpleTreeLearner by default.

Location:
orange
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • orange/Orange/ensemble/forest.py

    r9311 r9312  
    5151    defaults. 
    5252         
     53    :param trees: number of trees in the forest. 
     54    :type trees: int 
     55    :param attributes: number of randomly drawn features among 
     56            which to select the best to split the nodes in tree 
     57            induction. The default, None, means the square root of 
     58            the number of features in the training data. Ignored if 
     59            :obj:`learner` is specified. 
     60    :type attributes: int 
     61    :param base_learner: A base tree learner. The base learner will be 
     62        randomized with Random Forest's random 
     63        feature subset selection.  If None (default), 
     64        :class:`~Orange.classification.tree.TreeLearner` with Gini index 
     65        or MSE for attribute scoring will be used, and it will not split 
     66        nodes with less than 5 data instances. 
     67    :type base_learner: None or :class:`Orange.classification.tree.TreeLearner` 
     68    :param rand: random generator used in bootstrap sampling. If None (default),  
     69        then ``random.Random(0)`` is used. 
     70    :param learner: Tree induction learner. If `"fast"` (default), 
     71        :obj:`~Orange.classification.tree.SimpleTreeLearner` with 
     72        random feature subset selection will be used.  If `None`,  
     73        the :obj:`base_learner` will be used (and randomized). If 
     74        :obj:`learner` is specified, it will be used as such 
     75        with no additional transformations. 
     76    :type learner: None or :class:`Orange.core.Learner` 
     77    :param callback: a function to be called after every iteration of 
     78            induction of classifier. This is called with parameter  
     79            (from 0.0 to 1.0) that gives estimates on learning progress. 
     80    :param name: name of the learner. 
     81    :type name: string 
     82    :rtype: :class:`~Orange.ensemble.forest.RandomForestClassifier` or  
     83            :class:`~Orange.ensemble.forest.RandomForestLearner` 
     84    """ 
     85 
     86    def __new__(cls, instances=None, weight = 0, **kwds): 
     87        self = orange.Learner.__new__(cls, **kwds) 
     88        if instances: 
     89            self.__init__(**kwds) 
     90            return self.__call__(instances, weight) 
     91        else: 
     92            return self 
     93 
     94    def __init__(self, trees=100, attributes=None,\ 
     95                    name='Random Forest', rand=None, callback=None, base_learner=None, learner="fast"): 
     96        self.trees = trees 
     97        self.name = name 
     98        self.attributes = attributes 
     99        self.callback = callback 
     100        self.rand = rand 
     101 
     102        self.base_learner = base_learner 
     103 
     104        if base_learner != None and learner not in [ None, "fast" ]: 
     105            wrongSpecification() 
     106        elif base_learner != None: 
     107            learner = None   #build with base_learner 
     108 
     109        if not self.rand: 
     110            self.rand = random.Random(0) 
     111 
     112        self.randorange = Orange.core.RandomGenerator(self.rand.randint(0,2**31-1)) 
     113 
     114        if learner == "fast": 
     115            self.learner = SimpleTreeLearnerSetProb(min_instances=5, random_generator=self.randorange) 
     116        elif learner == None: 
     117            self.learner = _default_small_learner(self.attributes, self.rand, base=self.base_learner) 
     118        else: 
     119            self.learner = learner 
     120             
     121        self.randstate = self.rand.getstate() #original state 
     122 
     123    def __call__(self, instances, weight=0): 
     124        """ 
     125        Learn from the given table of data instances. 
     126         
     127        :param instances: learning data. 
     128        :type instances: class:`Orange.data.Table` 
     129        :param weight: weight. 
     130        :type weight: int 
     131        :rtype: :class:`Orange.ensemble.forest.RandomForestClassifier` 
     132        """ 
     133        self.rand.setstate(self.randstate) #when learning again, set the same state 
     134        self.randorange.reset()         
     135 
     136        if "attributes" in self.learner.__dict__: 
     137            self.learner.attributes = len(instances.domain.attributes)**0.5 if self.attributes == None else self.attributes 
     138 
     139        learner = self.learner 
     140 
     141        n = len(instances) 
     142        # build the forest 
     143        classifiers = [] 
     144        for i in range(self.trees): 
     145            # draw bootstrap sample 
     146            selection = [] 
     147            for j in range(n): 
     148                selection.append(self.rand.randrange(n)) 
     149            data = instances.get_items_ref(selection) 
     150            # build the model from the bootstrap sample 
     151            classifiers.append(learner(data, weight)) 
     152            if self.callback: 
     153                self.callback() 
     154            # if self.callback: self.callback((i+1.)/self.trees) 
     155 
     156        return RandomForestClassifier(classifiers = classifiers, name=self.name,\ 
     157                    domain=instances.domain, class_var=instances.domain.class_var) 
     158  
     159 
     160class RandomForestClassifier(orange.Classifier): 
     161    """ 
     162    Uses the trees induced by the :obj:`RandomForestLearner`. An input 
     163    instance is classified into the class with the most frequent vote. 
     164    However, this implementation returns the averaged probabilities from 
     165    each of the trees if class probability is requested. 
     166 
     167    When constructed manually, the following parameters have to 
     168    be passed: 
     169 
     170    :param classifiers: a list of classifiers to be used. 
     171    :type classifiers: list 
     172     
     173    :param name: name of the resulting classifier. 
     174    :type name: str 
     175     
     176    :param domain: the domain of the learning set. 
     177    :type domain: :class:`Orange.data.Domain` 
     178     
     179    :param class_var: the class feature. 
     180    :type class_var: :class:`Orange.data.variable.Variable` 
     181 
     182    """ 
     183    def __init__(self, classifiers, name, domain, class_var, **kwds): 
     184        self.classifiers = classifiers 
     185        self.name = name 
     186        self.domain = domain 
     187        self.class_var = class_var 
     188        self.__dict__.update(kwds) 
     189 
     190    def __call__(self, instance, resultType = orange.GetValue): 
     191        """ 
     192        :param instance: instance to be classified. 
     193        :type instance: :class:`Orange.data.Instance` 
     194         
     195        :param result_type: :class:`Orange.classification.Classifier.GetValue` or \ 
     196              :class:`Orange.classification.Classifier.GetProbabilities` or 
     197              :class:`Orange.classification.Classifier.GetBoth` 
     198         
     199        :rtype: :class:`Orange.data.Value`,  
     200              :class:`Orange.statistics.Distribution` or a tuple with both 
     201        """ 
     202        from operator import add 
     203         
     204        # handle discreete class 
     205         
     206        if self.class_var.var_type == Orange.data.variable.Discrete.Discrete: 
     207         
     208            # voting for class probabilities 
     209            if resultType == orange.GetProbabilities or resultType == orange.GetBoth: 
     210                prob = [0.] * len(self.domain.class_var.values) 
     211                for c in self.classifiers: 
     212                    a = [x for x in c(instance, orange.GetProbabilities)] 
     213                    prob = map(add, prob, a) 
     214                norm = sum(prob) 
     215                cprob = Orange.statistics.distribution.Discrete(self.class_var) 
     216                for i in range(len(prob)): 
     217                    cprob[i] = prob[i]/norm 
     218                 
     219            # voting for crisp class membership, notice that 
     220            # this may not be the same class as one obtaining the 
     221            # highest probability through probability voting 
     222            if resultType == orange.GetValue or resultType == orange.GetBoth: 
     223                cfreq = [0] * len(self.domain.class_var.values) 
     224                for c in self.classifiers: 
     225                    cfreq[int(c(instance))] += 1 
     226                index = cfreq.index(max(cfreq)) 
     227                cvalue = Orange.data.Value(self.domain.class_var, index) 
     228     
     229            if resultType == orange.GetValue: return cvalue 
     230            elif resultType == orange.GetProbabilities: return cprob 
     231            else: return (cvalue, cprob) 
     232         
     233        else: 
     234            # Handle continuous class 
     235         
     236            # voting for class probabilities 
     237            if resultType == orange.GetProbabilities or resultType == orange.GetBoth: 
     238                probs = [c(instance, orange.GetBoth) for c in self.classifiers] 
     239                cprob = dict() 
     240                for val,prob in probs: 
     241                    if prob != None: #no probability output 
     242                        a = dict(prob.items()) 
     243                    else: 
     244                        a = { val.value : 1. } 
     245                    cprob = dict( (n, a.get(n, 0)+cprob.get(n, 0)) for n in set(a)|set(cprob) ) 
     246                cprob = Orange.statistics.distribution.Continuous(cprob) 
     247                cprob.normalize() 
     248                 
     249            # gather average class value 
     250            if resultType == orange.GetValue or resultType == orange.GetBoth: 
     251                values = [c(instance).value for c in self.classifiers] 
     252                cvalue = Orange.data.Value(self.domain.class_var, sum(values) / len(self.classifiers)) 
     253             
     254            if resultType == orange.GetValue: return cvalue 
     255            elif resultType == orange.GetProbabilities: return cprob 
     256            else: return (cvalue, cprob) 
     257             
     258    def __reduce__(self): 
     259        return type(self), (self.classifiers, self.name, self.domain, self.class_var), dict(self.__dict__) 
     260 
     261### MeasureAttribute_randomForests 
     262 
     263class ScoreFeature(orange.MeasureAttribute): 
     264    """ 
    53265    :param trees: number of trees in the forest. 
    54266    :type trees: int 
     
    73285        with no additional transformations. 
    74286    :type learner: None or :class:`Orange.core.Learner` 
    75     :param callback: a function to be called after every iteration of 
    76             induction of classifier. This is called with parameter  
    77             (from 0.0 to 1.0) that gives estimates on learning progress. 
    78     :param name: name of the learner. 
    79     :type name: string 
    80     :rtype: :class:`~Orange.ensemble.forest.RandomForestClassifier` or  
    81             :class:`~Orange.ensemble.forest.RandomForestLearner` 
    82     """ 
    83  
    84     def __new__(cls, instances=None, weight = 0, **kwds): 
    85         self = orange.Learner.__new__(cls, **kwds) 
    86         if instances: 
    87             self.__init__(**kwds) 
    88             return self.__call__(instances, weight) 
    89         else: 
    90             return self 
    91  
    92     def __init__(self, trees=100, attributes=None,\ 
    93                     name='Random Forest', rand=None, callback=None, base_learner=None, learner="fast"): 
    94         self.trees = trees 
    95         self.name = name 
    96         self.attributes = attributes 
    97         self.callback = callback 
    98         self.rand = rand 
    99  
    100         self.base_learner = base_learner 
    101  
    102         if base_learner != None and learner not in [ None, "orig", "fast" ]: 
    103             wrongSpecification() 
    104         elif base_learner != None or learner == "orig": 
    105             learner = None   #build with base_learner 
    106  
    107         if not self.rand: 
    108             self.rand = random.Random(0) 
    109  
    110         if learner == "fast": 
    111             self.learner = SimpleTreeLearnerSetProb(min_instances=5) 
    112         elif learner == None: 
    113             self.learner = _default_small_learner(self.attributes, self.rand, base=self.base_learner) 
    114         else: 
    115             self.learner = learner 
    116              
    117         self.randstate = self.rand.getstate() #original state 
    118  
    119     def __call__(self, instances, weight=0): 
    120         """ 
    121         Learn from the given table of data instances. 
    122          
    123         :param instances: learning data. 
    124         :type instances: class:`Orange.data.Table` 
    125         :param weight: weight. 
    126         :type weight: int 
    127         :rtype: :class:`Orange.ensemble.forest.RandomForestClassifier` 
    128         """ 
    129         self.rand.setstate(self.randstate) #when learning again, set the same state 
    130  
    131         if "attributes" in self.learner.__dict__: 
    132             self.learner.attributes = len(instances.domain.attributes)**0.5 if self.attributes == None else self.attributes 
    133  
    134         learner = self.learner 
    135  
    136         n = len(instances) 
    137         # build the forest 
    138         classifiers = [] 
    139         for i in range(self.trees): 
    140             # draw bootstrap sample 
    141             selection = [] 
    142             for j in range(n): 
    143                 selection.append(self.rand.randrange(n)) 
    144             data = instances.get_items_ref(selection) 
    145             # build the model from the bootstrap sample 
    146             classifiers.append(learner(data, weight)) 
    147             if self.callback: 
    148                 self.callback() 
    149             # if self.callback: self.callback((i+1.)/self.trees) 
    150  
    151         return RandomForestClassifier(classifiers = classifiers, name=self.name,\ 
    152                     domain=instances.domain, class_var=instances.domain.class_var) 
    153   
    154  
    155 class RandomForestClassifier(orange.Classifier): 
    156     """ 
    157     Uses the trees induced by the :obj:`RandomForestLearner`. An input 
    158     instance is classified into the class with the most frequent vote. 
    159     However, this implementation returns the averaged probabilities from 
    160     each of the trees if class probability is requested. 
    161  
    162     When constructed manually, the following parameters have to 
    163     be passed: 
    164  
    165     :param classifiers: a list of classifiers to be used. 
    166     :type classifiers: list 
    167      
    168     :param name: name of the resulting classifier. 
    169     :type name: str 
    170      
    171     :param domain: the domain of the learning set. 
    172     :type domain: :class:`Orange.data.Domain` 
    173      
    174     :param class_var: the class feature. 
    175     :type class_var: :class:`Orange.data.variable.Variable` 
    176  
    177     """ 
    178     def __init__(self, classifiers, name, domain, class_var, **kwds): 
    179         self.classifiers = classifiers 
    180         self.name = name 
    181         self.domain = domain 
    182         self.class_var = class_var 
    183         self.__dict__.update(kwds) 
    184  
    185     def __call__(self, instance, resultType = orange.GetValue): 
    186         """ 
    187         :param instance: instance to be classified. 
    188         :type instance: :class:`Orange.data.Instance` 
    189          
    190         :param result_type: :class:`Orange.classification.Classifier.GetValue` or \ 
    191               :class:`Orange.classification.Classifier.GetProbabilities` or 
    192               :class:`Orange.classification.Classifier.GetBoth` 
    193          
    194         :rtype: :class:`Orange.data.Value`,  
    195               :class:`Orange.statistics.Distribution` or a tuple with both 
    196         """ 
    197         from operator import add 
    198          
    199         # handle discreete class 
    200          
    201         if self.class_var.var_type == Orange.data.variable.Discrete.Discrete: 
    202          
    203             # voting for class probabilities 
    204             if resultType == orange.GetProbabilities or resultType == orange.GetBoth: 
    205                 prob = [0.] * len(self.domain.class_var.values) 
    206                 for c in self.classifiers: 
    207                     a = [x for x in c(instance, orange.GetProbabilities)] 
    208                     prob = map(add, prob, a) 
    209                 norm = sum(prob) 
    210                 cprob = Orange.statistics.distribution.Discrete(self.class_var) 
    211                 for i in range(len(prob)): 
    212                     cprob[i] = prob[i]/norm 
    213                  
    214             # voting for crisp class membership, notice that 
    215             # this may not be the same class as one obtaining the 
    216             # highest probability through probability voting 
    217             if resultType == orange.GetValue or resultType == orange.GetBoth: 
    218                 cfreq = [0] * len(self.domain.class_var.values) 
    219                 for c in self.classifiers: 
    220                     cfreq[int(c(instance))] += 1 
    221                 index = cfreq.index(max(cfreq)) 
    222                 cvalue = Orange.data.Value(self.domain.class_var, index) 
    223      
    224             if resultType == orange.GetValue: return cvalue 
    225             elif resultType == orange.GetProbabilities: return cprob 
    226             else: return (cvalue, cprob) 
    227          
    228         else: 
    229             # Handle continuous class 
    230          
    231             # voting for class probabilities 
    232             if resultType == orange.GetProbabilities or resultType == orange.GetBoth: 
    233                 probs = [c(instance, orange.GetBoth) for c in self.classifiers] 
    234                 cprob = dict() 
    235                 for val,prob in probs: 
    236                     if prob != None: #no probability output 
    237                         a = dict(prob.items()) 
    238                     else: 
    239                         a = { val.value : 1. } 
    240                     cprob = dict( (n, a.get(n, 0)+cprob.get(n, 0)) for n in set(a)|set(cprob) ) 
    241                 cprob = Orange.statistics.distribution.Continuous(cprob) 
    242                 cprob.normalize() 
    243                  
    244             # gather average class value 
    245             if resultType == orange.GetValue or resultType == orange.GetBoth: 
    246                 values = [c(instance).value for c in self.classifiers] 
    247                 cvalue = Orange.data.Value(self.domain.class_var, sum(values) / len(self.classifiers)) 
    248              
    249             if resultType == orange.GetValue: return cvalue 
    250             elif resultType == orange.GetProbabilities: return cprob 
    251             else: return (cvalue, cprob) 
    252              
    253     def __reduce__(self): 
    254         return type(self), (self.classifiers, self.name, self.domain, self.class_var), dict(self.__dict__) 
    255  
    256 ### MeasureAttribute_randomForests 
    257  
    258 class ScoreFeature(orange.MeasureAttribute): 
    259     """ 
    260     :param trees: number of trees in the forest. 
    261     :type trees: int 
    262     :param attributes: number of randomly drawn features among 
    263             which to select the best to split the nodes in tree 
    264             induction. The default, None, means the square root of 
    265             the number of features in the training data. Ignored if 
    266             :obj:`learner` is specified. 
    267     :type attributes: int 
    268     :param base_learner: A base tree learner. The base learner will be 
    269         randomized with Random Forest's random 
    270         feature subset selection.  If None (default), 
    271         :class:`~Orange.classification.tree.TreeLearner` with Gini index 
    272         or MSE for attribute scoring will be used, and it will not split 
    273         nodes with less than 5 data instances. 
    274     :type base_learner: None or :class:`Orange.classification.tree.TreeLearner` 
    275     :param rand: random generator used in bootstrap sampling. If None (default),  
    276         then ``random.Random(0)`` is used. 
    277     :param learner: Tree induction learner. If None (default),  
    278         the :obj:`~ScoreFeature.base_learner` will be used (and randomized). If 
    279         :obj:`~ScoreFeature.learner` is specified, it will be used as such 
    280         with no additional transformations. 
    281     :type learner: None or :class:`Orange.core.Learner` 
    282287 
    283288    """ 
  • orange/doc/modules/ensemble2.py

    r6538 r9312  
    88 
    99data = orange.ExampleTable('bupa.tab') 
     10import random 
    1011forest = orngEnsemble.RandomForestLearner(trees=50, name="forest") 
    11 tree = orngTree.TreeLearner(minExamples=2, mForPrunning=2, \ 
    12                             sameMajorityPruning=True, name='tree') 
     12tree = orngTree.TreeLearner(min_instances=2, m_for_prunning=2, \ 
     13                            same_majority_pruning=True, name='tree') 
    1314learners = [tree, forest] 
    1415 
     
    2122        orngStat.BrierScore(results)[i], 
    2223        orngStat.AUC(results)[i]) 
     24 
Note: See TracChangeset for help on using the changeset viewer.