source: orange/orange/Orange/ensemble/forest.py @ 7687:ede4a15f71d2

Revision 7687:ede4a15f71d2, 19.5 KB checked in by ales_erjavec <ales.erjavec@…>, 3 years ago (diff)
  • moved small tree learner construction in a helper function (used in ScoreFeature, OWRandomTree.py)
  • fixed Orange.statistics.distributions -> Orange.statistics.distribution
Line 
1from math import sqrt, floor
2import Orange.core as orange
3import Orange
4import Orange.feature.scoring
5import random
6
7def default_small_learner(measure=Orange.feature.scoring.Gini(), attributes=None, rand=None):
8    """ A helper function for constructing a small tree learner for use
9    in the RandomForestLearner.
10    :param measure: Feature scoring method for split construction
11    :type measure: :class:`Orange.feature.scoring.Measure`
12    param attributes: number of features used in a randomly drawn
13            subset when searching for best feature to split the node
14            in tree growing
15    :type attributes: int
16    :param rand: random generator used in feature subset selection in split constructor.
17            If None is passed, then Python's Random from random library is
18            used, with seed initialized to 0.
19    :type rand: function
20    """
21    # tree learner assembled as suggested by Brieman (2001)
22    smallTreeLearner = Orange.classification.tree.TreeLearner(
23    storeNodeClassifier = 0, storeContingencies=0, 
24    storeDistributions=1, minExamples=5).instance()
25   
26    smallTreeLearner.split.discreteSplitConstructor.measure = measure
27    smallTreeLearner.split.continuousSplitConstructor.measure = measure
28   
29    smallTreeLearner.split = SplitConstructor_AttributeSubset(\
30            smallTreeLearner.split, attributes, rand)
31    return smallTreeLearner
32   
33
34class RandomForestLearner(orange.Learner):
35    """
36    Just like bagging, classifiers in random forests are trained from bootstrap
37    samples of training data. Here, classifiers are trees. However, to increase
38    randomness, classifiers are built so that at each node the best feature is
39    chosen from a subset of features in the training set. We closely follow the
40    original algorithm (Brieman, 2001) both in implementation and parameter
41    defaults.
42       
43    :param learner: although not required, one can use this argument
44            to pass one's own tree induction algorithm. If None is passed,
45            RandomForestLearner will use Orange's tree induction
46            algorithm such that induction nodes with less than 5
47            data instances will not be considered for (further) splitting.
48    :type learner: :class:`Orange.core.Learner`
49    :param trees: number of trees in the forest.
50    :type trees: int
51    :param attributes: number of features used in a randomly drawn
52            subset when searching for best feature to split the node
53            in tree growing (default: None, and if kept this way, this
54            is turned into square root of the number of features in the
55            training set, when this is presented to learner).
56    :type attributes: int
57    :param rand: random generator used in bootstrap sampling.
58            If None is passed, then Python's Random from random library is
59            used, with seed initialized to 0.
60    :type rand: function
61    :param callback: a function to be called after every iteration of
62            induction of classifier. This is called with parameter
63            (from 0.0 to 1.0) that gives estimates on learning progress.
64    :param name: name of the learner.
65    :type name: string
66    :rtype: :class:`Orange.ensemble.forest.RandomForestClassifier` or
67            :class:`Orange.ensemble.forest.RandomForestLearner`
68    """
69
70    def __new__(cls, instances=None, weight = 0, **kwds):
71        self = orange.Learner.__new__(cls, **kwds)
72        if instances:
73            self.__init__(**kwds)
74            return self.__call__(instances, weight)
75        else:
76            return self
77
78    def __init__(self, learner=None, trees=100, attributes=None,\
79                    name='Random Forest', rand=None, callback=None):
80        self.trees = trees
81        self.name = name
82        self.learner = learner
83        self.attributes = attributes
84        self.callback = callback
85       
86        if rand:
87            self.rand = rand
88        else:
89            self.rand = random.Random()
90            self.rand.seed(0)
91           
92        self.randstate = self.rand.getstate() #original state
93
94    def __call__(self, instances, weight=0):
95        """
96        Learn from the given table of data instances.
97       
98        :param instances: data instances to learn from.
99        :type instances: class:`Orange.data.Table`
100        :param origWeight: weight.
101        :type origWeight: int
102        :rtype: :class:`Orange.ensemble.forest.RandomForestClassifier`
103        """
104       
105       
106        # If there is no learner we create our own
107       
108#        if not self.learner:
109#           
110#            # tree learner assembled as suggested by Brieman (2001)
111#            smallTreeLearner = Orange.classification.tree.TreeLearner(
112#            storeNodeClassifier = 0, storeContingencies=0,
113#            storeDistributions=1, minExamples=5).instance()
114#           
115#            # Use MSE on continuous class and Gini on discreete
116#            if instances.domain.class_var.var_type == Orange.data.variable.Continuous.Continuous:
117#                smallTreeLearner.split.discreteSplitConstructor.measure = \
118#                    smallTreeLearner.split.continuousSplitConstructor.measure =\
119#                        Orange.feature.scoring.MSE()
120#            else:
121#                smallTreeLearner.split.discreteSplitConstructor.measure = \
122#                    smallTreeLearner.split.continuousSplitConstructor.measure =\
123#                        Orange.feature.scoring.Gini()
124#           
125#            smallTreeLearner.split = SplitConstructor_AttributeSubset(\
126#                    smallTreeLearner.split, self.attributes, self.rand)
127#            self.learner = smallTreeLearner
128        if not self.learner:
129            # Use MSE on continuous class and Gini on discreete
130            if isinstance(instances.domain.class_var, Orange.data.variable.Discrete):
131                learner = default_small_learner(Orange.feature.scoring.Gini(), self.attributes, self.rand)
132            else:
133                learner = default_small_learner(Orange.feature.scoring.MSE(), self.attributes, self.rand)
134        else:
135            learner = self.learner
136       
137        # if number of features for subset is not set, use square root
138        if hasattr(learner.split, 'attributes') and\
139                    not learner.split.attributes:
140            learner.split.attributes = int(sqrt(\
141                    len(instances.domain.attributes)))
142
143        self.rand.setstate(self.randstate) #when learning again, set the same state
144
145        n = len(instances)
146        # build the forest
147        classifiers = []
148        for i in range(self.trees):
149            # draw bootstrap sample
150            selection = []
151            for j in range(n):
152                selection.append(self.rand.randrange(n))
153            data = instances.getitems(selection)
154            # build the model from the bootstrap sample
155            classifiers.append(learner(data))
156            if self.callback:
157                self.callback()
158            # if self.callback: self.callback((i+1.)/self.trees)
159
160        return RandomForestClassifier(classifiers = classifiers, name=self.name,\
161                    domain=instances.domain, classVar=instances.domain.classVar)
162       
163class RandomForestClassifier(orange.Classifier):
164    """
165    Random forest classifier uses decision trees induced from bootstrapped
166    training set to vote on class of presented instance. Most frequent vote
167    is returned. However, in our implementation, if class probability is
168    requested from a classifier, this will return the averaged probabilities
169    from each of the trees.
170
171    When constructing the classifier manually, the following parameters can
172    be passed:
173
174    :param classifiers: a list of classifiers to be used.
175    :type classifiers: list
176   
177    :param name: name of the resulting classifier.
178    :type name: str
179   
180    :param domain: the domain of the learning set.
181    :type domain: :class:`Orange.data.Domain`
182   
183    :param classVar: the class feature.
184    :type classVar: :class:`Orange.data.variable.Variable`
185
186    """
187    def __init__(self, classifiers, name, domain, classVar, **kwds):
188        self.classifiers = classifiers
189        self.name = name
190        self.domain = domain
191        self.classVar = classVar
192        self.__dict__.update(kwds)
193
194    def __call__(self, instance, resultType = orange.GetValue):
195        """
196        :param instance: instance to be classified.
197        :type instance: :class:`Orange.data.Instance`
198       
199        :param result_type: :class:`Orange.classification.Classifier.GetValue` or \
200              :class:`Orange.classification.Classifier.GetProbabilities` or
201              :class:`Orange.classification.Classifier.GetBoth`
202       
203        :rtype: :class:`Orange.data.Value`,
204              :class:`Orange.statistics.Distribution` or a tuple with both
205        """
206        from operator import add
207       
208        # handle discreete class
209       
210        if self.class_var.var_type == Orange.data.variable.Discrete.Discrete:
211       
212            # voting for class probabilities
213            if resultType == orange.GetProbabilities or resultType == orange.GetBoth:
214                prob = [0.] * len(self.domain.classVar.values)
215                for c in self.classifiers:
216                    a = [x for x in c(instance, orange.GetProbabilities)]
217                    prob = map(add, prob, a)
218                norm = sum(prob)
219                cprob = Orange.statistics.distribution.Discrete(self.classVar)
220                for i in range(len(prob)):
221                    cprob[i] = prob[i]/norm
222               
223               
224   
225            # voting for crisp class membership, notice that
226            # this may not be the same class as one obtaining the
227            # highest probability through probability voting
228            if resultType == orange.GetValue or resultType == orange.GetBoth:
229                cfreq = [0] * len(self.domain.classVar.values)
230                for c in self.classifiers:
231                    cfreq[int(c(instance))] += 1
232                index = cfreq.index(max(cfreq))
233                cvalue = Orange.data.Value(self.domain.classVar, index)
234   
235            if resultType == orange.GetValue: return cvalue
236            elif resultType == orange.GetProbabilities: return cprob
237            else: return (cvalue, cprob)
238       
239        else:
240            # Handle continuous class
241           
242            # voting for class probabilities
243            if resultType == orange.GetProbabilities or resultType == orange.GetBoth:
244                probs = [c(instance, orange.GetProbabilities) for c in self.classifiers]
245                cprob = dict()
246                for prob in probs:
247                    a = dict(prob.items())
248                    cprob = dict( (n, a.get(n, 0)+cprob.get(n, 0)) for n in set(a)|set(cprob) )
249                cprob = Orange.statistics.distribution.Continuous(cprob)
250                cprob.normalize()
251               
252            # gather average class value
253            if resultType == orange.GetValue or resultType == orange.GetBoth:
254                values = [c(instance).value for c in self.classifiers]
255                cvalue = Orange.data.Value(self.domain.classVar, sum(values) / len(self.classifiers))
256           
257            if resultType == orange.GetValue: return cvalue
258            elif resultType == orange.GetProbabilities: return cprob
259            else: return (cvalue, cprob)
260
261### MeasureAttribute_randomForests
262
263class ScoreFeature(orange.MeasureAttribute):
264    """
265    :param learner: although not required, one can use this argument to pass
266        one's own tree induction algorithm. If None is
267        passed, :class:`Orange.ensemble.forest.MeasureAttribute` will
268        use Orange's tree induction algorithm such that
269        induction nodes with less than 5 data instances will not be
270        considered for (further) splitting.
271    :type learner: None or :class:`Orange.core.Learner`
272    :param trees: number of trees in the forest.
273    :type trees: int
274    :param attributes: number of features used in a randomly drawn
275            subset when searching for best feature to split the node
276            in tree growing (default: None, and if kept this way, this
277            is turned into square root of the number of features in the
278            training set, when this is presented to learner).
279    :type attributes: int
280    :param rand: random generator used in bootstrap sampling. If None is
281        passed, then Python's Random from random library is used, with seed
282        initialized to 0.
283    """
284    def __init__(self, learner=None, trees = 100, attributes=None, rand=None):
285
286        self.trees = trees
287        self.learner = learner
288        self.bufinstances = None
289        self.attributes = attributes
290   
291        if self.learner == None:
292#          temp = RandomForestLearner(attributes=self.attributes)
293#          self.learner = temp.learner
294            self.learner = default_small_learner(attributes=self.attributes)
295         
296   
297        if hasattr(self.learner.split, 'attributes'):
298          self.origattr = self.learner.split.attributes
299     
300        if rand:
301          self.rand = rand  # a random generator
302        else:
303          self.rand = random.Random()
304          self.rand.seed(0)
305
306    def __call__(self, feature, instances, apriorClass=None):
307        """
308        Return importance of a given feature.
309       
310        :param feature: feature to evaluate (by index, name or
311            :class:`Orange.data.variable.Variable` object).
312        :type feature: int, str or :class:`Orange.data.variable.Variable`.
313       
314        :param instances: data instances to use for importance evaluation.
315        :type instances: :class:`Orange.data.Table`
316       
317        :param apriorClass: not used!
318       
319        """
320        attrNo = None
321
322        if type(feature) == int: #by attr. index
323          attrNo  = feature
324        elif type(feature) == type("a"): #by attr. name
325          attrName = feature
326          attrNo = instances.domain.index(attrName)
327        elif isinstance(feature, Orange.data.variable.Variable):
328          atrs = [a for a in instances.domain.attributes]
329          attrNo = atrs.index(feature)
330        else:
331          raise Exception("MeasureAttribute_rf can not be called with (\
332                contingency,classDistribution, apriorClass) as fuction arguments.")
333
334        self.buffer(instances)
335
336        return self.avimp[attrNo]*100/self.trees
337
338    def importances(self, table):
339        """
340        Return importance of all features in the dataset as a list. The result
341        is buffered, so repeated calls on the same (unchanged) dataset are
342        computationally cheap.
343       
344        :param table: dataset of which the features' importance needs to be
345            measured.
346        :type table: :class:`Orange.data.Table`
347
348        """
349        self.buffer(table)
350   
351        return [a*100/self.trees for a in self.avimp]
352
353    def buffer(self, instances):
354        """
355        Recalculate importance of features if needed (ie. if it has been
356        buffered for the given dataset yet).
357
358        :param table: dataset of which the features' importance needs to be
359            measured.
360        :type table: :class:`Orange.data.Table`
361
362        """
363        recalculate = False
364   
365        if instances != self.bufinstances:
366          recalculate = True
367        elif instances.version != self.bufinstances.version:
368          recalculate = True
369         
370        if (recalculate):
371          self.bufinstances = instances
372          self.avimp = [0.0]*len(self.bufinstances.domain.attributes)
373          self.acu = 0
374     
375          if hasattr(self.learner.split, 'attributes'):
376              self.learner.split.attributes = self.origattr
377     
378          # if number of attributes for subset is not set, use square root
379          if hasattr(self.learner.split, 'attributes') and not\
380                    self.learner.split.attributes:
381              self.learner.split.attributes = int(sqrt(\
382                            len(instances.domain.attributes)))
383     
384          self.importanceAcu(self.bufinstances, self.trees, self.avimp)
385     
386    def getOOB(self, instances, selection, nexamples):
387        ooblist = filter(lambda x: x not in selection, range(nexamples))
388        return instances.getitems(ooblist)
389
390    def numRight(self, oob, classifier):
391        """
392        Return a number of instances which are classified correctly.
393        """
394        right = 0
395        for el in oob:
396            if (el.getclass() == classifier(el)):
397                right = right + 1
398        return right
399   
400    def numRightMix(self, oob, classifier, attr):
401        """
402        Return a number of instances which are classified
403        correctly even if a feature is shuffled.
404        """
405        n = len(oob)
406
407        perm = range(n)
408        self.rand.shuffle(perm)
409
410        right = 0
411
412        for i in range(n):
413            ex = Orange.data.Instance(oob[i])
414            ex[attr] = oob[perm[i]][attr]
415           
416            if (ex.getclass() == classifier(ex)):
417                right = right + 1
418               
419        return right
420
421    def importanceAcu(self, instances, trees, avimp):
422        """Accumulate avimp by importances for a given number of trees."""
423        n = len(instances)
424
425        attrs = len(instances.domain.attributes)
426
427        attrnum = {}
428        for attr in range(len(instances.domain.attributes)):
429           attrnum[instances.domain.attributes[attr].name] = attr           
430   
431        # build the forest
432        classifiers = [] 
433        for i in range(trees):
434            # draw bootstrap sample
435            selection = []
436            for j in range(n):
437                selection.append(self.rand.randrange(n))
438            data = instances.getitems(selection)
439           
440            # build the model from the bootstrap sample
441            cla = self.learner(data)
442
443            #prepare OOB data
444            oob = self.getOOB(instances, selection, n)
445           
446            #right on unmixed
447            right = self.numRight(oob, cla)
448           
449            presl = list(self.presentInTree(cla.tree, attrnum))
450                     
451            #randomize each feature in data and test
452            #only those on which there was a split
453            for attr in presl:
454                #calculate number of right classifications
455                #if the values of this features are permutated randomly
456                rightimp = self.numRightMix(oob, cla, attr)               
457                avimp[attr] += (float(right-rightimp))/len(oob)
458        self.acu += trees 
459
460    def presentInTree(self, node, attrnum):
461        """Return features present in tree (features that split)."""
462        if not node:
463          return set([])
464
465        if  node.branchSelector:
466            j = attrnum[node.branchSelector.classVar.name]
467            cs = set([])
468            for i in range(len(node.branches)):
469                s = self.presentInTree(node.branches[i], attrnum)
470                cs = s | cs
471            cs = cs | set([j])
472            return cs
473        else:
474          return set([])
475
476class SplitConstructor_AttributeSubset(orange.TreeSplitConstructor):
477    def __init__(self, scons, attributes, rand = None):
478        import random
479        self.scons = scons           # split constructor of original tree
480        self.attributes = attributes # number of features to consider
481        if rand:
482            self.rand = rand             # a random generator
483        else:
484            self.rand = random.Random()
485            self.rand.seed(0)
486
487    def __call__(self, gen, weightID, contingencies, apriori, candidates, clsfr):
488        cand = [1]*self.attributes + [0]*(len(candidates) - self.attributes)
489        self.rand.shuffle(cand)
490        # instead with all features, we will invoke split constructor
491        # only for the subset of a features
492        t = self.scons(gen, weightID, contingencies, apriori, cand, clsfr)
493        return t
Note: See TracBrowser for help on using the repository browser.