source: orange/orange/Orange/ensemble/forest.py @ 7728:038303153275

Revision 7728:038303153275, 18.5 KB checked in by ales_erjavec <ales.erjavec@…>, 3 years ago (diff)

Calling build_stop in default_small_learner.

Line 
1from math import sqrt, floor
2import Orange.core as orange
3import Orange
4import Orange.feature.scoring
5import random
6
7def default_small_learner(measure=Orange.feature.scoring.Gini(), attributes=None, rand=None):
8    """ A helper function for constructing a small tree learner for use
9    in the RandomForestLearner.
10    :param measure: Feature scoring method for split construction
11    :type measure: :class:`Orange.feature.scoring.Measure`
12    param attributes: number of features used in a randomly drawn
13            subset when searching for best feature to split the node
14            in tree growing
15    :type attributes: int
16    :param rand: random generator used in feature subset selection in split constructor.
17            If None is passed, then Python's Random from random library is
18            used, with seed initialized to 0.
19    :type rand: function
20    """
21    # tree learner assembled as suggested by Brieman (2001)
22    smallTreeLearner = Orange.classification.tree.TreeLearner(
23    storeNodeClassifier=0, storeContingencies=0, 
24    storeDistributions=1, minExamples=5)
25
26    smallTreeLearner.split = smallTreeLearner.build_split()
27   
28    smallTreeLearner.split.discreteSplitConstructor.measure = measure
29    smallTreeLearner.split.continuousSplitConstructor.measure = measure
30   
31    smallTreeLearner.stop = smallTreeLearner.build_stop()
32   
33    smallTreeLearner.split = SplitConstructor_AttributeSubset(\
34            smallTreeLearner.split, attributes, rand)
35    return smallTreeLearner
36   
37
38class RandomForestLearner(orange.Learner):
39    """
40    Just like bagging, classifiers in random forests are trained from bootstrap
41    samples of training data. Here, classifiers are trees. However, to increase
42    randomness, classifiers are built so that at each node the best feature is
43    chosen from a subset of features in the training set. We closely follow the
44    original algorithm (Brieman, 2001) both in implementation and parameter
45    defaults.
46       
47    :param learner: although not required, one can use this argument
48            to pass one's own tree induction algorithm. If None is passed,
49            RandomForestLearner will use Orange's tree induction
50            algorithm such that induction nodes with less than 5
51            data instances will not be considered for (further) splitting.
52    :type learner: :class:`Orange.core.Learner`
53    :param trees: number of trees in the forest.
54    :type trees: int
55    :param attributes: number of features used in a randomly drawn
56            subset when searching for best feature to split the node
57            in tree growing (default: None, and if kept this way, this
58            is turned into square root of the number of features in the
59            training set, when this is presented to learner).
60    :type attributes: int
61    :param rand: random generator used in bootstrap sampling.
62            If None is passed, then Python's Random from random library is
63            used, with seed initialized to 0.
64    :type rand: function
65    :param callback: a function to be called after every iteration of
66            induction of classifier. This is called with parameter
67            (from 0.0 to 1.0) that gives estimates on learning progress.
68    :param name: name of the learner.
69    :type name: string
70    :rtype: :class:`Orange.ensemble.forest.RandomForestClassifier` or
71            :class:`Orange.ensemble.forest.RandomForestLearner`
72    """
73
74    def __new__(cls, instances=None, weight = 0, **kwds):
75        self = orange.Learner.__new__(cls, **kwds)
76        if instances:
77            self.__init__(**kwds)
78            return self.__call__(instances, weight)
79        else:
80            return self
81
82    def __init__(self, learner=None, trees=100, attributes=None,\
83                    name='Random Forest', rand=None, callback=None):
84        self.trees = trees
85        self.name = name
86        self.learner = learner
87        self.attributes = attributes
88        self.callback = callback
89       
90        if rand:
91            self.rand = rand
92        else:
93            self.rand = random.Random()
94            self.rand.seed(0)
95           
96        self.randstate = self.rand.getstate() #original state
97
98    def __call__(self, instances, weight=0):
99        """
100        Learn from the given table of data instances.
101       
102        :param instances: data instances to learn from.
103        :type instances: class:`Orange.data.Table`
104        :param origWeight: weight.
105        :type origWeight: int
106        :rtype: :class:`Orange.ensemble.forest.RandomForestClassifier`
107        """
108       
109        if not self.learner:
110            # Use MSE on continuous class and Gini on discreete
111            if isinstance(instances.domain.class_var, Orange.data.variable.Discrete):
112                learner = default_small_learner(Orange.feature.scoring.Gini(), self.attributes, self.rand)
113            else:
114                learner = default_small_learner(Orange.feature.scoring.MSE(), self.attributes, self.rand)
115        else:
116            learner = self.learner
117       
118        # if number of features for subset is not set, use square root
119
120        if hasattr(learner.split, 'attributes') and \
121            not learner.split.attributes:
122            learner.split.attributes = \
123                int(sqrt(len(instances.domain.attributes)))
124
125        self.rand.setstate(self.randstate) #when learning again, set the same state
126
127        n = len(instances)
128        # build the forest
129        classifiers = []
130        for i in range(self.trees):
131            # draw bootstrap sample
132            selection = []
133            for j in range(n):
134                selection.append(self.rand.randrange(n))
135            data = instances.getitems(selection)
136            # build the model from the bootstrap sample
137            classifiers.append(learner(data))
138            if self.callback:
139                self.callback()
140            # if self.callback: self.callback((i+1.)/self.trees)
141
142        return RandomForestClassifier(classifiers = classifiers, name=self.name,\
143                    domain=instances.domain, classVar=instances.domain.classVar)
144       
145class RandomForestClassifier(orange.Classifier):
146    """
147    Random forest classifier uses decision trees induced from bootstrapped
148    training set to vote on class of presented instance. Most frequent vote
149    is returned. However, in our implementation, if class probability is
150    requested from a classifier, this will return the averaged probabilities
151    from each of the trees.
152
153    When constructing the classifier manually, the following parameters can
154    be passed:
155
156    :param classifiers: a list of classifiers to be used.
157    :type classifiers: list
158   
159    :param name: name of the resulting classifier.
160    :type name: str
161   
162    :param domain: the domain of the learning set.
163    :type domain: :class:`Orange.data.Domain`
164   
165    :param classVar: the class feature.
166    :type classVar: :class:`Orange.data.variable.Variable`
167
168    """
169    def __init__(self, classifiers, name, domain, classVar, **kwds):
170        self.classifiers = classifiers
171        self.name = name
172        self.domain = domain
173        self.classVar = classVar
174        self.__dict__.update(kwds)
175
176    def __call__(self, instance, resultType = orange.GetValue):
177        """
178        :param instance: instance to be classified.
179        :type instance: :class:`Orange.data.Instance`
180       
181        :param result_type: :class:`Orange.classification.Classifier.GetValue` or \
182              :class:`Orange.classification.Classifier.GetProbabilities` or
183              :class:`Orange.classification.Classifier.GetBoth`
184       
185        :rtype: :class:`Orange.data.Value`,
186              :class:`Orange.statistics.Distribution` or a tuple with both
187        """
188        from operator import add
189       
190        # handle discreete class
191       
192        if self.class_var.var_type == Orange.data.variable.Discrete.Discrete:
193       
194            # voting for class probabilities
195            if resultType == orange.GetProbabilities or resultType == orange.GetBoth:
196                prob = [0.] * len(self.domain.classVar.values)
197                for c in self.classifiers:
198                    a = [x for x in c(instance, orange.GetProbabilities)]
199                    prob = map(add, prob, a)
200                norm = sum(prob)
201                cprob = Orange.statistics.distribution.Discrete(self.classVar)
202                for i in range(len(prob)):
203                    cprob[i] = prob[i]/norm
204               
205               
206   
207            # voting for crisp class membership, notice that
208            # this may not be the same class as one obtaining the
209            # highest probability through probability voting
210            if resultType == orange.GetValue or resultType == orange.GetBoth:
211                cfreq = [0] * len(self.domain.classVar.values)
212                for c in self.classifiers:
213                    cfreq[int(c(instance))] += 1
214                index = cfreq.index(max(cfreq))
215                cvalue = Orange.data.Value(self.domain.classVar, index)
216   
217            if resultType == orange.GetValue: return cvalue
218            elif resultType == orange.GetProbabilities: return cprob
219            else: return (cvalue, cprob)
220       
221        else:
222            # Handle continuous class
223           
224            # voting for class probabilities
225            if resultType == orange.GetProbabilities or resultType == orange.GetBoth:
226                probs = [c(instance, orange.GetProbabilities) for c in self.classifiers]
227                cprob = dict()
228                for prob in probs:
229                    a = dict(prob.items())
230                    cprob = dict( (n, a.get(n, 0)+cprob.get(n, 0)) for n in set(a)|set(cprob) )
231                cprob = Orange.statistics.distribution.Continuous(cprob)
232                cprob.normalize()
233               
234            # gather average class value
235            if resultType == orange.GetValue or resultType == orange.GetBoth:
236                values = [c(instance).value for c in self.classifiers]
237                cvalue = Orange.data.Value(self.domain.classVar, sum(values) / len(self.classifiers))
238           
239            if resultType == orange.GetValue: return cvalue
240            elif resultType == orange.GetProbabilities: return cprob
241            else: return (cvalue, cprob)
242
243### MeasureAttribute_randomForests
244
245class ScoreFeature(orange.MeasureAttribute):
246    """
247    :param learner: although not required, one can use this argument to pass
248        one's own tree induction algorithm. If None is
249        passed, :class:`Orange.ensemble.forest.MeasureAttribute` will
250        use Orange's tree induction algorithm such that
251        induction nodes with less than 5 data instances will not be
252        considered for (further) splitting.
253    :type learner: None or :class:`Orange.core.Learner`
254    :param trees: number of trees in the forest.
255    :type trees: int
256    :param attributes: number of features used in a randomly drawn
257            subset when searching for best feature to split the node
258            in tree growing (default: None, and if kept this way, this
259            is turned into square root of the number of features in the
260            training set, when this is presented to learner).
261    :type attributes: int
262    :param rand: random generator used in bootstrap sampling. If None is
263        passed, then Python's Random from random library is used, with seed
264        initialized to 0.
265    """
266    def __init__(self, learner=None, trees = 100, attributes=None, rand=None):
267
268        self.trees = trees
269        self.learner = learner
270        self.bufinstances = None
271        self.attributes = attributes
272   
273        if self.learner == None:
274#          temp = RandomForestLearner(attributes=self.attributes)
275#          self.learner = temp.learner
276            self.learner = default_small_learner(attributes=self.attributes)
277         
278   
279        if hasattr(self.learner.split, 'attributes'):
280          self.origattr = self.learner.split.attributes
281     
282        if rand:
283          self.rand = rand  # a random generator
284        else:
285          self.rand = random.Random()
286          self.rand.seed(0)
287
288    def __call__(self, feature, instances, apriorClass=None):
289        """
290        Return importance of a given feature.
291       
292        :param feature: feature to evaluate (by index, name or
293            :class:`Orange.data.variable.Variable` object).
294        :type feature: int, str or :class:`Orange.data.variable.Variable`.
295       
296        :param instances: data instances to use for importance evaluation.
297        :type instances: :class:`Orange.data.Table`
298       
299        :param apriorClass: not used!
300       
301        """
302        attrNo = None
303
304        if type(feature) == int: #by attr. index
305          attrNo  = feature
306        elif type(feature) == type("a"): #by attr. name
307          attrName = feature
308          attrNo = instances.domain.index(attrName)
309        elif isinstance(feature, Orange.data.variable.Variable):
310          atrs = [a for a in instances.domain.attributes]
311          attrNo = atrs.index(feature)
312        else:
313          raise Exception("MeasureAttribute_rf can not be called with (\
314                contingency,classDistribution, apriorClass) as fuction arguments.")
315
316        self.buffer(instances)
317
318        return self.avimp[attrNo]*100/self.trees
319
320    def importances(self, table):
321        """
322        Return importance of all features in the dataset as a list. The result
323        is buffered, so repeated calls on the same (unchanged) dataset are
324        computationally cheap.
325       
326        :param table: dataset of which the features' importance needs to be
327            measured.
328        :type table: :class:`Orange.data.Table`
329
330        """
331        self.buffer(table)
332   
333        return [a*100/self.trees for a in self.avimp]
334
335    def buffer(self, instances):
336        """
337        Recalculate importance of features if needed (ie. if it has been
338        buffered for the given dataset yet).
339
340        :param table: dataset of which the features' importance needs to be
341            measured.
342        :type table: :class:`Orange.data.Table`
343
344        """
345        recalculate = False
346   
347        if instances != self.bufinstances:
348          recalculate = True
349        elif instances.version != self.bufinstances.version:
350          recalculate = True
351         
352        if (recalculate):
353          self.bufinstances = instances
354          self.avimp = [0.0]*len(self.bufinstances.domain.attributes)
355          self.acu = 0
356     
357          if hasattr(self.learner.split, 'attributes'):
358              self.learner.split.attributes = self.origattr
359     
360          # if number of attributes for subset is not set, use square root
361          if hasattr(self.learner.split, 'attributes') and not\
362                    self.learner.split.attributes:
363              self.learner.split.attributes = int(sqrt(\
364                            len(instances.domain.attributes)))
365     
366          self.importanceAcu(self.bufinstances, self.trees, self.avimp)
367     
368    def getOOB(self, instances, selection, nexamples):
369        ooblist = filter(lambda x: x not in selection, range(nexamples))
370        return instances.getitems(ooblist)
371
372    def numRight(self, oob, classifier):
373        """
374        Return a number of instances which are classified correctly.
375        """
376        right = 0
377        for el in oob:
378            if (el.getclass() == classifier(el)):
379                right = right + 1
380        return right
381   
382    def numRightMix(self, oob, classifier, attr):
383        """
384        Return a number of instances which are classified
385        correctly even if a feature is shuffled.
386        """
387        n = len(oob)
388
389        perm = range(n)
390        self.rand.shuffle(perm)
391
392        right = 0
393
394        for i in range(n):
395            ex = Orange.data.Instance(oob[i])
396            ex[attr] = oob[perm[i]][attr]
397           
398            if (ex.getclass() == classifier(ex)):
399                right = right + 1
400               
401        return right
402
403    def importanceAcu(self, instances, trees, avimp):
404        """Accumulate avimp by importances for a given number of trees."""
405        n = len(instances)
406
407        attrs = len(instances.domain.attributes)
408
409        attrnum = {}
410        for attr in range(len(instances.domain.attributes)):
411           attrnum[instances.domain.attributes[attr].name] = attr           
412   
413        # build the forest
414        classifiers = [] 
415        for i in range(trees):
416            # draw bootstrap sample
417            selection = []
418            for j in range(n):
419                selection.append(self.rand.randrange(n))
420            data = instances.getitems(selection)
421           
422            # build the model from the bootstrap sample
423            cla = self.learner(data)
424
425            #prepare OOB data
426            oob = self.getOOB(instances, selection, n)
427           
428            #right on unmixed
429            right = self.numRight(oob, cla)
430           
431            presl = list(self.presentInTree(cla.tree, attrnum))
432                     
433            #randomize each feature in data and test
434            #only those on which there was a split
435            for attr in presl:
436                #calculate number of right classifications
437                #if the values of this features are permutated randomly
438                rightimp = self.numRightMix(oob, cla, attr)               
439                avimp[attr] += (float(right-rightimp))/len(oob)
440        self.acu += trees 
441
442    def presentInTree(self, node, attrnum):
443        """Return features present in tree (features that split)."""
444        if not node:
445          return set([])
446
447        if  node.branchSelector:
448            j = attrnum[node.branchSelector.classVar.name]
449            cs = set([])
450            for i in range(len(node.branches)):
451                s = self.presentInTree(node.branches[i], attrnum)
452                cs = s | cs
453            cs = cs | set([j])
454            return cs
455        else:
456          return set([])
457
458class SplitConstructor_AttributeSubset(orange.TreeSplitConstructor):
459    def __init__(self, scons, attributes, rand = None):
460        import random
461        self.scons = scons           # split constructor of original tree
462        self.attributes = attributes # number of features to consider
463        if rand:
464            self.rand = rand             # a random generator
465        else:
466            self.rand = random.Random()
467            self.rand.seed(0)
468
469    def __call__(self, gen, weightID, contingencies, apriori, candidates, clsfr):
470        cand = [1]*self.attributes + [0]*(len(candidates) - self.attributes)
471        self.rand.shuffle(cand)
472        # instead with all features, we will invoke split constructor
473        # only for the subset of a features
474        t = self.scons(gen, weightID, contingencies, apriori, cand, clsfr)
475        return t
Note: See TracBrowser for help on using the repository browser.