source: orange/orange/Orange/ensemble/forest.py @ 9349:fa13a2c52fcd

Revision 9349:fa13a2c52fcd, 20.6 KB checked in by mitar, 2 years ago (diff)

Changed way of linking to code in documentation.

Line 
1from math import sqrt, floor
2import Orange.core as orange
3import Orange
4import Orange.feature.scoring
5import random
6import copy
7
8def _default_small_learner(attributes=None, rand=None, base=None):
9    # tree learner assembled as suggested by Breiman (2001)
10    if not base:
11        base = Orange.classification.tree.TreeLearner(
12            store_node_classifier=0, store_contingencies=0, 
13            store_distributions=1, min_instances=5)
14
15    return _RandomForestTreeLearner(base=base, rand=rand)
16
17def _default_simple_learner(base, randorange):
18    if base == None:
19        base = Orange.classification.tree.SimpleTreeLearner(min_instances=5)
20    return _RandomForestSimpleTreeLearner(base=base, rand=randorange)
21
22def _wrap_learner(base, rand, randorange):
23    if base == None or isinstance(base, Orange.classification.tree.SimpleTreeLearner):
24        return _default_simple_learner(base, randorange)
25    elif isinstance(base, Orange.classification.tree.TreeLearner):
26        return _default_small_learner(None, rand, base)
27    else:
28        notRightLearnerToWrap()
29 
30class _RandomForestSimpleTreeLearner(Orange.core.Learner):
31    """ A learner which wraps an ordinary SimpleTreeLearner.  Sets the
32    skip_prob so that the number of randomly chosen features for each
33    split is  (on average) as specified.  """
34
35    def __new__(cls, examples = None, weightID = 0, **argkw):
36        self = Orange.core.Learner.__new__(cls, **argkw)
37        if examples:
38            self.__init__(**argkw)
39            return self.__call__(examples, weightID)
40        else:
41            return self
42     
43    def __init__(self, base, rand):
44        self.base = base
45        self.attributes = None
46        self.rand = rand
47   
48    def __call__(self, examples, weight=0):
49        osp,orand = self.base.skip_prob, self.base.random_generator
50        self.base.skip_prob = 1-float(self.attributes)/len(examples.domain.attributes)
51        self.base.random_generator = self.rand
52        r = self.base(examples, weight)
53        self.base.skip_prob, self.base.random_generator = osp, orand
54        return r
55   
56class RandomForestLearner(orange.Learner):
57    """
58    Just like in bagging, classifiers in random forests are trained from bootstrap
59    samples of training data. Here, the classifiers are trees. However, to increase
60    randomness, at each node of the tree the best feature is
61    chosen from a subset of features in the data. We closely follow the
62    original algorithm (Brieman, 2001) both in implementation and parameter
63    defaults.
64       
65    :param trees: number of trees in the forest.
66    :type trees: int
67
68    :param attributes: number of randomly drawn features among
69            which to select the best to split the nodes in tree
70            induction. The default, None, means the square root of
71            the number of features in the training data. Ignored if
72            :obj:`learner` is specified.
73    :type attributes: int
74
75    :param base_learner: A base tree learner. The base learner will be
76        randomized with Random Forest's random
77        feature subset selection.  If None (default),
78        :class:`~Orange.classification.tree.SimpleTreeLearner` and it
79        will not split nodes with less than 5 data instances.
80    :type base_learner: None or
81        :class:`Orange.classification.tree.TreeLearner` or
82        :class:`Orange.classification.tree.SimpleTreeLearner`
83
84    :param rand: random generator used in bootstrap sampling. If None (default),
85        then ``random.Random(0)`` is used.
86
87    :param learner: Tree induction learner. If `None` (default),
88        the :obj:`base_learner` will be used (and randomized). If
89        :obj:`learner` is specified, it will be used as such
90        with no additional transformations.
91    :type learner: None or :class:`Orange.core.Learner`
92
93    :param callback: a function to be called after every iteration of
94            induction of classifier. This is called with parameter
95            (from 0.0 to 1.0) that gives estimates on learning progress.
96
97    :param name: name of the learner.
98    :type name: string
99
100    :rtype: :class:`~Orange.ensemble.forest.RandomForestClassifier` or
101            :class:`~Orange.ensemble.forest.RandomForestLearner`
102
103    """
104
105    def __new__(cls, instances=None, weight = 0, **kwds):
106        self = orange.Learner.__new__(cls, **kwds)
107        if instances:
108            self.__init__(**kwds)
109            return self.__call__(instances, weight)
110        else:
111            return self
112
113    def __init__(self, trees=100, attributes=None,\
114                    name='Random Forest', rand=None, callback=None, base_learner=None, learner=None):
115        self.trees = trees
116        self.name = name
117        self.attributes = attributes
118        self.callback = callback
119        self.rand = rand
120
121        self.base_learner = base_learner
122
123        if base_learner != None and learner != None:
124            wrongSpecification()
125
126        if not self.rand:
127            self.rand = random.Random(0)
128        self.randorange = Orange.core.RandomGenerator(self.rand.randint(0,2**31-1))
129
130        if learner == None:
131            self.learner = _wrap_learner(base=self.base_learner, rand=self.rand, randorange=self.randorange)
132        else:
133            self.learner = learner
134           
135        self.randstate = self.rand.getstate() #original state
136
137    def __call__(self, instances, weight=0):
138        """
139        Learn from the given table of data instances.
140       
141        :param instances: learning data.
142        :type instances: class:`Orange.data.Table`
143        :param weight: weight.
144        :type weight: int
145        :rtype: :class:`Orange.ensemble.forest.RandomForestClassifier`
146        """
147        self.rand.setstate(self.randstate) #when learning again, set the same state
148        self.randorange.reset()       
149
150        if "attributes" in self.learner.__dict__:
151            self.learner.attributes = len(instances.domain.attributes)**0.5 if self.attributes == None else self.attributes
152
153        learner = self.learner
154
155        n = len(instances)
156        # build the forest
157        classifiers = []
158        for i in range(self.trees):
159            # draw bootstrap sample
160            selection = []
161            for j in range(n):
162                selection.append(self.rand.randrange(n))
163            data = instances.get_items_ref(selection)
164            # build the model from the bootstrap sample
165            classifiers.append(learner(data, weight))
166            if self.callback:
167                self.callback()
168            # if self.callback: self.callback((i+1.)/self.trees)
169
170        return RandomForestClassifier(classifiers = classifiers, name=self.name,\
171                    domain=instances.domain, class_var=instances.domain.class_var)
172 
173
174class RandomForestClassifier(orange.Classifier):
175    """
176    Uses the trees induced by the :obj:`RandomForestLearner`. An input
177    instance is classified into the class with the most frequent vote.
178    However, this implementation returns the averaged probabilities from
179    each of the trees if class probability is requested.
180
181    When constructed manually, the following parameters have to
182    be passed:
183
184    :param classifiers: a list of classifiers to be used.
185    :type classifiers: list
186   
187    :param name: name of the resulting classifier.
188    :type name: str
189   
190    :param domain: the domain of the learning set.
191    :type domain: :class:`Orange.data.Domain`
192   
193    :param class_var: the class feature.
194    :type class_var: :class:`Orange.data.variable.Variable`
195
196    """
197    def __init__(self, classifiers, name, domain, class_var, **kwds):
198        self.classifiers = classifiers
199        self.name = name
200        self.domain = domain
201        self.class_var = class_var
202        self.__dict__.update(kwds)
203
204    def __call__(self, instance, resultType = orange.GetValue):
205        """
206        :param instance: instance to be classified.
207        :type instance: :class:`Orange.data.Instance`
208       
209        :param result_type: :class:`Orange.classification.Classifier.GetValue` or \
210              :class:`Orange.classification.Classifier.GetProbabilities` or
211              :class:`Orange.classification.Classifier.GetBoth`
212       
213        :rtype: :class:`Orange.data.Value`,
214              :class:`Orange.statistics.Distribution` or a tuple with both
215        """
216        from operator import add
217       
218        # handle discreete class
219       
220        if self.class_var.var_type == Orange.data.variable.Discrete.Discrete:
221       
222            # voting for class probabilities
223            if resultType == orange.GetProbabilities or resultType == orange.GetBoth:
224                prob = [0.] * len(self.domain.class_var.values)
225                for c in self.classifiers:
226                    a = [x for x in c(instance, orange.GetProbabilities)]
227                    prob = map(add, prob, a)
228                norm = sum(prob)
229                cprob = Orange.statistics.distribution.Discrete(self.class_var)
230                for i in range(len(prob)):
231                    cprob[i] = prob[i]/norm
232               
233            # voting for crisp class membership, notice that
234            # this may not be the same class as one obtaining the
235            # highest probability through probability voting
236            if resultType == orange.GetValue or resultType == orange.GetBoth:
237                cfreq = [0] * len(self.domain.class_var.values)
238                for c in self.classifiers:
239                    cfreq[int(c(instance))] += 1
240                index = cfreq.index(max(cfreq))
241                cvalue = Orange.data.Value(self.domain.class_var, index)
242   
243            if resultType == orange.GetValue: return cvalue
244            elif resultType == orange.GetProbabilities: return cprob
245            else: return (cvalue, cprob)
246       
247        else:
248            # Handle continuous class
249       
250            # voting for class probabilities
251            if resultType == orange.GetProbabilities or resultType == orange.GetBoth:
252                probs = [c(instance, orange.GetBoth) for c in self.classifiers]
253                cprob = dict()
254                for val,prob in probs:
255                    if prob != None: #no probability output
256                        a = dict(prob.items())
257                    else:
258                        a = { val.value : 1. }
259                    cprob = dict( (n, a.get(n, 0)+cprob.get(n, 0)) for n in set(a)|set(cprob) )
260                cprob = Orange.statistics.distribution.Continuous(cprob)
261                cprob.normalize()
262               
263            # gather average class value
264            if resultType == orange.GetValue or resultType == orange.GetBoth:
265                values = [c(instance).value for c in self.classifiers]
266                cvalue = Orange.data.Value(self.domain.class_var, sum(values) / len(self.classifiers))
267           
268            if resultType == orange.GetValue: return cvalue
269            elif resultType == orange.GetProbabilities: return cprob
270            else: return (cvalue, cprob)
271           
272    def __reduce__(self):
273        return type(self), (self.classifiers, self.name, self.domain, self.class_var), dict(self.__dict__)
274
275### MeasureAttribute_randomForests
276
277class ScoreFeature(orange.MeasureAttribute):
278    """
279    :param trees: number of trees in the forest.
280    :type trees: int
281
282    :param attributes: number of randomly drawn features among
283            which to select the best to split the nodes in tree
284            induction. The default, None, means the square root of
285            the number of features in the training data. Ignored if
286            :obj:`learner` is specified.
287    :type attributes: int
288
289    :param base_learner: A base tree learner. The base learner will be
290        randomized with Random Forest's random
291        feature subset selection.  If None (default),
292        :class:`~Orange.classification.tree.SimpleTreeLearner` and it
293        will not split nodes with less than 5 data instances.
294    :type base_learner: None or
295        :class:`Orange.classification.tree.TreeLearner` or
296        :class:`Orange.classification.tree.SimpleTreeLearner`
297
298    :param rand: random generator used in bootstrap sampling. If None (default),
299        then ``random.Random(0)`` is used.
300
301    :param learner: Tree induction learner. If `None` (default),
302        the :obj:`base_learner` will be used (and randomized). If
303        :obj:`learner` is specified, it will be used as such
304        with no additional transformations.
305    :type learner: None or :class:`Orange.core.Learner`
306
307    """
308    def __init__(self, trees=100, attributes=None, rand=None, base_learner=None, learner=None):
309
310        self.trees = trees
311        self.learner = learner
312        self._bufinstances = None
313        self.attributes = attributes
314        self.rand = rand
315        self.base_learner = base_learner
316
317        if base_learner != None and learner != None:
318            wrongSpecification()
319
320        if not self.rand:
321            self.rand = random.Random(0)
322        self.randorange = Orange.core.RandomGenerator(self.rand.randint(0,2**31-1))
323
324        if learner == None:
325            self.learner = _wrap_learner(base=self.base_learner, rand=self.rand, randorange=self.randorange)
326        else:
327            self.learner = learner
328 
329    def __call__(self, feature, instances, apriorClass=None):
330        """
331        Return importance of a given feature.
332        Only the first call on a given data set is computationally expensive.
333       
334        :param feature: feature to evaluate (by index, name or
335            :class:`Orange.data.variable.Variable` object).
336        :type feature: int, str or :class:`Orange.data.variable.Variable`.
337       
338        :param instances: data instances to use for importance evaluation.
339        :type instances: :class:`Orange.data.Table`
340       
341        :param apriorClass: not used!
342       
343        """
344        attrNo = None
345
346        if type(feature) == int: #by attr. index
347          attrNo  = feature
348        elif type(feature) == type("a"): #by attr. name
349          attrName = feature
350          attrNo = instances.domain.index(attrName)
351        elif isinstance(feature, Orange.data.variable.Variable):
352          atrs = [a for a in instances.domain.attributes]
353          attrNo = atrs.index(feature)
354        else:
355          raise Exception("MeasureAttribute_rf can not be called with (\
356                contingency,classDistribution, apriorClass) as fuction arguments.")
357
358        self._buffer(instances)
359
360        return self._avimp[attrNo]*100/self._acu
361
362    def importances(self, table):
363        """
364        DEPRECATED. Return importance of all features in the dataset as a list.
365       
366        :param table: dataset of which the features' importance needs to be
367            measured.
368        :type table: :class:`Orange.data.Table`
369
370        """
371        self._buffer(table)
372        return [a*100/self._acu for a in self._avimp]
373
374    def _buffer(self, instances):
375        """
376        Recalculate importance of features if needed (ie. if it has been
377        buffered for the given dataset yet).
378
379        :param table: dataset of which the features' importance needs to be
380            measured.
381        :type table: :class:`Orange.data.Table`
382
383        """
384        if instances != self._bufinstances or \
385            instances.version != self._bufinstances.version:
386
387            self._bufinstances = instances
388            self._avimp = [0.0]*len(self._bufinstances.domain.attributes)
389            self._acu = 0
390            self._importanceAcu(self._bufinstances, self.trees, self._avimp)
391     
392    def _getOOB(self, instances, selection, nexamples):
393        ooblist = filter(lambda x: x not in selection, range(nexamples))
394        return instances.getitems(ooblist)
395
396    def _numRight(self, oob, classifier):
397        """
398        Return a number of instances which are classified correctly.
399        """
400        #TODO How to accomodate regression?
401        return sum(1 for el in oob if el.getclass() == classifier(el))
402   
403    def _numRightMix(self, oob, classifier, attr):
404        """
405        Return a number of instances which are classified
406        correctly even if a feature is shuffled.
407        """
408        perm = range(len(oob))
409        self.rand.shuffle(perm)
410
411        def shuffle_ex(index):
412            ex = Orange.data.Instance(oob[index])
413            ex[attr] = oob[perm[index]][attr]
414            return ex
415        #TODO How to accomodate regression?
416        return sum(1 for i in range(len(oob)) if oob[i].getclass() == classifier(shuffle_ex(i)))
417
418    def _importanceAcu(self, instances, trees, avimp):
419        """Accumulate avimp by importances for a given number of trees."""
420        n = len(instances)
421
422        attrs = len(instances.domain.attributes)
423
424        attrnum = {}
425        for attr in range(len(instances.domain.attributes)):
426           attrnum[instances.domain.attributes[attr].name] = attr           
427
428        if "attributes" in self.learner.__dict__:
429            self.learner.attributes = len(instances.domain.attributes)**0.5 if self.attributes == None else self.attributes
430
431        # build the forest
432        classifiers = [] 
433        for i in range(trees):
434            # draw bootstrap sample
435            selection = []
436            for j in range(n):
437                selection.append(self.rand.randrange(n))
438            data = instances.getitems(selection)
439           
440            # build the model from the bootstrap sample
441            cla = self.learner(data)
442
443            #prepare OOB data
444            oob = self._getOOB(instances, selection, n)
445           
446            #right on unmixed
447            right = self._numRight(oob, cla)
448           
449            presl = range(attrs)
450            try: #FIXME SimpleTreeLearner does not know how to output attributes yet
451                presl = list(self._presentInTree(cla.tree, attrnum))
452            except:
453                pass
454                     
455            #randomize each feature in data and test
456            #only those on which there was a split
457            for attr in presl:
458                #calculate number of right classifications
459                #if the values of this features are permutated randomly
460                rightimp = self._numRightMix(oob, cla, attr)               
461                avimp[attr] += (float(right-rightimp))/len(oob)
462        self._acu += trees 
463
464    def _presentInTree(self, node, attrnum):
465        """Return features present in tree (features that split)."""
466        if not node:
467          return set([])
468
469        if  node.branchSelector:
470            j = attrnum[node.branchSelector.class_var.name]
471            cs = set([])
472            for i in range(len(node.branches)):
473                s = self._presentInTree(node.branches[i], attrnum)
474                cs = s | cs
475            cs = cs | set([j])
476            return cs
477        else:
478          return set([])
479
480class _RandomForestTreeLearner(Orange.core.Learner):
481    """ A learner which wraps an ordinary TreeLearner with
482    a new split constructor.
483    """
484
485    def __new__(cls, examples = None, weightID = 0, **argkw):
486        self = Orange.core.Learner.__new__(cls, **argkw)
487        if examples:
488            self.__init__(**argkw)
489            return self.__call__(examples, weightID)
490        else:
491            return self
492     
493    def __init__(self, base, rand):
494        self.base = base
495        self.attributes = None
496        self.rand = rand
497        if not self.rand: #for all the built trees
498            self.rand = random.Random(0)
499   
500    def __call__(self, examples, weight=0):
501        """ A current tree learner is copied, modified and then used.
502        Modification: set a different split constructor, which uses
503        a random subset of attributes.
504        """
505        bcopy = copy.copy(self.base)
506
507        #if base tree learner has no measure set
508        if not bcopy.measure:
509            bcopy.measure = Orange.feature.scoring.Gini() \
510                if isinstance(examples.domain.class_var, Orange.data.variable.Discrete) \
511                else Orange.feature.scoring.MSE()
512
513        bcopy.split = SplitConstructor_AttributeSubset(\
514            bcopy.split, self.attributes, self.rand)
515
516        return bcopy(examples, weight=weight)
517
518class SplitConstructor_AttributeSubset(orange.TreeSplitConstructor):
519    def __init__(self, scons, attributes, rand = None):
520        self.scons = scons           # split constructor of original tree
521        self.attributes = attributes # number of features to consider
522        self.rand = rand
523        if not self.rand:
524            self.rand = random.Random(0)
525
526    def __call__(self, gen, weightID, contingencies, apriori, candidates, clsfr):
527        # if number of features for subset is not set, use square root
528        cand = [1]*int(self.attributes) + [0]*(len(candidates) - int(self.attributes))
529        self.rand.shuffle(cand)
530        # instead with all features, we will invoke split constructor
531        # only for the subset of a features
532        t = self.scons(gen, weightID, contingencies, apriori, cand, clsfr)
533        return t
Note: See TracBrowser for help on using the repository browser.