source: orange/Orange/ensemble/forest.py @ 10851:b91d80df8bb3

Revision 10851:b91d80df8bb3, 22.1 KB checked in by Miran@…, 2 years ago (diff)

[BUG] fixed a pickling bug in ensemble.forest._RandomForestTreeLearner

Line 
1from math import sqrt, floor
2import Orange.core as orange
3import Orange
4import Orange.feature.scoring
5import random
6import copy
7from Orange.utils import deprecated_keywords
8
9def _default_small_learner(attributes=None, rand=None, base=None):
10    # tree learner assembled as suggested by Breiman (2001)
11    if not base:
12        base = Orange.classification.tree.TreeLearner(
13            store_node_classifier=0, store_contingencies=0, 
14            store_distributions=1, min_instances=5)
15
16    return _RandomForestTreeLearner(base=base, rand=rand)
17
18def _default_simple_learner(base, randorange):
19    if base == None:
20        base = Orange.classification.tree.SimpleTreeLearner(min_instances=5)
21    return _RandomForestSimpleTreeLearner(base=base, rand=randorange)
22
23def _wrap_learner(base, rand, randorange):
24    if base == None or isinstance(base, Orange.classification.tree.SimpleTreeLearner):
25        return _default_simple_learner(base, randorange)
26    elif isinstance(base, Orange.classification.tree.TreeLearner):
27        return _default_small_learner(None, rand, base)
28    else:
29        notRightLearnerToWrap()
30 
31class _RandomForestSimpleTreeLearner(Orange.core.Learner):
32    """A learner which wraps an ordinary SimpleTreeLearner.  Sets the
33    skip_prob so that the number of randomly chosen features for each
34    split is  (on average) as specified."""
35
36    __new__ = Orange.utils._orange__new__(Orange.core.Learner)
37
38    def __init__(self, base=None, rand=None): # pickle needs an empty init
39        self.base = base
40        self.attributes = None
41        self.rand = rand
42   
43    def __call__(self, instances, weight=0):
44        osp,orand = self.base.skip_prob, self.base.random_generator
45        self.base.skip_prob = 1-float(self.attributes)/len(instances.domain.attributes)
46        self.base.random_generator = self.rand
47        r = self.base(instances, weight)
48        self.base.skip_prob, self.base.random_generator = osp, orand
49        return r
50
51_RandomForestSimpleTreeLearner = Orange.utils.deprecated_members({"weightID":"weight_id", "examples":"instances"})(_RandomForestSimpleTreeLearner)
52
53
54class _RandomForestTreeLearner(Orange.core.Learner):
55    """ A learner which wraps an ordinary TreeLearner with
56    a new split constructor.
57    """
58
59    __new__ = Orange.utils._orange__new__(Orange.core.Learner)
60   
61    def __init__(self, base=None, rand=None): # pickle needs an empty init
62        self.base = base
63        self.attributes = None
64        self.rand = rand
65        if not self.rand: #for all the built trees
66            self.rand = random.Random(0)
67
68    @deprecated_keywords({"examples":"instances"})
69    def __call__(self, instances, weight=0):
70        """ A current tree learner is copied, modified and then used.
71        Modification: set a different split constructor, which uses
72        a random subset of attributes.
73        """
74        bcopy = copy.copy(self.base)
75
76        #if base tree learner has no measure set
77        if not bcopy.measure:
78            bcopy.measure = Orange.feature.scoring.Gini() \
79                if isinstance(instances.domain.class_var, Orange.feature.Discrete) \
80                else Orange.feature.scoring.MSE()
81
82        bcopy.split = SplitConstructor_AttributeSubset(\
83            bcopy.split, self.attributes, self.rand)
84
85        return bcopy(instances, weight=weight)
86
87
88
89class RandomForestLearner(Orange.core.Learner):
90    """
91    Trains an ensemble predictor consisting of trees trained
92    on bootstrap
93    samples of training data. To increase
94    randomness, the tree learner considers only a subset of
95    candidate features at each node. The algorithm closely follows
96    the original procedure (Brieman, 2001) both in implementation and parameter
97    defaults.
98       
99    :param trees: number of trees in the forest.
100    :type trees: int
101
102    :param attributes: number of randomly drawn features among
103            which to select the best one to split the data sets
104            in tree nodes. The default, None, means the square root of
105            the number of features in the training data. Ignored if
106            :obj:`learner` is specified.
107    :type attributes: int
108
109    :param base_learner: A base tree learner. The base learner will be
110        randomized with Random Forest's random
111        feature subset selection.  If None (default),
112        :class:`~Orange.classification.tree.SimpleTreeLearner` and it
113        will not split nodes with less than 5 data instances.
114    :type base_learner: None or
115        :class:`Orange.classification.tree.TreeLearner` or
116        :class:`Orange.classification.tree.SimpleTreeLearner`
117
118    :param rand: random generator used in bootstrap sampling. If None (default),
119        then ``random.Random(0)`` is used.
120
121    :param learner: Tree induction learner. If `None` (default),
122        the :obj:`base_learner` will be used (and randomized). If
123        :obj:`learner` is specified, it will be used as such
124        with no additional transformations.
125    :type learner: None or :class:`Orange.core.Learner`
126
127    :param callback: a function to be called after every iteration of
128            induction of classifier. The call includes a parameter
129            (from 0.0 to 1.0) that provides an estimate
130            of completion of the learning progress.
131
132    :param name: learner name.
133    :type name: string
134
135    :rtype: :class:`~Orange.ensemble.forest.RandomForestClassifier` or
136            :class:`~Orange.ensemble.forest.RandomForestLearner`
137
138    """
139
140    __new__ = Orange.utils._orange__new__(Orange.core.Learner)
141
142    def __init__(self, trees=100, attributes=None,\
143                    name='Random Forest', rand=None, callback=None, base_learner=None, learner=None):
144        self.trees = trees
145        self.name = name
146        self.attributes = attributes
147        self.callback = callback
148        self.rand = rand
149
150        self.base_learner = base_learner
151
152        if base_learner != None and learner != None:
153            wrongSpecification()
154
155        if not self.rand:
156            self.rand = random.Random(0)
157        self.randorange = Orange.misc.Random(self.rand.randint(0,2**31-1))
158
159        if learner == None:
160            self.learner = _wrap_learner(base=self.base_learner, rand=self.rand, randorange=self.randorange)
161        else:
162            self.learner = learner
163           
164        self.randstate = self.rand.getstate() #original state
165
166    def __call__(self, instances, weight=0):
167        """
168        Learn from the given table of data instances.
169       
170        :param instances: learning data.
171        :type instances: class:`Orange.data.Table`
172        :param weight: weight.
173        :type weight: int
174        :rtype: :class:`Orange.ensemble.forest.RandomForestClassifier`
175        """
176        self.rand.setstate(self.randstate) #when learning again, set the same state
177        self.randorange.reset()       
178
179        if "attributes" in self.learner.__dict__:
180            self.learner.attributes = len(instances.domain.attributes)**0.5 if self.attributes == None else self.attributes
181
182        learner = self.learner
183
184        n = len(instances)
185        # build the forest
186        classifiers = []
187        for i in range(self.trees):
188            # draw bootstrap sample
189            selection = []
190            for j in range(n):
191                selection.append(self.rand.randrange(n))
192            data = instances.get_items_ref(selection)
193            # build the model from the bootstrap sample
194            classifiers.append(learner(data, weight))
195            if self.callback:
196                self.callback((i+1.)/self.trees)
197
198        return RandomForestClassifier(classifiers = classifiers, name=self.name,\
199                    domain=instances.domain, class_var=instances.domain.class_var, \
200                    class_vars=instances.domain.class_vars)
201           
202RandomForestLearner = Orange.utils.deprecated_members({"examples":"instances"})(RandomForestLearner)
203
204class RandomForestClassifier(orange.Classifier):
205    """
206    Uses the trees induced by the :obj:`RandomForestLearner`. An input
207    instance is classified into the class with the most frequent vote.
208    However, this implementation returns the averaged probabilities from
209    each of the trees if class probability is requested.
210
211    When constructed manually, the following parameters have to
212    be passed:
213
214    :param classifiers: a list of classifiers to be used.
215    :type classifiers: list
216   
217    :param name: name of the resulting classifier.
218    :type name: str
219   
220    :param domain: the domain of the learning set.
221    :type domain: :class:`Orange.data.Domain`
222   
223    :param class_var: the class feature.
224    :type class_var: :class:`Orange.feature.Descriptor`
225
226    :param class_vars: the multi-target class features.
227    :type class_vars: list of :class:`Orange.feature.Descriptor`
228
229    """
230    def __init__(self, classifiers, name, domain, class_var, class_vars, **kwds):
231        self.classifiers = classifiers
232        self.name = name
233        self.domain = domain
234        self.class_var = class_var
235        self.class_vars = class_vars
236        self.__dict__.update(kwds)
237        self.single_class = True if not class_vars else False
238
239    def __call__(self, instance, result_type = orange.GetValue):
240        """
241        :param instance: instance to be classified.
242        :type instance: :class:`Orange.data.Instance`
243       
244        :param result_type: :class:`Orange.classification.Classifier.GetValue` or \
245              :class:`Orange.classification.Classifier.GetProbabilities` or
246              :class:`Orange.classification.Classifier.GetBoth`
247       
248        :rtype: :class:`Orange.data.Value`,
249              :class:`Orange.statistics.Distribution` or a tuple with both
250        """
251        from operator import add
252
253        instance = Orange.data.Instance(self.domain, instance)
254        # get results to avoid multiple calls
255        res_both = [c(instance, orange.GetBoth) for c in self.classifiers]
256
257        # transform single class instance to match multi-target instances
258        if self.single_class:
259            self.class_vars = [self.class_var]
260            res_both = [([(r[0])],[(r[1])]) for r in res_both]
261
262        mt_prob = []
263        mt_value = []
264
265        for varn in xrange(len(self.class_vars)):
266
267            self.class_var = self.class_vars[varn]
268           
269            # handle discreete class
270       
271            if self.class_var.var_type == Orange.feature.Discrete.Discrete:
272       
273                # voting for class probabilities
274                if result_type == orange.GetProbabilities or result_type == orange.GetBoth:
275                    prob = [0.] * len(self.class_var.values)
276                    for r in res_both:
277                        a = [x for x in r[1][varn]]
278                        prob = map(add, prob, a)
279                    norm = sum(prob)
280                    cprob = Orange.statistics.distribution.Discrete(self.class_var)
281                    for i in range(len(prob)):
282                        cprob[i] = prob[i]/norm
283               
284                # voting for crisp class membership, notice that
285                # this may not be the same class as one obtaining the
286                # highest probability through probability voting
287                if result_type == orange.GetValue or result_type == orange.GetBoth:
288                    cfreq = [0] * len(self.class_var.values)
289                    for r in res_both:
290                        cfreq[int(r[0][varn])] += 1
291                    index = cfreq.index(max(cfreq))
292                    cvalue = Orange.data.Value(self.class_var, index)
293           
294                if result_type == orange.GetValue: mt_value.append(cvalue)
295                elif result_type == orange.GetProbabilities: mt_prob.append(cprob)
296                else: 
297                    mt_value.append(cvalue)
298                    mt_prob.append(cprob)
299       
300            else:
301                # Handle continuous class
302       
303                # voting for class probabilities
304                if result_type == orange.GetProbabilities or result_type == orange.GetBoth:
305                    probs = [ r for r in res_both]
306                    cprob = dict()
307     
308                    for val,prob in probs:
309                        if prob[varn] != None: #no probability output
310                            a = dict(prob[varn].items())
311                        else:
312                            a = { val[varn].value : 1. }
313                        cprob = dict( (n, a.get(n, 0)+cprob.get(n, 0)) for n in set(a)|set(cprob) )
314                    cprob = Orange.statistics.distribution.Continuous(cprob)
315                    cprob.normalize()
316               
317                # gather average class value
318                if result_type == orange.GetValue or result_type == orange.GetBoth:
319                    values = [r[0][varn] for r in res_both]
320                    cvalue = Orange.data.Value(self.class_var, sum(values) / len(self.classifiers))
321           
322                if result_type == orange.GetValue: mt_value.append(cvalue)
323                elif result_type == orange.GetProbabilities: mt_prob.append(cprob)
324                else: 
325                    mt_value.append(cvalue)
326                    mt_prob.append(cprob)
327       
328        # check for singleclass when returning
329        if self.single_class:
330            if result_type == orange.GetValue: return mt_value[0]
331            elif result_type == orange.GetProbabilities: return mt_prob[0]
332            else: 
333                return [mt_value[0],mt_prob[0]] 
334           
335        if result_type == orange.GetValue: return tuple(mt_value)
336        elif result_type == orange.GetProbabilities: return tuple(mt_prob)
337        else: 
338            return [tuple(mt_value),tuple(mt_prob)]
339
340    def __reduce__(self):
341        return type(self), (self.classifiers, self.name, self.domain, self.class_var, self.class_vars), dict(self.__dict__)
342
343RandomForestClassifier = Orange.utils.deprecated_members({"resultType":"result_type", "classVar":"class_var", "example":"instance"})(RandomForestClassifier)
344### MeasureAttribute_randomForests
345
346class ScoreFeature(orange.MeasureAttribute):
347    """
348    :param trees: number of trees in the forest.
349    :type trees: int
350
351    :param attributes: number of randomly drawn features among
352            which to select the best to split the nodes in tree
353            induction. The default, None, means the square root of
354            the number of features in the training data. Ignored if
355            :obj:`learner` is specified.
356    :type attributes: int
357
358    :param base_learner: A base tree learner. The base learner will be
359        randomized with Random Forest's random
360        feature subset selection.  If None (default),
361        :class:`~Orange.classification.tree.SimpleTreeLearner` and it
362        will not split nodes with less than 5 data instances.
363    :type base_learner: None or
364        :class:`Orange.classification.tree.TreeLearner` or
365        :class:`Orange.classification.tree.SimpleTreeLearner`
366
367    :param rand: random generator used in bootstrap sampling. If None (default),
368        then ``random.Random(0)`` is used.
369
370    :param learner: Tree induction learner. If `None` (default),
371        the :obj:`base_learner` will be used (and randomized). If
372        :obj:`learner` is specified, it will be used as such
373        with no additional transformations.
374    :type learner: None or :class:`Orange.core.Learner`
375
376    """
377    def __init__(self, trees=100, attributes=None, rand=None, base_learner=None, learner=None):
378
379        self.trees = trees
380        self.learner = learner
381        self._bufinstances = None
382        self.attributes = attributes
383        self.rand = rand
384        self.base_learner = base_learner
385
386        if base_learner != None and learner != None:
387            wrongSpecification()
388
389        if not self.rand:
390            self.rand = random.Random(0)
391        self.randorange = Orange.misc.Random(self.rand.randint(0,2**31-1))
392
393        if learner == None:
394            self.learner = _wrap_learner(base=self.base_learner, rand=self.rand, randorange=self.randorange)
395        else:
396            self.learner = learner
397
398    @deprecated_keywords({"apriorClass":"aprior_class"})
399    def __call__(self, feature, instances, aprior_class=None):
400        """
401        Return importance of a given feature.
402        Only the first call on a given data set is computationally expensive.
403       
404        :param feature: feature to evaluate (by index, name or
405            :class:`Orange.feature.Descriptor` object).
406        :type feature: int, str or :class:`Orange.feature.Descriptor`.
407       
408        :param instances: data instances to use for importance evaluation.
409        :type instances: :class:`Orange.data.Table`
410       
411        :param aprior_class: not used!
412       
413        """
414        attrNo = None
415
416        if type(feature) == int: #by attr. index
417          attrNo  = feature
418        elif type(feature) == type("a"): #by attr. name
419          attrName = feature
420          attrNo = instances.domain.index(attrName)
421        elif isinstance(feature, Orange.feature.Descriptor):
422          atrs = [a for a in instances.domain.attributes]
423          attrNo = atrs.index(feature)
424        else:
425          raise Exception("MeasureAttribute_rf can not be called with (\
426                contingency,classDistribution, aprior_class) as fuction arguments.")
427
428        self._buffer(instances)
429
430        return self._avimp[attrNo]*100/self._acu
431
432    def importances(self, table):
433        """
434        DEPRECATED. Return importance of all features in the dataset as a list.
435       
436        :param table: dataset of which the features' importance needs to be
437            measured.
438        :type table: :class:`Orange.data.Table`
439
440        """
441        self._buffer(table)
442        return [a*100/self._acu for a in self._avimp]
443
444    def _buffer(self, instances):
445        """
446        Recalculate importance of features if needed (ie. if it has been
447        buffered for the given dataset yet).
448
449        :param table: dataset of which the features' importance needs to be
450            measured.
451        :type table: :class:`Orange.data.Table`
452
453        """
454        if instances != self._bufinstances or \
455            instances.version != self._bufinstances.version:
456
457            self._bufinstances = instances
458            self._avimp = [0.0]*len(self._bufinstances.domain.attributes)
459            self._acu = 0
460            self._importanceAcu(self._bufinstances, self.trees, self._avimp)
461     
462    def _getOOB(self, instances, selection, nexamples):
463        ooblist = filter(lambda x: x not in selection, range(nexamples))
464        return instances.getitems(ooblist)
465
466    def _numRight(self, oob, classifier):
467        """
468        Return a number of instances which are classified correctly.
469        """
470        #TODO How to accomodate regression?
471        return sum(1 for el in oob if el.getclass() == classifier(el))
472   
473    def _numRightMix(self, oob, classifier, attr):
474        """
475        Return a number of instances which are classified
476        correctly even if a feature is shuffled.
477        """
478        perm = range(len(oob))
479        self.rand.shuffle(perm)
480
481        def shuffle_ex(index):
482            ex = Orange.data.Instance(oob[index])
483            ex[attr] = oob[perm[index]][attr]
484            return ex
485        #TODO How to accomodate regression?
486        return sum(1 for i in range(len(oob)) if oob[i].getclass() == classifier(shuffle_ex(i)))
487
488    def _importanceAcu(self, instances, trees, avimp):
489        """Accumulate avimp by importances for a given number of trees."""
490        n = len(instances)
491
492        attrs = len(instances.domain.attributes)
493
494        attrnum = {}
495        for attr in range(len(instances.domain.attributes)):
496           attrnum[instances.domain.attributes[attr].name] = attr           
497
498        if "attributes" in self.learner.__dict__:
499            self.learner.attributes = len(instances.domain.attributes)**0.5 if self.attributes == None else self.attributes
500
501        # build the forest
502        classifiers = [] 
503        for i in range(trees):
504            # draw bootstrap sample
505            selection = []
506            for j in range(n):
507                selection.append(self.rand.randrange(n))
508            data = instances.getitems(selection)
509           
510            # build the model from the bootstrap sample
511            cla = self.learner(data)
512
513            #prepare OOB data
514            oob = self._getOOB(instances, selection, n)
515           
516            #right on unmixed
517            right = self._numRight(oob, cla)
518           
519            presl = range(attrs)
520            try: #FIXME SimpleTreeLearner does not know how to output attributes yet
521                presl = list(self._presentInTree(cla.tree, attrnum))
522            except:
523                pass
524                     
525            #randomize each feature in data and test
526            #only those on which there was a split
527            for attr in presl:
528                #calculate number of right classifications
529                #if the values of this features are permutated randomly
530                rightimp = self._numRightMix(oob, cla, attr)               
531                avimp[attr] += (float(right-rightimp))/len(oob)
532        self._acu += trees 
533
534    def _presentInTree(self, node, attrnum):
535        """Return features present in tree (features that split)."""
536        if not node:
537          return set([])
538
539        if  node.branchSelector:
540            j = attrnum[node.branchSelector.class_var.name]
541            cs = set([])
542            for i in range(len(node.branches)):
543                s = self._presentInTree(node.branches[i], attrnum)
544                cs = s | cs
545            cs = cs | set([j])
546            return cs
547        else:
548          return set([])
549
550
551class SplitConstructor_AttributeSubset(orange.TreeSplitConstructor):
552    def __init__(self, scons, attributes, rand = None):
553        self.scons = scons           # split constructor of original tree
554        self.attributes = attributes # number of features to consider
555        self.rand = rand
556        if not self.rand:
557            self.rand = random.Random(0)
558
559    @deprecated_keywords({"weightID":"weight_id"})
560    def __call__(self, gen, weight_id, contingencies, apriori, candidates, clsfr):
561        # if number of features for subset is not set, use square root
562        cand = [1]*int(self.attributes) + [0]*(len(candidates) - int(self.attributes))
563        self.rand.shuffle(cand)
564        # instead with all features, we will invoke split constructor
565        # only for the subset of a features
566        t = self.scons(gen, weight_id, contingencies, apriori, cand, clsfr)
567        return t
Note: See TracBrowser for help on using the repository browser.