source: orange/Orange/ensemble/forest.py @ 10769:8838c9b438d8

Revision 10769:8838c9b438d8, 22.1 KB checked in by Ales Erjavec <ales.erjavec@…>, 2 years ago (diff)

Merge

Line 
1from math import sqrt, floor
2import Orange.core as orange
3import Orange
4import Orange.feature.scoring
5import random
6import copy
7from Orange.utils import deprecated_keywords
8
9def _default_small_learner(attributes=None, rand=None, base=None):
10    # tree learner assembled as suggested by Breiman (2001)
11    if not base:
12        base = Orange.classification.tree.TreeLearner(
13            store_node_classifier=0, store_contingencies=0, 
14            store_distributions=1, min_instances=5)
15
16    return _RandomForestTreeLearner(base=base, rand=rand)
17
18def _default_simple_learner(base, randorange):
19    if base == None:
20        base = Orange.classification.tree.SimpleTreeLearner(min_instances=5)
21    return _RandomForestSimpleTreeLearner(base=base, rand=randorange)
22
23def _wrap_learner(base, rand, randorange):
24    if base == None or isinstance(base, Orange.classification.tree.SimpleTreeLearner):
25        return _default_simple_learner(base, randorange)
26    elif isinstance(base, Orange.classification.tree.TreeLearner):
27        return _default_small_learner(None, rand, base)
28    else:
29        notRightLearnerToWrap()
30 
31class _RandomForestSimpleTreeLearner(Orange.core.Learner):
32    """A learner which wraps an ordinary SimpleTreeLearner.  Sets the
33    skip_prob so that the number of randomly chosen features for each
34    split is  (on average) as specified."""
35
36    __new__ = Orange.utils._orange__new__(Orange.core.Learner)
37
38    def __init__(self, base=None, rand=None): #pickle needs an empty init
39        self.base = base
40        self.attributes = None
41        self.rand = rand
42   
43    def __call__(self, instances, weight=0):
44        osp,orand = self.base.skip_prob, self.base.random_generator
45        self.base.skip_prob = 1-float(self.attributes)/len(instances.domain.attributes)
46        self.base.random_generator = self.rand
47        r = self.base(instances, weight)
48        self.base.skip_prob, self.base.random_generator = osp, orand
49        return r
50
51_RandomForestSimpleTreeLearner = Orange.utils.deprecated_members({"weightID":"weight_id", "examples":"instances"})(_RandomForestSimpleTreeLearner)
52   
53class RandomForestLearner(Orange.core.Learner):
54    """
55    Trains an ensemble predictor consisting of trees trained
56    on bootstrap
57    samples of training data. To increase
58    randomness, the tree learner considers only a subset of
59    candidate features at each node. The algorithm closely follows
60    the original procedure (Brieman, 2001) both in implementation and parameter
61    defaults.
62       
63    :param trees: number of trees in the forest.
64    :type trees: int
65
66    :param attributes: number of randomly drawn features among
67            which to select the best one to split the data sets
68            in tree nodes. The default, None, means the square root of
69            the number of features in the training data. Ignored if
70            :obj:`learner` is specified.
71    :type attributes: int
72
73    :param base_learner: A base tree learner. The base learner will be
74        randomized with Random Forest's random
75        feature subset selection.  If None (default),
76        :class:`~Orange.classification.tree.SimpleTreeLearner` and it
77        will not split nodes with less than 5 data instances.
78    :type base_learner: None or
79        :class:`Orange.classification.tree.TreeLearner` or
80        :class:`Orange.classification.tree.SimpleTreeLearner`
81
82    :param rand: random generator used in bootstrap sampling. If None (default),
83        then ``random.Random(0)`` is used.
84
85    :param learner: Tree induction learner. If `None` (default),
86        the :obj:`base_learner` will be used (and randomized). If
87        :obj:`learner` is specified, it will be used as such
88        with no additional transformations.
89    :type learner: None or :class:`Orange.core.Learner`
90
91    :param callback: a function to be called after every iteration of
92            induction of classifier. The call includes a parameter
93            (from 0.0 to 1.0) that provides an estimate
94            of completion of the learning progress.
95
96    :param name: learner name.
97    :type name: string
98
99    :rtype: :class:`~Orange.ensemble.forest.RandomForestClassifier` or
100            :class:`~Orange.ensemble.forest.RandomForestLearner`
101
102    """
103
104    __new__ = Orange.utils._orange__new__(Orange.core.Learner)
105
106    def __init__(self, trees=100, attributes=None,\
107                    name='Random Forest', rand=None, callback=None, base_learner=None, learner=None):
108        self.trees = trees
109        self.name = name
110        self.attributes = attributes
111        self.callback = callback
112        self.rand = rand
113
114        self.base_learner = base_learner
115
116        if base_learner != None and learner != None:
117            wrongSpecification()
118
119        if not self.rand:
120            self.rand = random.Random(0)
121        self.randorange = Orange.misc.Random(self.rand.randint(0,2**31-1))
122
123        if learner == None:
124            self.learner = _wrap_learner(base=self.base_learner, rand=self.rand, randorange=self.randorange)
125        else:
126            self.learner = learner
127           
128        self.randstate = self.rand.getstate() #original state
129
130    def __call__(self, instances, weight=0):
131        """
132        Learn from the given table of data instances.
133       
134        :param instances: learning data.
135        :type instances: class:`Orange.data.Table`
136        :param weight: weight.
137        :type weight: int
138        :rtype: :class:`Orange.ensemble.forest.RandomForestClassifier`
139        """
140        self.rand.setstate(self.randstate) #when learning again, set the same state
141        self.randorange.reset()       
142
143        if "attributes" in self.learner.__dict__:
144            self.learner.attributes = len(instances.domain.attributes)**0.5 if self.attributes == None else self.attributes
145
146        learner = self.learner
147
148        n = len(instances)
149        # build the forest
150        classifiers = []
151        for i in range(self.trees):
152            # draw bootstrap sample
153            selection = []
154            for j in range(n):
155                selection.append(self.rand.randrange(n))
156            data = instances.get_items_ref(selection)
157            # build the model from the bootstrap sample
158            classifiers.append(learner(data, weight))
159            if self.callback:
160                self.callback((i+1.)/self.trees)
161
162        return RandomForestClassifier(classifiers = classifiers, name=self.name,\
163                    domain=instances.domain, class_var=instances.domain.class_var, \
164                    class_vars=instances.domain.class_vars)
165
166           
167RandomForestLearner = Orange.utils.deprecated_members({"examples":"instances"})(RandomForestLearner)
168
169class RandomForestClassifier(orange.Classifier):
170    """
171    Uses the trees induced by the :obj:`RandomForestLearner`. An input
172    instance is classified into the class with the most frequent vote.
173    However, this implementation returns the averaged probabilities from
174    each of the trees if class probability is requested.
175
176    When constructed manually, the following parameters have to
177    be passed:
178
179    :param classifiers: a list of classifiers to be used.
180    :type classifiers: list
181   
182    :param name: name of the resulting classifier.
183    :type name: str
184   
185    :param domain: the domain of the learning set.
186    :type domain: :class:`Orange.data.Domain`
187   
188    :param class_var: the class feature.
189    :type class_var: :class:`Orange.feature.Descriptor`
190
191    :param class_vars: the multi-target class features.
192    :type class_vars: list of :class:`Orange.feature.Descriptor`
193
194    """
195    def __init__(self, classifiers, name, domain, class_var, class_vars, **kwds):
196        self.classifiers = classifiers
197        self.name = name
198        self.domain = domain
199        self.class_var = class_var
200        self.class_vars = class_vars
201        self.__dict__.update(kwds)
202        self.single_class = True if not class_vars else False
203
204    def __call__(self, instance, result_type = orange.GetValue):
205        """
206        :param instance: instance to be classified.
207        :type instance: :class:`Orange.data.Instance`
208       
209        :param result_type: :class:`Orange.classification.Classifier.GetValue` or \
210              :class:`Orange.classification.Classifier.GetProbabilities` or
211              :class:`Orange.classification.Classifier.GetBoth`
212       
213        :rtype: :class:`Orange.data.Value`,
214              :class:`Orange.statistics.Distribution` or a tuple with both
215        """
216        from operator import add
217
218        instance = Orange.data.Instance(self.domain, instance)
219        # get results to avoid multiple calls
220        res_both = [c(instance, orange.GetBoth) for c in self.classifiers]
221
222        # transform single class instance to match multi-target instances
223        if self.single_class:
224            self.class_vars = [self.class_var]
225            res_both = [([(r[0])],[(r[1])]) for r in res_both]
226
227        mt_prob = []
228        mt_value = []
229
230        for varn in xrange(len(self.class_vars)):
231
232            self.class_var = self.class_vars[varn]
233           
234            # handle discreete class
235       
236            if self.class_var.var_type == Orange.feature.Discrete.Discrete:
237       
238                # voting for class probabilities
239                if result_type == orange.GetProbabilities or result_type == orange.GetBoth:
240                    prob = [0.] * len(self.class_var.values)
241                    for r in res_both:
242                        a = [x for x in r[1][varn]]
243                        prob = map(add, prob, a)
244                    norm = sum(prob)
245                    cprob = Orange.statistics.distribution.Discrete(self.class_var)
246                    for i in range(len(prob)):
247                        cprob[i] = prob[i]/norm
248               
249                # voting for crisp class membership, notice that
250                # this may not be the same class as one obtaining the
251                # highest probability through probability voting
252                if result_type == orange.GetValue or result_type == orange.GetBoth:
253                    cfreq = [0] * len(self.class_var.values)
254                    for r in res_both:
255                        cfreq[int(r[0][varn])] += 1
256                    index = cfreq.index(max(cfreq))
257                    cvalue = Orange.data.Value(self.class_var, index)
258           
259                if result_type == orange.GetValue: mt_value.append(cvalue)
260                elif result_type == orange.GetProbabilities: mt_prob.append(cprob)
261                else: 
262                    mt_value.append(cvalue)
263                    mt_prob.append(cprob)
264       
265            else:
266                # Handle continuous class
267       
268                # voting for class probabilities
269                if result_type == orange.GetProbabilities or result_type == orange.GetBoth:
270                    probs = [ r for r in res_both]
271                    cprob = dict()
272     
273                    for val,prob in probs:
274                        if prob[varn] != None: #no probability output
275                            a = dict(prob[varn].items())
276                        else:
277                            a = { val[varn].value : 1. }
278                        cprob = dict( (n, a.get(n, 0)+cprob.get(n, 0)) for n in set(a)|set(cprob) )
279                    cprob = Orange.statistics.distribution.Continuous(cprob)
280                    cprob.normalize()
281               
282                # gather average class value
283                if result_type == orange.GetValue or result_type == orange.GetBoth:
284                    values = [r[0][varn] for r in res_both]
285                    cvalue = Orange.data.Value(self.class_var, sum(values) / len(self.classifiers))
286           
287                if result_type == orange.GetValue: mt_value.append(cvalue)
288                elif result_type == orange.GetProbabilities: mt_prob.append(cprob)
289                else: 
290                    mt_value.append(cvalue)
291                    mt_prob.append(cprob)
292       
293        # check for singleclass when returning
294        if self.single_class:
295            if result_type == orange.GetValue: return mt_value[0]
296            elif result_type == orange.GetProbabilities: return mt_prob[0]
297            else: 
298                return [mt_value[0],mt_prob[0]] 
299           
300        if result_type == orange.GetValue: return tuple(mt_value)
301        elif result_type == orange.GetProbabilities: return tuple(mt_prob)
302        else: 
303            return [tuple(mt_value),tuple(mt_prob)]
304
305    def __reduce__(self):
306        return type(self), (self.classifiers, self.name, self.domain, self.class_var, self.class_vars), dict(self.__dict__)
307RandomForestClassifier = Orange.utils.deprecated_members({"resultType":"result_type", "classVar":"class_var", "example":"instance"})(RandomForestClassifier)
308### MeasureAttribute_randomForests
309
310class ScoreFeature(orange.MeasureAttribute):
311    """
312    :param trees: number of trees in the forest.
313    :type trees: int
314
315    :param attributes: number of randomly drawn features among
316            which to select the best to split the nodes in tree
317            induction. The default, None, means the square root of
318            the number of features in the training data. Ignored if
319            :obj:`learner` is specified.
320    :type attributes: int
321
322    :param base_learner: A base tree learner. The base learner will be
323        randomized with Random Forest's random
324        feature subset selection.  If None (default),
325        :class:`~Orange.classification.tree.SimpleTreeLearner` and it
326        will not split nodes with less than 5 data instances.
327    :type base_learner: None or
328        :class:`Orange.classification.tree.TreeLearner` or
329        :class:`Orange.classification.tree.SimpleTreeLearner`
330
331    :param rand: random generator used in bootstrap sampling. If None (default),
332        then ``random.Random(0)`` is used.
333
334    :param learner: Tree induction learner. If `None` (default),
335        the :obj:`base_learner` will be used (and randomized). If
336        :obj:`learner` is specified, it will be used as such
337        with no additional transformations.
338    :type learner: None or :class:`Orange.core.Learner`
339
340    """
341    def __init__(self, trees=100, attributes=None, rand=None, base_learner=None, learner=None):
342
343        self.trees = trees
344        self.learner = learner
345        self._bufinstances = None
346        self.attributes = attributes
347        self.rand = rand
348        self.base_learner = base_learner
349
350        if base_learner != None and learner != None:
351            wrongSpecification()
352
353        if not self.rand:
354            self.rand = random.Random(0)
355        self.randorange = Orange.misc.Random(self.rand.randint(0,2**31-1))
356
357        if learner == None:
358            self.learner = _wrap_learner(base=self.base_learner, rand=self.rand, randorange=self.randorange)
359        else:
360            self.learner = learner
361
362    @deprecated_keywords({"apriorClass":"aprior_class"})
363    def __call__(self, feature, instances, aprior_class=None):
364        """
365        Return importance of a given feature.
366        Only the first call on a given data set is computationally expensive.
367       
368        :param feature: feature to evaluate (by index, name or
369            :class:`Orange.feature.Descriptor` object).
370        :type feature: int, str or :class:`Orange.feature.Descriptor`.
371       
372        :param instances: data instances to use for importance evaluation.
373        :type instances: :class:`Orange.data.Table`
374       
375        :param aprior_class: not used!
376       
377        """
378        attrNo = None
379
380        if type(feature) == int: #by attr. index
381          attrNo  = feature
382        elif type(feature) == type("a"): #by attr. name
383          attrName = feature
384          attrNo = instances.domain.index(attrName)
385        elif isinstance(feature, Orange.feature.Descriptor):
386          atrs = [a for a in instances.domain.attributes]
387          attrNo = atrs.index(feature)
388        else:
389          raise Exception("MeasureAttribute_rf can not be called with (\
390                contingency,classDistribution, aprior_class) as fuction arguments.")
391
392        self._buffer(instances)
393
394        return self._avimp[attrNo]*100/self._acu
395
396    def importances(self, table):
397        """
398        DEPRECATED. Return importance of all features in the dataset as a list.
399       
400        :param table: dataset of which the features' importance needs to be
401            measured.
402        :type table: :class:`Orange.data.Table`
403
404        """
405        self._buffer(table)
406        return [a*100/self._acu for a in self._avimp]
407
408    def _buffer(self, instances):
409        """
410        Recalculate importance of features if needed (ie. if it has been
411        buffered for the given dataset yet).
412
413        :param table: dataset of which the features' importance needs to be
414            measured.
415        :type table: :class:`Orange.data.Table`
416
417        """
418        if instances != self._bufinstances or \
419            instances.version != self._bufinstances.version:
420
421            self._bufinstances = instances
422            self._avimp = [0.0]*len(self._bufinstances.domain.attributes)
423            self._acu = 0
424            self._importanceAcu(self._bufinstances, self.trees, self._avimp)
425     
426    def _getOOB(self, instances, selection, nexamples):
427        ooblist = filter(lambda x: x not in selection, range(nexamples))
428        return instances.getitems(ooblist)
429
430    def _numRight(self, oob, classifier):
431        """
432        Return a number of instances which are classified correctly.
433        """
434        #TODO How to accomodate regression?
435        return sum(1 for el in oob if el.getclass() == classifier(el))
436   
437    def _numRightMix(self, oob, classifier, attr):
438        """
439        Return a number of instances which are classified
440        correctly even if a feature is shuffled.
441        """
442        perm = range(len(oob))
443        self.rand.shuffle(perm)
444
445        def shuffle_ex(index):
446            ex = Orange.data.Instance(oob[index])
447            ex[attr] = oob[perm[index]][attr]
448            return ex
449        #TODO How to accomodate regression?
450        return sum(1 for i in range(len(oob)) if oob[i].getclass() == classifier(shuffle_ex(i)))
451
452    def _importanceAcu(self, instances, trees, avimp):
453        """Accumulate avimp by importances for a given number of trees."""
454        n = len(instances)
455
456        attrs = len(instances.domain.attributes)
457
458        attrnum = {}
459        for attr in range(len(instances.domain.attributes)):
460           attrnum[instances.domain.attributes[attr].name] = attr           
461
462        if "attributes" in self.learner.__dict__:
463            self.learner.attributes = len(instances.domain.attributes)**0.5 if self.attributes == None else self.attributes
464
465        # build the forest
466        classifiers = [] 
467        for i in range(trees):
468            # draw bootstrap sample
469            selection = []
470            for j in range(n):
471                selection.append(self.rand.randrange(n))
472            data = instances.getitems(selection)
473           
474            # build the model from the bootstrap sample
475            cla = self.learner(data)
476
477            #prepare OOB data
478            oob = self._getOOB(instances, selection, n)
479           
480            #right on unmixed
481            right = self._numRight(oob, cla)
482           
483            presl = range(attrs)
484            try: #FIXME SimpleTreeLearner does not know how to output attributes yet
485                presl = list(self._presentInTree(cla.tree, attrnum))
486            except:
487                pass
488                     
489            #randomize each feature in data and test
490            #only those on which there was a split
491            for attr in presl:
492                #calculate number of right classifications
493                #if the values of this features are permutated randomly
494                rightimp = self._numRightMix(oob, cla, attr)               
495                avimp[attr] += (float(right-rightimp))/len(oob)
496        self._acu += trees 
497
498    def _presentInTree(self, node, attrnum):
499        """Return features present in tree (features that split)."""
500        if not node:
501          return set([])
502
503        if  node.branchSelector:
504            j = attrnum[node.branchSelector.class_var.name]
505            cs = set([])
506            for i in range(len(node.branches)):
507                s = self._presentInTree(node.branches[i], attrnum)
508                cs = s | cs
509            cs = cs | set([j])
510            return cs
511        else:
512          return set([])
513
514class _RandomForestTreeLearner(Orange.core.Learner):
515    """ A learner which wraps an ordinary TreeLearner with
516    a new split constructor.
517    """
518
519    __new__ = Orange.utils._orange__new__(Orange.core.Learner)
520     
521    def __init__(self, base, rand):
522        self.base = base
523        self.attributes = None
524        self.rand = rand
525        if not self.rand: #for all the built trees
526            self.rand = random.Random(0)
527
528    @deprecated_keywords({"examples":"instances"})
529    def __call__(self, instances, weight=0):
530        """ A current tree learner is copied, modified and then used.
531        Modification: set a different split constructor, which uses
532        a random subset of attributes.
533        """
534        bcopy = copy.copy(self.base)
535
536        #if base tree learner has no measure set
537        if not bcopy.measure:
538            bcopy.measure = Orange.feature.scoring.Gini() \
539                if isinstance(instances.domain.class_var, Orange.feature.Discrete) \
540                else Orange.feature.scoring.MSE()
541
542        bcopy.split = SplitConstructor_AttributeSubset(\
543            bcopy.split, self.attributes, self.rand)
544
545        return bcopy(instances, weight=weight)
546
547class SplitConstructor_AttributeSubset(orange.TreeSplitConstructor):
548    def __init__(self, scons, attributes, rand = None):
549        self.scons = scons           # split constructor of original tree
550        self.attributes = attributes # number of features to consider
551        self.rand = rand
552        if not self.rand:
553            self.rand = random.Random(0)
554
555    @deprecated_keywords({"weightID":"weight_id"})
556    def __call__(self, gen, weight_id, contingencies, apriori, candidates, clsfr):
557        # if number of features for subset is not set, use square root
558        cand = [1]*int(self.attributes) + [0]*(len(candidates) - int(self.attributes))
559        self.rand.shuffle(cand)
560        # instead with all features, we will invoke split constructor
561        # only for the subset of a features
562        t = self.scons(gen, weight_id, contingencies, apriori, cand, clsfr)
563        return t
Note: See TracBrowser for help on using the repository browser.