source: orange/Orange/ensemble/forest.py @ 10587:5c9fc139ddfa

Revision 10587:5c9fc139ddfa, 20.5 KB checked in by Ales Erjavec <ales.erjavec@…>, 2 years ago (diff)

Changed the way callback function is called in RandomForestLearner (the way it is documented). Fixed the widgets correspondingly.

Line 
1from math import sqrt, floor
2import Orange.core as orange
3import Orange
4import Orange.feature.scoring
5import random
6import copy
7from Orange.utils import deprecated_keywords
8
9def _default_small_learner(attributes=None, rand=None, base=None):
10    # tree learner assembled as suggested by Breiman (2001)
11    if not base:
12        base = Orange.classification.tree.TreeLearner(
13            store_node_classifier=0, store_contingencies=0, 
14            store_distributions=1, min_instances=5)
15
16    return _RandomForestTreeLearner(base=base, rand=rand)
17
18def _default_simple_learner(base, randorange):
19    if base == None:
20        base = Orange.classification.tree.SimpleTreeLearner(min_instances=5)
21    return _RandomForestSimpleTreeLearner(base=base, rand=randorange)
22
23def _wrap_learner(base, rand, randorange):
24    if base == None or isinstance(base, Orange.classification.tree.SimpleTreeLearner):
25        return _default_simple_learner(base, randorange)
26    elif isinstance(base, Orange.classification.tree.TreeLearner):
27        return _default_small_learner(None, rand, base)
28    else:
29        notRightLearnerToWrap()
30 
31class _RandomForestSimpleTreeLearner(Orange.core.Learner):
32    """A learner which wraps an ordinary SimpleTreeLearner.  Sets the
33    skip_prob so that the number of randomly chosen features for each
34    split is  (on average) as specified."""
35
36    __new__ = Orange.utils._orange__new__(Orange.core.Learner)
37
38    def __init__(self, base=None, rand=None): #pickle needs an empty init
39        self.base = base
40        self.attributes = None
41        self.rand = rand
42   
43    def __call__(self, instances, weight=0):
44        osp,orand = self.base.skip_prob, self.base.random_generator
45        self.base.skip_prob = 1-float(self.attributes)/len(instances.domain.attributes)
46        self.base.random_generator = self.rand
47        r = self.base(instances, weight)
48        self.base.skip_prob, self.base.random_generator = osp, orand
49        return r
50
51_RandomForestSimpleTreeLearner = Orange.utils.deprecated_members({"weightID":"weight_id", "examples":"instances"})(_RandomForestSimpleTreeLearner)
52   
53class RandomForestLearner(Orange.core.Learner):
54    """
55    Trains an ensemble predictor consisting of trees trained
56    on bootstrap
57    samples of training data. To increase
58    randomness, the tree learner considers only a subset of
59    candidate features at each node. The algorithm closely follows
60    the original procedure (Brieman, 2001) both in implementation and parameter
61    defaults.
62       
63    :param trees: number of trees in the forest.
64    :type trees: int
65
66    :param attributes: number of randomly drawn features among
67            which to select the best one to split the data sets
68            in tree nodes. The default, None, means the square root of
69            the number of features in the training data. Ignored if
70            :obj:`learner` is specified.
71    :type attributes: int
72
73    :param base_learner: A base tree learner. The base learner will be
74        randomized with Random Forest's random
75        feature subset selection.  If None (default),
76        :class:`~Orange.classification.tree.SimpleTreeLearner` and it
77        will not split nodes with less than 5 data instances.
78    :type base_learner: None or
79        :class:`Orange.classification.tree.TreeLearner` or
80        :class:`Orange.classification.tree.SimpleTreeLearner`
81
82    :param rand: random generator used in bootstrap sampling. If None (default),
83        then ``random.Random(0)`` is used.
84
85    :param learner: Tree induction learner. If `None` (default),
86        the :obj:`base_learner` will be used (and randomized). If
87        :obj:`learner` is specified, it will be used as such
88        with no additional transformations.
89    :type learner: None or :class:`Orange.core.Learner`
90
91    :param callback: a function to be called after every iteration of
92            induction of classifier. The call includes a parameter
93            (from 0.0 to 1.0) that provides an estimate
94            of completion of the learning progress.
95
96    :param name: learner name.
97    :type name: string
98
99    :rtype: :class:`~Orange.ensemble.forest.RandomForestClassifier` or
100            :class:`~Orange.ensemble.forest.RandomForestLearner`
101
102    """
103
104    __new__ = Orange.utils._orange__new__(Orange.core.Learner)
105
106    def __init__(self, trees=100, attributes=None,\
107                    name='Random Forest', rand=None, callback=None, base_learner=None, learner=None):
108        self.trees = trees
109        self.name = name
110        self.attributes = attributes
111        self.callback = callback
112        self.rand = rand
113
114        self.base_learner = base_learner
115
116        if base_learner != None and learner != None:
117            wrongSpecification()
118
119        if not self.rand:
120            self.rand = random.Random(0)
121        self.randorange = Orange.misc.Random(self.rand.randint(0,2**31-1))
122
123        if learner == None:
124            self.learner = _wrap_learner(base=self.base_learner, rand=self.rand, randorange=self.randorange)
125        else:
126            self.learner = learner
127           
128        self.randstate = self.rand.getstate() #original state
129
130    def __call__(self, instances, weight=0):
131        """
132        Learn from the given table of data instances.
133       
134        :param instances: learning data.
135        :type instances: class:`Orange.data.Table`
136        :param weight: weight.
137        :type weight: int
138        :rtype: :class:`Orange.ensemble.forest.RandomForestClassifier`
139        """
140        self.rand.setstate(self.randstate) #when learning again, set the same state
141        self.randorange.reset()       
142
143        if "attributes" in self.learner.__dict__:
144            self.learner.attributes = len(instances.domain.attributes)**0.5 if self.attributes == None else self.attributes
145
146        learner = self.learner
147
148        n = len(instances)
149        # build the forest
150        classifiers = []
151        for i in range(self.trees):
152            # draw bootstrap sample
153            selection = []
154            for j in range(n):
155                selection.append(self.rand.randrange(n))
156            data = instances.get_items_ref(selection)
157            # build the model from the bootstrap sample
158            classifiers.append(learner(data, weight))
159            if self.callback:
160                self.callback((i+1.)/self.trees)
161
162        return RandomForestClassifier(classifiers = classifiers, name=self.name,\
163                    domain=instances.domain, class_var=instances.domain.class_var)
164RandomForestLearner = Orange.utils.deprecated_members({"examples":"instances"})(RandomForestLearner)
165
166class RandomForestClassifier(orange.Classifier):
167    """
168    Uses the trees induced by the :obj:`RandomForestLearner`. An input
169    instance is classified into the class with the most frequent vote.
170    However, this implementation returns the averaged probabilities from
171    each of the trees if class probability is requested.
172
173    When constructed manually, the following parameters have to
174    be passed:
175
176    :param classifiers: a list of classifiers to be used.
177    :type classifiers: list
178   
179    :param name: name of the resulting classifier.
180    :type name: str
181   
182    :param domain: the domain of the learning set.
183    :type domain: :class:`Orange.data.Domain`
184   
185    :param class_var: the class feature.
186    :type class_var: :class:`Orange.feature.Descriptor`
187
188    """
189    def __init__(self, classifiers, name, domain, class_var, **kwds):
190        self.classifiers = classifiers
191        self.name = name
192        self.domain = domain
193        self.class_var = class_var
194        self.__dict__.update(kwds)
195
196    def __call__(self, instance, result_type = orange.GetValue):
197        """
198        :param instance: instance to be classified.
199        :type instance: :class:`Orange.data.Instance`
200       
201        :param result_type: :class:`Orange.classification.Classifier.GetValue` or \
202              :class:`Orange.classification.Classifier.GetProbabilities` or
203              :class:`Orange.classification.Classifier.GetBoth`
204       
205        :rtype: :class:`Orange.data.Value`,
206              :class:`Orange.statistics.Distribution` or a tuple with both
207        """
208        from operator import add
209       
210        # handle discreete class
211       
212        if self.class_var.var_type == Orange.feature.Discrete.Discrete:
213       
214            # voting for class probabilities
215            if result_type == orange.GetProbabilities or result_type == orange.GetBoth:
216                prob = [0.] * len(self.domain.class_var.values)
217                for c in self.classifiers:
218                    a = [x for x in c(instance, orange.GetProbabilities)]
219                    prob = map(add, prob, a)
220                norm = sum(prob)
221                cprob = Orange.statistics.distribution.Discrete(self.class_var)
222                for i in range(len(prob)):
223                    cprob[i] = prob[i]/norm
224               
225            # voting for crisp class membership, notice that
226            # this may not be the same class as one obtaining the
227            # highest probability through probability voting
228            if result_type == orange.GetValue or result_type == orange.GetBoth:
229                cfreq = [0] * len(self.domain.class_var.values)
230                for c in self.classifiers:
231                    cfreq[int(c(instance))] += 1
232                index = cfreq.index(max(cfreq))
233                cvalue = Orange.data.Value(self.domain.class_var, index)
234   
235            if result_type == orange.GetValue: return cvalue
236            elif result_type == orange.GetProbabilities: return cprob
237            else: return (cvalue, cprob)
238       
239        else:
240            # Handle continuous class
241       
242            # voting for class probabilities
243            if result_type == orange.GetProbabilities or result_type == orange.GetBoth:
244                probs = [c(instance, orange.GetBoth) for c in self.classifiers]
245                cprob = dict()
246                for val,prob in probs:
247                    if prob != None: #no probability output
248                        a = dict(prob.items())
249                    else:
250                        a = { val.value : 1. }
251                    cprob = dict( (n, a.get(n, 0)+cprob.get(n, 0)) for n in set(a)|set(cprob) )
252                cprob = Orange.statistics.distribution.Continuous(cprob)
253                cprob.normalize()
254               
255            # gather average class value
256            if result_type == orange.GetValue or result_type == orange.GetBoth:
257                values = [c(instance).value for c in self.classifiers]
258                cvalue = Orange.data.Value(self.domain.class_var, sum(values) / len(self.classifiers))
259           
260            if result_type == orange.GetValue: return cvalue
261            elif result_type == orange.GetProbabilities: return cprob
262            else: return (cvalue, cprob)
263           
264    def __reduce__(self):
265        return type(self), (self.classifiers, self.name, self.domain, self.class_var), dict(self.__dict__)
266RandomForestClassifier = Orange.utils.deprecated_members({"resultType":"result_type", "classVar":"class_var", "example":"instance"})(RandomForestClassifier)
267### MeasureAttribute_randomForests
268
269class ScoreFeature(orange.MeasureAttribute):
270    """
271    :param trees: number of trees in the forest.
272    :type trees: int
273
274    :param attributes: number of randomly drawn features among
275            which to select the best to split the nodes in tree
276            induction. The default, None, means the square root of
277            the number of features in the training data. Ignored if
278            :obj:`learner` is specified.
279    :type attributes: int
280
281    :param base_learner: A base tree learner. The base learner will be
282        randomized with Random Forest's random
283        feature subset selection.  If None (default),
284        :class:`~Orange.classification.tree.SimpleTreeLearner` and it
285        will not split nodes with less than 5 data instances.
286    :type base_learner: None or
287        :class:`Orange.classification.tree.TreeLearner` or
288        :class:`Orange.classification.tree.SimpleTreeLearner`
289
290    :param rand: random generator used in bootstrap sampling. If None (default),
291        then ``random.Random(0)`` is used.
292
293    :param learner: Tree induction learner. If `None` (default),
294        the :obj:`base_learner` will be used (and randomized). If
295        :obj:`learner` is specified, it will be used as such
296        with no additional transformations.
297    :type learner: None or :class:`Orange.core.Learner`
298
299    """
300    def __init__(self, trees=100, attributes=None, rand=None, base_learner=None, learner=None):
301
302        self.trees = trees
303        self.learner = learner
304        self._bufinstances = None
305        self.attributes = attributes
306        self.rand = rand
307        self.base_learner = base_learner
308
309        if base_learner != None and learner != None:
310            wrongSpecification()
311
312        if not self.rand:
313            self.rand = random.Random(0)
314        self.randorange = Orange.misc.Random(self.rand.randint(0,2**31-1))
315
316        if learner == None:
317            self.learner = _wrap_learner(base=self.base_learner, rand=self.rand, randorange=self.randorange)
318        else:
319            self.learner = learner
320
321    @deprecated_keywords({"apriorClass":"aprior_class"})
322    def __call__(self, feature, instances, aprior_class=None):
323        """
324        Return importance of a given feature.
325        Only the first call on a given data set is computationally expensive.
326       
327        :param feature: feature to evaluate (by index, name or
328            :class:`Orange.feature.Descriptor` object).
329        :type feature: int, str or :class:`Orange.feature.Descriptor`.
330       
331        :param instances: data instances to use for importance evaluation.
332        :type instances: :class:`Orange.data.Table`
333       
334        :param aprior_class: not used!
335       
336        """
337        attrNo = None
338
339        if type(feature) == int: #by attr. index
340          attrNo  = feature
341        elif type(feature) == type("a"): #by attr. name
342          attrName = feature
343          attrNo = instances.domain.index(attrName)
344        elif isinstance(feature, Orange.feature.Descriptor):
345          atrs = [a for a in instances.domain.attributes]
346          attrNo = atrs.index(feature)
347        else:
348          raise Exception("MeasureAttribute_rf can not be called with (\
349                contingency,classDistribution, aprior_class) as fuction arguments.")
350
351        self._buffer(instances)
352
353        return self._avimp[attrNo]*100/self._acu
354
355    def importances(self, table):
356        """
357        DEPRECATED. Return importance of all features in the dataset as a list.
358       
359        :param table: dataset of which the features' importance needs to be
360            measured.
361        :type table: :class:`Orange.data.Table`
362
363        """
364        self._buffer(table)
365        return [a*100/self._acu for a in self._avimp]
366
367    def _buffer(self, instances):
368        """
369        Recalculate importance of features if needed (ie. if it has been
370        buffered for the given dataset yet).
371
372        :param table: dataset of which the features' importance needs to be
373            measured.
374        :type table: :class:`Orange.data.Table`
375
376        """
377        if instances != self._bufinstances or \
378            instances.version != self._bufinstances.version:
379
380            self._bufinstances = instances
381            self._avimp = [0.0]*len(self._bufinstances.domain.attributes)
382            self._acu = 0
383            self._importanceAcu(self._bufinstances, self.trees, self._avimp)
384     
385    def _getOOB(self, instances, selection, nexamples):
386        ooblist = filter(lambda x: x not in selection, range(nexamples))
387        return instances.getitems(ooblist)
388
389    def _numRight(self, oob, classifier):
390        """
391        Return a number of instances which are classified correctly.
392        """
393        #TODO How to accomodate regression?
394        return sum(1 for el in oob if el.getclass() == classifier(el))
395   
396    def _numRightMix(self, oob, classifier, attr):
397        """
398        Return a number of instances which are classified
399        correctly even if a feature is shuffled.
400        """
401        perm = range(len(oob))
402        self.rand.shuffle(perm)
403
404        def shuffle_ex(index):
405            ex = Orange.data.Instance(oob[index])
406            ex[attr] = oob[perm[index]][attr]
407            return ex
408        #TODO How to accomodate regression?
409        return sum(1 for i in range(len(oob)) if oob[i].getclass() == classifier(shuffle_ex(i)))
410
411    def _importanceAcu(self, instances, trees, avimp):
412        """Accumulate avimp by importances for a given number of trees."""
413        n = len(instances)
414
415        attrs = len(instances.domain.attributes)
416
417        attrnum = {}
418        for attr in range(len(instances.domain.attributes)):
419           attrnum[instances.domain.attributes[attr].name] = attr           
420
421        if "attributes" in self.learner.__dict__:
422            self.learner.attributes = len(instances.domain.attributes)**0.5 if self.attributes == None else self.attributes
423
424        # build the forest
425        classifiers = [] 
426        for i in range(trees):
427            # draw bootstrap sample
428            selection = []
429            for j in range(n):
430                selection.append(self.rand.randrange(n))
431            data = instances.getitems(selection)
432           
433            # build the model from the bootstrap sample
434            cla = self.learner(data)
435
436            #prepare OOB data
437            oob = self._getOOB(instances, selection, n)
438           
439            #right on unmixed
440            right = self._numRight(oob, cla)
441           
442            presl = range(attrs)
443            try: #FIXME SimpleTreeLearner does not know how to output attributes yet
444                presl = list(self._presentInTree(cla.tree, attrnum))
445            except:
446                pass
447                     
448            #randomize each feature in data and test
449            #only those on which there was a split
450            for attr in presl:
451                #calculate number of right classifications
452                #if the values of this features are permutated randomly
453                rightimp = self._numRightMix(oob, cla, attr)               
454                avimp[attr] += (float(right-rightimp))/len(oob)
455        self._acu += trees 
456
457    def _presentInTree(self, node, attrnum):
458        """Return features present in tree (features that split)."""
459        if not node:
460          return set([])
461
462        if  node.branchSelector:
463            j = attrnum[node.branchSelector.class_var.name]
464            cs = set([])
465            for i in range(len(node.branches)):
466                s = self._presentInTree(node.branches[i], attrnum)
467                cs = s | cs
468            cs = cs | set([j])
469            return cs
470        else:
471          return set([])
472
473class _RandomForestTreeLearner(Orange.core.Learner):
474    """ A learner which wraps an ordinary TreeLearner with
475    a new split constructor.
476    """
477
478    __new__ = Orange.utils._orange__new__(Orange.core.Learner)
479     
480    def __init__(self, base, rand):
481        self.base = base
482        self.attributes = None
483        self.rand = rand
484        if not self.rand: #for all the built trees
485            self.rand = random.Random(0)
486
487    @deprecated_keywords({"examples":"instances"})
488    def __call__(self, instances, weight=0):
489        """ A current tree learner is copied, modified and then used.
490        Modification: set a different split constructor, which uses
491        a random subset of attributes.
492        """
493        bcopy = copy.copy(self.base)
494
495        #if base tree learner has no measure set
496        if not bcopy.measure:
497            bcopy.measure = Orange.feature.scoring.Gini() \
498                if isinstance(instances.domain.class_var, Orange.feature.Discrete) \
499                else Orange.feature.scoring.MSE()
500
501        bcopy.split = SplitConstructor_AttributeSubset(\
502            bcopy.split, self.attributes, self.rand)
503
504        return bcopy(instances, weight=weight)
505
506class SplitConstructor_AttributeSubset(orange.TreeSplitConstructor):
507    def __init__(self, scons, attributes, rand = None):
508        self.scons = scons           # split constructor of original tree
509        self.attributes = attributes # number of features to consider
510        self.rand = rand
511        if not self.rand:
512            self.rand = random.Random(0)
513
514    @deprecated_keywords({"weightID":"weight_id"})
515    def __call__(self, gen, weight_id, contingencies, apriori, candidates, clsfr):
516        # if number of features for subset is not set, use square root
517        cand = [1]*int(self.attributes) + [0]*(len(candidates) - int(self.attributes))
518        self.rand.shuffle(cand)
519        # instead with all features, we will invoke split constructor
520        # only for the subset of a features
521        t = self.scons(gen, weight_id, contingencies, apriori, cand, clsfr)
522        return t
Note: See TracBrowser for help on using the repository browser.