source: orange/Orange/ensemble/forest.py @ 10525:a139c6838b3c

Revision 10525:a139c6838b3c, 20.5 KB checked in by markotoplak, 2 years ago (diff)

Random forests: replaced custom news with Orange.misc_orangenew.

Line 
1from math import sqrt, floor
2import Orange.core as orange
3import Orange
4import Orange.feature.scoring
5import random
6import copy
7from Orange.misc import deprecated_keywords
8
9def _default_small_learner(attributes=None, rand=None, base=None):
10    # tree learner assembled as suggested by Breiman (2001)
11    if not base:
12        base = Orange.classification.tree.TreeLearner(
13            store_node_classifier=0, store_contingencies=0, 
14            store_distributions=1, min_instances=5)
15
16    return _RandomForestTreeLearner(base=base, rand=rand)
17
18def _default_simple_learner(base, randorange):
19    if base == None:
20        base = Orange.classification.tree.SimpleTreeLearner(min_instances=5)
21    return _RandomForestSimpleTreeLearner(base=base, rand=randorange)
22
23def _wrap_learner(base, rand, randorange):
24    if base == None or isinstance(base, Orange.classification.tree.SimpleTreeLearner):
25        return _default_simple_learner(base, randorange)
26    elif isinstance(base, Orange.classification.tree.TreeLearner):
27        return _default_small_learner(None, rand, base)
28    else:
29        notRightLearnerToWrap()
30 
31class _RandomForestSimpleTreeLearner(Orange.core.Learner):
32    """A learner which wraps an ordinary SimpleTreeLearner.  Sets the
33    skip_prob so that the number of randomly chosen features for each
34    split is  (on average) as specified."""
35
36    __new__ = Orange.misc._orange__new__(Orange.core.Learner)
37
38    def __init__(self, base, rand):
39        self.base = base
40        self.attributes = None
41        self.rand = rand
42   
43    def __call__(self, instances, weight=0):
44        osp,orand = self.base.skip_prob, self.base.random_generator
45        self.base.skip_prob = 1-float(self.attributes)/len(instances.domain.attributes)
46        self.base.random_generator = self.rand
47        r = self.base(instances, weight)
48        self.base.skip_prob, self.base.random_generator = osp, orand
49        return r
50
51_RandomForestSimpleTreeLearner = Orange.misc.deprecated_members({"weightID":"weight_id", "examples":"instances"})(_RandomForestSimpleTreeLearner)
52   
53class RandomForestLearner(Orange.core.Learner):
54    """
55    Just like in bagging, classifiers in random forests are trained from bootstrap
56    samples of training data. Here, the classifiers are trees. However, to increase
57    randomness, at each node of the tree the best feature is
58    chosen from a subset of features in the data. We closely follow the
59    original algorithm (Brieman, 2001) both in implementation and parameter
60    defaults.
61       
62    :param trees: number of trees in the forest.
63    :type trees: int
64
65    :param attributes: number of randomly drawn features among
66            which to select the best to split the nodes in tree
67            induction. The default, None, means the square root of
68            the number of features in the training data. Ignored if
69            :obj:`learner` is specified.
70    :type attributes: int
71
72    :param base_learner: A base tree learner. The base learner will be
73        randomized with Random Forest's random
74        feature subset selection.  If None (default),
75        :class:`~Orange.classification.tree.SimpleTreeLearner` and it
76        will not split nodes with less than 5 data instances.
77    :type base_learner: None or
78        :class:`Orange.classification.tree.TreeLearner` or
79        :class:`Orange.classification.tree.SimpleTreeLearner`
80
81    :param rand: random generator used in bootstrap sampling. If None (default),
82        then ``random.Random(0)`` is used.
83
84    :param learner: Tree induction learner. If `None` (default),
85        the :obj:`base_learner` will be used (and randomized). If
86        :obj:`learner` is specified, it will be used as such
87        with no additional transformations.
88    :type learner: None or :class:`Orange.core.Learner`
89
90    :param callback: a function to be called after every iteration of
91            induction of classifier. This is called with parameter
92            (from 0.0 to 1.0) that gives estimates on learning progress.
93
94    :param name: name of the learner.
95    :type name: string
96
97    :rtype: :class:`~Orange.ensemble.forest.RandomForestClassifier` or
98            :class:`~Orange.ensemble.forest.RandomForestLearner`
99
100    """
101
102    __new__ = Orange.misc._orange__new__(Orange.core.Learner)
103
104    def __init__(self, trees=100, attributes=None,\
105                    name='Random Forest', rand=None, callback=None, base_learner=None, learner=None):
106        self.trees = trees
107        self.name = name
108        self.attributes = attributes
109        self.callback = callback
110        self.rand = rand
111
112        self.base_learner = base_learner
113
114        if base_learner != None and learner != None:
115            wrongSpecification()
116
117        if not self.rand:
118            self.rand = random.Random(0)
119        self.randorange = Orange.misc.Random(self.rand.randint(0,2**31-1))
120
121        if learner == None:
122            self.learner = _wrap_learner(base=self.base_learner, rand=self.rand, randorange=self.randorange)
123        else:
124            self.learner = learner
125           
126        self.randstate = self.rand.getstate() #original state
127
128    def __call__(self, instances, weight=0):
129        """
130        Learn from the given table of data instances.
131       
132        :param instances: learning data.
133        :type instances: class:`Orange.data.Table`
134        :param weight: weight.
135        :type weight: int
136        :rtype: :class:`Orange.ensemble.forest.RandomForestClassifier`
137        """
138        self.rand.setstate(self.randstate) #when learning again, set the same state
139        self.randorange.reset()       
140
141        if "attributes" in self.learner.__dict__:
142            self.learner.attributes = len(instances.domain.attributes)**0.5 if self.attributes == None else self.attributes
143
144        learner = self.learner
145
146        n = len(instances)
147        # build the forest
148        classifiers = []
149        for i in range(self.trees):
150            # draw bootstrap sample
151            selection = []
152            for j in range(n):
153                selection.append(self.rand.randrange(n))
154            data = instances.get_items_ref(selection)
155            # build the model from the bootstrap sample
156            classifiers.append(learner(data, weight))
157            if self.callback:
158                self.callback()
159            # if self.callback: self.callback((i+1.)/self.trees)
160
161        return RandomForestClassifier(classifiers = classifiers, name=self.name,\
162                    domain=instances.domain, class_var=instances.domain.class_var)
163RandomForestLearner = Orange.misc.deprecated_members({"examples":"instances"})(RandomForestLearner)
164
165class RandomForestClassifier(orange.Classifier):
166    """
167    Uses the trees induced by the :obj:`RandomForestLearner`. An input
168    instance is classified into the class with the most frequent vote.
169    However, this implementation returns the averaged probabilities from
170    each of the trees if class probability is requested.
171
172    When constructed manually, the following parameters have to
173    be passed:
174
175    :param classifiers: a list of classifiers to be used.
176    :type classifiers: list
177   
178    :param name: name of the resulting classifier.
179    :type name: str
180   
181    :param domain: the domain of the learning set.
182    :type domain: :class:`Orange.data.Domain`
183   
184    :param class_var: the class feature.
185    :type class_var: :class:`Orange.feature.Descriptor`
186
187    """
188    def __init__(self, classifiers, name, domain, class_var, **kwds):
189        self.classifiers = classifiers
190        self.name = name
191        self.domain = domain
192        self.class_var = class_var
193        self.__dict__.update(kwds)
194
195    def __call__(self, instance, result_type = orange.GetValue):
196        """
197        :param instance: instance to be classified.
198        :type instance: :class:`Orange.data.Instance`
199       
200        :param result_type: :class:`Orange.classification.Classifier.GetValue` or \
201              :class:`Orange.classification.Classifier.GetProbabilities` or
202              :class:`Orange.classification.Classifier.GetBoth`
203       
204        :rtype: :class:`Orange.data.Value`,
205              :class:`Orange.statistics.Distribution` or a tuple with both
206        """
207        from operator import add
208       
209        # handle discreete class
210       
211        if self.class_var.var_type == Orange.feature.Discrete.Discrete:
212       
213            # voting for class probabilities
214            if result_type == orange.GetProbabilities or result_type == orange.GetBoth:
215                prob = [0.] * len(self.domain.class_var.values)
216                for c in self.classifiers:
217                    a = [x for x in c(instance, orange.GetProbabilities)]
218                    prob = map(add, prob, a)
219                norm = sum(prob)
220                cprob = Orange.statistics.distribution.Discrete(self.class_var)
221                for i in range(len(prob)):
222                    cprob[i] = prob[i]/norm
223               
224            # voting for crisp class membership, notice that
225            # this may not be the same class as one obtaining the
226            # highest probability through probability voting
227            if result_type == orange.GetValue or result_type == orange.GetBoth:
228                cfreq = [0] * len(self.domain.class_var.values)
229                for c in self.classifiers:
230                    cfreq[int(c(instance))] += 1
231                index = cfreq.index(max(cfreq))
232                cvalue = Orange.data.Value(self.domain.class_var, index)
233   
234            if result_type == orange.GetValue: return cvalue
235            elif result_type == orange.GetProbabilities: return cprob
236            else: return (cvalue, cprob)
237       
238        else:
239            # Handle continuous class
240       
241            # voting for class probabilities
242            if result_type == orange.GetProbabilities or result_type == orange.GetBoth:
243                probs = [c(instance, orange.GetBoth) for c in self.classifiers]
244                cprob = dict()
245                for val,prob in probs:
246                    if prob != None: #no probability output
247                        a = dict(prob.items())
248                    else:
249                        a = { val.value : 1. }
250                    cprob = dict( (n, a.get(n, 0)+cprob.get(n, 0)) for n in set(a)|set(cprob) )
251                cprob = Orange.statistics.distribution.Continuous(cprob)
252                cprob.normalize()
253               
254            # gather average class value
255            if result_type == orange.GetValue or result_type == orange.GetBoth:
256                values = [c(instance).value for c in self.classifiers]
257                cvalue = Orange.data.Value(self.domain.class_var, sum(values) / len(self.classifiers))
258           
259            if result_type == orange.GetValue: return cvalue
260            elif result_type == orange.GetProbabilities: return cprob
261            else: return (cvalue, cprob)
262           
263    def __reduce__(self):
264        return type(self), (self.classifiers, self.name, self.domain, self.class_var), dict(self.__dict__)
265RandomForestClassifier = Orange.misc.deprecated_members({"resultType":"result_type", "classVar":"class_var", "example":"instance"})(RandomForestClassifier)
266### MeasureAttribute_randomForests
267
268class ScoreFeature(orange.MeasureAttribute):
269    """
270    :param trees: number of trees in the forest.
271    :type trees: int
272
273    :param attributes: number of randomly drawn features among
274            which to select the best to split the nodes in tree
275            induction. The default, None, means the square root of
276            the number of features in the training data. Ignored if
277            :obj:`learner` is specified.
278    :type attributes: int
279
280    :param base_learner: A base tree learner. The base learner will be
281        randomized with Random Forest's random
282        feature subset selection.  If None (default),
283        :class:`~Orange.classification.tree.SimpleTreeLearner` and it
284        will not split nodes with less than 5 data instances.
285    :type base_learner: None or
286        :class:`Orange.classification.tree.TreeLearner` or
287        :class:`Orange.classification.tree.SimpleTreeLearner`
288
289    :param rand: random generator used in bootstrap sampling. If None (default),
290        then ``random.Random(0)`` is used.
291
292    :param learner: Tree induction learner. If `None` (default),
293        the :obj:`base_learner` will be used (and randomized). If
294        :obj:`learner` is specified, it will be used as such
295        with no additional transformations.
296    :type learner: None or :class:`Orange.core.Learner`
297
298    """
299    def __init__(self, trees=100, attributes=None, rand=None, base_learner=None, learner=None):
300
301        self.trees = trees
302        self.learner = learner
303        self._bufinstances = None
304        self.attributes = attributes
305        self.rand = rand
306        self.base_learner = base_learner
307
308        if base_learner != None and learner != None:
309            wrongSpecification()
310
311        if not self.rand:
312            self.rand = random.Random(0)
313        self.randorange = Orange.misc.Random(self.rand.randint(0,2**31-1))
314
315        if learner == None:
316            self.learner = _wrap_learner(base=self.base_learner, rand=self.rand, randorange=self.randorange)
317        else:
318            self.learner = learner
319
320    @deprecated_keywords({"apriorClass":"aprior_class"})
321    def __call__(self, feature, instances, aprior_class=None):
322        """
323        Return importance of a given feature.
324        Only the first call on a given data set is computationally expensive.
325       
326        :param feature: feature to evaluate (by index, name or
327            :class:`Orange.feature.Descriptor` object).
328        :type feature: int, str or :class:`Orange.feature.Descriptor`.
329       
330        :param instances: data instances to use for importance evaluation.
331        :type instances: :class:`Orange.data.Table`
332       
333        :param aprior_class: not used!
334       
335        """
336        attrNo = None
337
338        if type(feature) == int: #by attr. index
339          attrNo  = feature
340        elif type(feature) == type("a"): #by attr. name
341          attrName = feature
342          attrNo = instances.domain.index(attrName)
343        elif isinstance(feature, Orange.feature.Descriptor):
344          atrs = [a for a in instances.domain.attributes]
345          attrNo = atrs.index(feature)
346        else:
347          raise Exception("MeasureAttribute_rf can not be called with (\
348                contingency,classDistribution, aprior_class) as fuction arguments.")
349
350        self._buffer(instances)
351
352        return self._avimp[attrNo]*100/self._acu
353
354    def importances(self, table):
355        """
356        DEPRECATED. Return importance of all features in the dataset as a list.
357       
358        :param table: dataset of which the features' importance needs to be
359            measured.
360        :type table: :class:`Orange.data.Table`
361
362        """
363        self._buffer(table)
364        return [a*100/self._acu for a in self._avimp]
365
366    def _buffer(self, instances):
367        """
368        Recalculate importance of features if needed (ie. if it has been
369        buffered for the given dataset yet).
370
371        :param table: dataset of which the features' importance needs to be
372            measured.
373        :type table: :class:`Orange.data.Table`
374
375        """
376        if instances != self._bufinstances or \
377            instances.version != self._bufinstances.version:
378
379            self._bufinstances = instances
380            self._avimp = [0.0]*len(self._bufinstances.domain.attributes)
381            self._acu = 0
382            self._importanceAcu(self._bufinstances, self.trees, self._avimp)
383     
384    def _getOOB(self, instances, selection, nexamples):
385        ooblist = filter(lambda x: x not in selection, range(nexamples))
386        return instances.getitems(ooblist)
387
388    def _numRight(self, oob, classifier):
389        """
390        Return a number of instances which are classified correctly.
391        """
392        #TODO How to accomodate regression?
393        return sum(1 for el in oob if el.getclass() == classifier(el))
394   
395    def _numRightMix(self, oob, classifier, attr):
396        """
397        Return a number of instances which are classified
398        correctly even if a feature is shuffled.
399        """
400        perm = range(len(oob))
401        self.rand.shuffle(perm)
402
403        def shuffle_ex(index):
404            ex = Orange.data.Instance(oob[index])
405            ex[attr] = oob[perm[index]][attr]
406            return ex
407        #TODO How to accomodate regression?
408        return sum(1 for i in range(len(oob)) if oob[i].getclass() == classifier(shuffle_ex(i)))
409
410    def _importanceAcu(self, instances, trees, avimp):
411        """Accumulate avimp by importances for a given number of trees."""
412        n = len(instances)
413
414        attrs = len(instances.domain.attributes)
415
416        attrnum = {}
417        for attr in range(len(instances.domain.attributes)):
418           attrnum[instances.domain.attributes[attr].name] = attr           
419
420        if "attributes" in self.learner.__dict__:
421            self.learner.attributes = len(instances.domain.attributes)**0.5 if self.attributes == None else self.attributes
422
423        # build the forest
424        classifiers = [] 
425        for i in range(trees):
426            # draw bootstrap sample
427            selection = []
428            for j in range(n):
429                selection.append(self.rand.randrange(n))
430            data = instances.getitems(selection)
431           
432            # build the model from the bootstrap sample
433            cla = self.learner(data)
434
435            #prepare OOB data
436            oob = self._getOOB(instances, selection, n)
437           
438            #right on unmixed
439            right = self._numRight(oob, cla)
440           
441            presl = range(attrs)
442            try: #FIXME SimpleTreeLearner does not know how to output attributes yet
443                presl = list(self._presentInTree(cla.tree, attrnum))
444            except:
445                pass
446                     
447            #randomize each feature in data and test
448            #only those on which there was a split
449            for attr in presl:
450                #calculate number of right classifications
451                #if the values of this features are permutated randomly
452                rightimp = self._numRightMix(oob, cla, attr)               
453                avimp[attr] += (float(right-rightimp))/len(oob)
454        self._acu += trees 
455
456    def _presentInTree(self, node, attrnum):
457        """Return features present in tree (features that split)."""
458        if not node:
459          return set([])
460
461        if  node.branchSelector:
462            j = attrnum[node.branchSelector.class_var.name]
463            cs = set([])
464            for i in range(len(node.branches)):
465                s = self._presentInTree(node.branches[i], attrnum)
466                cs = s | cs
467            cs = cs | set([j])
468            return cs
469        else:
470          return set([])
471
472class _RandomForestTreeLearner(Orange.core.Learner):
473    """ A learner which wraps an ordinary TreeLearner with
474    a new split constructor.
475    """
476
477    __new__ = Orange.misc._orange__new__(Orange.core.Learner)
478     
479    def __init__(self, base, rand):
480        self.base = base
481        self.attributes = None
482        self.rand = rand
483        if not self.rand: #for all the built trees
484            self.rand = random.Random(0)
485
486    @deprecated_keywords({"examples":"instances"})
487    def __call__(self, instances, weight=0):
488        """ A current tree learner is copied, modified and then used.
489        Modification: set a different split constructor, which uses
490        a random subset of attributes.
491        """
492        bcopy = copy.copy(self.base)
493
494        #if base tree learner has no measure set
495        if not bcopy.measure:
496            bcopy.measure = Orange.feature.scoring.Gini() \
497                if isinstance(instances.domain.class_var, Orange.feature.Discrete) \
498                else Orange.feature.scoring.MSE()
499
500        bcopy.split = SplitConstructor_AttributeSubset(\
501            bcopy.split, self.attributes, self.rand)
502
503        return bcopy(instances, weight=weight)
504
505class SplitConstructor_AttributeSubset(orange.TreeSplitConstructor):
506    def __init__(self, scons, attributes, rand = None):
507        self.scons = scons           # split constructor of original tree
508        self.attributes = attributes # number of features to consider
509        self.rand = rand
510        if not self.rand:
511            self.rand = random.Random(0)
512
513    @deprecated_keywords({"weightID":"weight_id"})
514    def __call__(self, gen, weight_id, contingencies, apriori, candidates, clsfr):
515        # if number of features for subset is not set, use square root
516        cand = [1]*int(self.attributes) + [0]*(len(candidates) - int(self.attributes))
517        self.rand.shuffle(cand)
518        # instead with all features, we will invoke split constructor
519        # only for the subset of a features
520        t = self.scons(gen, weight_id, contingencies, apriori, cand, clsfr)
521        return t
Note: See TracBrowser for help on using the repository browser.