source: orange/Orange/ensemble/forest.py @ 11403:8ec741dcda0d

Revision 11403:8ec741dcda0d, 22.2 KB checked in by Ales Erjavec <ales.erjavec@…>, 13 months ago (diff)

Translate the instance domain in RandomForestClassifier.

Line 
1from math import sqrt, floor
2import Orange.core as orange
3import Orange
4import Orange.feature.scoring
5import random
6import copy
7from Orange.utils import deprecated_keywords
8from operator import add
9
10def _default_small_learner(attributes=None, rand=None, base=None):
11    # tree learner assembled as suggested by Breiman (2001)
12    if not base:
13        base = Orange.classification.tree.TreeLearner(
14            store_node_classifier=0, store_contingencies=0, 
15            store_distributions=1, min_instances=5)
16
17    return _RandomForestTreeLearner(base=base, rand=rand)
18
19def _default_simple_learner(base, randorange):
20    if base == None:
21        base = Orange.classification.tree.SimpleTreeLearner(min_instances=5)
22    return _RandomForestSimpleTreeLearner(base=base, rand=randorange)
23
24def _wrap_learner(base, rand, randorange):
25    if base == None or isinstance(base, Orange.classification.tree.SimpleTreeLearner) or isinstance(base, Orange.core.ClusteringTreeLearner):
26        return _default_simple_learner(base, randorange)
27    elif isinstance(base, Orange.classification.tree.TreeLearner):
28        return _default_small_learner(None, rand, base)
29    else:
30        notRightLearnerToWrap()
31 
32class _RandomForestSimpleTreeLearner(Orange.core.Learner):
33    """A learner which wraps an ordinary SimpleTreeLearner.  Sets the
34    skip_prob so that the number of randomly chosen features for each
35    split is  (on average) as specified."""
36
37    __new__ = Orange.utils._orange__new__(Orange.core.Learner)
38
39    def __init__(self, base=None, rand=None): # pickle needs an empty init
40        self.base = base
41        self.attributes = None
42        self.rand = rand
43   
44    def __call__(self, instances, weight=0):
45        osp,orand = self.base.skip_prob, self.base.random_generator
46        self.base.skip_prob = 1-float(self.attributes)/len(instances.domain.attributes)
47        self.base.random_generator = self.rand
48        r = self.base(instances, weight)
49        self.base.skip_prob, self.base.random_generator = osp, orand
50        return r
51
52_RandomForestSimpleTreeLearner = Orange.utils.deprecated_members({"weightID":"weight_id", "examples":"instances"})(_RandomForestSimpleTreeLearner)
53
54class _RandomForestTreeLearner(Orange.core.Learner):
55    """ A learner which wraps an ordinary TreeLearner with
56    a new split constructor.
57    """
58
59    __new__ = Orange.utils._orange__new__(Orange.core.Learner)
60   
61    def __init__(self, base=None, rand=None): # pickle needs an empty init
62        self.base = base
63        self.attributes = None
64        self.rand = rand
65        if not self.rand: #for all the built trees
66            self.rand = random.Random(0)
67
68    @deprecated_keywords({"examples":"instances"})
69    def __call__(self, instances, weight=0):
70        """ A current tree learner is copied, modified and then used.
71        Modification: set a different split constructor, which uses
72        a random subset of attributes.
73        """
74        bcopy = copy.copy(self.base)
75
76        #if base tree learner has no measure set
77        if not bcopy.measure:
78            bcopy.measure = Orange.feature.scoring.Gini() \
79                if isinstance(instances.domain.class_var, Orange.feature.Discrete) \
80                else Orange.feature.scoring.MSE()
81
82        bcopy.split = SplitConstructor_AttributeSubset(\
83            bcopy.split, self.attributes, self.rand)
84
85        return bcopy(instances, weight=weight)
86
87
88
89class RandomForestLearner(Orange.core.Learner):
90    """
91    Trains an ensemble predictor consisting of trees trained
92    on bootstrap
93    samples of training data. To increase
94    randomness, the tree learner considers only a subset of
95    candidate features at each node. The algorithm closely follows
96    the original procedure (Brieman, 2001) both in implementation and parameter
97    defaults.
98       
99    :param trees: number of trees in the forest.
100    :type trees: int
101
102    :param attributes: number of randomly drawn features among
103            which to select the best one to split the data sets
104            in tree nodes. The default, None, means the square root of
105            the number of features in the training data. Ignored if
106            :obj:`learner` is specified.
107    :type attributes: int
108
109    :param base_learner: A base tree learner. The base learner will be
110        randomized with Random Forest's random
111        feature subset selection.  If None (default),
112        :class:`~Orange.classification.tree.SimpleTreeLearner` and it
113        will not split nodes with less than 5 data instances.
114    :type base_learner: None or
115        :class:`Orange.classification.tree.TreeLearner` or
116        :class:`Orange.classification.tree.SimpleTreeLearner`
117
118    :param rand: random generator used in bootstrap sampling. If None (default),
119        then ``random.Random(0)`` is used.
120
121    :param learner: Tree induction learner. If `None` (default),
122        the :obj:`base_learner` will be used (and randomized). If
123        :obj:`learner` is specified, it will be used as such
124        with no additional transformations.
125    :type learner: None or :class:`Orange.core.Learner`
126
127    :param callback: a function to be called after every iteration of
128            induction of classifier. The call includes a parameter
129            (from 0.0 to 1.0) that provides an estimate
130            of completion of the learning progress.
131
132    :param name: learner name.
133    :type name: string
134
135    :rtype: :class:`~Orange.ensemble.forest.RandomForestClassifier` or
136            :class:`~Orange.ensemble.forest.RandomForestLearner`
137
138    """
139
140    __new__ = Orange.utils._orange__new__(Orange.core.Learner)
141   
142    def __init__(self, trees=100, attributes=None,\
143                    name='Random Forest', rand=None, callback=None, base_learner=None, learner=None):
144        self.trees = trees
145        self.name = name
146        self.attributes = attributes
147        self.callback = callback
148        self.rand = rand
149
150        self.base_learner = base_learner
151
152        if base_learner != None and learner != None:
153            wrongSpecification()
154
155        if not self.rand:
156            self.rand = random.Random(0)
157        self.randorange = Orange.misc.Random(self.rand.randint(0,2**31-1))
158
159        if learner == None:
160            self.learner = _wrap_learner(base=self.base_learner, rand=self.rand, randorange=self.randorange)
161        else:
162            self.learner = learner
163           
164        self.randstate = self.rand.getstate() #original state
165
166    def __call__(self, instances, weight=0):
167        """
168        Learn from the given table of data instances.
169       
170        :param instances: learning data.
171        :type instances: class:`Orange.data.Table`
172        :param weight: weight.
173        :type weight: int
174        :rtype: :class:`Orange.ensemble.forest.RandomForestClassifier`
175        """
176        self.rand.setstate(self.randstate) #when learning again, set the same state
177        self.randorange.reset()       
178
179        if "attributes" in self.learner.__dict__:
180            self.learner.attributes = len(instances.domain.attributes)**0.5 if self.attributes == None else self.attributes
181
182        learner = self.learner
183
184        n = len(instances)
185        # build the forest
186        classifiers = []
187        for i in range(self.trees):
188            # draw bootstrap sample
189            selection = []
190            for j in range(n):
191                selection.append(self.rand.randrange(n))
192            data = instances.get_items_ref(selection)
193            # build the model from the bootstrap sample
194            classifiers.append(learner(data, weight))
195            if self.callback:
196                self.callback((i+1.)/self.trees)
197
198        return RandomForestClassifier(classifiers = classifiers, name=self.name,\
199                    domain=instances.domain, class_var=instances.domain.class_var, \
200                    class_vars=instances.domain.class_vars)
201
202
203RandomForestLearner = Orange.utils.deprecated_members({"examples":"instances"})(RandomForestLearner)
204
205class RandomForestClassifier(orange.Classifier):
206    """
207    Uses the trees induced by the :obj:`RandomForestLearner`. An input
208    instance is classified into the class with the most frequent vote.
209    However, this implementation returns the averaged probabilities from
210    each of the trees if class probability is requested.
211
212    When constructed manually, the following parameters have to
213    be passed:
214
215    :param classifiers: a list of classifiers to be used.
216    :type classifiers: list
217   
218    :param name: name of the resulting classifier.
219    :type name: str
220   
221    :param domain: the domain of the learning set.
222    :type domain: :class:`Orange.data.Domain`
223   
224    :param class_var: the class feature.
225    :type class_var: :class:`Orange.feature.Descriptor`
226
227    :param class_vars: the multi-target class features.
228    :type class_vars: list of :class:`Orange.feature.Descriptor`
229
230    """
231    def __init__(self, classifiers, name, domain, class_var, class_vars, **kwds):
232        self.classifiers = classifiers
233        self.name = name
234        self.domain = domain
235        self.class_var = class_var
236        self.class_vars = class_vars
237        self.__dict__.update(kwds)
238        self.single_class = True if not class_vars else False
239
240    def __call__(self, instance, result_type = orange.GetValue):
241        """
242        :param instance: instance to be classified.
243        :type instance: :class:`Orange.data.Instance`
244       
245        :param result_type: :class:`Orange.classification.Classifier.GetValue` or \
246              :class:`Orange.classification.Classifier.GetProbabilities` or
247              :class:`Orange.classification.Classifier.GetBoth`
248       
249        :rtype: :class:`Orange.data.Value`,
250              :class:`Orange.statistics.Distribution` or a tuple with both
251        """
252        instance = Orange.data.Instance(self.domain, instance)
253        # get results to avoid multiple calls
254        res_both = [c(instance, orange.GetBoth) for c in self.classifiers]
255
256        # transform single class instance to match multi-target instances
257        if self.single_class:
258            self.class_vars = [self.class_var]
259            res_both = [([(r[0])],[(r[1])]) for r in res_both]
260
261        mt_prob = []
262        mt_value = []
263
264        for varn in xrange(len(self.class_vars)):
265
266            self.class_var = self.class_vars[varn]
267           
268            # handle discreete class
269       
270            if self.class_var.var_type == Orange.feature.Discrete.Discrete:
271       
272                # voting for class probabilities
273                if result_type == orange.GetProbabilities or result_type == orange.GetBoth:
274                    prob = [0.] * len(self.class_var.values)
275                    for r in res_both:
276                        a = [x for x in r[1][varn]]
277                        prob = map(add, prob, a)
278                    norm = sum(prob)
279                    cprob = Orange.statistics.distribution.Discrete(self.class_var)
280                    for i in range(len(prob)):
281                        cprob[i] = prob[i]/norm
282               
283                # voting for crisp class membership, notice that
284                # this may not be the same class as one obtaining the
285                # highest probability through probability voting
286                if result_type == orange.GetValue or result_type == orange.GetBoth:
287                    cfreq = [0] * len(self.class_var.values)
288                    for r in res_both:
289                        cfreq[int(r[0][varn])] += 1
290                    index = cfreq.index(max(cfreq))
291                    cvalue = Orange.data.Value(self.class_var, index)
292           
293                if result_type == orange.GetValue: mt_value.append(cvalue)
294                elif result_type == orange.GetProbabilities: mt_prob.append(cprob)
295                else: 
296                    mt_value.append(cvalue)
297                    mt_prob.append(cprob)
298       
299            else:
300                # Handle continuous class
301       
302                # voting for class probabilities
303                if result_type == orange.GetProbabilities or result_type == orange.GetBoth:
304                    probs = [ r for r in res_both]
305                    cprob = dict()
306     
307                    for val,prob in probs:
308                        if prob[varn] != None: #no probability output
309                            a = dict(prob[varn].items())
310                        else:
311                            a = { val[varn].value : 1. }
312                        cprob = dict( (n, a.get(n, 0)+cprob.get(n, 0)) for n in set(a)|set(cprob) )
313                    cprob = Orange.statistics.distribution.Continuous(cprob)
314                    cprob.normalize()
315               
316                # gather average class value
317                if result_type == orange.GetValue or result_type == orange.GetBoth:
318                    values = [r[0][varn] for r in res_both]
319                    cvalue = Orange.data.Value(self.class_var, sum(values) / len(self.classifiers))
320           
321                if result_type == orange.GetValue: mt_value.append(cvalue)
322                elif result_type == orange.GetProbabilities: mt_prob.append(cprob)
323                else: 
324                    mt_value.append(cvalue)
325                    mt_prob.append(cprob)
326       
327        # check for singleclass when returning
328        if self.single_class:
329            if result_type == orange.GetValue: return mt_value[0]
330            elif result_type == orange.GetProbabilities: return mt_prob[0]
331            else: 
332                return [mt_value[0],mt_prob[0]] 
333           
334        if result_type == orange.GetValue: return tuple(mt_value)
335        elif result_type == orange.GetProbabilities: return tuple(mt_prob)
336        else: 
337            return [tuple(mt_value),tuple(mt_prob)]
338
339    def __reduce__(self):
340        return type(self), (self.classifiers, self.name, self.domain, self.class_var, self.class_vars), dict(self.__dict__)
341
342RandomForestClassifier = Orange.utils.deprecated_members({"resultType":"result_type", "classVar":"class_var", "example":"instance"})(RandomForestClassifier)
343### MeasureAttribute_randomForests
344
345class ScoreFeature(orange.MeasureAttribute):
346    """
347    :param trees: number of trees in the forest.
348    :type trees: int
349
350    :param attributes: number of randomly drawn features among
351            which to select the best to split the nodes in tree
352            induction. The default, None, means the square root of
353            the number of features in the training data. Ignored if
354            :obj:`learner` is specified.
355    :type attributes: int
356
357    :param base_learner: A base tree learner. The base learner will be
358        randomized with Random Forest's random
359        feature subset selection.  If None (default),
360        :class:`~Orange.classification.tree.SimpleTreeLearner` and it
361        will not split nodes with less than 5 data instances.
362    :type base_learner: None or
363        :class:`Orange.classification.tree.TreeLearner` or
364        :class:`Orange.classification.tree.SimpleTreeLearner`
365
366    :param rand: random generator used in bootstrap sampling. If None (default),
367        then ``random.Random(0)`` is used.
368
369    :param learner: Tree induction learner. If `None` (default),
370        the :obj:`base_learner` will be used (and randomized). If
371        :obj:`learner` is specified, it will be used as such
372        with no additional transformations.
373    :type learner: None or :class:`Orange.core.Learner`
374
375    """
376    def __init__(self, trees=100, attributes=None, rand=None, base_learner=None, learner=None):
377
378        self.trees = trees
379        self.learner = learner
380        self._bufinstances = None
381        self.attributes = attributes
382        self.rand = rand
383        self.base_learner = base_learner
384
385        if base_learner != None and learner != None:
386            wrongSpecification()
387
388        if not self.rand:
389            self.rand = random.Random(0)
390        self.randorange = Orange.misc.Random(self.rand.randint(0,2**31-1))
391
392        if learner == None:
393            self.learner = _wrap_learner(base=self.base_learner, rand=self.rand, randorange=self.randorange)
394        else:
395            self.learner = learner
396
397    @deprecated_keywords({"apriorClass":"aprior_class"})
398    def __call__(self, feature, instances, aprior_class=None):
399        """
400        Return importance of a given feature.
401        Only the first call on a given data set is computationally expensive.
402       
403        :param feature: feature to evaluate (by index, name or
404            :class:`Orange.feature.Descriptor` object).
405        :type feature: int, str or :class:`Orange.feature.Descriptor`.
406       
407        :param instances: data instances to use for importance evaluation.
408        :type instances: :class:`Orange.data.Table`
409       
410        :param aprior_class: not used!
411       
412        """
413        attrNo = None
414
415        if type(feature) == int: #by attr. index
416          attrNo  = feature
417        elif type(feature) == type("a"): #by attr. name
418          attrName = feature
419          attrNo = instances.domain.index(attrName)
420        elif isinstance(feature, Orange.feature.Descriptor):
421          atrs = [a for a in instances.domain.attributes]
422          attrNo = atrs.index(feature)
423        else:
424          raise Exception("MeasureAttribute_rf can not be called with (\
425                contingency,classDistribution, aprior_class) as fuction arguments.")
426
427        self._buffer(instances)
428
429        return self._avimp[attrNo]*100/self._acu
430
431    def importances(self, table):
432        """
433        DEPRECATED. Return importance of all features in the dataset as a list.
434       
435        :param table: dataset of which the features' importance needs to be
436            measured.
437        :type table: :class:`Orange.data.Table`
438
439        """
440        self._buffer(table)
441        return [a*100/self._acu for a in self._avimp]
442
443    def _buffer(self, instances):
444        """
445        Recalculate importance of features if needed (ie. if it has been
446        buffered for the given dataset yet).
447
448        :param table: dataset of which the features' importance needs to be
449            measured.
450        :type table: :class:`Orange.data.Table`
451
452        """
453        if instances != self._bufinstances or \
454            instances.version != self._bufinstances.version:
455
456            self._bufinstances = instances
457            self._avimp = [0.0]*len(self._bufinstances.domain.attributes)
458            self._acu = 0
459            self._importanceAcu(self._bufinstances, self.trees, self._avimp)
460     
461    def _getOOB(self, instances, selection, nexamples):
462        ooblist = filter(lambda x: x not in selection, range(nexamples))
463        return instances.getitems(ooblist)
464
465    def _numRight(self, oob, classifier):
466        """
467        Return a number of instances which are classified correctly.
468        """
469        #TODO How to accomodate regression?
470        return sum(1 for el in oob if el.getclass() == classifier(el))
471   
472    def _numRightMix(self, oob, classifier, attr):
473        """
474        Return a number of instances which are classified
475        correctly even if a feature is shuffled.
476        """
477        perm = range(len(oob))
478        self.rand.shuffle(perm)
479
480        def shuffle_ex(index):
481            ex = Orange.data.Instance(oob[index])
482            ex[attr] = oob[perm[index]][attr]
483            return ex
484        #TODO How to accomodate regression?
485        return sum(1 for i in range(len(oob)) if oob[i].getclass() == classifier(shuffle_ex(i)))
486
487    def _importanceAcu(self, instances, trees, avimp):
488        """Accumulate avimp by importances for a given number of trees."""
489        n = len(instances)
490
491        attrs = len(instances.domain.attributes)
492
493        attrnum = {}
494        for attr in range(len(instances.domain.attributes)):
495           attrnum[instances.domain.attributes[attr].name] = attr           
496
497        if "attributes" in self.learner.__dict__:
498            self.learner.attributes = len(instances.domain.attributes)**0.5 if self.attributes == None else self.attributes
499
500        # build the forest
501        classifiers = [] 
502        for i in range(trees):
503            # draw bootstrap sample
504            selection = []
505            for j in range(n):
506                selection.append(self.rand.randrange(n))
507            data = instances.getitems(selection)
508           
509            # build the model from the bootstrap sample
510            cla = self.learner(data)
511
512            #prepare OOB data
513            oob = self._getOOB(instances, selection, n)
514           
515            #right on unmixed
516            right = self._numRight(oob, cla)
517           
518            presl = range(attrs)
519            try: #FIXME SimpleTreeLearner does not know how to output attributes yet
520                presl = list(self._presentInTree(cla.tree, attrnum))
521            except:
522                pass
523                     
524            #randomize each feature in data and test
525            #only those on which there was a split
526            for attr in presl:
527                #calculate number of right classifications
528                #if the values of this features are permutated randomly
529                rightimp = self._numRightMix(oob, cla, attr)               
530                avimp[attr] += (float(right-rightimp))/len(oob)
531        self._acu += trees 
532
533    def _presentInTree(self, node, attrnum):
534        """Return features present in tree (features that split)."""
535        if not node:
536          return set([])
537
538        if  node.branchSelector:
539            j = attrnum[node.branchSelector.class_var.name]
540            cs = set([])
541            for i in range(len(node.branches)):
542                s = self._presentInTree(node.branches[i], attrnum)
543                cs = s | cs
544            cs = cs | set([j])
545            return cs
546        else:
547          return set([])
548
549
550class SplitConstructor_AttributeSubset(orange.TreeSplitConstructor):
551    def __init__(self, scons, attributes, rand = None):
552        self.scons = scons           # split constructor of original tree
553        self.attributes = attributes # number of features to consider
554        self.rand = rand
555        if not self.rand:
556            self.rand = random.Random(0)
557
558    @deprecated_keywords({"weightID":"weight_id"})
559    def __call__(self, gen, weight_id, contingencies, apriori, candidates, clsfr):
560        # if number of features for subset is not set, use square root
561        cand = [1]*int(self.attributes) + [0]*(len(candidates) - int(self.attributes))
562        self.rand.shuffle(cand)
563        # instead with all features, we will invoke split constructor
564        # only for the subset of a features
565        t = self.scons(gen, weight_id, contingencies, apriori, cand, clsfr)
566        return t
Note: See TracBrowser for help on using the repository browser.