source: orange/Orange/ensemble/forest.py @ 9919:8a2a770ef3af

Revision 9919:8a2a770ef3af, 21.2 KB checked in by markotoplak, 2 years ago (diff)

data.variable -> feature

Line 
1from math import sqrt, floor
2import Orange.core as orange
3import Orange
4import Orange.feature.scoring
5import random
6import copy
7from Orange.misc import deprecated_keywords
8
9def _default_small_learner(attributes=None, rand=None, base=None):
10    # tree learner assembled as suggested by Breiman (2001)
11    if not base:
12        base = Orange.classification.tree.TreeLearner(
13            store_node_classifier=0, store_contingencies=0, 
14            store_distributions=1, min_instances=5)
15
16    return _RandomForestTreeLearner(base=base, rand=rand)
17
18def _default_simple_learner(base, randorange):
19    if base == None:
20        base = Orange.classification.tree.SimpleTreeLearner(min_instances=5)
21    return _RandomForestSimpleTreeLearner(base=base, rand=randorange)
22
23def _wrap_learner(base, rand, randorange):
24    if base == None or isinstance(base, Orange.classification.tree.SimpleTreeLearner):
25        return _default_simple_learner(base, randorange)
26    elif isinstance(base, Orange.classification.tree.TreeLearner):
27        return _default_small_learner(None, rand, base)
28    else:
29        notRightLearnerToWrap()
30 
31class _RandomForestSimpleTreeLearner(Orange.core.Learner):
32    """A learner which wraps an ordinary SimpleTreeLearner.  Sets the
33    skip_prob so that the number of randomly chosen features for each
34    split is  (on average) as specified."""
35
36    def __new__(cls, instances = None, weight_id = 0, **argkw):
37        self = Orange.core.Learner.__new__(cls, **argkw)
38        if instances:
39            self.__init__(**argkw)
40            return self.__call__(instances, weight_id)
41        else:
42            return self
43     
44    def __init__(self, base, rand):
45        self.base = base
46        self.attributes = None
47        self.rand = rand
48   
49    def __call__(self, instances, weight=0):
50        osp,orand = self.base.skip_prob, self.base.random_generator
51        self.base.skip_prob = 1-float(self.attributes)/len(instances.domain.attributes)
52        self.base.random_generator = self.rand
53        r = self.base(instances, weight)
54        self.base.skip_prob, self.base.random_generator = osp, orand
55        return r
56_RandomForestSimpleTreeLearner = Orange.misc.deprecated_members({"weightID":"weight_id", "examples":"instances"})(_RandomForestSimpleTreeLearner)
57   
58class RandomForestLearner(orange.Learner):
59    """
60    Just like in bagging, classifiers in random forests are trained from bootstrap
61    samples of training data. Here, the classifiers are trees. However, to increase
62    randomness, at each node of the tree the best feature is
63    chosen from a subset of features in the data. We closely follow the
64    original algorithm (Brieman, 2001) both in implementation and parameter
65    defaults.
66       
67    :param trees: number of trees in the forest.
68    :type trees: int
69
70    :param attributes: number of randomly drawn features among
71            which to select the best to split the nodes in tree
72            induction. The default, None, means the square root of
73            the number of features in the training data. Ignored if
74            :obj:`learner` is specified.
75    :type attributes: int
76
77    :param base_learner: A base tree learner. The base learner will be
78        randomized with Random Forest's random
79        feature subset selection.  If None (default),
80        :class:`~Orange.classification.tree.SimpleTreeLearner` and it
81        will not split nodes with less than 5 data instances.
82    :type base_learner: None or
83        :class:`Orange.classification.tree.TreeLearner` or
84        :class:`Orange.classification.tree.SimpleTreeLearner`
85
86    :param rand: random generator used in bootstrap sampling. If None (default),
87        then ``random.Random(0)`` is used.
88
89    :param learner: Tree induction learner. If `None` (default),
90        the :obj:`base_learner` will be used (and randomized). If
91        :obj:`learner` is specified, it will be used as such
92        with no additional transformations.
93    :type learner: None or :class:`Orange.core.Learner`
94
95    :param callback: a function to be called after every iteration of
96            induction of classifier. This is called with parameter
97            (from 0.0 to 1.0) that gives estimates on learning progress.
98
99    :param name: name of the learner.
100    :type name: string
101
102    :rtype: :class:`~Orange.ensemble.forest.RandomForestClassifier` or
103            :class:`~Orange.ensemble.forest.RandomForestLearner`
104
105    """
106
107    def __new__(cls, instances=None, weight = 0, **kwds):
108        self = orange.Learner.__new__(cls, **kwds)
109        if instances:
110            self.__init__(**kwds)
111            return self.__call__(instances, weight)
112        else:
113            return self
114
115    def __init__(self, trees=100, attributes=None,\
116                    name='Random Forest', rand=None, callback=None, base_learner=None, learner=None):
117        self.trees = trees
118        self.name = name
119        self.attributes = attributes
120        self.callback = callback
121        self.rand = rand
122
123        self.base_learner = base_learner
124
125        if base_learner != None and learner != None:
126            wrongSpecification()
127
128        if not self.rand:
129            self.rand = random.Random(0)
130        self.randorange = Orange.misc.Random(self.rand.randint(0,2**31-1))
131
132        if learner == None:
133            self.learner = _wrap_learner(base=self.base_learner, rand=self.rand, randorange=self.randorange)
134        else:
135            self.learner = learner
136           
137        self.randstate = self.rand.getstate() #original state
138
139    def __call__(self, instances, weight=0):
140        """
141        Learn from the given table of data instances.
142       
143        :param instances: learning data.
144        :type instances: class:`Orange.data.Table`
145        :param weight: weight.
146        :type weight: int
147        :rtype: :class:`Orange.ensemble.forest.RandomForestClassifier`
148        """
149        self.rand.setstate(self.randstate) #when learning again, set the same state
150        self.randorange.reset()       
151
152        if "attributes" in self.learner.__dict__:
153            self.learner.attributes = len(instances.domain.attributes)**0.5 if self.attributes == None else self.attributes
154
155        learner = self.learner
156
157        n = len(instances)
158        # build the forest
159        classifiers = []
160        for i in range(self.trees):
161            # draw bootstrap sample
162            selection = []
163            for j in range(n):
164                selection.append(self.rand.randrange(n))
165            data = instances.get_items_ref(selection)
166            # build the model from the bootstrap sample
167            classifiers.append(learner(data, weight))
168            if self.callback:
169                self.callback()
170            # if self.callback: self.callback((i+1.)/self.trees)
171
172        return RandomForestClassifier(classifiers = classifiers, name=self.name,\
173                    domain=instances.domain, class_var=instances.domain.class_var)
174RandomForestLearner = Orange.misc.deprecated_members({"examples":"instances"})(RandomForestLearner)
175
176class RandomForestClassifier(orange.Classifier):
177    """
178    Uses the trees induced by the :obj:`RandomForestLearner`. An input
179    instance is classified into the class with the most frequent vote.
180    However, this implementation returns the averaged probabilities from
181    each of the trees if class probability is requested.
182
183    When constructed manually, the following parameters have to
184    be passed:
185
186    :param classifiers: a list of classifiers to be used.
187    :type classifiers: list
188   
189    :param name: name of the resulting classifier.
190    :type name: str
191   
192    :param domain: the domain of the learning set.
193    :type domain: :class:`Orange.data.Domain`
194   
195    :param class_var: the class feature.
196    :type class_var: :class:`Orange.feature.Descriptor`
197
198    """
199    def __init__(self, classifiers, name, domain, class_var, **kwds):
200        self.classifiers = classifiers
201        self.name = name
202        self.domain = domain
203        self.class_var = class_var
204        self.__dict__.update(kwds)
205
206    def __call__(self, instance, result_type = orange.GetValue):
207        """
208        :param instance: instance to be classified.
209        :type instance: :class:`Orange.data.Instance`
210       
211        :param result_type: :class:`Orange.classification.Classifier.GetValue` or \
212              :class:`Orange.classification.Classifier.GetProbabilities` or
213              :class:`Orange.classification.Classifier.GetBoth`
214       
215        :rtype: :class:`Orange.data.Value`,
216              :class:`Orange.statistics.Distribution` or a tuple with both
217        """
218        from operator import add
219       
220        # handle discreete class
221       
222        if self.class_var.var_type == Orange.feature.Discrete.Discrete:
223       
224            # voting for class probabilities
225            if result_type == orange.GetProbabilities or result_type == orange.GetBoth:
226                prob = [0.] * len(self.domain.class_var.values)
227                for c in self.classifiers:
228                    a = [x for x in c(instance, orange.GetProbabilities)]
229                    prob = map(add, prob, a)
230                norm = sum(prob)
231                cprob = Orange.statistics.distribution.Discrete(self.class_var)
232                for i in range(len(prob)):
233                    cprob[i] = prob[i]/norm
234               
235            # voting for crisp class membership, notice that
236            # this may not be the same class as one obtaining the
237            # highest probability through probability voting
238            if result_type == orange.GetValue or result_type == orange.GetBoth:
239                cfreq = [0] * len(self.domain.class_var.values)
240                for c in self.classifiers:
241                    cfreq[int(c(instance))] += 1
242                index = cfreq.index(max(cfreq))
243                cvalue = Orange.data.Value(self.domain.class_var, index)
244   
245            if result_type == orange.GetValue: return cvalue
246            elif result_type == orange.GetProbabilities: return cprob
247            else: return (cvalue, cprob)
248       
249        else:
250            # Handle continuous class
251       
252            # voting for class probabilities
253            if result_type == orange.GetProbabilities or result_type == orange.GetBoth:
254                probs = [c(instance, orange.GetBoth) for c in self.classifiers]
255                cprob = dict()
256                for val,prob in probs:
257                    if prob != None: #no probability output
258                        a = dict(prob.items())
259                    else:
260                        a = { val.value : 1. }
261                    cprob = dict( (n, a.get(n, 0)+cprob.get(n, 0)) for n in set(a)|set(cprob) )
262                cprob = Orange.statistics.distribution.Continuous(cprob)
263                cprob.normalize()
264               
265            # gather average class value
266            if result_type == orange.GetValue or result_type == orange.GetBoth:
267                values = [c(instance).value for c in self.classifiers]
268                cvalue = Orange.data.Value(self.domain.class_var, sum(values) / len(self.classifiers))
269           
270            if result_type == orange.GetValue: return cvalue
271            elif result_type == orange.GetProbabilities: return cprob
272            else: return (cvalue, cprob)
273           
274    def __reduce__(self):
275        return type(self), (self.classifiers, self.name, self.domain, self.class_var), dict(self.__dict__)
276RandomForestClassifier = Orange.misc.deprecated_members({"resultType":"result_type", "classVar":"class_var", "example":"instance"})(RandomForestClassifier)
277### MeasureAttribute_randomForests
278
279class ScoreFeature(orange.MeasureAttribute):
280    """
281    :param trees: number of trees in the forest.
282    :type trees: int
283
284    :param attributes: number of randomly drawn features among
285            which to select the best to split the nodes in tree
286            induction. The default, None, means the square root of
287            the number of features in the training data. Ignored if
288            :obj:`learner` is specified.
289    :type attributes: int
290
291    :param base_learner: A base tree learner. The base learner will be
292        randomized with Random Forest's random
293        feature subset selection.  If None (default),
294        :class:`~Orange.classification.tree.SimpleTreeLearner` and it
295        will not split nodes with less than 5 data instances.
296    :type base_learner: None or
297        :class:`Orange.classification.tree.TreeLearner` or
298        :class:`Orange.classification.tree.SimpleTreeLearner`
299
300    :param rand: random generator used in bootstrap sampling. If None (default),
301        then ``random.Random(0)`` is used.
302
303    :param learner: Tree induction learner. If `None` (default),
304        the :obj:`base_learner` will be used (and randomized). If
305        :obj:`learner` is specified, it will be used as such
306        with no additional transformations.
307    :type learner: None or :class:`Orange.core.Learner`
308
309    """
310    def __init__(self, trees=100, attributes=None, rand=None, base_learner=None, learner=None):
311
312        self.trees = trees
313        self.learner = learner
314        self._bufinstances = None
315        self.attributes = attributes
316        self.rand = rand
317        self.base_learner = base_learner
318
319        if base_learner != None and learner != None:
320            wrongSpecification()
321
322        if not self.rand:
323            self.rand = random.Random(0)
324        self.randorange = Orange.misc.Random(self.rand.randint(0,2**31-1))
325
326        if learner == None:
327            self.learner = _wrap_learner(base=self.base_learner, rand=self.rand, randorange=self.randorange)
328        else:
329            self.learner = learner
330
331    @deprecated_keywords({"apriorClass":"aprior_class"})
332    def __call__(self, feature, instances, aprior_class=None):
333        """
334        Return importance of a given feature.
335        Only the first call on a given data set is computationally expensive.
336       
337        :param feature: feature to evaluate (by index, name or
338            :class:`Orange.feature.Descriptor` object).
339        :type feature: int, str or :class:`Orange.feature.Descriptor`.
340       
341        :param instances: data instances to use for importance evaluation.
342        :type instances: :class:`Orange.data.Table`
343       
344        :param aprior_class: not used!
345       
346        """
347        attrNo = None
348
349        if type(feature) == int: #by attr. index
350          attrNo  = feature
351        elif type(feature) == type("a"): #by attr. name
352          attrName = feature
353          attrNo = instances.domain.index(attrName)
354        elif isinstance(feature, Orange.feature.Descriptor):
355          atrs = [a for a in instances.domain.attributes]
356          attrNo = atrs.index(feature)
357        else:
358          raise Exception("MeasureAttribute_rf can not be called with (\
359                contingency,classDistribution, aprior_class) as fuction arguments.")
360
361        self._buffer(instances)
362
363        return self._avimp[attrNo]*100/self._acu
364
365    def importances(self, table):
366        """
367        DEPRECATED. Return importance of all features in the dataset as a list.
368       
369        :param table: dataset of which the features' importance needs to be
370            measured.
371        :type table: :class:`Orange.data.Table`
372
373        """
374        self._buffer(table)
375        return [a*100/self._acu for a in self._avimp]
376
377    def _buffer(self, instances):
378        """
379        Recalculate importance of features if needed (ie. if it has been
380        buffered for the given dataset yet).
381
382        :param table: dataset of which the features' importance needs to be
383            measured.
384        :type table: :class:`Orange.data.Table`
385
386        """
387        if instances != self._bufinstances or \
388            instances.version != self._bufinstances.version:
389
390            self._bufinstances = instances
391            self._avimp = [0.0]*len(self._bufinstances.domain.attributes)
392            self._acu = 0
393            self._importanceAcu(self._bufinstances, self.trees, self._avimp)
394     
395    def _getOOB(self, instances, selection, nexamples):
396        ooblist = filter(lambda x: x not in selection, range(nexamples))
397        return instances.getitems(ooblist)
398
399    def _numRight(self, oob, classifier):
400        """
401        Return a number of instances which are classified correctly.
402        """
403        #TODO How to accomodate regression?
404        return sum(1 for el in oob if el.getclass() == classifier(el))
405   
406    def _numRightMix(self, oob, classifier, attr):
407        """
408        Return a number of instances which are classified
409        correctly even if a feature is shuffled.
410        """
411        perm = range(len(oob))
412        self.rand.shuffle(perm)
413
414        def shuffle_ex(index):
415            ex = Orange.data.Instance(oob[index])
416            ex[attr] = oob[perm[index]][attr]
417            return ex
418        #TODO How to accomodate regression?
419        return sum(1 for i in range(len(oob)) if oob[i].getclass() == classifier(shuffle_ex(i)))
420
421    def _importanceAcu(self, instances, trees, avimp):
422        """Accumulate avimp by importances for a given number of trees."""
423        n = len(instances)
424
425        attrs = len(instances.domain.attributes)
426
427        attrnum = {}
428        for attr in range(len(instances.domain.attributes)):
429           attrnum[instances.domain.attributes[attr].name] = attr           
430
431        if "attributes" in self.learner.__dict__:
432            self.learner.attributes = len(instances.domain.attributes)**0.5 if self.attributes == None else self.attributes
433
434        # build the forest
435        classifiers = [] 
436        for i in range(trees):
437            # draw bootstrap sample
438            selection = []
439            for j in range(n):
440                selection.append(self.rand.randrange(n))
441            data = instances.getitems(selection)
442           
443            # build the model from the bootstrap sample
444            cla = self.learner(data)
445
446            #prepare OOB data
447            oob = self._getOOB(instances, selection, n)
448           
449            #right on unmixed
450            right = self._numRight(oob, cla)
451           
452            presl = range(attrs)
453            try: #FIXME SimpleTreeLearner does not know how to output attributes yet
454                presl = list(self._presentInTree(cla.tree, attrnum))
455            except:
456                pass
457                     
458            #randomize each feature in data and test
459            #only those on which there was a split
460            for attr in presl:
461                #calculate number of right classifications
462                #if the values of this features are permutated randomly
463                rightimp = self._numRightMix(oob, cla, attr)               
464                avimp[attr] += (float(right-rightimp))/len(oob)
465        self._acu += trees 
466
467    def _presentInTree(self, node, attrnum):
468        """Return features present in tree (features that split)."""
469        if not node:
470          return set([])
471
472        if  node.branchSelector:
473            j = attrnum[node.branchSelector.class_var.name]
474            cs = set([])
475            for i in range(len(node.branches)):
476                s = self._presentInTree(node.branches[i], attrnum)
477                cs = s | cs
478            cs = cs | set([j])
479            return cs
480        else:
481          return set([])
482
483class _RandomForestTreeLearner(Orange.core.Learner):
484    """ A learner which wraps an ordinary TreeLearner with
485    a new split constructor.
486    """
487    @deprecated_keywords({"weightID":"weight_id", "examples":"instances"})
488    def __new__(cls, instances = None, weight_id = 0, **argkw):
489        self = Orange.core.Learner.__new__(cls, **argkw)
490        if instances:
491            self.__init__(**argkw)
492            return self.__call__(instances, weight_id)
493        else:
494            return self
495     
496    def __init__(self, base, rand):
497        self.base = base
498        self.attributes = None
499        self.rand = rand
500        if not self.rand: #for all the built trees
501            self.rand = random.Random(0)
502
503    @deprecated_keywords({"examples":"instances"})
504    def __call__(self, instances, weight=0):
505        """ A current tree learner is copied, modified and then used.
506        Modification: set a different split constructor, which uses
507        a random subset of attributes.
508        """
509        bcopy = copy.copy(self.base)
510
511        #if base tree learner has no measure set
512        if not bcopy.measure:
513            bcopy.measure = Orange.feature.scoring.Gini() \
514                if isinstance(instances.domain.class_var, Orange.feature.Discrete) \
515                else Orange.feature.scoring.MSE()
516
517        bcopy.split = SplitConstructor_AttributeSubset(\
518            bcopy.split, self.attributes, self.rand)
519
520        return bcopy(instances, weight=weight)
521
522class SplitConstructor_AttributeSubset(orange.TreeSplitConstructor):
523    def __init__(self, scons, attributes, rand = None):
524        self.scons = scons           # split constructor of original tree
525        self.attributes = attributes # number of features to consider
526        self.rand = rand
527        if not self.rand:
528            self.rand = random.Random(0)
529
530    @deprecated_keywords({"weightID":"weight_id"})
531    def __call__(self, gen, weight_id, contingencies, apriori, candidates, clsfr):
532        # if number of features for subset is not set, use square root
533        cand = [1]*int(self.attributes) + [0]*(len(candidates) - int(self.attributes))
534        self.rand.shuffle(cand)
535        # instead with all features, we will invoke split constructor
536        # only for the subset of a features
537        t = self.scons(gen, weight_id, contingencies, apriori, cand, clsfr)
538        return t
Note: See TracBrowser for help on using the repository browser.