| 1 | from math import sqrt, floor |
|---|
| 2 | import Orange.core as orange |
|---|
| 3 | import Orange |
|---|
| 4 | import Orange.feature.scoring |
|---|
| 5 | import random |
|---|
| 6 | import copy |
|---|
| 7 | from Orange.misc import deprecated_keywords |
|---|
| 8 | |
|---|
| 9 | def _default_small_learner(attributes=None, rand=None, base=None): |
|---|
| 10 | # tree learner assembled as suggested by Breiman (2001) |
|---|
| 11 | if not base: |
|---|
| 12 | base = Orange.classification.tree.TreeLearner( |
|---|
| 13 | store_node_classifier=0, store_contingencies=0, |
|---|
| 14 | store_distributions=1, min_instances=5) |
|---|
| 15 | |
|---|
| 16 | return _RandomForestTreeLearner(base=base, rand=rand) |
|---|
| 17 | |
|---|
| 18 | def _default_simple_learner(base, randorange): |
|---|
| 19 | if base == None: |
|---|
| 20 | base = Orange.classification.tree.SimpleTreeLearner(min_instances=5) |
|---|
| 21 | return _RandomForestSimpleTreeLearner(base=base, rand=randorange) |
|---|
| 22 | |
|---|
| 23 | def _wrap_learner(base, rand, randorange): |
|---|
| 24 | if base == None or isinstance(base, Orange.classification.tree.SimpleTreeLearner): |
|---|
| 25 | return _default_simple_learner(base, randorange) |
|---|
| 26 | elif isinstance(base, Orange.classification.tree.TreeLearner): |
|---|
| 27 | return _default_small_learner(None, rand, base) |
|---|
| 28 | else: |
|---|
| 29 | notRightLearnerToWrap() |
|---|
| 30 | |
|---|
| 31 | class _RandomForestSimpleTreeLearner(Orange.core.Learner): |
|---|
| 32 | """A learner which wraps an ordinary SimpleTreeLearner. Sets the |
|---|
| 33 | skip_prob so that the number of randomly chosen features for each |
|---|
| 34 | split is (on average) as specified.""" |
|---|
| 35 | |
|---|
| 36 | __new__ = Orange.misc._orange__new__(Orange.core.Learner) |
|---|
| 37 | |
|---|
| 38 | def __init__(self, base, rand): |
|---|
| 39 | self.base = base |
|---|
| 40 | self.attributes = None |
|---|
| 41 | self.rand = rand |
|---|
| 42 | |
|---|
| 43 | def __call__(self, instances, weight=0): |
|---|
| 44 | osp,orand = self.base.skip_prob, self.base.random_generator |
|---|
| 45 | self.base.skip_prob = 1-float(self.attributes)/len(instances.domain.attributes) |
|---|
| 46 | self.base.random_generator = self.rand |
|---|
| 47 | r = self.base(instances, weight) |
|---|
| 48 | self.base.skip_prob, self.base.random_generator = osp, orand |
|---|
| 49 | return r |
|---|
| 50 | |
|---|
| 51 | _RandomForestSimpleTreeLearner = Orange.misc.deprecated_members({"weightID":"weight_id", "examples":"instances"})(_RandomForestSimpleTreeLearner) |
|---|
| 52 | |
|---|
| 53 | class RandomForestLearner(Orange.core.Learner): |
|---|
| 54 | """ |
|---|
| 55 | Just like in bagging, classifiers in random forests are trained from bootstrap |
|---|
| 56 | samples of training data. Here, the classifiers are trees. However, to increase |
|---|
| 57 | randomness, at each node of the tree the best feature is |
|---|
| 58 | chosen from a subset of features in the data. We closely follow the |
|---|
| 59 | original algorithm (Brieman, 2001) both in implementation and parameter |
|---|
| 60 | defaults. |
|---|
| 61 | |
|---|
| 62 | :param trees: number of trees in the forest. |
|---|
| 63 | :type trees: int |
|---|
| 64 | |
|---|
| 65 | :param attributes: number of randomly drawn features among |
|---|
| 66 | which to select the best to split the nodes in tree |
|---|
| 67 | induction. The default, None, means the square root of |
|---|
| 68 | the number of features in the training data. Ignored if |
|---|
| 69 | :obj:`learner` is specified. |
|---|
| 70 | :type attributes: int |
|---|
| 71 | |
|---|
| 72 | :param base_learner: A base tree learner. The base learner will be |
|---|
| 73 | randomized with Random Forest's random |
|---|
| 74 | feature subset selection. If None (default), |
|---|
| 75 | :class:`~Orange.classification.tree.SimpleTreeLearner` and it |
|---|
| 76 | will not split nodes with less than 5 data instances. |
|---|
| 77 | :type base_learner: None or |
|---|
| 78 | :class:`Orange.classification.tree.TreeLearner` or |
|---|
| 79 | :class:`Orange.classification.tree.SimpleTreeLearner` |
|---|
| 80 | |
|---|
| 81 | :param rand: random generator used in bootstrap sampling. If None (default), |
|---|
| 82 | then ``random.Random(0)`` is used. |
|---|
| 83 | |
|---|
| 84 | :param learner: Tree induction learner. If `None` (default), |
|---|
| 85 | the :obj:`base_learner` will be used (and randomized). If |
|---|
| 86 | :obj:`learner` is specified, it will be used as such |
|---|
| 87 | with no additional transformations. |
|---|
| 88 | :type learner: None or :class:`Orange.core.Learner` |
|---|
| 89 | |
|---|
| 90 | :param callback: a function to be called after every iteration of |
|---|
| 91 | induction of classifier. This is called with parameter |
|---|
| 92 | (from 0.0 to 1.0) that gives estimates on learning progress. |
|---|
| 93 | |
|---|
| 94 | :param name: name of the learner. |
|---|
| 95 | :type name: string |
|---|
| 96 | |
|---|
| 97 | :rtype: :class:`~Orange.ensemble.forest.RandomForestClassifier` or |
|---|
| 98 | :class:`~Orange.ensemble.forest.RandomForestLearner` |
|---|
| 99 | |
|---|
| 100 | """ |
|---|
| 101 | |
|---|
| 102 | __new__ = Orange.misc._orange__new__(Orange.core.Learner) |
|---|
| 103 | |
|---|
| 104 | def __init__(self, trees=100, attributes=None,\ |
|---|
| 105 | name='Random Forest', rand=None, callback=None, base_learner=None, learner=None): |
|---|
| 106 | self.trees = trees |
|---|
| 107 | self.name = name |
|---|
| 108 | self.attributes = attributes |
|---|
| 109 | self.callback = callback |
|---|
| 110 | self.rand = rand |
|---|
| 111 | |
|---|
| 112 | self.base_learner = base_learner |
|---|
| 113 | |
|---|
| 114 | if base_learner != None and learner != None: |
|---|
| 115 | wrongSpecification() |
|---|
| 116 | |
|---|
| 117 | if not self.rand: |
|---|
| 118 | self.rand = random.Random(0) |
|---|
| 119 | self.randorange = Orange.misc.Random(self.rand.randint(0,2**31-1)) |
|---|
| 120 | |
|---|
| 121 | if learner == None: |
|---|
| 122 | self.learner = _wrap_learner(base=self.base_learner, rand=self.rand, randorange=self.randorange) |
|---|
| 123 | else: |
|---|
| 124 | self.learner = learner |
|---|
| 125 | |
|---|
| 126 | self.randstate = self.rand.getstate() #original state |
|---|
| 127 | |
|---|
| 128 | def __call__(self, instances, weight=0): |
|---|
| 129 | """ |
|---|
| 130 | Learn from the given table of data instances. |
|---|
| 131 | |
|---|
| 132 | :param instances: learning data. |
|---|
| 133 | :type instances: class:`Orange.data.Table` |
|---|
| 134 | :param weight: weight. |
|---|
| 135 | :type weight: int |
|---|
| 136 | :rtype: :class:`Orange.ensemble.forest.RandomForestClassifier` |
|---|
| 137 | """ |
|---|
| 138 | self.rand.setstate(self.randstate) #when learning again, set the same state |
|---|
| 139 | self.randorange.reset() |
|---|
| 140 | |
|---|
| 141 | if "attributes" in self.learner.__dict__: |
|---|
| 142 | self.learner.attributes = len(instances.domain.attributes)**0.5 if self.attributes == None else self.attributes |
|---|
| 143 | |
|---|
| 144 | learner = self.learner |
|---|
| 145 | |
|---|
| 146 | n = len(instances) |
|---|
| 147 | # build the forest |
|---|
| 148 | classifiers = [] |
|---|
| 149 | for i in range(self.trees): |
|---|
| 150 | # draw bootstrap sample |
|---|
| 151 | selection = [] |
|---|
| 152 | for j in range(n): |
|---|
| 153 | selection.append(self.rand.randrange(n)) |
|---|
| 154 | data = instances.get_items_ref(selection) |
|---|
| 155 | # build the model from the bootstrap sample |
|---|
| 156 | classifiers.append(learner(data, weight)) |
|---|
| 157 | if self.callback: |
|---|
| 158 | self.callback() |
|---|
| 159 | # if self.callback: self.callback((i+1.)/self.trees) |
|---|
| 160 | |
|---|
| 161 | return RandomForestClassifier(classifiers = classifiers, name=self.name,\ |
|---|
| 162 | domain=instances.domain, class_var=instances.domain.class_var) |
|---|
| 163 | RandomForestLearner = Orange.misc.deprecated_members({"examples":"instances"})(RandomForestLearner) |
|---|
| 164 | |
|---|
| 165 | class RandomForestClassifier(orange.Classifier): |
|---|
| 166 | """ |
|---|
| 167 | Uses the trees induced by the :obj:`RandomForestLearner`. An input |
|---|
| 168 | instance is classified into the class with the most frequent vote. |
|---|
| 169 | However, this implementation returns the averaged probabilities from |
|---|
| 170 | each of the trees if class probability is requested. |
|---|
| 171 | |
|---|
| 172 | When constructed manually, the following parameters have to |
|---|
| 173 | be passed: |
|---|
| 174 | |
|---|
| 175 | :param classifiers: a list of classifiers to be used. |
|---|
| 176 | :type classifiers: list |
|---|
| 177 | |
|---|
| 178 | :param name: name of the resulting classifier. |
|---|
| 179 | :type name: str |
|---|
| 180 | |
|---|
| 181 | :param domain: the domain of the learning set. |
|---|
| 182 | :type domain: :class:`Orange.data.Domain` |
|---|
| 183 | |
|---|
| 184 | :param class_var: the class feature. |
|---|
| 185 | :type class_var: :class:`Orange.feature.Descriptor` |
|---|
| 186 | |
|---|
| 187 | """ |
|---|
| 188 | def __init__(self, classifiers, name, domain, class_var, **kwds): |
|---|
| 189 | self.classifiers = classifiers |
|---|
| 190 | self.name = name |
|---|
| 191 | self.domain = domain |
|---|
| 192 | self.class_var = class_var |
|---|
| 193 | self.__dict__.update(kwds) |
|---|
| 194 | |
|---|
| 195 | def __call__(self, instance, result_type = orange.GetValue): |
|---|
| 196 | """ |
|---|
| 197 | :param instance: instance to be classified. |
|---|
| 198 | :type instance: :class:`Orange.data.Instance` |
|---|
| 199 | |
|---|
| 200 | :param result_type: :class:`Orange.classification.Classifier.GetValue` or \ |
|---|
| 201 | :class:`Orange.classification.Classifier.GetProbabilities` or |
|---|
| 202 | :class:`Orange.classification.Classifier.GetBoth` |
|---|
| 203 | |
|---|
| 204 | :rtype: :class:`Orange.data.Value`, |
|---|
| 205 | :class:`Orange.statistics.Distribution` or a tuple with both |
|---|
| 206 | """ |
|---|
| 207 | from operator import add |
|---|
| 208 | |
|---|
| 209 | # handle discreete class |
|---|
| 210 | |
|---|
| 211 | if self.class_var.var_type == Orange.feature.Discrete.Discrete: |
|---|
| 212 | |
|---|
| 213 | # voting for class probabilities |
|---|
| 214 | if result_type == orange.GetProbabilities or result_type == orange.GetBoth: |
|---|
| 215 | prob = [0.] * len(self.domain.class_var.values) |
|---|
| 216 | for c in self.classifiers: |
|---|
| 217 | a = [x for x in c(instance, orange.GetProbabilities)] |
|---|
| 218 | prob = map(add, prob, a) |
|---|
| 219 | norm = sum(prob) |
|---|
| 220 | cprob = Orange.statistics.distribution.Discrete(self.class_var) |
|---|
| 221 | for i in range(len(prob)): |
|---|
| 222 | cprob[i] = prob[i]/norm |
|---|
| 223 | |
|---|
| 224 | # voting for crisp class membership, notice that |
|---|
| 225 | # this may not be the same class as one obtaining the |
|---|
| 226 | # highest probability through probability voting |
|---|
| 227 | if result_type == orange.GetValue or result_type == orange.GetBoth: |
|---|
| 228 | cfreq = [0] * len(self.domain.class_var.values) |
|---|
| 229 | for c in self.classifiers: |
|---|
| 230 | cfreq[int(c(instance))] += 1 |
|---|
| 231 | index = cfreq.index(max(cfreq)) |
|---|
| 232 | cvalue = Orange.data.Value(self.domain.class_var, index) |
|---|
| 233 | |
|---|
| 234 | if result_type == orange.GetValue: return cvalue |
|---|
| 235 | elif result_type == orange.GetProbabilities: return cprob |
|---|
| 236 | else: return (cvalue, cprob) |
|---|
| 237 | |
|---|
| 238 | else: |
|---|
| 239 | # Handle continuous class |
|---|
| 240 | |
|---|
| 241 | # voting for class probabilities |
|---|
| 242 | if result_type == orange.GetProbabilities or result_type == orange.GetBoth: |
|---|
| 243 | probs = [c(instance, orange.GetBoth) for c in self.classifiers] |
|---|
| 244 | cprob = dict() |
|---|
| 245 | for val,prob in probs: |
|---|
| 246 | if prob != None: #no probability output |
|---|
| 247 | a = dict(prob.items()) |
|---|
| 248 | else: |
|---|
| 249 | a = { val.value : 1. } |
|---|
| 250 | cprob = dict( (n, a.get(n, 0)+cprob.get(n, 0)) for n in set(a)|set(cprob) ) |
|---|
| 251 | cprob = Orange.statistics.distribution.Continuous(cprob) |
|---|
| 252 | cprob.normalize() |
|---|
| 253 | |
|---|
| 254 | # gather average class value |
|---|
| 255 | if result_type == orange.GetValue or result_type == orange.GetBoth: |
|---|
| 256 | values = [c(instance).value for c in self.classifiers] |
|---|
| 257 | cvalue = Orange.data.Value(self.domain.class_var, sum(values) / len(self.classifiers)) |
|---|
| 258 | |
|---|
| 259 | if result_type == orange.GetValue: return cvalue |
|---|
| 260 | elif result_type == orange.GetProbabilities: return cprob |
|---|
| 261 | else: return (cvalue, cprob) |
|---|
| 262 | |
|---|
| 263 | def __reduce__(self): |
|---|
| 264 | return type(self), (self.classifiers, self.name, self.domain, self.class_var), dict(self.__dict__) |
|---|
| 265 | RandomForestClassifier = Orange.misc.deprecated_members({"resultType":"result_type", "classVar":"class_var", "example":"instance"})(RandomForestClassifier) |
|---|
| 266 | ### MeasureAttribute_randomForests |
|---|
| 267 | |
|---|
| 268 | class ScoreFeature(orange.MeasureAttribute): |
|---|
| 269 | """ |
|---|
| 270 | :param trees: number of trees in the forest. |
|---|
| 271 | :type trees: int |
|---|
| 272 | |
|---|
| 273 | :param attributes: number of randomly drawn features among |
|---|
| 274 | which to select the best to split the nodes in tree |
|---|
| 275 | induction. The default, None, means the square root of |
|---|
| 276 | the number of features in the training data. Ignored if |
|---|
| 277 | :obj:`learner` is specified. |
|---|
| 278 | :type attributes: int |
|---|
| 279 | |
|---|
| 280 | :param base_learner: A base tree learner. The base learner will be |
|---|
| 281 | randomized with Random Forest's random |
|---|
| 282 | feature subset selection. If None (default), |
|---|
| 283 | :class:`~Orange.classification.tree.SimpleTreeLearner` and it |
|---|
| 284 | will not split nodes with less than 5 data instances. |
|---|
| 285 | :type base_learner: None or |
|---|
| 286 | :class:`Orange.classification.tree.TreeLearner` or |
|---|
| 287 | :class:`Orange.classification.tree.SimpleTreeLearner` |
|---|
| 288 | |
|---|
| 289 | :param rand: random generator used in bootstrap sampling. If None (default), |
|---|
| 290 | then ``random.Random(0)`` is used. |
|---|
| 291 | |
|---|
| 292 | :param learner: Tree induction learner. If `None` (default), |
|---|
| 293 | the :obj:`base_learner` will be used (and randomized). If |
|---|
| 294 | :obj:`learner` is specified, it will be used as such |
|---|
| 295 | with no additional transformations. |
|---|
| 296 | :type learner: None or :class:`Orange.core.Learner` |
|---|
| 297 | |
|---|
| 298 | """ |
|---|
| 299 | def __init__(self, trees=100, attributes=None, rand=None, base_learner=None, learner=None): |
|---|
| 300 | |
|---|
| 301 | self.trees = trees |
|---|
| 302 | self.learner = learner |
|---|
| 303 | self._bufinstances = None |
|---|
| 304 | self.attributes = attributes |
|---|
| 305 | self.rand = rand |
|---|
| 306 | self.base_learner = base_learner |
|---|
| 307 | |
|---|
| 308 | if base_learner != None and learner != None: |
|---|
| 309 | wrongSpecification() |
|---|
| 310 | |
|---|
| 311 | if not self.rand: |
|---|
| 312 | self.rand = random.Random(0) |
|---|
| 313 | self.randorange = Orange.misc.Random(self.rand.randint(0,2**31-1)) |
|---|
| 314 | |
|---|
| 315 | if learner == None: |
|---|
| 316 | self.learner = _wrap_learner(base=self.base_learner, rand=self.rand, randorange=self.randorange) |
|---|
| 317 | else: |
|---|
| 318 | self.learner = learner |
|---|
| 319 | |
|---|
| 320 | @deprecated_keywords({"apriorClass":"aprior_class"}) |
|---|
| 321 | def __call__(self, feature, instances, aprior_class=None): |
|---|
| 322 | """ |
|---|
| 323 | Return importance of a given feature. |
|---|
| 324 | Only the first call on a given data set is computationally expensive. |
|---|
| 325 | |
|---|
| 326 | :param feature: feature to evaluate (by index, name or |
|---|
| 327 | :class:`Orange.feature.Descriptor` object). |
|---|
| 328 | :type feature: int, str or :class:`Orange.feature.Descriptor`. |
|---|
| 329 | |
|---|
| 330 | :param instances: data instances to use for importance evaluation. |
|---|
| 331 | :type instances: :class:`Orange.data.Table` |
|---|
| 332 | |
|---|
| 333 | :param aprior_class: not used! |
|---|
| 334 | |
|---|
| 335 | """ |
|---|
| 336 | attrNo = None |
|---|
| 337 | |
|---|
| 338 | if type(feature) == int: #by attr. index |
|---|
| 339 | attrNo = feature |
|---|
| 340 | elif type(feature) == type("a"): #by attr. name |
|---|
| 341 | attrName = feature |
|---|
| 342 | attrNo = instances.domain.index(attrName) |
|---|
| 343 | elif isinstance(feature, Orange.feature.Descriptor): |
|---|
| 344 | atrs = [a for a in instances.domain.attributes] |
|---|
| 345 | attrNo = atrs.index(feature) |
|---|
| 346 | else: |
|---|
| 347 | raise Exception("MeasureAttribute_rf can not be called with (\ |
|---|
| 348 | contingency,classDistribution, aprior_class) as fuction arguments.") |
|---|
| 349 | |
|---|
| 350 | self._buffer(instances) |
|---|
| 351 | |
|---|
| 352 | return self._avimp[attrNo]*100/self._acu |
|---|
| 353 | |
|---|
| 354 | def importances(self, table): |
|---|
| 355 | """ |
|---|
| 356 | DEPRECATED. Return importance of all features in the dataset as a list. |
|---|
| 357 | |
|---|
| 358 | :param table: dataset of which the features' importance needs to be |
|---|
| 359 | measured. |
|---|
| 360 | :type table: :class:`Orange.data.Table` |
|---|
| 361 | |
|---|
| 362 | """ |
|---|
| 363 | self._buffer(table) |
|---|
| 364 | return [a*100/self._acu for a in self._avimp] |
|---|
| 365 | |
|---|
| 366 | def _buffer(self, instances): |
|---|
| 367 | """ |
|---|
| 368 | Recalculate importance of features if needed (ie. if it has been |
|---|
| 369 | buffered for the given dataset yet). |
|---|
| 370 | |
|---|
| 371 | :param table: dataset of which the features' importance needs to be |
|---|
| 372 | measured. |
|---|
| 373 | :type table: :class:`Orange.data.Table` |
|---|
| 374 | |
|---|
| 375 | """ |
|---|
| 376 | if instances != self._bufinstances or \ |
|---|
| 377 | instances.version != self._bufinstances.version: |
|---|
| 378 | |
|---|
| 379 | self._bufinstances = instances |
|---|
| 380 | self._avimp = [0.0]*len(self._bufinstances.domain.attributes) |
|---|
| 381 | self._acu = 0 |
|---|
| 382 | self._importanceAcu(self._bufinstances, self.trees, self._avimp) |
|---|
| 383 | |
|---|
| 384 | def _getOOB(self, instances, selection, nexamples): |
|---|
| 385 | ooblist = filter(lambda x: x not in selection, range(nexamples)) |
|---|
| 386 | return instances.getitems(ooblist) |
|---|
| 387 | |
|---|
| 388 | def _numRight(self, oob, classifier): |
|---|
| 389 | """ |
|---|
| 390 | Return a number of instances which are classified correctly. |
|---|
| 391 | """ |
|---|
| 392 | #TODO How to accomodate regression? |
|---|
| 393 | return sum(1 for el in oob if el.getclass() == classifier(el)) |
|---|
| 394 | |
|---|
| 395 | def _numRightMix(self, oob, classifier, attr): |
|---|
| 396 | """ |
|---|
| 397 | Return a number of instances which are classified |
|---|
| 398 | correctly even if a feature is shuffled. |
|---|
| 399 | """ |
|---|
| 400 | perm = range(len(oob)) |
|---|
| 401 | self.rand.shuffle(perm) |
|---|
| 402 | |
|---|
| 403 | def shuffle_ex(index): |
|---|
| 404 | ex = Orange.data.Instance(oob[index]) |
|---|
| 405 | ex[attr] = oob[perm[index]][attr] |
|---|
| 406 | return ex |
|---|
| 407 | #TODO How to accomodate regression? |
|---|
| 408 | return sum(1 for i in range(len(oob)) if oob[i].getclass() == classifier(shuffle_ex(i))) |
|---|
| 409 | |
|---|
| 410 | def _importanceAcu(self, instances, trees, avimp): |
|---|
| 411 | """Accumulate avimp by importances for a given number of trees.""" |
|---|
| 412 | n = len(instances) |
|---|
| 413 | |
|---|
| 414 | attrs = len(instances.domain.attributes) |
|---|
| 415 | |
|---|
| 416 | attrnum = {} |
|---|
| 417 | for attr in range(len(instances.domain.attributes)): |
|---|
| 418 | attrnum[instances.domain.attributes[attr].name] = attr |
|---|
| 419 | |
|---|
| 420 | if "attributes" in self.learner.__dict__: |
|---|
| 421 | self.learner.attributes = len(instances.domain.attributes)**0.5 if self.attributes == None else self.attributes |
|---|
| 422 | |
|---|
| 423 | # build the forest |
|---|
| 424 | classifiers = [] |
|---|
| 425 | for i in range(trees): |
|---|
| 426 | # draw bootstrap sample |
|---|
| 427 | selection = [] |
|---|
| 428 | for j in range(n): |
|---|
| 429 | selection.append(self.rand.randrange(n)) |
|---|
| 430 | data = instances.getitems(selection) |
|---|
| 431 | |
|---|
| 432 | # build the model from the bootstrap sample |
|---|
| 433 | cla = self.learner(data) |
|---|
| 434 | |
|---|
| 435 | #prepare OOB data |
|---|
| 436 | oob = self._getOOB(instances, selection, n) |
|---|
| 437 | |
|---|
| 438 | #right on unmixed |
|---|
| 439 | right = self._numRight(oob, cla) |
|---|
| 440 | |
|---|
| 441 | presl = range(attrs) |
|---|
| 442 | try: #FIXME SimpleTreeLearner does not know how to output attributes yet |
|---|
| 443 | presl = list(self._presentInTree(cla.tree, attrnum)) |
|---|
| 444 | except: |
|---|
| 445 | pass |
|---|
| 446 | |
|---|
| 447 | #randomize each feature in data and test |
|---|
| 448 | #only those on which there was a split |
|---|
| 449 | for attr in presl: |
|---|
| 450 | #calculate number of right classifications |
|---|
| 451 | #if the values of this features are permutated randomly |
|---|
| 452 | rightimp = self._numRightMix(oob, cla, attr) |
|---|
| 453 | avimp[attr] += (float(right-rightimp))/len(oob) |
|---|
| 454 | self._acu += trees |
|---|
| 455 | |
|---|
| 456 | def _presentInTree(self, node, attrnum): |
|---|
| 457 | """Return features present in tree (features that split).""" |
|---|
| 458 | if not node: |
|---|
| 459 | return set([]) |
|---|
| 460 | |
|---|
| 461 | if node.branchSelector: |
|---|
| 462 | j = attrnum[node.branchSelector.class_var.name] |
|---|
| 463 | cs = set([]) |
|---|
| 464 | for i in range(len(node.branches)): |
|---|
| 465 | s = self._presentInTree(node.branches[i], attrnum) |
|---|
| 466 | cs = s | cs |
|---|
| 467 | cs = cs | set([j]) |
|---|
| 468 | return cs |
|---|
| 469 | else: |
|---|
| 470 | return set([]) |
|---|
| 471 | |
|---|
| 472 | class _RandomForestTreeLearner(Orange.core.Learner): |
|---|
| 473 | """ A learner which wraps an ordinary TreeLearner with |
|---|
| 474 | a new split constructor. |
|---|
| 475 | """ |
|---|
| 476 | |
|---|
| 477 | __new__ = Orange.misc._orange__new__(Orange.core.Learner) |
|---|
| 478 | |
|---|
| 479 | def __init__(self, base, rand): |
|---|
| 480 | self.base = base |
|---|
| 481 | self.attributes = None |
|---|
| 482 | self.rand = rand |
|---|
| 483 | if not self.rand: #for all the built trees |
|---|
| 484 | self.rand = random.Random(0) |
|---|
| 485 | |
|---|
| 486 | @deprecated_keywords({"examples":"instances"}) |
|---|
| 487 | def __call__(self, instances, weight=0): |
|---|
| 488 | """ A current tree learner is copied, modified and then used. |
|---|
| 489 | Modification: set a different split constructor, which uses |
|---|
| 490 | a random subset of attributes. |
|---|
| 491 | """ |
|---|
| 492 | bcopy = copy.copy(self.base) |
|---|
| 493 | |
|---|
| 494 | #if base tree learner has no measure set |
|---|
| 495 | if not bcopy.measure: |
|---|
| 496 | bcopy.measure = Orange.feature.scoring.Gini() \ |
|---|
| 497 | if isinstance(instances.domain.class_var, Orange.feature.Discrete) \ |
|---|
| 498 | else Orange.feature.scoring.MSE() |
|---|
| 499 | |
|---|
| 500 | bcopy.split = SplitConstructor_AttributeSubset(\ |
|---|
| 501 | bcopy.split, self.attributes, self.rand) |
|---|
| 502 | |
|---|
| 503 | return bcopy(instances, weight=weight) |
|---|
| 504 | |
|---|
| 505 | class SplitConstructor_AttributeSubset(orange.TreeSplitConstructor): |
|---|
| 506 | def __init__(self, scons, attributes, rand = None): |
|---|
| 507 | self.scons = scons # split constructor of original tree |
|---|
| 508 | self.attributes = attributes # number of features to consider |
|---|
| 509 | self.rand = rand |
|---|
| 510 | if not self.rand: |
|---|
| 511 | self.rand = random.Random(0) |
|---|
| 512 | |
|---|
| 513 | @deprecated_keywords({"weightID":"weight_id"}) |
|---|
| 514 | def __call__(self, gen, weight_id, contingencies, apriori, candidates, clsfr): |
|---|
| 515 | # if number of features for subset is not set, use square root |
|---|
| 516 | cand = [1]*int(self.attributes) + [0]*(len(candidates) - int(self.attributes)) |
|---|
| 517 | self.rand.shuffle(cand) |
|---|
| 518 | # instead with all features, we will invoke split constructor |
|---|
| 519 | # only for the subset of a features |
|---|
| 520 | t = self.scons(gen, weight_id, contingencies, apriori, cand, clsfr) |
|---|
| 521 | return t |
|---|