orange/orngEnsemble.py
r7003 r7267 1 import orange, math, orngTest, orngStat, random, orngMisc 2 3 # This function is builtin in Python 2.3, 4 # but we define it to be compatible with 2.2 as well 5 from operator import add 6 def sum(x): 7 return reduce(add, x) 8 9 ######################################################################## 10 # Boosting 11 12 inf = 100000 13 14 #def BoostedLearner(learner, examples=None, t=10, name='AdaBoost.M1'): 15 # learner = BoostedLearnerClass(learner, t, name) 16 # if examples: 17 # return learner(examples) 18 # else: 19 # return learner 20 21 class BoostedLearner(orange.Learner): 22 def __new__(cls, learner, examples=None, weightId=None, **kwargs): 23 self = orange.Learner.__new__(cls, **kwargs) 24 if examples is not None: 25 self.__init__(self, learner, **kwargs) 26 return self.__call__(examples, weightId) 27 else: 28 return self 29 30 def __init__(self, learner, t=10, name='AdaBoost.M1'): 31 self.t = t 32 self.name = name 33 self.learner = learner 34 35 def __call__(self, instances, origWeight = 0): 36 weight = orange.newmetaid() 37 if origWeight: 38 for i in instances: 39 i.setweight(weight, i.getweight(origWeight)) 40 else: 41 instances.addMetaAttribute(weight, 1.0) 42 43 n = len(instances) 44 classifiers = [] 45 for i in range(self.t): 46 epsilon = 0.0 47 classifier = self.learner(instances, weight) 48 corr = [] 49 for ex in instances: 50 if classifier(ex) != ex.getclass(): 51 epsilon += ex.getweight(weight) 52 corr.append(0) 53 else: 54 corr.append(1) 55 epsilon = epsilon / float(reduce(lambda x,y:x+y.getweight(weight), instances, 0)) 56 classifiers.append((classifier, epsilon and math.log((1epsilon)/epsilon) or inf)) 57 if epsilon==0 or epsilon >= 0.499: 58 if epsilon >= 0.499 and len(classifiers)>1: 59 del classifiers[1] 60 instances.removeMetaAttribute(weight) 61 return BoostedClassifier(classifiers = classifiers, name=self.name, classVar=instances.domain.classVar) 62 beta = epsilon/(1epsilon) 63 for e in range(n): 64 if corr[e]: 65 instances[e].setweight(weight, instances[e].getweight(weight)*beta) 66 f = 1/float(reduce(add, [e.getweight(weight) for e in instances])) 67 for e in range(n): 68 instances[e].setweight(weight, instances[e].getweight(weight)*f) 69 70 instances.removeMetaAttribute(weight) 71 return BoostedClassifier(classifiers = classifiers, name=self.name, classVar=instances.domain.classVar) 72 73 class BoostedClassifier(orange.Classifier): 74 def __init__(self, **kwds): 75 self.__dict__.update(kwds) 76 77 def __call__(self, example, resultType = orange.GetValue): 78 votes = [0.] * len(self.classVar.values) 79 for c, e in self.classifiers: 80 votes[int(c(example))] += e 81 index = orngMisc.selectBestIndex(votes) 82 value = orange.Value(self.classVar, index) 83 if resultType == orange.GetValue: 84 return value 85 sv = sum(votes) 86 for i in range(len(votes)): 87 votes[i] = votes[i]/sv 88 if resultType == orange.GetProbabilities: 89 return votes 90 else: 91 return (value, votes) 92 93 94 ######################################################################## 95 # Bagging 96 97 #def BaggedLearner(learner=None, t=10, name='Bagging', examples=None): 98 # learner = BaggedLearnerClass(learner, t, name) 99 # if examples: 100 # return learner(examples) 101 # else: 102 # return learner 103 104 class BaggedLearner(orange.Learner): 105 def __new__(cls, learner, examples=None, weightId=None, **kwargs): 106 self = orange.Learner.__new__(cls, **kwargs) 107 if examples is not None: 108 self.__init__(self, learner, **kwargs) 109 return self.__call__(examples, weightId) 110 else: 111 return self 112 113 def __init__(self, learner, t=10, name='Bagging'): 114 self.t = t 115 self.name = name 116 self.learner = learner 117 118 def __call__(self, examples, weight=0): 119 r = random.Random() 120 r.seed(0) 121 122 n = len(examples) 123 classifiers = [] 124 for i in range(self.t): 125 selection = [] 126 for i in range(n): 127 selection.append(r.randrange(n)) 128 examples = orange.ExampleTable(examples) 129 data = examples.getitems(selection) 130 classifiers.append(self.learner(data, weight)) 131 return BaggedClassifier(classifiers = classifiers, name=self.name, classVar=examples.domain.classVar) 132 133 class BaggedClassifier(orange.Classifier): 134 def __init__(self, **kwds): 135 self.__dict__.update(kwds) 136 137 def __call__(self, example, resultType = orange.GetValue): 138 if self.classVar.varType == orange.VarTypes.Discrete: 139 freq = [0.] * len(self.classVar.values) 140 for c in self.classifiers: 141 freq[int(c(example))] += 1 142 index = freq.index(max(freq)) 143 value = orange.Value(self.classVar, index) 144 if resultType == orange.GetValue: 145 return value 146 for i in range(len(freq)): 147 freq[i] = freq[i]/len(self.classifiers) 148 if resultType == orange.GetProbabilities: 149 return freq 150 else: 151 return (value, freq) 152 elif self.classVar.varType == orange.VarTypes.Continuous: 153 votes = [c(example, orange.GetBoth if resultType==orange.GetProbabilities else resultType) for c in self.classifiers] 154 wsum = float(len(self.classifiers)) 155 if resultType in [orange.GetBoth, orange.GetProbabilities]: 156 pred = sum([float(c) for c, p in votes]) / wsum 157 # prob = sum([float(p.modus()) for c, p in votes]) / wsum 158 from collections import defaultdict 159 prob = defaultdict(float) 160 for c, p in votes: 161 try: 162 prob[float(c)] += p[c] / wsum 163 except IndexError: # p[c] sometimes fails with index error 164 prob[float(c)] += 1.0 / wsum 165 prob = orange.ContDistribution(prob) 166 return self.classVar(pred), prob if resultType == orange.GetBoth else prob 167 elif resultType == orange.GetValue: 168 pred = sum([float(c) for c in votes]) / wsum 169 return self.classVar(pred) 170 171 ######################################################################## 172 # Random Forests 173 174 from math import sqrt, floor 175 import orngTree 176 177 class SplitConstructor_AttributeSubset(orange.TreeSplitConstructor): 178 def __init__(self, scons, attributes, rand = None): 179 self.scons = scons # split constructor of original tree 180 self.attributes = attributes # number of attributes to consider 181 if rand: 182 self.rand = rand # a random generator 183 else: 184 self.rand = random.Random() 185 self.rand.seed(0) 186 187 def __call__(self, gen, weightID, contingencies, apriori, candidates, clsfr): 188 cand = [1]*self.attributes + [0]*(len(candidates)  self.attributes) 189 self.rand.shuffle(cand) 190 # instead with all attributes, we will invoke split constructor only for the 191 # subset of a attributes 192 t = self.scons(gen, weightID, contingencies, apriori, cand, clsfr) 193 return t 194 195 class RandomForestLearner(orange.Learner): 196 def __new__(cls, examples=None, weight = 0, **kwds): 197 self = orange.Learner.__new__(cls, **kwds) 198 if examples: 199 self.__init__(**kwds) 200 return self.__call__(examples, weight) 201 else: 202 return self 203 204 def __init__(self, learner=None, trees=100, attributes=None, name='Random Forest', rand=None, callback=None): 205 """random forest learner""" 206 self.trees = trees 207 self.name = name 208 self.learner = learner 209 self.attributes = attributes 210 self.callback = callback 211 if rand: 212 self.rand = rand 213 else: 214 self.rand = random.Random() 215 self.rand.seed(0) 216 217 self.randstate = self.rand.getstate() #original state 218 219 if not learner: 220 # tree learner assembled as suggested by Brieman (2001) 221 smallTreeLearner = orngTree.TreeLearner(storeNodeClassifier = 0, storeContingencies=0, storeDistributions=1, minExamples=5).instance() 222 smallTreeLearner.split.discreteSplitConstructor.measure = smallTreeLearner.split.continuousSplitConstructor.measure = orange.MeasureAttribute_gini() 223 smallTreeLearner.split = SplitConstructor_AttributeSubset(smallTreeLearner.split, attributes, self.rand) 224 self.learner = smallTreeLearner 225 226 def __call__(self, examples, weight=0): 227 # if number of attributes for subset is not set, use square root 228 if hasattr(self.learner.split, 'attributes') and not self.learner.split.attributes: 229 self.learner.split.attributes = int(sqrt(len(examples.domain.attributes))) 230 231 self.rand.setstate(self.randstate) #when learning again, set the same state 232 233 n = len(examples) 234 # build the forest 235 classifiers = [] 236 for i in range(self.trees): 237 # draw bootstrap sample 238 selection = [] 239 for j in range(n): 240 selection.append(self.rand.randrange(n)) 241 data = examples.getitems(selection) 242 # build the model from the bootstrap sample 243 classifiers.append(self.learner(data)) 244 if self.callback: 245 self.callback() 246 # if self.callback: self.callback((i+1.)/self.trees) 247 248 return RandomForestClassifier(classifiers = classifiers, name=self.name, domain=examples.domain, classVar=examples.domain.classVar) 249 250 class RandomForestClassifier(orange.Classifier): 251 def __init__(self, **kwds): 252 self.__dict__.update(kwds) 253 254 def __call__(self, example, resultType = orange.GetValue): 255 from operator import add 256 257 # voting for class probabilities 258 if resultType == orange.GetProbabilities or resultType == orange.GetBoth: 259 cprob = [0.] * len(self.domain.classVar.values) 260 for c in self.classifiers: 261 a = [x for x in c(example, orange.GetProbabilities)] 262 cprob = map(add, cprob, a) 263 norm = sum(cprob) 264 for i in range(len(cprob)): 265 cprob[i] = cprob[i]/norm 266 267 # voting for crisp class membership, notice that 268 # this may not be the same class as one obtaining the 269 # highest probability through probability voting 270 if resultType == orange.GetValue or resultType == orange.GetBoth: 271 cfreq = [0] * len(self.domain.classVar.values) 272 for c in self.classifiers: 273 cfreq[int(c(example))] += 1 274 index = cfreq.index(max(cfreq)) 275 cvalue = orange.Value(self.domain.classVar, index) 276 277 if resultType == orange.GetValue: return cvalue 278 elif resultType == orange.GetProbabilities: return cprob 279 else: return (cvalue, cprob) 280 281 282 ########################################################## 283 ### MeasureAttribute_randomForests 284 285 class MeasureAttribute_randomForests(orange.MeasureAttribute): 286 287 def __init__(self, learner=None, trees = 100, attributes=None, rand=None): 288 self.trees = trees 289 self.learner = learner 290 self.bufexamples = None 291 self.attributes = attributes 292 293 if self.learner == None: 294 temp = RandomForestLearner(attributes=self.attributes) 295 self.learner = temp.learner 296 297 if hasattr(self.learner.split, 'attributes'): 298 self.origattr = self.learner.split.attributes 299 300 if rand: 301 self.rand = rand # a random generator 302 else: 303 self.rand = random.Random() 304 self.rand.seed(0) 305 306 def __call__(self, a1, a2, a3=None): 307 """ 308 Returns importance of a given attribute. Can be given by index, 309 name or as a orange.Variable. 310 """ 311 attrNo = None 312 examples = None 313 314 if type(a1) == int: #by attr. index 315 attrNo, examples, apriorClass = a1, a2, a3 316 elif type(a1) == type("a"): #by attr. name 317 attrName, examples, apriorClass = a1, a2, a3 318 attrNo = examples.domain.index(attrName) 319 elif isinstance(a1, orange.Variable): 320 a1, examples, apriorClass = a1, a2, a3 321 atrs = [a for a in examples.domain.attributes] 322 attrNo = atrs.index(a1) 323 else: 324 contingency, classDistribution, apriorClass = a1, a2, a3 325 raise Exception("MeasureAttribute_rf can not be called with (contingency, classDistribution, apriorClass) as fuction arguments.") 326 327 self.buffer(examples) 328 329 return self.avimp[attrNo]*100/self.trees 330 331 def importances(self, examples): 332 """ 333 Returns importances of all attributes in dataset in a list. Buffered. 334 """ 335 self.buffer(examples) 336 337 return [a*100/self.trees for a in self.avimp] 338 339 def buffer(self, examples): 340 """ 341 recalcule importances if needed (new examples) 342 """ 343 recalculate = False 344 345 if examples != self.bufexamples: 346 recalculate = True 347 elif examples.version != self.bufexamples.version: 348 recalculate = True 349 350 if (recalculate): 351 self.bufexamples = examples 352 self.avimp = [0.0]*len(self.bufexamples.domain.attributes) 353 self.acu = 0 354 355 if hasattr(self.learner.split, 'attributes'): 356 self.learner.split.attributes = self.origattr 357 358 # if number of attributes for subset is not set, use square root 359 if hasattr(self.learner.split, 'attributes') and not self.learner.split.attributes: 360 self.learner.split.attributes = int(sqrt(len(examples.domain.attributes))) 361 362 self.importanceAcu(self.bufexamples, self.trees, self.avimp) 363 364 def getOOB(self, examples, selection, nexamples): 365 ooblist = filter(lambda x: x not in selection, range(nexamples)) 366 return examples.getitems(ooblist) 367 368 def numRight(self, oob, classifier): 369 """ 370 returns a number of examples which are classified correcty 371 """ 372 right = 0 373 for el in oob: 374 if (el.getclass() == classifier(el)): 375 right = right + 1 376 return right 377 378 def numRightMix(self, oob, classifier, attr): 379 """ 380 returns a number of examples which are classified 381 correctly even if an attribute is shuffled 382 """ 383 n = len(oob) 384 385 perm = range(n) 386 self.rand.shuffle(perm) 387 388 right = 0 389 390 for i in range(n): 391 ex = orange.Example(oob[i]) 392 ex[attr] = oob[perm[i]][attr] 393 394 if (ex.getclass() == classifier(ex)): 395 right = right + 1 396 397 return right 398 399 def importanceAcu(self, examples, trees, avimp): 400 """ 401 accumulate avimp by importances for a given number of trees 402 """ 403 404 405 n = len(examples) 406 407 attrs = len(examples.domain.attributes) 408 409 attrnum = {} 410 for attr in range(len(examples.domain.attributes)): 411 attrnum[examples.domain.attributes[attr].name] = attr 412 413 # build the forest 414 classifiers = [] 415 for i in range(trees): 416 417 # draw bootstrap sample 418 selection = [] 419 for j in range(n): 420 selection.append(self.rand.randrange(n)) 421 data = examples.getitems(selection) 422 423 # build the model from the bootstrap sample 424 cla = self.learner(data) 425 426 #prepare OOB data 427 oob = self.getOOB(examples, selection, n) 428 429 #right on unmixed 430 right = self.numRight(oob, cla) 431 432 presl = list(self.presentInTree(cla.tree, attrnum)) 433 434 #randomize each attribute in data and test 435 #only those on which there was a split 436 for attr in presl: 437 #calculate number of right classifications 438 #if the values of this attribute are permutated randomly 439 rightimp = self.numRightMix(oob, cla, attr) 440 avimp[attr] += (float(rightrightimp))/len(oob) 441 442 self.acu += trees 443 444 def presentInTree(self, node, attrnum): 445 """ 446 returns attributes present in tree (attributes that split) 447 """ 448 449 if not node: 450 return set([]) 451 452 if node.branchSelector: 453 j = attrnum[node.branchSelector.classVar.name] 454 455 cs = set([]) 456 for i in range(len(node.branches)): 457 s = self.presentInTree(node.branches[i], attrnum) 458 cs = s  cs 459 460 cs = cs  set([j]) 461 462 return cs 463 464 else: 465 return set([]) 466 467 1 from Orange.ensemble.bagging import * 2 from Orange.ensemble.boosting import * 3 from Orange.ensemble.forest import *
