#
source:
orange/Orange/classification/logreg.py
@
11398:7e9b4eb30015

Revision 11398:7e9b4eb30015, 42.9 KB checked in by Ales Erjavec <ales.erjavec@…>, 13 months ago (diff) |
---|

Rev | Line | |
---|---|---|

[8042] | 1 | import Orange |

[10580] | 2 | from Orange.utils import deprecated_keywords, deprecated_members |

[10542] | 3 | from Orange.data import preprocess |

[11397] | 4 | from Orange.data.continuization import DomainContinuizer |

[11060] | 5 | import decimal |

[9818] | 6 | import math |

[10542] | 7 | |

8 | ||

[9818] | 9 | from numpy import dot, array, identity, reshape, diagonal, \ |

10 | transpose, concatenate, sqrt, sign | |

11 | from numpy.linalg import inv | |

12 | from Orange.core import LogRegClassifier, LogRegFitter, LogRegFitter_Cholesky | |

[8042] | 13 | |

[11060] | 14 | def format_decimal(x, prec=2): |

15 | """Allows arbitrary precision with scientific notation""" | |

16 | tup = x.as_tuple() | |

17 | digits = list(tup.digits[:prec + 1]) | |

18 | sign = '-' if tup.sign else '' | |

19 | dec = ''.join(str(i) for i in digits[1:]) | |

20 | exp = x.adjusted() | |

21 | return '{sign}{int}.{dec}e{exp}'.format(sign=sign, int=digits[0], dec=dec, exp=exp) | |

22 | ||

[8042] | 23 | def dump(classifier): |

[10346] | 24 | """ Return a formatted string describing the logistic regression model |

[8042] | 25 | |

[9818] | 26 | :param classifier: logistic regression classifier. |

[8042] | 27 | """ |

28 | ||

29 | # print out class values | |

30 | out = [''] | |

[9818] | 31 | out.append("class attribute = " + classifier.domain.class_var.name) |

32 | out.append("class values = " + str(classifier.domain.class_var.values)) | |

[8042] | 33 | out.append('') |

34 | ||

35 | # get the longest attribute name | |

36 | longest=0 | |

[9818] | 37 | for at in classifier.continuized_domain.features: |

[8042] | 38 | if len(at.name)>longest: |

[9818] | 39 | longest=len(at.name) |

[8042] | 40 | |

41 | # print out the head | |

42 | formatstr = "%"+str(longest)+"s %10s %10s %10s %10s %10s" | |

43 | out.append(formatstr % ("Feature", "beta", "st. error", "wald Z", "P", "OR=exp(beta)")) | |

44 | out.append('') | |

[11063] | 45 | formatstr = "%"+str(longest)+"s %10.2f %10.2f %10.2f %10.2f" |

[8042] | 46 | out.append(formatstr % ("Intercept", classifier.beta[0], classifier.beta_se[0], classifier.wald_Z[0], classifier.P[0])) |

[11063] | 47 | formatstr = "%"+str(longest)+"s %10.2f %10.2f %10.2f %10.2f %s" |

[9818] | 48 | for i in range(len(classifier.continuized_domain.features)): |

[11063] | 49 | try: |

50 | exp = decimal.Decimal(math.e) ** decimal.Decimal(classifier.beta[i+1]) | |

51 | except TypeError: | |

52 | # Python 2.6 does not support creating Decimals from float | |

53 | exp = decimal.Decimal(str(math.e)) ** decimal.Decimal(str(classifier.beta[i+1])) | |

[11060] | 54 | out.append(formatstr % (classifier.continuized_domain.features[i].name, |

55 | classifier.beta[i+1], | |

56 | classifier.beta_se[i+1], | |

57 | classifier.wald_Z[i+1], | |

58 | abs(classifier.P[i+1]), | |

59 | format_decimal(exp))) | |

[8042] | 60 | |

61 | return '\n'.join(out) | |

[11063] | 62 | |

[8042] | 63 | |

64 | def has_discrete_values(domain): | |

[9818] | 65 | """ |

66 | Return 1 if the given domain contains any discrete features, else 0. | |

67 | ||

68 | :param domain: domain. | |

69 | :type domain: :class:`Orange.data.Domain` | |

70 | """ | |

[9923] | 71 | return any(at.var_type == Orange.feature.Type.Discrete |

[9818] | 72 | for at in domain.features) |

73 | ||

[8042] | 74 | |

75 | class LogRegLearner(Orange.classification.Learner): | |

76 | """ Logistic regression learner. | |

77 | ||

[10346] | 78 | Returns either a learning algorithm (instance of |

79 | :obj:`LogRegLearner`) or, if data is provided, a fitted model | |

80 | (instance of :obj:`LogRegClassifier`). | |

[8042] | 81 | |

[10346] | 82 | :param data: data table; it may contain discrete and continuous features |

[10246] | 83 | :type data: Orange.data.Table |

[9818] | 84 | :param weight_id: the ID of the weight meta attribute |

85 | :type weight_id: int | |

[10346] | 86 | :param remove_singular: automated removal of constant |

87 | features and singularities (default: `False`) | |

[9818] | 88 | :type remove_singular: bool |

[10346] | 89 | :param fitter: the fitting algorithm (default: :obj:`LogRegFitter_Cholesky`) |

90 | :param stepwise_lr: enables stepwise feature selection (default: `False`) | |

[9818] | 91 | :type stepwise_lr: bool |

[10346] | 92 | :param add_crit: threshold for adding a feature in stepwise |

93 | selection (default: 0.2) | |

[9818] | 94 | :type add_crit: float |

[10346] | 95 | :param delete_crit: threshold for removing a feature in stepwise |

96 | selection (default: 0.3) | |

[9818] | 97 | :type delete_crit: float |

[10346] | 98 | :param num_features: number of features in stepwise selection |

99 | (default: -1, no limit) | |

[9818] | 100 | :type num_features: int |

[8042] | 101 | :rtype: :obj:`LogRegLearner` or :obj:`LogRegClassifier` |

102 | ||

103 | """ | |

[9818] | 104 | |

105 | @deprecated_keywords({"weightID": "weight_id"}) | |

[10246] | 106 | def __new__(cls, data=None, weight_id=0, **argkw): |

[8042] | 107 | self = Orange.classification.Learner.__new__(cls, **argkw) |

[10246] | 108 | if data: |

[8042] | 109 | self.__init__(**argkw) |

[10246] | 110 | return self.__call__(data, weight_id) |

[8042] | 111 | else: |

112 | return self | |

113 | ||

[9818] | 114 | @deprecated_keywords({"removeSingular": "remove_singular"}) |

115 | def __init__(self, remove_singular=0, fitter = None, **kwds): | |

[8042] | 116 | self.__dict__.update(kwds) |

[9818] | 117 | self.remove_singular = remove_singular |

[8042] | 118 | self.fitter = None |

119 | ||

[10246] | 120 | @deprecated_keywords({"examples": "data"}) |

121 | def __call__(self, data, weight=0): | |

[10346] | 122 | """Fit a model to the given data. |

[9818] | 123 | |

[10346] | 124 | :param data: Data instances. |

[10246] | 125 | :type data: :class:`~Orange.data.Table` |

[10346] | 126 | :param weight: Id of meta attribute with instance weights |

[9818] | 127 | :type weight: int |

128 | :rtype: :class:`~Orange.classification.logreg.LogRegClassifier` | |

129 | """ | |

[8042] | 130 | imputer = getattr(self, "imputer", None) or None |

[9818] | 131 | if getattr(self, "remove_missing", 0): |

[10246] | 132 | data = Orange.core.Preprocessor_dropMissing(data) |

[8042] | 133 | ## if hasDiscreteValues(examples.domain): |

134 | ## examples = createNoDiscTable(examples) | |

[10246] | 135 | if not len(data): |

[8042] | 136 | return None |

[9818] | 137 | if getattr(self, "stepwise_lr", 0): |

138 | add_crit = getattr(self, "add_crit", 0.2) | |

139 | delete_crit = getattr(self, "delete_crit", 0.3) | |

140 | num_features = getattr(self, "num_features", -1) | |

[10246] | 141 | attributes = StepWiseFSS(data, add_crit= add_crit, |

[9818] | 142 | delete_crit=delete_crit, imputer = imputer, num_features= num_features) |

143 | tmp_domain = Orange.data.Domain(attributes, | |

[10246] | 144 | data.domain.class_var) |

145 | tmp_domain.addmetas(data.domain.getmetas()) | |

146 | data = data.select(tmp_domain) | |

[9818] | 147 | learner = Orange.core.LogRegLearner() # Yes, it has to be from core. |

148 | learner.imputer_constructor = imputer | |

[8042] | 149 | if imputer: |

[10246] | 150 | data = self.imputer(data)(data) |

151 | data = Orange.core.Preprocessor_dropMissing(data) | |

[8042] | 152 | if self.fitter: |

153 | learner.fitter = self.fitter | |

[9818] | 154 | if self.remove_singular: |

[10246] | 155 | lr = learner.fit_model(data, weight) |

[8042] | 156 | else: |

[10246] | 157 | lr = learner(data, weight) |

[9919] | 158 | while isinstance(lr, Orange.feature.Descriptor): |

[8042] | 159 | if isinstance(lr.getValueFrom, Orange.core.ClassifierFromVar) and isinstance(lr.getValueFrom.transformer, Orange.core.Discrete2Continuous): |

160 | lr = lr.getValueFrom.variable | |

[10246] | 161 | attributes = data.domain.features[:] |

[8042] | 162 | if lr in attributes: |

163 | attributes.remove(lr) | |

164 | else: | |

165 | attributes.remove(lr.getValueFrom.variable) | |

[9818] | 166 | new_domain = Orange.data.Domain(attributes, |

[10246] | 167 | data.domain.class_var) |

168 | new_domain.addmetas(data.domain.getmetas()) | |

169 | data = data.select(new_domain) | |

170 | lr = learner.fit_model(data, weight) | |

[8042] | 171 | return lr |

172 | ||

[9818] | 173 | LogRegLearner = deprecated_members({"removeSingular": "remove_singular", |

174 | "weightID": "weight_id", | |

175 | "stepwiseLR": "stepwise_lr", | |

176 | "addCrit": "add_crit", | |

177 | "deleteCrit": "delete_crit", | |

178 | "numFeatures": "num_features", | |

179 | "removeMissing": "remove_missing" | |

180 | })(LogRegLearner) | |

[8042] | 181 | |

182 | class UnivariateLogRegLearner(Orange.classification.Learner): | |

[10246] | 183 | def __new__(cls, data=None, **argkw): |

[8042] | 184 | self = Orange.classification.Learner.__new__(cls, **argkw) |

[10246] | 185 | if data: |

[8042] | 186 | self.__init__(**argkw) |

[10246] | 187 | return self.__call__(data) |

[8042] | 188 | else: |

189 | return self | |

190 | ||

191 | def __init__(self, **kwds): | |

192 | self.__dict__.update(kwds) | |

193 | ||

[10246] | 194 | @deprecated_keywords({"examples": "data"}) |

195 | def __call__(self, data): | |

196 | data = createFullNoDiscTable(data) | |

[9818] | 197 | classifiers = map(lambda x: LogRegLearner(Orange.core.Preprocessor_dropMissing( |

[10246] | 198 | data.select(Orange.data.Domain(x, |

199 | data.domain.class_var)))), data.domain.features) | |

[9818] | 200 | maj_classifier = LogRegLearner(Orange.core.Preprocessor_dropMissing |

[10246] | 201 | (data.select(Orange.data.Domain(data.domain.class_var)))) |

[8042] | 202 | beta = [maj_classifier.beta[0]] + [x.beta[1] for x in classifiers] |

203 | beta_se = [maj_classifier.beta_se[0]] + [x.beta_se[1] for x in classifiers] | |

204 | P = [maj_classifier.P[0]] + [x.P[1] for x in classifiers] | |

205 | wald_Z = [maj_classifier.wald_Z[0]] + [x.wald_Z[1] for x in classifiers] | |

[10246] | 206 | domain = data.domain |

[8042] | 207 | |

208 | return Univariate_LogRegClassifier(beta = beta, beta_se = beta_se, P = P, wald_Z = wald_Z, domain = domain) | |

209 | ||

[9818] | 210 | class UnivariateLogRegClassifier(Orange.classification.Classifier): |

[8042] | 211 | def __init__(self, **kwds): |

212 | self.__dict__.update(kwds) | |

213 | ||

[9959] | 214 | def __call__(self, instance, result_type = Orange.classification.Classifier.GetValue): |

215 | # classification not implemented yet. For now its use is only to | |

216 | # provide regression coefficients and its statistics | |

217 | raise NotImplemented | |

[8042] | 218 | |

219 | ||

220 | class LogRegLearnerGetPriors(object): | |

[10246] | 221 | def __new__(cls, data=None, weight_id=0, **argkw): |

[9878] | 222 | self = object.__new__(cls) |

[10246] | 223 | if data: |

[8042] | 224 | self.__init__(**argkw) |

[10246] | 225 | return self.__call__(data, weight_id) |

[8042] | 226 | else: |

227 | return self | |

228 | ||

[9818] | 229 | @deprecated_keywords({"removeSingular": "remove_singular"}) |

230 | def __init__(self, remove_singular=0, **kwds): | |

[8042] | 231 | self.__dict__.update(kwds) |

[9818] | 232 | self.remove_singular = remove_singular |

233 | ||

[10246] | 234 | @deprecated_keywords({"examples": "data"}) |

235 | def __call__(self, data, weight=0): | |

[8042] | 236 | # next function changes data set to a extended with unknown values |

[9818] | 237 | def createLogRegExampleTable(data, weight_id): |

238 | sets_of_data = [] | |

239 | for at in data.domain.features: | |

240 | # za vsak atribut kreiraj nov newExampleTable new_data | |

241 | # v dataOrig, dataFinal in new_data dodaj nov atribut -- continuous variable | |

[9923] | 242 | if at.var_type == Orange.feature.Type.Continuous: |

[9919] | 243 | at_disc = Orange.feature.Continuous(at.name+ "Disc") |

[9818] | 244 | new_domain = Orange.data.Domain(data.domain.features+[at_disc,data.domain.class_var]) |

245 | new_domain.addmetas(data.domain.getmetas()) | |

246 | new_data = Orange.data.Table(new_domain,data) | |

247 | alt_data = Orange.data.Table(new_domain,data) | |

248 | for i,d in enumerate(new_data): | |

249 | d[at_disc] = 0 | |

250 | d[weight_id] = 1*data[i][weight_id] | |

251 | for i,d in enumerate(alt_data): | |

252 | d[at_disc] = 1 | |

[8042] | 253 | d[at] = 0 |

[9818] | 254 | d[weight_id] = 0.000001*data[i][weight_id] |

[9923] | 255 | elif at.var_type == Orange.feature.Type.Discrete: |

[9818] | 256 | # v dataOrig, dataFinal in new_data atributu "at" dodaj ee eno vreednost, ki ima vrednost kar ime atributa + "X" |

[9919] | 257 | at_new = Orange.feature.Discrete(at.name, values = at.values + [at.name+"X"]) |

[9818] | 258 | new_domain = Orange.data.Domain(filter(lambda x: x!=at, data.domain.features)+[at_new,data.domain.class_var]) |

259 | new_domain.addmetas(data.domain.getmetas()) | |

260 | new_data = Orange.data.Table(new_domain,data) | |

261 | alt_data = Orange.data.Table(new_domain,data) | |

262 | for i,d in enumerate(new_data): | |

263 | d[at_new] = data[i][at] | |

264 | d[weight_id] = 1*data[i][weight_id] | |

265 | for i,d in enumerate(alt_data): | |

266 | d[at_new] = at.name+"X" | |

267 | d[weight_id] = 0.000001*data[i][weight_id] | |

268 | new_data.extend(alt_data) | |

269 | sets_of_data.append(new_data) | |

270 | return sets_of_data | |

[8042] | 271 | |

[9818] | 272 | learner = LogRegLearner(imputer=Orange.feature.imputation.ImputerConstructor_average(), |

273 | remove_singular = self.remove_singular) | |

[8042] | 274 | # get Original Model |

[10246] | 275 | orig_model = learner(data, weight) |

[8042] | 276 | if orig_model.fit_status: |

277 | print "Warning: model did not converge" | |

278 | ||

279 | # get extended Model (you should not change data) | |

280 | if weight == 0: | |

[9936] | 281 | weight = Orange.feature.Descriptor.new_meta_id() |

[10246] | 282 | data.addMetaAttribute(weight, 1.0) |

283 | extended_set_of_examples = createLogRegExampleTable(data, weight) | |

[8042] | 284 | extended_models = [learner(extended_examples, weight) \ |

285 | for extended_examples in extended_set_of_examples] | |

286 | ||

287 | ## print examples[0] | |

288 | ## printOUT(orig_model) | |

289 | ## print orig_model.domain | |

290 | ## print orig_model.beta | |

[9818] | 291 | ## print orig_model.beta[orig_model.continuized_domain.features[-1]] |

[8042] | 292 | ## for i,m in enumerate(extended_models): |

[9818] | 293 | ## print examples.domain.features[i] |

[8042] | 294 | ## printOUT(m) |

295 | ||

296 | ||

297 | # izracunas odstopanja | |

298 | # get sum of all betas | |

299 | beta = 0 | |

300 | betas_ap = [] | |

301 | for m in extended_models: | |

[9818] | 302 | beta_add = m.beta[m.continuized_domain.features[-1]] |

[8042] | 303 | betas_ap.append(beta_add) |

304 | beta = beta + beta_add | |

305 | ||

306 | # substract it from intercept | |

307 | #print "beta", beta | |

308 | logistic_prior = orig_model.beta[0]+beta | |

309 | ||

310 | # compare it to bayes prior | |

[10246] | 311 | bayes = Orange.classification.bayes.NaiveLearner(data) |

[8042] | 312 | bayes_prior = math.log(bayes.distribution[1]/bayes.distribution[0]) |

313 | ||

314 | # normalize errors | |

315 | ## print "bayes", bayes_prior | |

316 | ## print "lr", orig_model.beta[0] | |

317 | ## print "lr2", logistic_prior | |

[9818] | 318 | ## print "dist", Orange.statistics.distribution.Distribution(examples.domain.class_var,examples) |

[8042] | 319 | ## print "prej", betas_ap |

320 | ||

321 | # error normalization - to avoid errors due to assumption of independence of unknown values | |

322 | dif = bayes_prior - logistic_prior | |

323 | positives = sum(filter(lambda x: x>=0, betas_ap)) | |

324 | negatives = -sum(filter(lambda x: x<0, betas_ap)) | |

325 | if not negatives == 0: | |

326 | kPN = positives/negatives | |

327 | diffNegatives = dif/(1+kPN) | |

328 | diffPositives = kPN*diffNegatives | |

329 | kNegatives = (negatives-diffNegatives)/negatives | |

330 | kPositives = positives/(positives-diffPositives) | |

331 | ## print kNegatives | |

332 | ## print kPositives | |

333 | ||

334 | for i,b in enumerate(betas_ap): | |

335 | if b<0: betas_ap[i]*=kNegatives | |

336 | else: betas_ap[i]*=kPositives | |

337 | #print "potem", betas_ap | |

338 | ||

339 | # vrni originalni model in pripadajoce apriorne niclele | |

340 | return (orig_model, betas_ap) | |

[9818] | 341 | #return (bayes_prior,orig_model.beta[examples.domain.class_var],logistic_prior) |

342 | ||

343 | LogRegLearnerGetPriors = deprecated_members({"removeSingular": | |

344 | "remove_singular"} | |

345 | )(LogRegLearnerGetPriors) | |

[8042] | 346 | |

347 | class LogRegLearnerGetPriorsOneTable: | |

[9818] | 348 | @deprecated_keywords({"removeSingular": "remove_singular"}) |

349 | def __init__(self, remove_singular=0, **kwds): | |

[8042] | 350 | self.__dict__.update(kwds) |

[9818] | 351 | self.remove_singular = remove_singular |

352 | ||

[10246] | 353 | @deprecated_keywords({"examples": "data"}) |

354 | def __call__(self, data, weight=0): | |

[8042] | 355 | # next function changes data set to a extended with unknown values |

356 | def createLogRegExampleTable(data, weightID): | |

[9818] | 357 | finalData = Orange.data.Table(data) |

358 | orig_data = Orange.data.Table(data) | |

359 | for at in data.domain.features: | |

[8042] | 360 | # za vsak atribut kreiraj nov newExampleTable newData |

361 | # v dataOrig, dataFinal in newData dodaj nov atribut -- continuous variable | |

[9923] | 362 | if at.var_type == Orange.feature.Type.Continuous: |

[9919] | 363 | atDisc = Orange.feature.Continuous(at.name + "Disc") |

[9818] | 364 | newDomain = Orange.data.Domain(orig_data.domain.features+[atDisc,data.domain.class_var]) |

[8042] | 365 | newDomain.addmetas(newData.domain.getmetas()) |

[9818] | 366 | finalData = Orange.data.Table(newDomain,finalData) |

367 | newData = Orange.data.Table(newDomain,orig_data) | |

368 | orig_data = Orange.data.Table(newDomain,orig_data) | |

369 | for d in orig_data: | |

[8042] | 370 | d[atDisc] = 0 |

371 | for d in finalData: | |

372 | d[atDisc] = 0 | |

373 | for i,d in enumerate(newData): | |

374 | d[atDisc] = 1 | |

375 | d[at] = 0 | |

376 | d[weightID] = 100*data[i][weightID] | |

377 | ||

[9923] | 378 | elif at.var_type == Orange.feature.Type.Discrete: |

[8042] | 379 | # v dataOrig, dataFinal in newData atributu "at" dodaj ee eno vreednost, ki ima vrednost kar ime atributa + "X" |

[9919] | 380 | at_new = Orange.feature.Discrete(at.name, values = at.values + [at.name+"X"]) |

[9818] | 381 | newDomain = Orange.data.Domain(filter(lambda x: x!=at, orig_data.domain.features)+[at_new,orig_data.domain.class_var]) |

382 | newDomain.addmetas(orig_data.domain.getmetas()) | |

383 | temp_finalData = Orange.data.Table(finalData) | |

384 | finalData = Orange.data.Table(newDomain,finalData) | |

385 | newData = Orange.data.Table(newDomain,orig_data) | |

386 | temp_origData = Orange.data.Table(orig_data) | |

387 | orig_data = Orange.data.Table(newDomain,orig_data) | |

388 | for i,d in enumerate(orig_data): | |

389 | d[at_new] = temp_origData[i][at] | |

[8042] | 390 | for i,d in enumerate(finalData): |

[9818] | 391 | d[at_new] = temp_finalData[i][at] |

[8042] | 392 | for i,d in enumerate(newData): |

[9818] | 393 | d[at_new] = at.name+"X" |

[8042] | 394 | d[weightID] = 10*data[i][weightID] |

395 | finalData.extend(newData) | |

396 | return finalData | |

397 | ||

[9818] | 398 | learner = LogRegLearner(imputer = Orange.feature.imputation.ImputerConstructor_average(), removeSingular = self.remove_singular) |

[8042] | 399 | # get Original Model |

[10246] | 400 | orig_model = learner(data,weight) |

[8042] | 401 | |

402 | # get extended Model (you should not change data) | |

403 | if weight == 0: | |

[9936] | 404 | weight = Orange.feature.Descriptor.new_meta_id() |

[10246] | 405 | data.addMetaAttribute(weight, 1.0) |

406 | extended_examples = createLogRegExampleTable(data, weight) | |

[8042] | 407 | extended_model = learner(extended_examples, weight) |

408 | ||

409 | ## print examples[0] | |

410 | ## printOUT(orig_model) | |

411 | ## print orig_model.domain | |

412 | ## print orig_model.beta | |

413 | ||

414 | ## printOUT(extended_model) | |

415 | # izracunas odstopanja | |

416 | # get sum of all betas | |

417 | beta = 0 | |

418 | betas_ap = [] | |

419 | for m in extended_models: | |

[9818] | 420 | beta_add = m.beta[m.continuized_domain.features[-1]] |

[8042] | 421 | betas_ap.append(beta_add) |

422 | beta = beta + beta_add | |

423 | ||

424 | # substract it from intercept | |

425 | #print "beta", beta | |

426 | logistic_prior = orig_model.beta[0]+beta | |

427 | ||

428 | # compare it to bayes prior | |

[10246] | 429 | bayes = Orange.classification.bayes.NaiveLearner(data) |

[8042] | 430 | bayes_prior = math.log(bayes.distribution[1]/bayes.distribution[0]) |

431 | ||

432 | # normalize errors | |

433 | #print "bayes", bayes_prior | |

434 | #print "lr", orig_model.beta[0] | |

435 | #print "lr2", logistic_prior | |

[9818] | 436 | #print "dist", Orange.statistics.distribution.Distribution(examples.domain.class_var,examples) |

[8042] | 437 | k = (bayes_prior-orig_model.beta[0])/(logistic_prior-orig_model.beta[0]) |

438 | #print "prej", betas_ap | |

439 | betas_ap = [k*x for x in betas_ap] | |

440 | #print "potem", betas_ap | |

441 | ||

442 | # vrni originalni model in pripadajoce apriorne niclele | |

443 | return (orig_model, betas_ap) | |

[9818] | 444 | #return (bayes_prior,orig_model.beta[data.domain.class_var],logistic_prior) |

445 | ||

446 | LogRegLearnerGetPriorsOneTable = deprecated_members({"removeSingular": | |

447 | "remove_singular"} | |

448 | )(LogRegLearnerGetPriorsOneTable) | |

[8042] | 449 | |

450 | ||

451 | ###################################### | |

452 | #### Fitters for logistic regression (logreg) learner #### | |

453 | ###################################### | |

454 | ||

455 | def pr(x, betas): | |

456 | k = math.exp(dot(x, betas)) | |

457 | return k / (1+k) | |

458 | ||

459 | def lh(x,y,betas): | |

460 | llh = 0.0 | |

461 | for i,x_i in enumerate(x): | |

462 | pr = pr(x_i,betas) | |

[9818] | 463 | llh += y[i]*math.log(max(pr,1e-6)) + (1-y[i])*log(max(1-pr,1e-6)) |

[8042] | 464 | return llh |

465 | ||

466 | ||

467 | def diag(vector): | |

[9818] | 468 | mat = identity(len(vector)) |

[8042] | 469 | for i,v in enumerate(vector): |

470 | mat[i][i] = v | |

471 | return mat | |

472 | ||

[9818] | 473 | class SimpleFitter(LogRegFitter): |

[8042] | 474 | def __init__(self, penalty=0, se_penalty = False): |

475 | self.penalty = penalty | |

476 | self.se_penalty = se_penalty | |

[9818] | 477 | |

[8042] | 478 | def __call__(self, data, weight=0): |

479 | ml = data.native(0) | |

[9818] | 480 | for i in range(len(data.domain.features)): |

481 | a = data.domain.features[i] | |

[9923] | 482 | if a.var_type == Orange.feature.Type.Discrete: |

[8042] | 483 | for m in ml: |

484 | m[i] = a.values.index(m[i]) | |

485 | for m in ml: | |

[9818] | 486 | m[-1] = data.domain.class_var.values.index(m[-1]) |

[8042] | 487 | Xtmp = array(ml) |

488 | y = Xtmp[:,-1] # true probabilities (1's or 0's) | |

489 | one = reshape(array([1]*len(data)), (len(data),1)) # intercept column | |

490 | X=concatenate((one, Xtmp[:,:-1]),1) # intercept first, then data | |

491 | ||

[9818] | 492 | betas = array([0.0] * (len(data.domain.features)+1)) |

493 | oldBetas = array([1.0] * (len(data.domain.features)+1)) | |

[8042] | 494 | N = len(data) |

495 | ||

[9818] | 496 | pen_matrix = array([self.penalty] * (len(data.domain.features)+1)) |

[8042] | 497 | if self.se_penalty: |

498 | p = array([pr(X[i], betas) for i in range(len(data))]) | |

[9818] | 499 | W = identity(len(data)) |

[8042] | 500 | pp = p * (1.0-p) |

501 | for i in range(N): | |

502 | W[i,i] = pp[i] | |

[9818] | 503 | se = sqrt(diagonal(inv(dot(transpose(X), dot(W, X))))) |

[8042] | 504 | for i,p in enumerate(pen_matrix): |

505 | pen_matrix[i] *= se[i] | |

506 | # predict the probability for an instance, x and betas are vectors | |

507 | # start the computation | |

508 | likelihood = 0. | |

509 | likelihood_new = 1. | |

510 | while abs(likelihood - likelihood_new)>1e-5: | |

511 | likelihood = likelihood_new | |

512 | oldBetas = betas | |

513 | p = array([pr(X[i], betas) for i in range(len(data))]) | |

514 | ||

[9818] | 515 | W = identity(len(data)) |

[8042] | 516 | pp = p * (1.0-p) |

517 | for i in range(N): | |

518 | W[i,i] = pp[i] | |

519 | ||

[9818] | 520 | WI = inv(W) |

521 | z = dot(X, betas) + dot(WI, y - p) | |

[8042] | 522 | |

[9818] | 523 | tmpA = inv(dot(transpose(X), dot(W, X))+diag(pen_matrix)) |

524 | tmpB = dot(transpose(X), y-p) | |

525 | betas = oldBetas + dot(tmpA,tmpB) | |

526 | # betaTemp = dot(dot(dot(dot(tmpA,transpose(X)),W),X),oldBetas) | |

[8042] | 527 | # print betaTemp |

[9818] | 528 | # tmpB = dot(transpose(X), dot(W, z)) |

529 | # betas = dot(tmpA, tmpB) | |

[8042] | 530 | likelihood_new = lh(X,y,betas)-self.penalty*sum([b*b for b in betas]) |

531 | print likelihood_new | |

532 | ||

533 | ||

534 | ||

[9818] | 535 | ## XX = sqrt(diagonal(inv(dot(transpose(X),X)))) |

[8042] | 536 | ## yhat = array([pr(X[i], betas) for i in range(len(data))]) |

[9818] | 537 | ## ss = sum((y - yhat) ** 2) / (N - len(data.domain.features) - 1) |

[8042] | 538 | ## sigma = math.sqrt(ss) |

539 | p = array([pr(X[i], betas) for i in range(len(data))]) | |

[9818] | 540 | W = identity(len(data)) |

[8042] | 541 | pp = p * (1.0-p) |

542 | for i in range(N): | |

543 | W[i,i] = pp[i] | |

[9818] | 544 | diXWX = sqrt(diagonal(inv(dot(transpose(X), dot(W, X))))) |

545 | xTemp = dot(dot(inv(dot(transpose(X), dot(W, X))),transpose(X)),y) | |

[8042] | 546 | beta = [] |

547 | beta_se = [] | |

548 | print "likelihood ridge", likelihood | |

549 | for i in range(len(betas)): | |

550 | beta.append(betas[i]) | |

551 | beta_se.append(diXWX[i]) | |

552 | return (self.OK, beta, beta_se, 0) | |

553 | ||

554 | def pr_bx(bx): | |

555 | if bx > 35: | |

556 | return 1 | |

557 | if bx < -35: | |

558 | return 0 | |

559 | return exp(bx)/(1+exp(bx)) | |

560 | ||

[9818] | 561 | class BayesianFitter(LogRegFitter): |

[8042] | 562 | def __init__(self, penalty=0, anch_examples=[], tau = 0): |

563 | self.penalty = penalty | |

564 | self.anch_examples = anch_examples | |

565 | self.tau = tau | |

566 | ||

567 | def create_array_data(self,data): | |

568 | if not len(data): | |

569 | return (array([]),array([])) | |

570 | # convert data to numeric | |

571 | ml = data.native(0) | |

[9818] | 572 | for i,a in enumerate(data.domain.features): |

[9923] | 573 | if a.var_type == Orange.feature.Type.Discrete: |

[8042] | 574 | for m in ml: |

575 | m[i] = a.values.index(m[i]) | |

576 | for m in ml: | |

[9818] | 577 | m[-1] = data.domain.class_var.values.index(m[-1]) |

[8042] | 578 | Xtmp = array(ml) |

579 | y = Xtmp[:,-1] # true probabilities (1's or 0's) | |

580 | one = reshape(array([1]*len(data)), (len(data),1)) # intercept column | |

581 | X=concatenate((one, Xtmp[:,:-1]),1) # intercept first, then data | |

582 | return (X,y) | |

583 | ||

584 | def __call__(self, data, weight=0): | |

585 | (X,y)=self.create_array_data(data) | |

586 | ||

[9818] | 587 | exTable = Orange.data.Table(data.domain) |

[8042] | 588 | for id,ex in self.anch_examples: |

[9818] | 589 | exTable.extend(Orange.data.Table(ex,data.domain)) |

[8042] | 590 | (X_anch,y_anch)=self.create_array_data(exTable) |

591 | ||

[9818] | 592 | betas = array([0.0] * (len(data.domain.features)+1)) |

[8042] | 593 | |

594 | likelihood,betas = self.estimate_beta(X,y,betas,[0]*(len(betas)),X_anch,y_anch) | |

595 | ||

596 | # get attribute groups atGroup = [(startIndex, number of values), ...) | |

[9818] | 597 | ats = data.domain.features |

[8042] | 598 | atVec=reduce(lambda x,y: x+[(y,not y==x[-1][0])], [a.getValueFrom and a.getValueFrom.whichVar or a for a in ats],[(ats[0].getValueFrom and ats[0].getValueFrom.whichVar or ats[0],0)])[1:] |

599 | atGroup=[[0,0]] | |

600 | for v_i,v in enumerate(atVec): | |

601 | if v[1]==0: atGroup[-1][1]+=1 | |

602 | else: atGroup.append([v_i,1]) | |

603 | ||

604 | # compute zero values for attributes | |

605 | sumB = 0. | |

606 | for ag in atGroup: | |

607 | X_temp = concatenate((X[:,:ag[0]+1],X[:,ag[0]+1+ag[1]:]),1) | |

608 | if X_anch: | |

609 | X_anch_temp = concatenate((X_anch[:,:ag[0]+1],X_anch[:,ag[0]+1+ag[1]:]),1) | |

610 | else: X_anch_temp = X_anch | |

611 | ## print "1", concatenate((betas[:i+1],betas[i+2:])) | |

612 | ## print "2", betas | |

613 | likelihood_temp,betas_temp=self.estimate_beta(X_temp,y,concatenate((betas[:ag[0]+1],betas[ag[0]+ag[1]+1:])),[0]+[1]*(len(betas)-1-ag[1]),X_anch_temp,y_anch) | |

614 | print "finBetas", betas, betas_temp | |

615 | print "betas", betas[0], betas_temp[0] | |

616 | sumB += betas[0]-betas_temp[0] | |

[9818] | 617 | apriori = Orange.statistics.distribution.Distribution(data.domain.class_var, data) |

[8042] | 618 | aprioriProb = apriori[0]/apriori.abs |

619 | ||

620 | print "koncni rezultat", sumB, math.log((1-aprioriProb)/aprioriProb), betas[0] | |

621 | ||

622 | beta = [] | |

623 | beta_se = [] | |

624 | print "likelihood2", likelihood | |

625 | for i in range(len(betas)): | |

626 | beta.append(betas[i]) | |

627 | beta_se.append(0.0) | |

628 | return (self.OK, beta, beta_se, 0) | |

629 | ||

630 | ||

631 | ||

632 | def estimate_beta(self,X,y,betas,const_betas,X_anch,y_anch): | |

633 | N,N_anch = len(y),len(y_anch) | |

634 | r,r_anch = array([dot(X[i], betas) for i in range(N)]),\ | |

635 | array([dot(X_anch[i], betas) for i in range(N_anch)]) | |

636 | p = array([pr_bx(ri) for ri in r]) | |

637 | X_sq = X*X | |

638 | ||

639 | max_delta = [1.]*len(const_betas) | |

640 | likelihood = -1.e+10 | |

641 | likelihood_new = -1.e+9 | |

642 | while abs(likelihood - likelihood_new)>0.01 and max(max_delta)>0.01: | |

643 | likelihood = likelihood_new | |

644 | print likelihood | |

645 | betas_temp = [b for b in betas] | |

646 | for j in range(len(betas)): | |

647 | if const_betas[j]: continue | |

[9818] | 648 | dl = dot(X[:,j], transpose(y-p)) |

[8042] | 649 | for xi,x in enumerate(X_anch): |

650 | dl += self.penalty*x[j]*(y_anch[xi] - pr_bx(r_anch[xi]*self.penalty)) | |

651 | ||

[9818] | 652 | ddl = dot(X_sq[:,j], transpose(p*(1-p))) |

[8042] | 653 | for xi,x in enumerate(X_anch): |

654 | ddl += self.penalty*x[j]*pr_bx(r[xi]*self.penalty)*(1-pr_bx(r[xi]*self.penalty)) | |

655 | ||

656 | if j==0: | |

657 | dv = dl/max(ddl,1e-6) | |

658 | elif betas[j] == 0: # special handling due to non-defined first and second derivatives | |

659 | dv = (dl-self.tau)/max(ddl,1e-6) | |

660 | if dv < 0: | |

661 | dv = (dl+self.tau)/max(ddl,1e-6) | |

662 | if dv > 0: | |

663 | dv = 0 | |

664 | else: | |

665 | dl -= sign(betas[j])*self.tau | |

666 | dv = dl/max(ddl,1e-6) | |

667 | if not sign(betas[j] + dv) == sign(betas[j]): | |

668 | dv = -betas[j] | |

669 | dv = min(max(dv,-max_delta[j]),max_delta[j]) | |

670 | r+= X[:,j]*dv | |

671 | p = array([pr_bx(ri) for ri in r]) | |

672 | if N_anch: | |

673 | r_anch+=X_anch[:,j]*dv | |

674 | betas[j] += dv | |

675 | max_delta[j] = max(2*abs(dv),max_delta[j]/2) | |

676 | likelihood_new = lh(X,y,betas) | |

677 | for xi,x in enumerate(X_anch): | |

678 | try: | |

679 | likelihood_new += y_anch[xi]*r_anch[xi]*self.penalty-log(1+exp(r_anch[xi]*self.penalty)) | |

680 | except: | |

681 | likelihood_new += r_anch[xi]*self.penalty*(y_anch[xi]-1) | |

682 | likelihood_new -= sum([abs(b) for b in betas[1:]])*self.tau | |

683 | if likelihood_new < likelihood: | |

684 | max_delta = [md/4 for md in max_delta] | |

685 | likelihood_new = likelihood | |

686 | likelihood = likelihood_new + 1. | |

687 | betas = [b for b in betas_temp] | |

688 | print "betas", betas | |

689 | print "init_like", likelihood_new | |

690 | print "pure_like", lh(X,y,betas) | |

691 | return (likelihood,betas) | |

692 | ||

693 | ############################################################ | |

694 | # Feature subset selection for logistic regression | |

695 | ||

[10246] | 696 | @deprecated_keywords({"examples": "data"}) |

697 | def get_likelihood(fitter, data): | |

698 | res = fitter(data) | |

[8042] | 699 | if res[0] in [fitter.OK]: #, fitter.Infinity, fitter.Divergence]: |

700 | status, beta, beta_se, likelihood = res | |

701 | if sum([abs(b) for b in beta])<sum([abs(b) for b in beta_se]): | |

[10246] | 702 | return -100*len(data) |

[8042] | 703 | return likelihood |

704 | else: | |

[10246] | 705 | return -100*len(data) |

[8042] | 706 | |

707 | ||

708 | ||

[10387] | 709 | class StepWiseFSS(object): |

[9818] | 710 | """ |

[10346] | 711 | A learning algorithm for logistic regression that implements a |

712 | stepwise feature subset selection as described in Applied Logistic | |

713 | Regression (Hosmer and Lemeshow, 2000). | |

[8042] | 714 | |

[10346] | 715 | Each step of the algorithm is composed of two parts. The first is |

716 | backward elimination in which the least significant variable in the | |

717 | model is removed if its p-value is above the prescribed threshold | |

718 | :obj:`delete_crit`. The second step is forward selection in which | |

719 | all variables are tested for addition to the model, and the one with | |

720 | the most significant contribution is added if the corresponding | |

721 | p-value is smaller than the prescribed :obj:d`add_crit`. The | |

722 | algorithm stops when no more variables can be added or removed. | |

[8042] | 723 | |

[10346] | 724 | The model can be additionaly constrained by setting |

725 | :obj:`num_features` to a non-negative value. The algorithm will then | |

726 | stop when the number of variables exceeds the given limit. | |

[8042] | 727 | |

[10346] | 728 | Significances are assesed by the likelihood ratio chi-square |

729 | test. Normal F test is not appropriate since the errors are assumed | |

730 | to follow a binomial distribution. | |

[8042] | 731 | |

[10346] | 732 | The class constructor returns an instance of learning algorithm or, |

733 | if given training data, a list of selected variables. | |

734 | ||

735 | :param table: training data. | |

[8042] | 736 | :type table: Orange.data.Table |

737 | ||

[10346] | 738 | :param add_crit: threshold for adding a variable (default: 0.2) |

[9818] | 739 | :type add_crit: float |

[8042] | 740 | |

[10346] | 741 | :param delete_crit: threshold for removing a variable |

742 | (default: 0.3); should be higher than :obj:`add_crit`. | |

[9818] | 743 | :type delete_crit: float |

[8042] | 744 | |

[9818] | 745 | :param num_features: maximum number of selected features, |

746 | use -1 for infinity. | |

747 | :type num_features: int | |

[8042] | 748 | :rtype: :obj:`StepWiseFSS` or list of features |

749 | ||

750 | """ | |

751 | ||

[10246] | 752 | def __new__(cls, data=None, **argkw): |

[10387] | 753 | self = object.__new__(cls) |

754 | if data is not None: | |

[8042] | 755 | self.__init__(**argkw) |

[10246] | 756 | return self.__call__(data) |

[8042] | 757 | else: |

758 | return self | |

759 | ||

[9818] | 760 | @deprecated_keywords({"addCrit": "add_crit", "deleteCrit": "delete_crit", |

761 | "numFeatures": "num_features"}) | |

[10387] | 762 | def __init__(self, add_crit=0.2, delete_crit=0.3, num_features=-1, **kwds): |

[9818] | 763 | self.__dict__.update(kwds) |

764 | self.add_crit = add_crit | |

765 | self.delete_crit = delete_crit | |

766 | self.num_features = num_features | |

[8042] | 767 | |

768 | def __call__(self, examples): | |

769 | if getattr(self, "imputer", 0): | |

770 | examples = self.imputer(examples)(examples) | |

771 | if getattr(self, "removeMissing", 0): | |

[10542] | 772 | examples = preprocess.DropMissing(examples) |

773 | continuizer = preprocess.DomainContinuizer(zeroBased=1, | |

774 | continuousTreatment=preprocess.DomainContinuizer.Leave, | |

775 | multinomialTreatment = preprocess.DomainContinuizer.FrequentIsBase, | |

776 | classTreatment = preprocess.DomainContinuizer.Ignore) | |

[8042] | 777 | attr = [] |

[9818] | 778 | remain_attr = examples.domain.features[:] |

[8042] | 779 | |

780 | # get LL for Majority Learner | |

[9818] | 781 | tempDomain = Orange.data.Domain(attr,examples.domain.class_var) |

[8042] | 782 | #tempData = Orange.core.Preprocessor_dropMissing(examples.select(tempDomain)) |

783 | tempData = Orange.core.Preprocessor_dropMissing(examples.select(tempDomain)) | |

784 | ||

[9818] | 785 | ll_Old = get_likelihood(LogRegFitter_Cholesky(), tempData) |

[8042] | 786 | ll_Best = -1000000 |

787 | length_Old = float(len(tempData)) | |

788 | ||

789 | stop = 0 | |

790 | while not stop: | |

791 | # LOOP until all variables are added or no further deletion nor addition of attribute is possible | |

792 | worstAt = None | |

793 | # if there are more than 1 attribute then perform backward elimination | |

794 | if len(attr) >= 2: | |

795 | minG = 1000 | |

796 | worstAt = attr[0] | |

797 | ll_Best = ll_Old | |

798 | length_Best = length_Old | |

799 | for at in attr: | |

800 | # check all attribute whether its presence enough increases LL? | |

801 | ||

802 | tempAttr = filter(lambda x: x!=at, attr) | |

[9818] | 803 | tempDomain = Orange.data.Domain(tempAttr,examples.domain.class_var) |

[8042] | 804 | tempDomain.addmetas(examples.domain.getmetas()) |

805 | # domain, calculate P for LL improvement. | |

806 | tempDomain = continuizer(Orange.core.Preprocessor_dropMissing(examples.select(tempDomain))) | |

807 | tempData = Orange.core.Preprocessor_dropMissing(examples.select(tempDomain)) | |

808 | ||

[9818] | 809 | ll_Delete = get_likelihood(LogRegFitter_Cholesky(), tempData) |

[8042] | 810 | length_Delete = float(len(tempData)) |

811 | length_Avg = (length_Delete + length_Old)/2.0 | |

812 | ||

813 | G=-2*length_Avg*(ll_Delete/length_Delete-ll_Old/length_Old) | |

814 | ||

[9818] | 815 | # set new worst attribute |

[8042] | 816 | if G<minG: |

817 | worstAt = at | |

818 | minG=G | |

819 | ll_Best = ll_Delete | |

820 | length_Best = length_Delete | |

821 | # deletion of attribute | |

[9818] | 822 | |

[9923] | 823 | if worstAt.var_type==Orange.feature.Type.Continuous: |

[8042] | 824 | P=lchisqprob(minG,1); |

825 | else: | |

826 | P=lchisqprob(minG,len(worstAt.values)-1); | |

[9818] | 827 | if P>=self.delete_crit: |

[8042] | 828 | attr.remove(worstAt) |

829 | remain_attr.append(worstAt) | |

830 | nodeletion=0 | |

831 | ll_Old = ll_Best | |

832 | length_Old = length_Best | |

833 | else: | |

834 | nodeletion=1 | |

835 | else: | |

836 | nodeletion = 1 | |

837 | # END OF DELETION PART | |

[9818] | 838 | |

[8042] | 839 | # if enough attributes has been chosen, stop the procedure |

[9818] | 840 | if self.num_features>-1 and len(attr)>=self.num_features: |

[8042] | 841 | remain_attr=[] |

[9818] | 842 | |

[8042] | 843 | # for each attribute in the remaining |

844 | maxG=-1 | |

845 | ll_Best = ll_Old | |

846 | length_Best = length_Old | |

847 | bestAt = None | |

848 | for at in remain_attr: | |

849 | tempAttr = attr + [at] | |

[9818] | 850 | tempDomain = Orange.data.Domain(tempAttr,examples.domain.class_var) |

[8042] | 851 | tempDomain.addmetas(examples.domain.getmetas()) |

852 | # domain, calculate P for LL improvement. | |

853 | tempDomain = continuizer(Orange.core.Preprocessor_dropMissing(examples.select(tempDomain))) | |

854 | tempData = Orange.core.Preprocessor_dropMissing(examples.select(tempDomain)) | |

[9818] | 855 | ll_New = get_likelihood(LogRegFitter_Cholesky(), tempData) |

[8042] | 856 | |

857 | length_New = float(len(tempData)) # get number of examples in tempData to normalize likelihood | |

858 | ||

859 | # P=PR(CHI^2>G), G=-2(L(0)-L(1))=2(E(0)-E(1)) | |

860 | length_avg = (length_New + length_Old)/2 | |

861 | G=-2*length_avg*(ll_Old/length_Old-ll_New/length_New); | |

862 | if G>maxG: | |

863 | bestAt = at | |

864 | maxG=G | |

865 | ll_Best = ll_New | |

866 | length_Best = length_New | |

867 | if not bestAt: | |

868 | stop = 1 | |

869 | continue | |

[9818] | 870 | |

[9923] | 871 | if bestAt.var_type==Orange.feature.Type.Continuous: |

[8042] | 872 | P=lchisqprob(maxG,1); |

873 | else: | |

874 | P=lchisqprob(maxG,len(bestAt.values)-1); | |

875 | # Add attribute with smallest P to attributes(attr) | |

[9818] | 876 | if P<=self.add_crit: |

[8042] | 877 | attr.append(bestAt) |

878 | remain_attr.remove(bestAt) | |

879 | ll_Old = ll_Best | |

880 | length_Old = length_Best | |

881 | ||

[9818] | 882 | if (P>self.add_crit and nodeletion) or (bestAt == worstAt): |

[8042] | 883 | stop = 1 |

884 | ||

885 | return attr | |

886 | ||

[9818] | 887 | StepWiseFSS = deprecated_members({"addCrit": "add_crit", |

888 | "deleteCrit": "delete_crit", | |

889 | "numFeatures": "num_features"})(StepWiseFSS) | |

890 | ||

[8042] | 891 | |

892 | class StepWiseFSSFilter(object): | |

[10246] | 893 | def __new__(cls, data=None, **argkw): |

[9878] | 894 | self = object.__new__(cls) |

[10246] | 895 | if data: |

[8042] | 896 | self.__init__(**argkw) |

[10246] | 897 | return self.__call__(data) |

[8042] | 898 | else: |

899 | return self | |

900 | ||

[9818] | 901 | @deprecated_keywords({"addCrit": "add_crit", "deleteCrit": "delete_crit", |

902 | "numFeatures": "num_features"}) | |

903 | def __init__(self, add_crit=0.2, delete_crit=0.3, num_features = -1): | |

904 | self.add_crit = add_crit | |

905 | self.delete_crit = delete_crit | |

906 | self.num_features = num_features | |

907 | ||

[10246] | 908 | @deprecated_keywords({"examples": "data"}) |

909 | def __call__(self, data): | |

910 | attr = StepWiseFSS(data, add_crit=self.add_crit, | |

[9818] | 911 | delete_crit= self.delete_crit, num_features= self.num_features) |

[10246] | 912 | return data.select(Orange.data.Domain(attr, data.domain.class_var)) |

[9818] | 913 | |

914 | StepWiseFSSFilter = deprecated_members({"addCrit": "add_crit", | |

915 | "deleteCrit": "delete_crit", | |

916 | "numFeatures": "num_features"})\ | |

917 | (StepWiseFSSFilter) | |

918 | ||

[8042] | 919 | |

920 | #################################### | |

921 | ## PROBABILITY CALCULATIONS | |

922 | ||

[11398] | 923 | def lchisqprob(chisq, df): |

[8042] | 924 | """ |

925 | Return the (1-tailed) probability value associated with the provided | |

926 | chi-square value and df. Adapted from chisq.c in Gary Perlman's |Stat. | |

927 | """ | |

928 | BIG = 20.0 | |

[11398] | 929 | |

[8042] | 930 | def ex(x): |

[11398] | 931 | BIG = 20.0 |

932 | if x < -BIG: | |

933 | return 0.0 | |

934 | else: | |

935 | return math.exp(x) | |

936 | if chisq <= 0 or df < 1: | |

937 | return 1.0 | |

[8042] | 938 | a = 0.5 * chisq |

[11398] | 939 | if df % 2 == 0: |

940 | even = 1 | |

[8042] | 941 | else: |

[11398] | 942 | even = 0 |

[8042] | 943 | if df > 1: |

[11398] | 944 | y = ex(-a) |

[8042] | 945 | if even: |

[11398] | 946 | s = y |

[8042] | 947 | else: |

948 | s = 2.0 * zprob(-math.sqrt(chisq)) | |

949 | if (df > 2): | |

950 | chisq = 0.5 * (df - 1.0) | |

951 | if even: | |

952 | z = 1.0 | |

953 | else: | |

954 | z = 0.5 | |

955 | if a > BIG: | |

956 | if even: | |

[11398] | 957 | e = 0.0 |

[8042] | 958 | else: |

[11398] | 959 | e = math.log(math.sqrt(math.pi)) |

[8042] | 960 | c = math.log(a) |

961 | while (z <= chisq): | |

[11398] | 962 | e = math.log(z) + e |

963 | s = s + ex(c * z - a - e) | |

964 | z = z + 1.0 | |

[8042] | 965 | return s |

966 | else: | |

967 | if even: | |

968 | e = 1.0 | |

969 | else: | |

970 | e = 1.0 / math.sqrt(math.pi) / math.sqrt(a) | |

971 | c = 0.0 | |

972 | while (z <= chisq): | |

[11398] | 973 | e = e * (a / float(z)) |

[8042] | 974 | c = c + e |

975 | z = z + 1.0 | |

[11398] | 976 | return (c * y + s) |

[8042] | 977 | else: |

978 | return s | |

979 | ||

980 | ||

981 | def zprob(z): | |

982 | """ | |

983 | Returns the area under the normal curve 'to the left of' the given z value. | |

[11398] | 984 | Thus:: |

[8042] | 985 | |

986 | for z<0, zprob(z) = 1-tail probability | |

987 | for z>0, 1.0-zprob(z) = 1-tail probability | |

988 | for any z, 2.0*(1.0-zprob(abs(z))) = 2-tail probability | |

989 | ||

990 | Adapted from z.c in Gary Perlman's |Stat. | |

991 | """ | |

992 | Z_MAX = 6.0 # maximum meaningful z-value | |

993 | if z == 0.0: | |

[11398] | 994 | x = 0.0 |

[8042] | 995 | else: |

[11398] | 996 | y = 0.5 * math.fabs(z) |

997 | if y >= (Z_MAX * 0.5): | |

998 | x = 1.0 | |

999 | elif (y < 1.0): | |

1000 | w = y * y | |

1001 | x = ((((((((0.000124818987 * w | |

1002 | - 0.001075204047) * w + 0.005198775019) * w | |

1003 | - 0.019198292004) * w + 0.059054035642) * w | |

1004 | - 0.151968751364) * w + 0.319152932694) * w | |

1005 | - 0.531923007300) * w + 0.797884560593) * y * 2.0 | |

1006 | else: | |

1007 | y = y - 2.0 | |

1008 | x = (((((((((((((-0.000045255659 * y | |

1009 | + 0.000152529290) * y - 0.000019538132) * y | |

1010 | - 0.000676904986) * y + 0.001390604284) * y | |

1011 | - 0.000794620820) * y - 0.002034254874) * y | |

1012 | + 0.006549791214) * y - 0.010557625006) * y | |

1013 | + 0.011630447319) * y - 0.009279453341) * y | |

1014 | + 0.005353579108) * y - 0.002141268741) * y | |

1015 | + 0.000535310849) * y + 0.999936657524 | |

[8042] | 1016 | if z > 0.0: |

[11398] | 1017 | prob = ((x + 1.0) * 0.5) |

[8042] | 1018 | else: |

[11398] | 1019 | prob = ((1.0 - x) * 0.5) |

[8042] | 1020 | return prob |

1021 | ||

[10677] | 1022 | |

1023 | """ | |

1024 | Logistic regression learner from LIBLINEAR | |

1025 | """ | |

1026 | ||

[11397] | 1027 | |

[10677] | 1028 | class LibLinearLogRegLearner(Orange.core.LinearLearner): |

1029 | """A logistic regression learner from `LIBLINEAR`_. | |

[11397] | 1030 | |

[10677] | 1031 | Supports L2 regularized learning. |

[11397] | 1032 | |

[10677] | 1033 | .. _`LIBLINEAR`: http://www.csie.ntu.edu.tw/~cjlin/liblinear/ |

[11397] | 1034 | |

[10677] | 1035 | """ |

1036 | ||

1037 | L2R_LR = Orange.core.LinearLearner.L2R_LR | |

1038 | L2R_LR_DUAL = Orange.core.LinearLearner.L2R_LR_Dual | |

[10885] | 1039 | L1R_LR = Orange.core.LinearLearner.L1R_LR |

[10677] | 1040 | |

1041 | __new__ = Orange.utils._orange__new__(base=Orange.core.LinearLearner) | |

1042 | ||

1043 | def __init__(self, solver_type=L2R_LR, C=1.0, eps=0.01, normalization=True, | |

[11397] | 1044 | bias=-1.0, multinomial_treatment=DomainContinuizer.NValues, |

1045 | **kwargs): | |

[10677] | 1046 | """ |

[11397] | 1047 | :param solver_type: One of the following class constants: |

[10885] | 1048 | ``L2_LR``, ``L2_LR_DUAL``, ``L1R_LR``. |

[11397] | 1049 | |

1050 | :param C: Regularization parameter (default 1.0). Higher values of | |

1051 | C mean less regularization (C is a coefficient for the loss | |

1052 | function). | |

[10677] | 1053 | :type C: float |

[11397] | 1054 | |

[10677] | 1055 | :param eps: Stopping criteria (default 0.01) |

1056 | :type eps: float | |

[11397] | 1057 | |

[10677] | 1058 | :param normalization: Normalize the input data prior to learning |

1059 | (default True) | |

1060 | :type normalization: bool | |

[10883] | 1061 | |

1062 | :param bias: If positive, use it as a bias (default -1). | |

1063 | :type bias: float | |

[11397] | 1064 | |

1065 | :param multinomial_treatment: Defines how to handle multinomial | |

1066 | features for learning. It can be one of the | |

1067 | :class:`~.DomainContinuizer` `multinomial_treatment` | |

1068 | constants (default: `DomainContinuizer.NValues`). | |

1069 | ||

1070 | :type multinomial_treatment: int | |

1071 | ||

1072 | .. versionadded:: 2.6.1 | |

1073 | Added `multinomial_treatment` | |

1074 | ||

[10677] | 1075 | """ |

1076 | self.solver_type = solver_type | |

1077 | self.C = C | |

1078 | self.eps = eps | |

1079 | self.normalization = normalization | |

[10883] | 1080 | self.bias = bias |

[11397] | 1081 | self.multinomial_treatment = multinomial_treatment |

[10677] | 1082 | |

[10883] | 1083 | for name, value in kwargs.items(): |

[10677] | 1084 | setattr(self, name, value) |

1085 | ||

1086 | def __call__(self, data, weight_id=None): | |

1087 | if not isinstance(data.domain.class_var, Orange.feature.Discrete): | |

1088 | raise TypeError("Can only learn a discrete class.") | |

1089 | ||

[10684] | 1090 | if data.domain.has_discrete_attributes(False) or self.normalization: |

[11397] | 1091 | dc = DomainContinuizer() |

1092 | dc.multinomial_treatment = self.multinomial_treatment | |

[10677] | 1093 | dc.class_treatment = dc.Ignore |

1094 | dc.continuous_treatment = \ | |

1095 | dc.NormalizeByVariance if self.normalization else dc.Leave | |

[11397] | 1096 | c_domain = dc(data) |

[10677] | 1097 | data = data.translate(c_domain) |

1098 | return super(LibLinearLogRegLearner, self).__call__(data, weight_id) |

**Note:**See TracBrowser for help on using the repository browser.