#
source:
orange-reliability/orangecontrib/reliability/__init__.py
@
38:7aabb3dd2321

Revision 38:7aabb3dd2321, 45.1 KB checked in by markotoplak, 7 months ago (diff) |
---|

Rev | Line | |
---|---|---|

[0] | 1 | import Orange |

2 | ||

3 | import random | |

4 | from Orange import statc | |

5 | import math | |

6 | import warnings | |

7 | import numpy | |

8 | ||

9 | from collections import defaultdict | |

10 | from itertools import izip | |

11 | ||

12 | # All the estimator method constants | |

13 | SAVAR_ABSOLUTE = 0 | |

14 | SABIAS_SIGNED = 1 | |

15 | SABIAS_ABSOLUTE = 2 | |

16 | BAGV_ABSOLUTE = 3 | |

17 | CNK_SIGNED = 4 | |

18 | CNK_ABSOLUTE = 5 | |

19 | LCV_ABSOLUTE = 6 | |

20 | BVCK_ABSOLUTE = 7 | |

21 | MAHAL_ABSOLUTE = 8 | |

22 | BLENDING_ABSOLUTE = 9 | |

23 | ICV_METHOD = 10 | |

24 | MAHAL_TO_CENTER_ABSOLUTE = 13 | |

[5] | 25 | DENS_ABSOLUTE = 14 |

[10] | 26 | ERR_ABSOLUTE = 15 |

[37] | 27 | STACKING = 101 |

[0] | 28 | |

29 | # Type of estimator constant | |

30 | SIGNED = 0 | |

31 | ABSOLUTE = 1 | |

32 | ||

33 | # Names of all the estimator methods | |

34 | METHOD_NAME = {0: "SAvar absolute", 1: "SAbias signed", 2: "SAbias absolute", | |

35 | 3: "BAGV absolute", 4: "CNK signed", 5: "CNK absolute", | |

[37] | 36 | 6: "LCV absolute", 7: "BVCK absolute", 8: "Mahalanobis absolute", |

[0] | 37 | 9: "BLENDING absolute", 10: "ICV", 11: "RF Variance", 12: "RF Std", |

[37] | 38 | 13: "Mahalanobis to center", 14: "Density based", 15: "Reference expected error", |

39 | 101: "Stacking" } | |

[0] | 40 | |

41 | def get_reliability_estimation_list(res, i): | |

[37] | 42 | return [ result.probabilities[0].reliability_estimate[i].estimate for result in res.results], \ |

43 | res.results[0].probabilities[0].reliability_estimate[i].signed_or_absolute, \ | |

44 | res.results[0].probabilities[0].reliability_estimate[i].method | |

[0] | 45 | |

46 | def get_prediction_error_list(res): | |

47 | return [result.actual_class - result.classes[0] for result in res.results] | |

48 | ||

49 | def get_description_list(res, i): | |

50 | return [result.probabilities[0].reliability_estimate[i].text_description for result in res.results] | |

51 | ||

52 | def get_pearson_r(res): | |

53 | """ | |

54 | :param res: results of evaluation, done using learners, | |

55 | wrapped into :class:`Orange.evaluation.reliability.Classifier`. | |

56 | :type res: :class:`Orange.evaluation.testing.ExperimentResults` | |

57 | ||

58 | Return Pearson's coefficient between the prediction error and each of the | |

59 | used reliability estimates. Also, return the p-value of each of | |

60 | the coefficients. | |

61 | """ | |

62 | prediction_error = get_prediction_error_list(res) | |

63 | results = [] | |

64 | for i in xrange(len(res.results[0].probabilities[0].reliability_estimate)): | |

65 | reliability_estimate, signed_or_absolute, method = get_reliability_estimation_list(res, i) | |

66 | try: | |

67 | if signed_or_absolute == SIGNED: | |

68 | r, p = statc.pearsonr(prediction_error, reliability_estimate) | |

69 | else: | |

70 | r, p = statc.pearsonr([abs(pe) for pe in prediction_error], reliability_estimate) | |

71 | except Exception: | |

72 | r = p = float("NaN") | |

73 | results.append((r, p, signed_or_absolute, method)) | |

74 | return results | |

75 | ||

76 | def get_spearman_r(res): | |

77 | """ | |

78 | :param res: results of evaluation, done using learners, | |

79 | wrapped into :class:`Orange.evaluation.reliability.Classifier`. | |

80 | :type res: :class:`Orange.evaluation.testing.ExperimentResults` | |

81 | ||

82 | Return Spearman's coefficient between the prediction error and each of the | |

83 | used reliability estimates. Also, return the p-value of each of | |

84 | the coefficients. | |

85 | """ | |

86 | prediction_error = get_prediction_error_list(res) | |

87 | results = [] | |

88 | for i in xrange(len(res.results[0].probabilities[0].reliability_estimate)): | |

89 | reliability_estimate, signed_or_absolute, method = get_reliability_estimation_list(res, i) | |

90 | try: | |

91 | if signed_or_absolute == SIGNED: | |

92 | r, p = statc.spearmanr(prediction_error, reliability_estimate) | |

93 | else: | |

94 | r, p = statc.spearmanr([abs(pe) for pe in prediction_error], reliability_estimate) | |

95 | except Exception: | |

96 | r = p = float("NaN") | |

97 | results.append((r, p, signed_or_absolute, method)) | |

98 | return results | |

99 | ||

100 | def get_pearson_r_by_iterations(res): | |

101 | """ | |

102 | :param res: results of evaluation, done using learners, | |

103 | wrapped into :class:`Orange.evaluation.reliability.Classifier`. | |

104 | :type res: :class:`Orange.evaluation.testing.ExperimentResults` | |

105 | ||

106 | Return average Pearson's coefficient over all folds between prediction error | |

107 | and each of the used estimates. | |

108 | """ | |

109 | results_by_fold = Orange.evaluation.scoring.split_by_iterations(res) | |

110 | number_of_estimates = len(res.results[0].probabilities[0].reliability_estimate) | |

111 | number_of_instances = len(res.results) | |

112 | number_of_folds = len(results_by_fold) | |

113 | results = [0 for _ in xrange(number_of_estimates)] | |

114 | sig = [0 for _ in xrange(number_of_estimates)] | |

115 | method_list = [0 for _ in xrange(number_of_estimates)] | |

116 | ||

117 | for res in results_by_fold: | |

118 | prediction_error = get_prediction_error_list(res) | |

119 | for i in xrange(number_of_estimates): | |

120 | reliability_estimate, signed_or_absolute, method = get_reliability_estimation_list(res, i) | |

121 | try: | |

122 | if signed_or_absolute == SIGNED: | |

123 | r, _ = statc.pearsonr(prediction_error, reliability_estimate) | |

124 | else: | |

125 | r, _ = statc.pearsonr([abs(pe) for pe in prediction_error], reliability_estimate) | |

126 | except Exception: | |

127 | r = float("NaN") | |

128 | results[i] += r | |

129 | sig[i] = signed_or_absolute | |

130 | method_list[i] = method | |

131 | ||

132 | # Calculate p-values | |

133 | results = [float(res) / number_of_folds for res in results] | |

134 | ps = [p_value_from_r(r, number_of_instances) for r in results] | |

135 | ||

136 | return zip(results, ps, sig, method_list) | |

137 | ||

138 | def p_value_from_r(r, n): | |

139 | """ | |

140 | Calculate p-value from the paerson coefficient and the sample size. | |

141 | """ | |

142 | df = n - 2 | |

143 | t = r * (df / ((-r + 1.0 + 1e-30) * (r + 1.0 + 1e-30))) ** 0.5 | |

144 | return statc.betai (df * 0.5, 0.5, df / (df + t * t)) | |

145 | ||

[5] | 146 | |

147 | # Distances between two discrete probability distributions | |

148 | #TODO Document those. | |

149 | def normalize_both(p, q): | |

150 | if not p.normalized: | |

151 | p.normalize() | |

152 | if not q.normalized: | |

153 | q.normalize() | |

154 | return p, q | |

155 | ||

156 | def minkowsky_dist(p, q, m=2): | |

157 | p, q = normalize_both(p, q) | |

158 | dist = 0 | |

159 | for i in range(len(p)): | |

160 | dist += abs(p[i]-q[i])**m | |

161 | return dist**(1./m) | |

162 | ||

163 | def manhattan_distance(p, q): | |

164 | return minkowsky_dist(p, q, m=1) | |

165 | ||

166 | def euclidean_dist(p, q): | |

167 | return minkowsky_dist(p, q, m=2) | |

168 | ||

169 | def variance_dist(p, q): | |

170 | return euclidean_dist(p, q) ** 2 | |

171 | ||

172 | def max_dist(p, q): | |

173 | p, q = normalize_both(p, q) | |

174 | return max([abs(p[i]-q[i]) for i in range(len(p))]) | |

175 | ||

176 | def hellinger_dist(p, q): | |

177 | p, q = normalize_both(p, q) | |

178 | dist = 0 | |

179 | for i in range(len(p)): | |

180 | dist += (math.sqrt(p[i])-math.sqrt(q[i])) ** 2 | |

181 | return dist | |

182 | ||

183 | def my_log(x): | |

184 | return 0 if x == 0 else x * math.log(x) | |

185 | ||

186 | def kullback_leibler(p, q): | |

187 | p, q = normalize_both(p, q) | |

188 | dist = 0 | |

189 | for i in range(len(p)): | |

190 | dist += my_log(p[i]-q[i]) | |

191 | return dist | |

192 | ||

193 | def cosine(p, q): | |

194 | p, q = normalize_both(p, q) | |

195 | p, q = [pp for pp in p], [qq for qq in q] | |

196 | return 1 - numpy.dot(x,y) / (numpy.linalg.norm(p)*numpy.linalg.norm(q)) | |

197 | ||

198 | ||

[0] | 199 | class Estimate: |

200 | """ | |

201 | Reliability estimate. Contains attributes that describe the results of | |

202 | reliability estimation. | |

203 | ||

204 | .. attribute:: estimate | |

205 | ||

206 | A numerical reliability estimate. | |

207 | ||

208 | .. attribute:: signed_or_absolute | |

209 | ||

210 | Determines whether the method used gives a signed or absolute result. | |

211 | Has a value of either :obj:`SIGNED` or :obj:`ABSOLUTE`. | |

212 | ||

213 | .. attribute:: method | |

214 | ||

215 | An integer ID of reliability estimation method used. | |

216 | ||

217 | .. attribute:: method_name | |

218 | ||

219 | Name (string) of reliability estimation method used. | |

220 | ||

221 | .. attribute:: icv_method | |

222 | ||

223 | An integer ID of reliability estimation method that performed best, | |

224 | as determined by ICV, and of which estimate is stored in the | |

225 | :obj:`estimate` field. (:obj:`None` when ICV was not used.) | |

226 | ||

227 | .. attribute:: icv_method_name | |

228 | ||

229 | Name (string) of reliability estimation method that performed best, | |

230 | as determined by ICV. (:obj:`None` when ICV was not used.) | |

231 | ||

232 | """ | |

233 | def __init__(self, estimate, signed_or_absolute, method, icv_method= -1): | |

234 | self.estimate = estimate | |

235 | self.signed_or_absolute = signed_or_absolute | |

236 | self.method = method | |

237 | self.method_name = METHOD_NAME[method] | |

238 | self.icv_method = icv_method | |

239 | self.icv_method_name = METHOD_NAME[icv_method] if icv_method != -1 else "" | |

240 | self.text_description = None | |

241 | ||

242 | class DescriptiveAnalysis: | |

[14] | 243 | def __init__(self, estimator, desc=["high", "medium", "low"], procentage=[0.00, 0.33, 0.66], name="da"): |

[0] | 244 | self.desc = desc |

245 | self.procentage = procentage | |

246 | self.estimator = estimator | |

[14] | 247 | self.name = name |

[0] | 248 | |

249 | def __call__(self, instances, weight=None, **kwds): | |

250 | ||

251 | # Calculate borders using cross validation | |

252 | res = Orange.evaluation.testing.cross_validation([self.estimator], instances) | |

253 | all_borders = [] | |

254 | for i in xrange(len(res.results[0].probabilities[0].reliability_estimate)): | |

255 | estimates, signed_or_absolute, method = get_reliability_estimation_list(res, i) | |

256 | sorted_estimates = sorted(abs(x) for x in estimates) | |

257 | borders = [sorted_estimates[int(len(estimates) * p) - 1] for p in self.procentage] | |

258 | all_borders.append(borders) | |

259 | ||

260 | # Learn on whole train data | |

261 | estimator_classifier = self.estimator(instances) | |

262 | ||

263 | return DescriptiveAnalysisClassifier(estimator_classifier, all_borders, self.desc) | |

264 | ||

265 | class DescriptiveAnalysisClassifier: | |

266 | def __init__(self, estimator_classifier, all_borders, desc): | |

267 | self.estimator_classifier = estimator_classifier | |

268 | self.all_borders = all_borders | |

269 | self.desc = desc | |

270 | ||

271 | def __call__(self, instance, result_type=Orange.core.GetValue): | |

272 | predicted, probabilities = self.estimator_classifier(instance, Orange.core.GetBoth) | |

273 | ||

274 | for borders, estimate in zip(self.all_borders, probabilities.reliability_estimate): | |

275 | estimate.text_description = self.desc[0] | |

276 | for lower_border, text_desc in zip(borders, self.desc): | |

277 | if estimate.estimate >= lower_border: | |

278 | estimate.text_description = text_desc | |

279 | ||

280 | # Return the appropriate type of result | |

281 | if result_type == Orange.core.GetValue: | |

282 | return predicted | |

283 | elif result_type == Orange.core.GetProbabilities: | |

284 | return probabilities | |

285 | else: | |

286 | return predicted, probabilities | |

287 | ||

288 | class SensitivityAnalysis: | |

289 | """ | |

290 | ||

291 | :param e: List of possible :math:`\epsilon` values for SAvar and SAbias | |

292 | reliability estimates. | |

293 | :type e: list of floats | |

294 | ||

295 | :rtype: :class:`Orange.evaluation.reliability.SensitivityAnalysisClassifier` | |

296 | ||

297 | To estimate the reliability of prediction for given instance, | |

298 | the learning set is extended with this instance, labeled with | |

299 | :math:`K + \epsilon (l_{max} - l_{min})`, | |

300 | where :math:`K` denotes the initial prediction, | |

301 | :math:`\epsilon` is sensitivity parameter and :math:`l_{min}` and | |

302 | :math:`l_{max}` denote lower and the upper bound of the learning | |

303 | instances' labels. After computing different sensitivity predictions | |

304 | using different values of :math:`\epsilon`, the prediction are combined | |

305 | into SAvar and SAbias. SAbias can be used in a signed or absolute form. | |

306 | ||

307 | :math:`SAvar = \\frac{\sum_{\epsilon \in E}(K_{\epsilon} - K_{-\epsilon})}{|E|}` | |

308 | ||

309 | :math:`SAbias = \\frac{\sum_{\epsilon \in E} (K_{\epsilon} - K ) + (K_{-\epsilon} - K)}{2 |E|}` | |

310 | ||

311 | ||

312 | """ | |

[14] | 313 | def __init__(self, e=[0.01, 0.1, 0.5, 1.0, 2.0], name="sa"): |

[0] | 314 | self.e = e |

[14] | 315 | self.name = name |

[0] | 316 | |

317 | def __call__(self, instances, learner): | |

318 | min_value = max_value = instances[0].getclass().value | |

319 | for ex in instances: | |

320 | if ex.getclass().value > max_value: | |

321 | max_value = ex.getclass().value | |

322 | if ex.getclass().value < min_value: | |

323 | min_value = ex.getclass().value | |

324 | return SensitivityAnalysisClassifier(self.e, instances, min_value, max_value, learner) | |

325 | ||

326 | class SensitivityAnalysisClassifier: | |

327 | def __init__(self, e, instances, min_value, max_value, learner): | |

328 | self.e = e | |

329 | self.instances = instances | |

330 | self.max_value = max_value | |

331 | self.min_value = min_value | |

332 | self.learner = learner | |

333 | ||

334 | def __call__(self, instance, predicted, probabilities): | |

335 | # Create new dataset | |

336 | r_data = Orange.data.Table(self.instances) | |

337 | ||

338 | # Create new instance | |

339 | modified_instance = Orange.data.Instance(instance) | |

340 | ||

341 | # Append it to the data | |

342 | r_data.append(modified_instance) | |

343 | ||

344 | # Calculate SAvar & SAbias | |

345 | SAvar = SAbias = 0 | |

346 | ||

347 | for eps in self.e: | |

348 | # +epsilon | |

349 | r_data[-1].setclass(predicted.value + eps * (self.max_value - self.min_value)) | |

350 | c = self.learner(r_data) | |

351 | k_plus = c(instance, Orange.core.GetValue) | |

352 | ||

353 | # -epsilon | |

354 | r_data[-1].setclass(predicted.value - eps * (self.max_value - self.min_value)) | |

355 | c = self.learner(r_data) | |

356 | k_minus = c(instance, Orange.core.GetValue) | |

357 | #print len(r_data) | |

358 | #print eps*(self.max_value - self.min_value) | |

359 | #print k_plus | |

360 | #print k_minus | |

361 | # calculate part SAvar and SAbias | |

362 | SAvar += k_plus.value - k_minus.value | |

363 | SAbias += k_plus.value + k_minus.value - 2 * predicted.value | |

364 | ||

365 | SAvar /= len(self.e) | |

366 | SAbias /= 2 * len(self.e) | |

367 | ||

368 | return [Estimate(SAvar, ABSOLUTE, SAVAR_ABSOLUTE), | |

369 | Estimate(SAbias, SIGNED, SABIAS_SIGNED), | |

370 | Estimate(abs(SAbias), ABSOLUTE, SABIAS_ABSOLUTE)] | |

371 | ||

[10] | 372 | |

373 | ||

374 | class ReferenceExpectedError: | |

[13] | 375 | """ |

[10] | 376 | |

[13] | 377 | :rtype: :class:`Orange.evaluation.reliability.ReferenceExpectedErrorClassifier` |

378 | ||

379 | Reference reliability estimation method for classification as used in Evaluating Reliability of Single | |

380 | Classifications of Neural Networks, Darko Pevec, 2011. | |

381 | ||

382 | :math:`O_{ref} = 2 (\hat y - \hat y ^2) = 2 \hat y (1-\hat y)` | |

383 | ||

384 | where :math:`\hat y` is the estimated probability of the predicted class. | |

385 | ||

386 | Note that for this method, in contrast with all others, a greater estimate means lower reliability (greater | |

387 | expected error). | |

388 | ||

389 | """ | |

[10] | 390 | def __init__(self, name="reference"): |

391 | self.name = name | |

392 | ||

393 | def __call__(self, instances, learner): | |

394 | classifier = learner(instances) | |

395 | return ReferenceExpectedErrorClassifier(classifier) | |

396 | ||

397 | ||

398 | class ReferenceExpectedErrorClassifier: | |

399 | ||

400 | def __init__(self, classifier): | |

401 | self.classifier = classifier | |

402 | ||

403 | def __call__(self, instance, *args): | |

404 | y_hat = max(self.classifier(instance, Orange.classification.Classifier.GetProbabilities)) | |

405 | return [Estimate(2 * y_hat * (1 - y_hat), ABSOLUTE, ERR_ABSOLUTE)] | |

406 | ||

407 | ||

[0] | 408 | class BaggingVariance: |

409 | """ | |

410 | ||

411 | :param m: Number of bagging models to be used with BAGV estimate | |

412 | :type m: int | |

413 | ||

414 | :rtype: :class:`Orange.evaluation.reliability.BaggingVarianceClassifier` | |

415 | ||

416 | :math:`m` different bagging models are constructed and used to estimate | |

[5] | 417 | the value of dependent variable for a given instance. In regression, |

418 | the variance of those predictions is used as a prediction reliability | |

419 | estimate. | |

[0] | 420 | |

421 | :math:`BAGV = \\frac{1}{m} \sum_{i=1}^{m} (K_i - K)^2` | |

422 | ||

423 | where :math:`K = \\frac{\sum_{i=1}^{m} K_i}{m}` and :math:`K_i` are | |

[5] | 424 | predictions of individual constructed models. Note that a greater value |

425 | implies greater error. | |

426 | ||

427 | For classification, 1 minus the average Euclidean distance between class | |

428 | probability distributions predicted by the model, and distributions | |

429 | predicted by the individual bagged models, is used as the BAGV reliability | |

430 | measure. Note that in this case a greater value implies a better | |

431 | prediction. | |

[0] | 432 | |

[37] | 433 | This reliability measure can run out of memory fast if individual classifiers |

434 | use a lot of memory, as it build m of them, thereby using :math:`m` times memory | |

435 | for a single classifier. If instances for measuring predictions | |

436 | are given as a parameter, this class can only compute their reliability, | |

437 | which allows less memory use. | |

438 | ||

[0] | 439 | """ |

[37] | 440 | def __init__(self, m=50, name="bv", randseed=0, for_instances=None): |

441 | """ | |

442 | for_instances: | |

443 | """ | |

[0] | 444 | self.m = m |

[9] | 445 | self.name = name |

[37] | 446 | self.select_with_repeat = Orange.core.MakeRandomIndicesMultiple() |

447 | self.select_with_repeat.random_generator = Orange.misc.Random(randseed) | |

448 | self.for_instances = for_instances | |

[0] | 449 | |

450 | def __call__(self, instances, learner): | |

451 | classifiers = [] | |

452 | ||

[5] | 453 | if instances.domain.class_var.var_type == Orange.feature.Descriptor.Discrete: |

454 | classifier = learner(instances) | |

455 | else: | |

456 | classifier = None | |

457 | ||

[37] | 458 | for_inst_class = defaultdict(list) |

459 | this_iteration = None | |

460 | ||

461 | if self.for_instances: | |

462 | his = map(_hashable_instance, self.for_instances) | |

463 | ||

[0] | 464 | # Create bagged classifiers using sampling with replacement |

[37] | 465 | for i in xrange(self.m): |

466 | this_iteration = set() | |

467 | selection = self.select_with_repeat(len(instances)) | |

[0] | 468 | data = instances.select(selection) |

[37] | 469 | cl = learner(data) |

470 | if cl: | |

471 | if self.for_instances: # predict reliability for testing instances and throw cl away | |

472 | for instance, hi in zip(self.for_instances, his): | |

473 | if hi not in this_iteration: | |

474 | for_inst_class[hi].append(_bagged_value(instance, cl, classifier)) | |

475 | this_iteration.add(hi) | |

476 | else: | |

477 | classifiers.append(cl) | |

478 | ||

479 | return BaggingVarianceClassifier(classifiers, classifier, for_inst_class=dict(for_inst_class)) | |

[0] | 480 | |

481 | class BaggingVarianceClassifier: | |

[37] | 482 | def __init__(self, classifiers, classifier=None, for_inst_class=None): |

[0] | 483 | self.classifiers = classifiers |

[8] | 484 | self.classifier = classifier |

[37] | 485 | self.for_inst_class = for_inst_class |

[5] | 486 | |

487 | def __call__(self, instance, *args): | |

[0] | 488 | BAGV = 0 |

489 | ||

490 | # Calculate the bagging variance | |

[37] | 491 | if self.for_inst_class: |

492 | bagged_values = self.for_inst_class[_hashable_instance(instance)] | |

493 | else: | |

494 | bagged_values = [ _bagged_value(instance, c, self.classifier) for c in self.classifiers ] | |

495 | ||

[0] | 496 | k = sum(bagged_values) / len(bagged_values) |

497 | ||

498 | BAGV = sum((bagged_value - k) ** 2 for bagged_value in bagged_values) / len(bagged_values) | |

[5] | 499 | if instance.domain.class_var.var_type == Orange.feature.Descriptor.Discrete: |

500 | BAGV = 1 - BAGV | |

[0] | 501 | |

502 | return [Estimate(BAGV, ABSOLUTE, BAGV_ABSOLUTE)] | |

503 | ||

[37] | 504 | def _hashable_instance(instance): |

505 | return tuple(instance[i].value for i in range(len(instance.domain.attributes))) | |

506 | ||

507 | def _bagged_value(instance, c, classifier): | |

508 | if instance.domain.class_var.var_type == Orange.feature.Descriptor.Continuous: | |

509 | return c(instance, Orange.core.GetValue).value | |

510 | elif instance.domain.class_var.var_type == Orange.feature.Descriptor.Discrete: | |

511 | estimate = classifier(instance, Orange.core.GetProbabilities) | |

512 | return euclidean_dist(c(instance, Orange.core.GetProbabilities), estimate) | |

513 | ||

514 | ||

[0] | 515 | class LocalCrossValidation: |

516 | """ | |

[5] | 517 | |

[0] | 518 | :param k: Number of nearest neighbours used in LCV estimate |

519 | :type k: int | |

[5] | 520 | |

521 | :param distance: function that computes a distance between two discrete | |

522 | distributions (used only in classification problems). The default | |

523 | is Hellinger distance. | |

524 | :type distance: function | |

525 | ||

526 | :param distance_weighted: for classification reliability estimation, | |

527 | use an average distance between distributions, weighted by :math:`e^{-d}`, | |

528 | where :math:`d` is the distance between predicted instance and the | |

529 | neighbour. | |

530 | ||

[0] | 531 | :rtype: :class:`Orange.evaluation.reliability.LocalCrossValidationClassifier` |

[5] | 532 | |

[0] | 533 | :math:`k` nearest neighbours to the given instance are found and put in |

534 | a separate data set. On this data set, a leave-one-out validation is | |

[5] | 535 | performed. Reliability estimate for regression is then the distance |

536 | weighted absolute prediction error. In classification, 1 minus the average | |

537 | distance between the predicted class probability distribution and the | |

538 | (trivial) probability distributions of the nearest neighbour. | |

[0] | 539 | |

540 | If a special value 0 is passed as :math:`k` (as is by default), | |

541 | it is set as 1/20 of data set size (or 5, whichever is greater). | |

[5] | 542 | |

543 | Summary of the algorithm for regression: | |

544 | ||

[0] | 545 | 1. Determine the set of k nearest neighours :math:`N = { (x_1, c_1),..., |

546 | (x_k, c_k)}`. | |

547 | 2. On this set, compute leave-one-out predictions :math:`K_i` and | |

548 | prediction errors :math:`E_i = | C_i - K_i |`. | |

549 | 3. :math:`LCV(x) = \\frac{ \sum_{(x_i, c_i) \in N} d(x_i, x) * E_i }{ \sum_{(x_i, c_i) \in N} d(x_i, x) }` | |

[5] | 550 | |

[0] | 551 | """ |

[9] | 552 | def __init__(self, k=0, distance=hellinger_dist, distance_weighted=True, name="lcv"): |

[0] | 553 | self.k = k |

[5] | 554 | self.distance = distance |

555 | self.distance_weighted = distance_weighted | |

[9] | 556 | self.name = name |

[0] | 557 | |

558 | def __call__(self, instances, learner): | |

559 | nearest_neighbours_constructor = Orange.classification.knn.FindNearestConstructor() | |

560 | nearest_neighbours_constructor.distanceConstructor = Orange.distance.Euclidean() | |

561 | ||

562 | distance_id = Orange.feature.Descriptor.new_meta_id() | |

563 | nearest_neighbours = nearest_neighbours_constructor(instances, 0, distance_id) | |

564 | ||

565 | if self.k == 0: | |

566 | self.k = max(5, len(instances) / 20) | |

567 | ||

[5] | 568 | return LocalCrossValidationClassifier(distance_id, nearest_neighbours, self.k, learner, |

569 | distance=self.distance, distance_weighted=self.distance_weighted) | |

[0] | 570 | |

571 | class LocalCrossValidationClassifier: | |

[5] | 572 | def __init__(self, distance_id, nearest_neighbours, k, learner, **kwds): |

[0] | 573 | self.distance_id = distance_id |

574 | self.nearest_neighbours = nearest_neighbours | |

575 | self.k = k | |

576 | self.learner = learner | |

[5] | 577 | for a,b in kwds.items(): |

578 | setattr(self, a, b) | |

[0] | 579 | |

580 | def __call__(self, instance, *args): | |

581 | LCVer = 0 | |

582 | LCVdi = 0 | |

583 | ||

584 | # Find k nearest neighbors | |

585 | ||

586 | knn = [ex for ex in self.nearest_neighbours(instance, self.k)] | |

587 | ||

588 | # leave one out of prediction error | |

589 | for i in xrange(len(knn)): | |

590 | train = knn[:] | |

591 | del train[i] | |

592 | ||

593 | classifier = self.learner(Orange.data.Table(train)) | |

594 | ||

[5] | 595 | if instance.domain.class_var.var_type == Orange.feature.Descriptor.Continuous: |

596 | returned_value = classifier(knn[i], Orange.core.GetValue) | |

597 | e = abs(knn[i].getclass().value - returned_value.value) | |

[0] | 598 | |

[5] | 599 | elif instance.domain.class_var.var_type == Orange.feature.Descriptor.Discrete: |

600 | returned_value = classifier(knn[i], Orange.core.GetProbabilities) | |

601 | probabilities = [knn[i].get_class() == val for val in instance.domain.class_var.values] | |

602 | e = self.distance(returned_value, Orange.statistics.distribution.Discrete(probabilities)) | |

[0] | 603 | |

[5] | 604 | dist = math.exp(-knn[i][self.distance_id]) if self.distance_weighted else 1.0 |

605 | LCVer += e * dist | |

606 | LCVdi += dist | |

[0] | 607 | |

608 | LCV = LCVer / LCVdi if LCVdi != 0 else 0 | |

609 | if math.isnan(LCV): | |

610 | LCV = 0.0 | |

[5] | 611 | |

612 | if instance.domain.class_var.var_type == Orange.feature.Descriptor.Discrete: | |

613 | LCV = 1 - LCV | |

614 | ||

[0] | 615 | return [ Estimate(LCV, ABSOLUTE, LCV_ABSOLUTE) ] |

616 | ||

617 | class CNeighbours: | |

618 | """ | |

619 | ||

620 | :param k: Number of nearest neighbours used in CNK estimate | |

621 | :type k: int | |

[5] | 622 | |

623 | :param distance: function that computes a distance between two discrete | |

624 | distributions (used only in classification problems). The default | |

625 | is Hellinger distance. | |

626 | :type distance: function | |

[0] | 627 | |

628 | :rtype: :class:`Orange.evaluation.reliability.CNeighboursClassifier` | |

629 | ||

[5] | 630 | For regression, CNK is defined for an unlabeled instance as a difference |

631 | between average label of its nearest neighbours and its prediction. CNK | |

632 | can be used as a signed or absolute estimate. | |

[0] | 633 | |

634 | :math:`CNK = \\frac{\sum_{i=1}^{k}C_i}{k} - K` | |

635 | ||

636 | where :math:`k` denotes number of neighbors, C :sub:`i` denotes neighbours' | |

[5] | 637 | labels and :math:`K` denotes the instance's prediction. Note that a greater |

638 | value implies greater prediction error. | |

639 | ||

640 | For classification, CNK is equal to 1 minus the average distance between | |

641 | predicted class distribution and (trivial) class distributions of the | |

642 | $k$ nearest neighbours from the learning set. Note that in this case | |

643 | a greater value implies better prediction. | |

[0] | 644 | |

645 | """ | |

[9] | 646 | def __init__(self, k=5, distance=hellinger_dist, name = "cnk"): |

[0] | 647 | self.k = k |

[5] | 648 | self.distance = distance |

[9] | 649 | self.name = name |

[0] | 650 | |

651 | def __call__(self, instances, learner): | |

652 | nearest_neighbours_constructor = Orange.classification.knn.FindNearestConstructor() | |

653 | nearest_neighbours_constructor.distanceConstructor = Orange.distance.Euclidean() | |

654 | ||

655 | distance_id = Orange.feature.Descriptor.new_meta_id() | |

656 | nearest_neighbours = nearest_neighbours_constructor(instances, 0, distance_id) | |

[5] | 657 | return CNeighboursClassifier(nearest_neighbours, self.k, distance=self.distance) |

[0] | 658 | |

659 | class CNeighboursClassifier: | |

[8] | 660 | def __init__(self, nearest_neighbours, k, distance): |

[0] | 661 | self.nearest_neighbours = nearest_neighbours |

662 | self.k = k | |

[8] | 663 | self.distance = distance |

[0] | 664 | |

665 | def __call__(self, instance, predicted, probabilities): | |

666 | CNK = 0 | |

667 | ||

668 | # Find k nearest neighbors | |

669 | ||

670 | knn = [ex for ex in self.nearest_neighbours(instance, self.k)] | |

671 | ||

672 | # average label of neighbors | |

[5] | 673 | if ex.domain.class_var.var_type == Orange.feature.Descriptor.Continuous: |

674 | for ex in knn: | |

675 | CNK += ex.getclass().value | |

676 | CNK /= self.k | |

677 | CNK -= predicted.value | |

[0] | 678 | |

[5] | 679 | return [Estimate(CNK, SIGNED, CNK_SIGNED), |

680 | Estimate(abs(CNK), ABSOLUTE, CNK_ABSOLUTE)] | |

681 | elif ex.domain.class_var.var_type == Orange.feature.Descriptor.Discrete: | |

682 | knn_l = Orange.classification.knn.kNNLearner(k=self.k) | |

683 | knn_c = knn_l(knn) | |

684 | for ex in knn: | |

685 | CNK -= self.distance(probabilities, knn_c(ex, Orange.classification.Classifier.GetProbabilities)) | |

686 | CNK /= self.k | |

687 | CNK += 1 | |

[0] | 688 | |

[5] | 689 | return [Estimate(CNK, ABSOLUTE, CNK_ABSOLUTE)] |

[0] | 690 | |

691 | class Mahalanobis: | |

692 | """ | |

693 | ||

694 | :param k: Number of nearest neighbours used in Mahalanobis estimate. | |

695 | :type k: int | |

696 | ||

697 | :rtype: :class:`Orange.evaluation.reliability.MahalanobisClassifier` | |

698 | ||

699 | Mahalanobis distance reliability estimate is defined as | |

700 | `mahalanobis distance <http://en.wikipedia.org/wiki/Mahalanobis_distance>`_ | |

701 | to the evaluated instance's :math:`k` nearest neighbours. | |

702 | ||

703 | ||

704 | """ | |

[14] | 705 | def __init__(self, k=3, name="mahalanobis"): |

[0] | 706 | self.k = k |

[14] | 707 | self.name = name |

[0] | 708 | |

709 | def __call__(self, instances, *args): | |

710 | nnm = Orange.classification.knn.FindNearestConstructor() | |

711 | nnm.distanceConstructor = Orange.distance.Mahalanobis() | |

712 | ||

713 | mid = Orange.feature.Descriptor.new_meta_id() | |

714 | nnm = nnm(instances, 0, mid) | |

715 | return MahalanobisClassifier(self.k, nnm, mid) | |

716 | ||

717 | class MahalanobisClassifier: | |

718 | def __init__(self, k, nnm, mid): | |

719 | self.k = k | |

720 | self.nnm = nnm | |

721 | self.mid = mid | |

722 | ||

723 | def __call__(self, instance, *args): | |

724 | mahalanobis_distance = 0 | |

725 | ||

726 | mahalanobis_distance = sum(ex[self.mid].value for ex in self.nnm(instance, self.k)) | |

727 | ||

728 | return [ Estimate(mahalanobis_distance, ABSOLUTE, MAHAL_ABSOLUTE) ] | |

729 | ||

730 | class MahalanobisToCenter: | |

731 | """ | |

732 | :rtype: :class:`Orange.evaluation.reliability.MahalanobisToCenterClassifier` | |

733 | ||

734 | Mahalanobis distance to center reliability estimate is defined as a | |

735 | `mahalanobis distance <http://en.wikipedia.org/wiki/Mahalanobis_distance>`_ | |

736 | between the predicted instance and the centroid of the data. | |

737 | ||

738 | ||

739 | """ | |

[14] | 740 | def __init__(self, name="mahalanobis to center"): |

741 | self.name = name | |

[0] | 742 | |

743 | def __call__(self, instances, *args): | |

744 | dc = Orange.core.DomainContinuizer() | |

745 | dc.classTreatment = Orange.core.DomainContinuizer.Ignore | |

746 | dc.continuousTreatment = Orange.core.DomainContinuizer.NormalizeBySpan | |

747 | dc.multinomialTreatment = Orange.core.DomainContinuizer.NValues | |

748 | ||

749 | new_domain = dc(instances) | |

750 | new_instances = instances.translate(new_domain) | |

751 | ||

752 | X, _, _ = new_instances.to_numpy() | |

753 | instance_avg = numpy.average(X, 0) | |

754 | ||

755 | distance_constructor = Orange.distance.Mahalanobis() | |

756 | distance = distance_constructor(new_instances) | |

757 | ||

758 | average_instance = Orange.data.Instance(new_instances.domain, list(instance_avg) + ["?"]) | |

759 | ||

760 | return MahalanobisToCenterClassifier(distance, average_instance, new_domain) | |

761 | ||

762 | class MahalanobisToCenterClassifier: | |

763 | def __init__(self, distance, average_instance, new_domain): | |

764 | self.distance = distance | |

765 | self.average_instance = average_instance | |

766 | self.new_domain = new_domain | |

767 | ||

768 | def __call__(self, instance, *args): | |

769 | ||

770 | inst = Orange.data.Instance(self.new_domain, instance) | |

771 | ||

772 | mahalanobis_to_center = self.distance(inst, self.average_instance) | |

773 | ||

774 | return [ Estimate(mahalanobis_to_center, ABSOLUTE, MAHAL_TO_CENTER_ABSOLUTE) ] | |

775 | ||

776 | ||

777 | class BaggingVarianceCNeighbours: | |

778 | """ | |

779 | ||

780 | :param bagv: Instance of Bagging Variance estimator. | |

781 | :type bagv: :class:`BaggingVariance` | |

782 | ||

783 | :param cnk: Instance of CNK estimator. | |

784 | :type cnk: :class:`CNeighbours` | |

785 | ||

786 | :rtype: :class:`Orange.evaluation.reliability.BaggingVarianceCNeighboursClassifier` | |

787 | ||

788 | BVCK is a combination (average) of Bagging variance and local modeling of | |

789 | prediction error. | |

790 | ||

791 | """ | |

[37] | 792 | def __init__(self, bagv=None, cnk=None, name="bvck"): |

793 | if bagv is None: | |

794 | bagv = BaggingVariance() | |

795 | if cnk is None: | |

796 | cnk = CNeighbours() | |

[0] | 797 | self.bagv = bagv |

798 | self.cnk = cnk | |

[14] | 799 | self.name = "bvck" |

[0] | 800 | |

801 | def __call__(self, instances, learner): | |

802 | bagv_classifier = self.bagv(instances, learner) | |

803 | cnk_classifier = self.cnk(instances, learner) | |

804 | return BaggingVarianceCNeighboursClassifier(bagv_classifier, cnk_classifier) | |

805 | ||

806 | class BaggingVarianceCNeighboursClassifier: | |

807 | def __init__(self, bagv_classifier, cnk_classifier): | |

808 | self.bagv_classifier = bagv_classifier | |

809 | self.cnk_classifier = cnk_classifier | |

810 | ||

811 | def __call__(self, instance, predicted, probabilities): | |

812 | bagv_estimates = self.bagv_classifier(instance, predicted, probabilities) | |

813 | cnk_estimates = self.cnk_classifier(instance, predicted, probabilities) | |

814 | ||

815 | bvck_value = (bagv_estimates[0].estimate + cnk_estimates[1].estimate) / 2 | |

816 | bvck_estimates = [ Estimate(bvck_value, ABSOLUTE, BVCK_ABSOLUTE) ] | |

817 | bvck_estimates.extend(bagv_estimates) | |

818 | bvck_estimates.extend(cnk_estimates) | |

819 | return bvck_estimates | |

820 | ||

821 | class ErrorPredicting: | |

[14] | 822 | def __init__(self, name = "ep"): |

823 | self.name = name | |

[0] | 824 | |

825 | def __call__(self, instances, learner): | |

826 | res = Orange.evaluation.testing.cross_validation([learner], instances) | |

827 | prediction_errors = get_prediction_error_list(res) | |

828 | ||

829 | new_domain = Orange.data.Domain(instances.domain.attributes, Orange.core.FloatVariable("pe")) | |

830 | new_dataset = Orange.data.Table(new_domain, instances) | |

831 | ||

832 | for instance, prediction_error in izip(new_dataset, prediction_errors): | |

833 | instance.set_class(prediction_error) | |

834 | ||

835 | rf = Orange.ensemble.forest.RandomForestLearner() | |

836 | rf_classifier = rf(new_dataset) | |

837 | ||

838 | return ErrorPredictingClassification(rf_classifier, new_domain) | |

839 | ||

840 | class ErrorPredictingClassification: | |

841 | def __init__(self, rf_classifier, new_domain): | |

842 | self.rf_classifier = rf_classifier | |

843 | self.new_domain = new_domain | |

844 | ||

845 | def __call__(self, instance, predicted, probabilities): | |

846 | new_instance = Orange.data.Instance(self.new_domain, instance) | |

847 | value = self.rf_classifier(new_instance, Orange.core.GetValue) | |

848 | ||

849 | return [Estimate(value.value, SIGNED, SABIAS_SIGNED)] | |

850 | ||

[5] | 851 | def gauss_kernel(x, sigma=1): |

852 | return 1./(sigma*math.sqrt(2*math.pi)) * math.exp(-1./2*(x/sigma)**2) | |

853 | ||

854 | class ParzenWindowDensityBased: | |

855 | """ | |

856 | :param K: kernel function. Default: gaussian. | |

857 | :type K: function | |

858 | ||

859 | :param d_measure: distance measure for inter-instance distance. | |

860 | :type d_measure: :class:`Orange.distance.DistanceConstructor` | |

861 | ||

862 | :rtype: :class:`Orange.evaluation.reliability.ParzenWindowDensityBasedClassifier` | |

863 | ||

864 | Returns a value that estimates a density of problem space around the | |

865 | instance being predicted. | |

866 | """ | |

[9] | 867 | def __init__(self, K=gauss_kernel, d_measure=Orange.distance.Euclidean(), name="density"): |

[5] | 868 | self.K = K |

869 | self.d_measure = d_measure | |

[9] | 870 | self.name = name |

[5] | 871 | |

[11] | 872 | def __call__(self, instances, learner): |

[5] | 873 | |

874 | self.distance = self.d_measure(instances) | |

875 | ||

876 | def density(x): | |

877 | l, dens = len(instances), 0 | |

878 | for ex in instances: | |

879 | dens += self.K(self.distance(x,ex)) | |

880 | return dens / l | |

881 | ||

882 | max_density = max([density(ex) for ex in instances]) | |

883 | ||

884 | return ParzenWindowDensityBasedClassifier(density, max_density) | |

885 | ||

886 | class ParzenWindowDensityBasedClassifier: | |

887 | ||

888 | def __init__(self, density, max_density): | |

889 | self.density = density | |

890 | self.max_density = max_density | |

891 | ||

892 | ||

893 | def __call__(self, instance, *args): | |

894 | ||

895 | DENS = self.max_density-self.density(instance) | |

896 | ||

897 | return [Estimate(DENS, ABSOLUTE, DENS_ABSOLUTE)] | |

898 | ||

[37] | 899 | class Stacking: |

900 | ||

[38] | 901 | def __init__(self, stack_learner, estimators=None, folds=10, save_data=False): |

[37] | 902 | self.stack_learner = stack_learner |

903 | self.estimators = estimators | |

904 | self.folds = folds | |

905 | self.save_data = save_data | |

[38] | 906 | if self.estimators is None: |

907 | self.estimators = [SensitivityAnalysis(), | |

908 | LocalCrossValidation(), | |

909 | BaggingVarianceCNeighbours(), | |

910 | Mahalanobis(), | |

911 | MahalanobisToCenter()] | |

[37] | 912 | |

913 | def __call__(self, data, learner): | |

914 | ||

915 | newfeatures = None | |

916 | ||

917 | if self.folds > 1: | |

918 | ||

919 | cvi = Orange.data.sample.SubsetIndicesCV(data, self.folds) | |

920 | data_cv = [ None ] * len(data) | |

921 | for f in set(cvi): #for each fold | |

922 | learn = data.select(cvi, f, negate=True) | |

923 | test = data.select(cvi, f) | |

924 | ||

925 | #learn reliability estimates for the learning set | |

926 | lf = Learner(learner, estimators=self.estimators)(learn) | |

927 | ||

928 | #pos is used to retain the order of instances | |

929 | for ex, pos in zip(test, [ i for i,n in enumerate(cvi) if n == f ]): | |

930 | pred = lf(ex, Orange.core.GetBoth) | |

931 | re = pred[1].reliability_estimate | |

932 | names = [ e.method_name for e in re ] | |

933 | assert newfeatures is None or names == newfeatures | |

934 | newfeatures = names | |

935 | estimates = [ abs(e.estimate) for e in re ] | |

936 | error = ex[-1].value - pred[0].value | |

937 | data_cv[pos] = estimates + [ abs(error) ] | |

938 | ||

939 | else: | |

940 | ||

941 | #use half of the data to learn reliability estimates | |

942 | #and the other half for induction of a stacking classifier | |

943 | cvi = Orange.data.sample.SubsetIndicesCV(data, 2) | |

944 | data_cv = [] | |

945 | ||

946 | learn = data.select(cvi, 0, negate=True) | |

947 | test = data.select(cvi, 0) | |

948 | ||

949 | #learn reliability estimates for the learning set | |

950 | lf = Learner(learner, estimators=self.estimators)(learn) | |

951 | ||

952 | for ex in test: | |

953 | pred = lf(ex, Orange.core.GetBoth) | |

954 | re = pred[1].reliability_estimate | |

955 | names = [ e.method_name for e in re ] | |

956 | assert newfeatures is None or names == newfeatures | |

957 | newfeatures = names | |

958 | estimates = [ abs(e.estimate) for e in re ] | |

959 | error = ex[-1].value - pred[0].value | |

960 | data_cv.append(estimates + [ abs(error) ]) | |

961 | ||

962 | print "DCV", len(data_cv) | |

963 | ||

964 | lf = None | |

965 | ||

966 | #induce the classifier on cross-validated reliability estimates | |

967 | newfeatures = [ Orange.feature.Continuous(name=n) for n in newfeatures ] | |

968 | newdomain = Orange.data.Domain(newfeatures, Orange.feature.Continuous(name="error")) | |

969 | classifier_data = Orange.data.Table(newdomain, data_cv) | |

970 | stack_classifier = self.stack_learner(classifier_data) | |

971 | ||

972 | #induce reliability estimates on the whole data set | |

973 | lf = Learner(learner, estimators=self.estimators)(data) | |

974 | ||

975 | if self.save_data: | |

976 | self.classifier_data = classifier_data | |

977 | ||

978 | return StackingClassifier(stack_classifier, lf, newdomain) | |

979 | ||

980 | ||

981 | class StackingClassifier: | |

982 | ||

983 | def __init__(self, stacking_classifier, reliability_classifier, domain): | |

984 | self.stacking_classifier = stacking_classifier | |

985 | print self.stacking_classifier | |

986 | self.domain = domain | |

987 | self.reliability_classifier = reliability_classifier | |

988 | ||

989 | def convert(self, instance): | |

990 | """ Return example in the space of reliability estimates. """ | |

991 | re = self.reliability_classifier(instance, Orange.core.GetProbabilities).reliability_estimate | |

992 | #take absolute values for all | |

993 | tex = [ abs(e.estimate) for e in re ] + [ "?" ] | |

994 | tex = Orange.data.Instance(self.domain, tex) | |

995 | return tex | |

996 | ||

997 | def __call__(self, instance, *args): | |

998 | tex = self.convert(instance) | |

999 | r = self.stacking_classifier(tex) | |

1000 | r = float(r) | |

1001 | r = max(0., r) | |

1002 | return [ Estimate(r, ABSOLUTE, STACKING) ] | |

1003 | ||

1004 | class ICV: | |

1005 | """ Perform internal cross validation (as in Automatic selection of | |

[38] | 1006 | reliability estimates for individual regression predictions, |

[37] | 1007 | Zoran Bosnic, 2010) and return id of the method |

1008 | that scored best on this data. | |

1009 | """ | |

1010 | ||

[38] | 1011 | def __init__(self, estimators=None, folds=10): |

[37] | 1012 | self.estimators = estimators |

[38] | 1013 | if self.estimators is None: |

1014 | self.estimators = [SensitivityAnalysis(), | |

1015 | LocalCrossValidation(), | |

1016 | BaggingVarianceCNeighbours(), | |

1017 | Mahalanobis(), | |

1018 | MahalanobisToCenter()] | |

[37] | 1019 | self.folds = folds |

1020 | ||

1021 | def __call__(self, data, learner): | |

1022 | ||

1023 | cvi = Orange.data.sample.SubsetIndicesCV(data, self.folds) | |

1024 | sum_of_rs = defaultdict(float) | |

[38] | 1025 | n_rs = defaultdict(int) |

[37] | 1026 | |

1027 | elearner = Learner(learner, estimators=self.estimators) | |

1028 | ||

1029 | #average correlations from each fold | |

1030 | for f in set(cvi): | |

1031 | learn = data.select(cvi, f, negate=True) | |

1032 | test = data.select(cvi, f) | |

1033 | ||

1034 | res = Orange.evaluation.testing.learn_and_test_on_test_data([elearner], learn, test) | |

1035 | results = get_pearson_r(res) | |

[38] | 1036 | |

[37] | 1037 | for r, p, sa, method in results: |

[38] | 1038 | if not math.isnan(r): #ignore NaN values |

1039 | sum_of_rs[(method, sa)] += r | |

1040 | n_rs[(method, sa)] += 1 | |

[37] | 1041 | |

[38] | 1042 | avg_rs = [ (k,(sum_of_rs[k]/n_rs[k])) for k in sum_of_rs ] |

1043 | ||

1044 | avg_rs = sorted(avg_rs, key=lambda estimate: estimate[1], reverse=True) | |

1045 | chosen = avg_rs[0][0] | |

[37] | 1046 | |

1047 | lf = elearner(data) | |

1048 | return ICVClassifier(chosen, lf) | |

1049 | ||

1050 | ||

1051 | class ICVClassifier: | |

1052 | ||

1053 | def __init__(self, chosen, reliability_classifier): | |

1054 | self.chosen = chosen | |

1055 | self.reliability_classifier = reliability_classifier | |

1056 | ||

1057 | def __call__(self, instance, *args): | |

1058 | re = self.reliability_classifier(instance, Orange.core.GetProbabilities).reliability_estimate | |

1059 | for e in re: | |

1060 | if e.method == self.chosen[0] and e.signed_or_absolute == self.chosen[1]: | |

1061 | r = e.estimate | |

1062 | ||

1063 | return [ Estimate(r, self.chosen[1], ICV_METHOD) ] | |

1064 | ||

[0] | 1065 | class Learner: |

1066 | """ | |

1067 | Reliability estimation wrapper around a learner we want to test. | |

1068 | Different reliability estimation algorithms can be used on the | |

1069 | chosen learner. This learner works as any other and can be used as one, | |

1070 | but it returns the classifier, wrapped into an instance of | |

1071 | :class:`Orange.evaluation.reliability.Classifier`. | |

1072 | ||

1073 | :param box_learner: Learner we want to wrap into a reliability estimation | |

1074 | classifier. | |

1075 | :type box_learner: :obj:`~Orange.classification.Learner` | |

1076 | ||

1077 | :param estimators: List of different reliability estimation methods we | |

1078 | want to use on the chosen learner. | |

1079 | :type estimators: :obj:`list` of reliability estimators | |

1080 | ||

1081 | :param name: Name of this reliability learner | |

1082 | :type name: string | |

1083 | ||

1084 | :rtype: :class:`Orange.evaluation.reliability.Learner` | |

1085 | """ | |

1086 | def __init__(self, box_learner, name="Reliability estimation", | |

[38] | 1087 | estimators=None, |

[0] | 1088 | **kwds): |

1089 | self.__dict__.update(kwds) | |

1090 | self.name = name | |

1091 | self.estimators = estimators | |

[38] | 1092 | if self.estimators is None: |

1093 | self.estimators = [SensitivityAnalysis(), | |

1094 | LocalCrossValidation(), | |

1095 | BaggingVarianceCNeighbours(), | |

1096 | Mahalanobis(), | |

1097 | MahalanobisToCenter()] | |

1098 | ||

[0] | 1099 | self.box_learner = box_learner |

1100 | self.blending = False | |

1101 | ||

1102 | ||

1103 | def __call__(self, instances, weight=None, **kwds): | |

1104 | """Learn from the given table of data instances. | |

1105 | ||

1106 | :param instances: Data instances to learn from. | |

1107 | :type instances: Orange.data.Table | |

1108 | :param weight: Id of meta attribute with weights of instances | |

1109 | :type weight: int | |

1110 | :rtype: :class:`Orange.evaluation.reliability.Classifier` | |

1111 | """ | |

1112 | ||

1113 | blending_classifier = None | |

1114 | new_domain = None | |

1115 | ||

1116 | # if instances.domain.class_var.var_type != Orange.feature.Continuous.Continuous: | |

1117 | # raise Exception("This method only works on data with continuous class.") | |

1118 | ||

1119 | return Classifier(instances, self.box_learner, self.estimators, self.blending, new_domain, blending_classifier) | |

[37] | 1120 | |

[0] | 1121 | class Classifier: |

1122 | """ | |

1123 | A reliability estimation wrapper for classifiers. | |

1124 | ||

1125 | What distinguishes this classifier is that the returned probabilities (if | |

1126 | :obj:`Orange.classification.Classifier.GetProbabilities` or | |

1127 | :obj:`Orange.classification.Classifier.GetBoth` is passed) contain an | |

1128 | additional attribute :obj:`reliability_estimate`, which is an instance of | |

1129 | :class:`~Orange.evaluation.reliability.Estimate`. | |

1130 | ||

1131 | """ | |

1132 | ||

1133 | def __init__(self, instances, box_learner, estimators, blending, blending_domain, rf_classifier, **kwds): | |

1134 | self.__dict__.update(kwds) | |

1135 | self.instances = instances | |

1136 | self.box_learner = box_learner | |

1137 | self.estimators = estimators | |

1138 | self.blending = blending | |

1139 | self.blending_domain = blending_domain | |

1140 | self.rf_classifier = rf_classifier | |

1141 | ||

1142 | # Train the learner with original data | |

1143 | self.classifier = box_learner(instances) | |

1144 | ||

1145 | # Train all the estimators and create their classifiers | |

1146 | self.estimation_classifiers = [estimator(instances, box_learner) for estimator in estimators] | |

1147 | ||

1148 | def __call__(self, instance, result_type=Orange.core.GetValue): | |

1149 | """ | |

1150 | Classify and estimate reliability of estimation for a new instance. | |

1151 | When :obj:`result_type` is set to | |

1152 | :obj:`Orange.classification.Classifier.GetBoth` or | |

1153 | :obj:`Orange.classification.Classifier.GetProbabilities`, | |

1154 | an additional attribute :obj:`reliability_estimate`, | |

1155 | which is an instance of | |

1156 | :class:`~Orange.evaluation.reliability.Estimate`, | |

1157 | is added to the distribution object. | |

1158 | ||

1159 | :param instance: instance to be classified. | |

1160 | :type instance: :class:`Orange.data.Instance` | |

1161 | :param result_type: :class:`Orange.classification.Classifier.GetValue` or \ | |

1162 | :class:`Orange.classification.Classifier.GetProbabilities` or | |

1163 | :class:`Orange.classification.Classifier.GetBoth` | |

1164 | ||

1165 | :rtype: :class:`Orange.data.Value`, | |

1166 | :class:`Orange.statistics.Distribution` or a tuple with both | |

1167 | """ | |

1168 | predicted, probabilities = self.classifier(instance, Orange.core.GetBoth) | |

1169 | ||

1170 | # Create a place holder for estimates | |

1171 | if probabilities is None: | |

1172 | probabilities = Orange.statistics.distribution.Continuous() | |

1173 | #with warnings.catch_warnings(): | |

1174 | # warnings.simplefilter("ignore") | |

1175 | probabilities.setattr('reliability_estimate', []) | |

1176 | ||

1177 | # Calculate all the estimates and add them to the results | |

1178 | for estimate in self.estimation_classifiers: | |

1179 | probabilities.reliability_estimate.extend(estimate(instance, predicted, probabilities)) | |

1180 | ||

1181 | # Return the appropriate type of result | |

1182 | if result_type == Orange.core.GetValue: | |

1183 | return predicted | |

1184 | elif result_type == Orange.core.GetProbabilities: | |

1185 | return probabilities | |

1186 | else: | |

1187 | return predicted, probabilities | |

[5] | 1188 | |

1189 | # Functions for testing and plotting | |

1190 | #TODO Document those. | |

1191 | def get_acc_rel(method, data, learner): | |

1192 | estimators = [method] | |

1193 | reliability = Orange.evaluation.reliability.Learner(learner, estimators=estimators) | |

1194 | #results = Orange.evaluation.testing.leave_one_out([reliability], data) | |

1195 | results = Orange.evaluation.testing.cross_validation([reliability], data) | |

1196 | ||

1197 | rels, acc = [], [] | |

1198 | ||

1199 | for res in results.results: | |

1200 | rels.append(res.probabilities[0].reliability_estimate[0].estimate) | |

1201 | acc.append(res.probabilities[0][res.actual_class]) | |

1202 | ||

1203 | return rels, acc | |

1204 | ||

[12] | 1205 | |

1206 | def rel_acc_plot(rels, acc, file_name=None, colors=None): | |

[5] | 1207 | |

1208 | import matplotlib.pylab as plt | |

[12] | 1209 | |

1210 | if colors is None: | |

1211 | colors = "k" | |

1212 | plt.scatter(rels, acc, c=colors) | |

1213 | plt.xlim(0.,1.) | |

1214 | plt.ylim(ymin=0.) | |

1215 | plt.xlabel("Reliability") | |

1216 | plt.ylabel("Accuracy") | |

1217 | if file_name is None: | |

1218 | plt.show() | |

1219 | else: | |

1220 | plt.savefig(file_name) | |

1221 | ||

1222 | def rel_acc_compute_plot(method, data, learner, file_name=None, colors=None): | |

[5] | 1223 | |

1224 | plt.clf() | |

1225 | ||

1226 | rels, acc = get_acc_rel(method, data, learner) | |

[12] | 1227 | el_acc_plot(acc, rels, file_name=file_name, colors=colors) |

1228 | ||

[5] | 1229 | |

1230 | def acc_rel_correlation(method, data, learner): | |

1231 | import scipy.stats | |

1232 | rels, acc = get_acc_rel(method, data, learner) | |

[37] | 1233 | return scipy.stats.spearmanr(acc, rels)[0] |

**Note:**See TracBrowser for help on using the repository browser.