source: orange-reliability/_reliability/__init__.py @ 12:469af3a0790f

Revision 12:469af3a0790f, 38.7 KB checked in by Matija Polajnar <matija.polajnar@…>, 22 months ago (diff)

Make plots nicer, along with axis reversal to make them like the ones in Pevec's paper.

Line 
1import Orange
2
3import random
4from Orange import statc
5import math
6import warnings
7import numpy
8
9from collections import defaultdict
10from itertools import izip
11
12# Labels and final variables
13labels = ["SAvar", "SAbias", "BAGV", "CNK", "LCV", "BVCK", "Mahalanobis", "ICV"]
14
15"""
16# All the estimators calculation constants
17DO_SA = 0
18DO_BAGV = 1
19DO_CNK = 2
20DO_LCV = 3
21DO_BVCK = 4
22DO_MAHAL = 5
23"""
24
25# All the estimator method constants
26SAVAR_ABSOLUTE = 0
27SABIAS_SIGNED = 1
28SABIAS_ABSOLUTE = 2
29BAGV_ABSOLUTE = 3
30CNK_SIGNED = 4
31CNK_ABSOLUTE = 5
32LCV_ABSOLUTE = 6
33BVCK_ABSOLUTE = 7
34MAHAL_ABSOLUTE = 8
35BLENDING_ABSOLUTE = 9
36ICV_METHOD = 10
37MAHAL_TO_CENTER_ABSOLUTE = 13
38DENS_ABSOLUTE = 14
39ERR_ABSOLUTE = 15
40
41# Type of estimator constant
42SIGNED = 0
43ABSOLUTE = 1
44
45# Names of all the estimator methods
46METHOD_NAME = {0: "SAvar absolute", 1: "SAbias signed", 2: "SAbias absolute",
47               3: "BAGV absolute", 4: "CNK signed", 5: "CNK absolute",
48               6: "LCV absolute", 7: "BVCK_absolute", 8: "Mahalanobis absolute",
49               9: "BLENDING absolute", 10: "ICV", 11: "RF Variance", 12: "RF Std",
50               13: "Mahalanobis to center", 14: "Density based", 15: "Reference expected error"}
51
52select_with_repeat = Orange.core.MakeRandomIndicesMultiple()
53select_with_repeat.random_generator = Orange.misc.Random()
54
55def get_reliability_estimation_list(res, i):
56    return [result.probabilities[0].reliability_estimate[i].estimate for result in res.results], res.results[0].probabilities[0].reliability_estimate[i].signed_or_absolute, res.results[0].probabilities[0].reliability_estimate[i].method
57
58def get_prediction_error_list(res):
59    return [result.actual_class - result.classes[0] for result in res.results]
60
61def get_description_list(res, i):
62    return [result.probabilities[0].reliability_estimate[i].text_description for result in res.results]
63
64def get_pearson_r(res):
65    """
66    :param res: results of evaluation, done using learners,
67        wrapped into :class:`Orange.evaluation.reliability.Classifier`.
68    :type res: :class:`Orange.evaluation.testing.ExperimentResults`
69
70    Return Pearson's coefficient between the prediction error and each of the
71    used reliability estimates. Also, return the p-value of each of
72    the coefficients.
73    """
74    prediction_error = get_prediction_error_list(res)
75    results = []
76    for i in xrange(len(res.results[0].probabilities[0].reliability_estimate)):
77        reliability_estimate, signed_or_absolute, method = get_reliability_estimation_list(res, i)
78        try:
79            if signed_or_absolute == SIGNED:
80                r, p = statc.pearsonr(prediction_error, reliability_estimate)
81            else:
82                r, p = statc.pearsonr([abs(pe) for pe in prediction_error], reliability_estimate)
83        except Exception:
84            r = p = float("NaN")
85        results.append((r, p, signed_or_absolute, method))
86    return results
87
88def get_spearman_r(res):
89    """
90    :param res: results of evaluation, done using learners,
91        wrapped into :class:`Orange.evaluation.reliability.Classifier`.
92    :type res: :class:`Orange.evaluation.testing.ExperimentResults`
93
94    Return Spearman's coefficient between the prediction error and each of the
95    used reliability estimates. Also, return the p-value of each of
96    the coefficients.
97    """
98    prediction_error = get_prediction_error_list(res)
99    results = []
100    for i in xrange(len(res.results[0].probabilities[0].reliability_estimate)):
101        reliability_estimate, signed_or_absolute, method = get_reliability_estimation_list(res, i)
102        try:
103            if signed_or_absolute == SIGNED:
104                r, p = statc.spearmanr(prediction_error, reliability_estimate)
105            else:
106                r, p = statc.spearmanr([abs(pe) for pe in prediction_error], reliability_estimate)
107        except Exception:
108            r = p = float("NaN")
109        results.append((r, p, signed_or_absolute, method))
110    return results
111
112def get_pearson_r_by_iterations(res):
113    """
114    :param res: results of evaluation, done using learners,
115        wrapped into :class:`Orange.evaluation.reliability.Classifier`.
116    :type res: :class:`Orange.evaluation.testing.ExperimentResults`
117
118    Return average Pearson's coefficient over all folds between prediction error
119    and each of the used estimates.
120    """
121    results_by_fold = Orange.evaluation.scoring.split_by_iterations(res)
122    number_of_estimates = len(res.results[0].probabilities[0].reliability_estimate)
123    number_of_instances = len(res.results)
124    number_of_folds = len(results_by_fold)
125    results = [0 for _ in xrange(number_of_estimates)]
126    sig = [0 for _ in xrange(number_of_estimates)]
127    method_list = [0 for _ in xrange(number_of_estimates)]
128
129    for res in results_by_fold:
130        prediction_error = get_prediction_error_list(res)
131        for i in xrange(number_of_estimates):
132            reliability_estimate, signed_or_absolute, method = get_reliability_estimation_list(res, i)
133            try:
134                if signed_or_absolute == SIGNED:
135                    r, _ = statc.pearsonr(prediction_error, reliability_estimate)
136                else:
137                    r, _ = statc.pearsonr([abs(pe) for pe in prediction_error], reliability_estimate)
138            except Exception:
139                r = float("NaN")
140            results[i] += r
141            sig[i] = signed_or_absolute
142            method_list[i] = method
143
144    # Calculate p-values
145    results = [float(res) / number_of_folds for res in results]
146    ps = [p_value_from_r(r, number_of_instances) for r in results]
147
148    return zip(results, ps, sig, method_list)
149
150def p_value_from_r(r, n):
151    """
152    Calculate p-value from the paerson coefficient and the sample size.
153    """
154    df = n - 2
155    t = r * (df / ((-r + 1.0 + 1e-30) * (r + 1.0 + 1e-30))) ** 0.5
156    return statc.betai (df * 0.5, 0.5, df / (df + t * t))
157
158
159# Distances between two discrete probability distributions
160#TODO Document those.
161def normalize_both(p, q):
162    if not p.normalized:
163        p.normalize()
164    if not q.normalized:
165        q.normalize()
166    return p, q
167
168def minkowsky_dist(p, q, m=2):
169    p, q = normalize_both(p, q)
170    dist = 0
171    for i in range(len(p)):
172        dist += abs(p[i]-q[i])**m
173    return dist**(1./m)
174
175def manhattan_distance(p, q):
176    return minkowsky_dist(p, q, m=1)
177
178def euclidean_dist(p, q):
179    return minkowsky_dist(p, q, m=2)
180
181def variance_dist(p, q):
182    return euclidean_dist(p, q) ** 2
183
184def max_dist(p, q):
185    p, q = normalize_both(p, q)
186    return max([abs(p[i]-q[i]) for i in range(len(p))])
187
188def hellinger_dist(p, q):
189    p, q = normalize_both(p, q)
190    dist = 0
191    for i in range(len(p)):
192        dist += (math.sqrt(p[i])-math.sqrt(q[i])) ** 2
193    return dist
194
195def my_log(x):
196    return 0 if x == 0 else x * math.log(x)
197
198def kullback_leibler(p, q):
199    p, q = normalize_both(p, q)
200    dist = 0
201    for i in range(len(p)):
202        dist += my_log(p[i]-q[i])
203    return dist
204
205def cosine(p, q):
206    p, q = normalize_both(p, q)
207    p, q = [pp for pp in p], [qq for qq in q]
208    return 1 - numpy.dot(x,y) / (numpy.linalg.norm(p)*numpy.linalg.norm(q))
209
210
211class Estimate:
212    """
213    Reliability estimate. Contains attributes that describe the results of
214    reliability estimation.
215
216    .. attribute:: estimate
217
218        A numerical reliability estimate.
219
220    .. attribute:: signed_or_absolute
221
222        Determines whether the method used gives a signed or absolute result.
223        Has a value of either :obj:`SIGNED` or :obj:`ABSOLUTE`.
224
225    .. attribute:: method
226
227        An integer ID of reliability estimation method used.
228
229    .. attribute:: method_name
230
231        Name (string) of reliability estimation method used.
232
233    .. attribute:: icv_method
234
235        An integer ID of reliability estimation method that performed best,
236        as determined by ICV, and of which estimate is stored in the
237        :obj:`estimate` field. (:obj:`None` when ICV was not used.)
238
239    .. attribute:: icv_method_name
240
241        Name (string) of reliability estimation method that performed best,
242        as determined by ICV. (:obj:`None` when ICV was not used.)
243
244    """
245    def __init__(self, estimate, signed_or_absolute, method, icv_method= -1):
246        self.estimate = estimate
247        self.signed_or_absolute = signed_or_absolute
248        self.method = method
249        self.method_name = METHOD_NAME[method]
250        self.icv_method = icv_method
251        self.icv_method_name = METHOD_NAME[icv_method] if icv_method != -1 else ""
252        self.text_description = None
253
254class DescriptiveAnalysis:
255    def __init__(self, estimator, desc=["high", "medium", "low"], procentage=[0.00, 0.33, 0.66]):
256        self.desc = desc
257        self.procentage = procentage
258        self.estimator = estimator
259
260    def __call__(self, instances, weight=None, **kwds):
261
262        # Calculate borders using cross validation
263        res = Orange.evaluation.testing.cross_validation([self.estimator], instances)
264        all_borders = []
265        for i in xrange(len(res.results[0].probabilities[0].reliability_estimate)):
266            estimates, signed_or_absolute, method = get_reliability_estimation_list(res, i)
267            sorted_estimates = sorted(abs(x) for x in estimates)
268            borders = [sorted_estimates[int(len(estimates) * p) - 1]  for p in self.procentage]
269            all_borders.append(borders)
270
271        # Learn on whole train data
272        estimator_classifier = self.estimator(instances)
273
274        return DescriptiveAnalysisClassifier(estimator_classifier, all_borders, self.desc)
275
276class DescriptiveAnalysisClassifier:
277    def __init__(self, estimator_classifier, all_borders, desc):
278        self.estimator_classifier = estimator_classifier
279        self.all_borders = all_borders
280        self.desc = desc
281
282    def __call__(self, instance, result_type=Orange.core.GetValue):
283        predicted, probabilities = self.estimator_classifier(instance, Orange.core.GetBoth)
284
285        for borders, estimate in zip(self.all_borders, probabilities.reliability_estimate):
286            estimate.text_description = self.desc[0]
287            for lower_border, text_desc in zip(borders, self.desc):
288                if estimate.estimate >= lower_border:
289                    estimate.text_description = text_desc
290
291        # Return the appropriate type of result
292        if result_type == Orange.core.GetValue:
293            return predicted
294        elif result_type == Orange.core.GetProbabilities:
295            return probabilities
296        else:
297            return predicted, probabilities
298
299class SensitivityAnalysis:
300    """
301   
302    :param e: List of possible :math:`\epsilon` values for SAvar and SAbias
303        reliability estimates.
304    :type e: list of floats
305   
306    :rtype: :class:`Orange.evaluation.reliability.SensitivityAnalysisClassifier`
307   
308    To estimate the reliability of prediction for given instance,
309    the learning set is extended with this instance, labeled with
310    :math:`K + \epsilon (l_{max} - l_{min})`,
311    where :math:`K` denotes the initial prediction,
312    :math:`\epsilon` is sensitivity parameter and :math:`l_{min}` and
313    :math:`l_{max}` denote lower and the upper bound of the learning
314    instances' labels. After computing different sensitivity predictions
315    using different values of :math:`\epsilon`, the prediction are combined
316    into SAvar and SAbias. SAbias can be used in a signed or absolute form.
317
318    :math:`SAvar = \\frac{\sum_{\epsilon \in E}(K_{\epsilon} - K_{-\epsilon})}{|E|}`
319
320    :math:`SAbias = \\frac{\sum_{\epsilon \in E} (K_{\epsilon} - K ) + (K_{-\epsilon} - K)}{2 |E|}`
321   
322   
323    """
324    def __init__(self, e=[0.01, 0.1, 0.5, 1.0, 2.0]):
325        self.e = e
326
327    def __call__(self, instances, learner):
328        min_value = max_value = instances[0].getclass().value
329        for ex in instances:
330            if ex.getclass().value > max_value:
331                max_value = ex.getclass().value
332            if ex.getclass().value < min_value:
333                min_value = ex.getclass().value
334        return SensitivityAnalysisClassifier(self.e, instances, min_value, max_value, learner)
335
336class SensitivityAnalysisClassifier:
337    def __init__(self, e, instances, min_value, max_value, learner):
338        self.e = e
339        self.instances = instances
340        self.max_value = max_value
341        self.min_value = min_value
342        self.learner = learner
343
344    def __call__(self, instance, predicted, probabilities):
345        # Create new dataset
346        r_data = Orange.data.Table(self.instances)
347
348        # Create new instance
349        modified_instance = Orange.data.Instance(instance)
350
351        # Append it to the data
352        r_data.append(modified_instance)
353
354        # Calculate SAvar & SAbias
355        SAvar = SAbias = 0
356
357        for eps in self.e:
358            # +epsilon
359            r_data[-1].setclass(predicted.value + eps * (self.max_value - self.min_value))
360            c = self.learner(r_data)
361            k_plus = c(instance, Orange.core.GetValue)
362
363            # -epsilon
364            r_data[-1].setclass(predicted.value - eps * (self.max_value - self.min_value))
365            c = self.learner(r_data)
366            k_minus = c(instance, Orange.core.GetValue)
367            #print len(r_data)
368            #print eps*(self.max_value - self.min_value)
369            #print k_plus
370            #print k_minus
371            # calculate part SAvar and SAbias
372            SAvar += k_plus.value - k_minus.value
373            SAbias += k_plus.value + k_minus.value - 2 * predicted.value
374
375        SAvar /= len(self.e)
376        SAbias /= 2 * len(self.e)
377
378        return [Estimate(SAvar, ABSOLUTE, SAVAR_ABSOLUTE),
379                Estimate(SAbias, SIGNED, SABIAS_SIGNED),
380                Estimate(abs(SAbias), ABSOLUTE, SABIAS_ABSOLUTE)]
381
382
383
384class ReferenceExpectedError:
385
386    def __init__(self, name="reference"):
387        self.name = name
388
389    def __call__(self, instances, learner):
390        classifier = learner(instances)
391        return ReferenceExpectedErrorClassifier(classifier)
392
393   
394class ReferenceExpectedErrorClassifier:
395
396    def __init__(self, classifier):
397        self.classifier = classifier
398
399    def __call__(self, instance, *args):
400        y_hat = max(self.classifier(instance, Orange.classification.Classifier.GetProbabilities))
401        return [Estimate(2 * y_hat * (1 - y_hat), ABSOLUTE, ERR_ABSOLUTE)]
402
403   
404
405class BaggingVariance:
406    """
407   
408    :param m: Number of bagging models to be used with BAGV estimate
409    :type m: int
410   
411    :rtype: :class:`Orange.evaluation.reliability.BaggingVarianceClassifier`
412   
413    :math:`m` different bagging models are constructed and used to estimate
414    the value of dependent variable for a given instance. In regression,
415    the variance of those predictions is used as a prediction reliability
416    estimate.
417
418    :math:`BAGV = \\frac{1}{m} \sum_{i=1}^{m} (K_i - K)^2`
419
420    where :math:`K = \\frac{\sum_{i=1}^{m} K_i}{m}` and :math:`K_i` are
421    predictions of individual constructed models. Note that a greater value
422    implies greater error.
423
424    For classification, 1 minus the average Euclidean distance between class
425    probability distributions predicted by the model, and distributions
426    predicted by the individual bagged models, is used as the BAGV reliability
427    measure. Note that in this case a greater value implies a better
428    prediction.
429   
430    """
431    def __init__(self, m=50, name="bv"):
432        self.m = m
433        self.name = name
434
435    def __call__(self, instances, learner):
436        classifiers = []
437
438        if instances.domain.class_var.var_type == Orange.feature.Descriptor.Discrete:
439            classifier = learner(instances)
440        else:
441            classifier = None
442
443        # Create bagged classifiers using sampling with replacement
444        for _ in xrange(self.m):
445            selection = select_with_repeat(len(instances))
446            data = instances.select(selection)
447            classifiers.append(learner(data))
448        return BaggingVarianceClassifier(classifiers, classifier)
449
450class BaggingVarianceClassifier:
451    def __init__(self, classifiers, classifier=None):
452        self.classifiers = classifiers
453        self.classifier = classifier
454
455    def __call__(self, instance, *args):
456        BAGV = 0
457
458        # Calculate the bagging variance
459        if instance.domain.class_var.var_type == Orange.feature.Descriptor.Continuous:
460            bagged_values = [c(instance, Orange.core.GetValue).value for c in self.classifiers if c is not None]
461        elif instance.domain.class_var.var_type == Orange.feature.Descriptor.Discrete:
462            estimate = self.classifier(instance, Orange.core.GetProbabilities)
463            bagged_values = [euclidean_dist(c(instance, Orange.core.GetProbabilities), estimate) for c in self.classifiers if c is not None]
464        k = sum(bagged_values) / len(bagged_values)
465
466        BAGV = sum((bagged_value - k) ** 2 for bagged_value in bagged_values) / len(bagged_values)
467        if instance.domain.class_var.var_type == Orange.feature.Descriptor.Discrete:
468            BAGV = 1 - BAGV
469
470        return [Estimate(BAGV, ABSOLUTE, BAGV_ABSOLUTE)]
471
472class LocalCrossValidation:
473    """
474
475    :param k: Number of nearest neighbours used in LCV estimate
476    :type k: int
477
478    :param distance: function that computes a distance between two discrete
479        distributions (used only in classification problems). The default
480        is Hellinger distance.
481    :type distance: function
482
483    :param distance_weighted: for classification reliability estimation,
484        use an average distance between distributions, weighted by :math:`e^{-d}`,
485        where :math:`d` is the distance between predicted instance and the
486        neighbour.
487
488    :rtype: :class:`Orange.evaluation.reliability.LocalCrossValidationClassifier`
489
490    :math:`k` nearest neighbours to the given instance are found and put in
491    a separate data set. On this data set, a leave-one-out validation is
492    performed. Reliability estimate for regression is then the distance
493    weighted absolute prediction error. In classification, 1 minus the average
494    distance between the predicted class probability distribution and the
495    (trivial) probability distributions of the nearest neighbour.
496
497    If a special value 0 is passed as :math:`k` (as is by default),
498    it is set as 1/20 of data set size (or 5, whichever is greater).
499
500    Summary of the algorithm for regression:
501
502    1. Determine the set of k nearest neighours :math:`N = { (x_1, c_1),...,
503       (x_k, c_k)}`.
504    2. On this set, compute leave-one-out predictions :math:`K_i` and
505       prediction errors :math:`E_i = | C_i - K_i |`.
506    3. :math:`LCV(x) = \\frac{ \sum_{(x_i, c_i) \in N} d(x_i, x) * E_i }{ \sum_{(x_i, c_i) \in N} d(x_i, x) }`
507
508    """
509    def __init__(self, k=0, distance=hellinger_dist, distance_weighted=True, name="lcv"):
510        self.k = k
511        self.distance = distance
512        self.distance_weighted = distance_weighted
513        self.name = name
514
515    def __call__(self, instances, learner):
516        nearest_neighbours_constructor = Orange.classification.knn.FindNearestConstructor()
517        nearest_neighbours_constructor.distanceConstructor = Orange.distance.Euclidean()
518
519        distance_id = Orange.feature.Descriptor.new_meta_id()
520        nearest_neighbours = nearest_neighbours_constructor(instances, 0, distance_id)
521
522        if self.k == 0:
523            self.k = max(5, len(instances) / 20)
524
525        return LocalCrossValidationClassifier(distance_id, nearest_neighbours, self.k, learner,
526            distance=self.distance, distance_weighted=self.distance_weighted)
527
528class LocalCrossValidationClassifier:
529    def __init__(self, distance_id, nearest_neighbours, k, learner, **kwds):
530        self.distance_id = distance_id
531        self.nearest_neighbours = nearest_neighbours
532        self.k = k
533        self.learner = learner
534        for a,b in kwds.items():
535            setattr(self, a, b)
536
537    def __call__(self, instance, *args):
538        LCVer = 0
539        LCVdi = 0
540
541        # Find k nearest neighbors
542
543        knn = [ex for ex in self.nearest_neighbours(instance, self.k)]
544
545        # leave one out of prediction error
546        for i in xrange(len(knn)):
547            train = knn[:]
548            del train[i]
549
550            classifier = self.learner(Orange.data.Table(train))
551
552            if instance.domain.class_var.var_type == Orange.feature.Descriptor.Continuous:
553                returned_value = classifier(knn[i], Orange.core.GetValue)
554                e = abs(knn[i].getclass().value - returned_value.value)
555
556            elif instance.domain.class_var.var_type == Orange.feature.Descriptor.Discrete:
557                returned_value = classifier(knn[i], Orange.core.GetProbabilities)
558                probabilities = [knn[i].get_class() == val for val in instance.domain.class_var.values]
559                e = self.distance(returned_value, Orange.statistics.distribution.Discrete(probabilities))
560
561            dist = math.exp(-knn[i][self.distance_id]) if self.distance_weighted else 1.0
562            LCVer += e * dist
563            LCVdi += dist
564
565        LCV = LCVer / LCVdi if LCVdi != 0 else 0
566        if math.isnan(LCV):
567            LCV = 0.0
568
569        if instance.domain.class_var.var_type == Orange.feature.Descriptor.Discrete:
570            LCV = 1 - LCV
571
572        return [ Estimate(LCV, ABSOLUTE, LCV_ABSOLUTE) ]
573
574class CNeighbours:
575    """
576   
577    :param k: Number of nearest neighbours used in CNK estimate
578    :type k: int
579
580    :param distance: function that computes a distance between two discrete
581        distributions (used only in classification problems). The default
582        is Hellinger distance.
583    :type distance: function
584   
585    :rtype: :class:`Orange.evaluation.reliability.CNeighboursClassifier`
586   
587    For regression, CNK is defined for an unlabeled instance as a difference
588    between average label of its nearest neighbours and its prediction. CNK
589    can be used as a signed or absolute estimate.
590   
591    :math:`CNK = \\frac{\sum_{i=1}^{k}C_i}{k} - K`
592   
593    where :math:`k` denotes number of neighbors, C :sub:`i` denotes neighbours'
594    labels and :math:`K` denotes the instance's prediction. Note that a greater
595    value implies greater prediction error.
596
597    For classification, CNK is equal to 1 minus the average distance between
598    predicted class distribution and (trivial) class distributions of the
599    $k$ nearest neighbours from the learning set. Note that in this case
600    a greater value implies better prediction.
601   
602    """
603    def __init__(self, k=5, distance=hellinger_dist, name = "cnk"):
604        self.k = k
605        self.distance = distance
606        self.name = name
607
608    def __call__(self, instances, learner):
609        nearest_neighbours_constructor = Orange.classification.knn.FindNearestConstructor()
610        nearest_neighbours_constructor.distanceConstructor = Orange.distance.Euclidean()
611
612        distance_id = Orange.feature.Descriptor.new_meta_id()
613        nearest_neighbours = nearest_neighbours_constructor(instances, 0, distance_id)
614        return CNeighboursClassifier(nearest_neighbours, self.k, distance=self.distance)
615
616class CNeighboursClassifier:
617    def __init__(self, nearest_neighbours, k, distance):
618        self.nearest_neighbours = nearest_neighbours
619        self.k = k
620        self.distance = distance
621
622    def __call__(self, instance, predicted, probabilities):
623        CNK = 0
624
625        # Find k nearest neighbors
626
627        knn = [ex for ex in self.nearest_neighbours(instance, self.k)]
628
629        # average label of neighbors
630        if ex.domain.class_var.var_type == Orange.feature.Descriptor.Continuous:
631            for ex in knn:
632                CNK += ex.getclass().value
633            CNK /= self.k
634            CNK -= predicted.value
635
636            return [Estimate(CNK, SIGNED, CNK_SIGNED),
637                    Estimate(abs(CNK), ABSOLUTE, CNK_ABSOLUTE)]
638        elif ex.domain.class_var.var_type == Orange.feature.Descriptor.Discrete:
639            knn_l = Orange.classification.knn.kNNLearner(k=self.k)
640            knn_c = knn_l(knn)
641            for ex in knn:
642                CNK -= self.distance(probabilities, knn_c(ex, Orange.classification.Classifier.GetProbabilities))
643            CNK /= self.k
644            CNK += 1
645
646            return [Estimate(CNK, ABSOLUTE, CNK_ABSOLUTE)]
647
648class Mahalanobis:
649    """
650   
651    :param k: Number of nearest neighbours used in Mahalanobis estimate.
652    :type k: int
653   
654    :rtype: :class:`Orange.evaluation.reliability.MahalanobisClassifier`
655   
656    Mahalanobis distance reliability estimate is defined as
657    `mahalanobis distance <http://en.wikipedia.org/wiki/Mahalanobis_distance>`_
658    to the evaluated instance's :math:`k` nearest neighbours.
659
660   
661    """
662    def __init__(self, k=3):
663        self.k = k
664
665    def __call__(self, instances, *args):
666        nnm = Orange.classification.knn.FindNearestConstructor()
667        nnm.distanceConstructor = Orange.distance.Mahalanobis()
668
669        mid = Orange.feature.Descriptor.new_meta_id()
670        nnm = nnm(instances, 0, mid)
671        return MahalanobisClassifier(self.k, nnm, mid)
672
673class MahalanobisClassifier:
674    def __init__(self, k, nnm, mid):
675        self.k = k
676        self.nnm = nnm
677        self.mid = mid
678
679    def __call__(self, instance, *args):
680        mahalanobis_distance = 0
681
682        mahalanobis_distance = sum(ex[self.mid].value for ex in self.nnm(instance, self.k))
683
684        return [ Estimate(mahalanobis_distance, ABSOLUTE, MAHAL_ABSOLUTE) ]
685
686class MahalanobisToCenter:
687    """
688    :rtype: :class:`Orange.evaluation.reliability.MahalanobisToCenterClassifier`
689   
690    Mahalanobis distance to center reliability estimate is defined as a
691    `mahalanobis distance <http://en.wikipedia.org/wiki/Mahalanobis_distance>`_
692    between the predicted instance and the centroid of the data.
693
694   
695    """
696    def __init__(self):
697        pass
698
699    def __call__(self, instances, *args):
700        dc = Orange.core.DomainContinuizer()
701        dc.classTreatment = Orange.core.DomainContinuizer.Ignore
702        dc.continuousTreatment = Orange.core.DomainContinuizer.NormalizeBySpan
703        dc.multinomialTreatment = Orange.core.DomainContinuizer.NValues
704
705        new_domain = dc(instances)
706        new_instances = instances.translate(new_domain)
707
708        X, _, _ = new_instances.to_numpy()
709        instance_avg = numpy.average(X, 0)
710
711        distance_constructor = Orange.distance.Mahalanobis()
712        distance = distance_constructor(new_instances)
713
714        average_instance = Orange.data.Instance(new_instances.domain, list(instance_avg) + ["?"])
715
716        return MahalanobisToCenterClassifier(distance, average_instance, new_domain)
717
718class MahalanobisToCenterClassifier:
719    def __init__(self, distance, average_instance, new_domain):
720        self.distance = distance
721        self.average_instance = average_instance
722        self.new_domain = new_domain
723
724    def __call__(self, instance, *args):
725
726        inst = Orange.data.Instance(self.new_domain, instance)
727
728        mahalanobis_to_center = self.distance(inst, self.average_instance)
729
730        return [ Estimate(mahalanobis_to_center, ABSOLUTE, MAHAL_TO_CENTER_ABSOLUTE) ]
731
732
733class BaggingVarianceCNeighbours:
734    """
735   
736    :param bagv: Instance of Bagging Variance estimator.
737    :type bagv: :class:`BaggingVariance`
738   
739    :param cnk: Instance of CNK estimator.
740    :type cnk: :class:`CNeighbours`
741   
742    :rtype: :class:`Orange.evaluation.reliability.BaggingVarianceCNeighboursClassifier`
743   
744    BVCK is a combination (average) of Bagging variance and local modeling of
745    prediction error.
746   
747    """
748    def __init__(self, bagv=BaggingVariance(), cnk=CNeighbours()):
749        self.bagv = bagv
750        self.cnk = cnk
751
752    def __call__(self, instances, learner):
753        bagv_classifier = self.bagv(instances, learner)
754        cnk_classifier = self.cnk(instances, learner)
755        return BaggingVarianceCNeighboursClassifier(bagv_classifier, cnk_classifier)
756
757class BaggingVarianceCNeighboursClassifier:
758    def __init__(self, bagv_classifier, cnk_classifier):
759        self.bagv_classifier = bagv_classifier
760        self.cnk_classifier = cnk_classifier
761
762    def __call__(self, instance, predicted, probabilities):
763        bagv_estimates = self.bagv_classifier(instance, predicted, probabilities)
764        cnk_estimates = self.cnk_classifier(instance, predicted, probabilities)
765
766        bvck_value = (bagv_estimates[0].estimate + cnk_estimates[1].estimate) / 2
767        bvck_estimates = [ Estimate(bvck_value, ABSOLUTE, BVCK_ABSOLUTE) ]
768        bvck_estimates.extend(bagv_estimates)
769        bvck_estimates.extend(cnk_estimates)
770        return bvck_estimates
771
772class ErrorPredicting:
773    def __init__(self):
774        pass
775
776    def __call__(self, instances, learner):
777        res = Orange.evaluation.testing.cross_validation([learner], instances)
778        prediction_errors = get_prediction_error_list(res)
779
780        new_domain = Orange.data.Domain(instances.domain.attributes, Orange.core.FloatVariable("pe"))
781        new_dataset = Orange.data.Table(new_domain, instances)
782
783        for instance, prediction_error in izip(new_dataset, prediction_errors):
784            instance.set_class(prediction_error)
785
786        rf = Orange.ensemble.forest.RandomForestLearner()
787        rf_classifier = rf(new_dataset)
788
789        return ErrorPredictingClassification(rf_classifier, new_domain)
790
791class ErrorPredictingClassification:
792    def __init__(self, rf_classifier, new_domain):
793        self.rf_classifier = rf_classifier
794        self.new_domain = new_domain
795
796    def __call__(self, instance, predicted, probabilities):
797        new_instance = Orange.data.Instance(self.new_domain, instance)
798        value = self.rf_classifier(new_instance, Orange.core.GetValue)
799
800        return [Estimate(value.value, SIGNED, SABIAS_SIGNED)]
801
802def gauss_kernel(x, sigma=1):
803    return 1./(sigma*math.sqrt(2*math.pi)) * math.exp(-1./2*(x/sigma)**2)
804
805class ParzenWindowDensityBased:
806    """
807    :param K: kernel function. Default: gaussian.
808    :type K: function
809
810    :param d_measure: distance measure for inter-instance distance.
811    :type d_measure: :class:`Orange.distance.DistanceConstructor`
812
813    :rtype: :class:`Orange.evaluation.reliability.ParzenWindowDensityBasedClassifier`
814
815    Returns a value that estimates a density of problem space around the
816    instance being predicted.
817    """
818    def __init__(self, K=gauss_kernel, d_measure=Orange.distance.Euclidean(), name="density"):
819        self.K = K
820        self.d_measure = d_measure
821        self.name = name
822
823    def __call__(self, instances, learner):
824
825        self.distance = self.d_measure(instances)
826
827        def density(x):
828            l, dens = len(instances), 0
829            for ex in instances:
830                dens += self.K(self.distance(x,ex))
831            return dens / l
832
833        max_density = max([density(ex) for ex in instances])
834
835        return ParzenWindowDensityBasedClassifier(density, max_density)
836
837class ParzenWindowDensityBasedClassifier:
838
839    def __init__(self, density, max_density):
840        self.density = density
841        self.max_density = max_density
842
843
844    def __call__(self, instance, *args):
845
846        DENS = self.max_density-self.density(instance)
847
848        return [Estimate(DENS, ABSOLUTE, DENS_ABSOLUTE)]
849
850class Learner:
851    """
852    Reliability estimation wrapper around a learner we want to test.
853    Different reliability estimation algorithms can be used on the
854    chosen learner. This learner works as any other and can be used as one,
855    but it returns the classifier, wrapped into an instance of
856    :class:`Orange.evaluation.reliability.Classifier`.
857   
858    :param box_learner: Learner we want to wrap into a reliability estimation
859        classifier.
860    :type box_learner: :obj:`~Orange.classification.Learner`
861   
862    :param estimators: List of different reliability estimation methods we
863                       want to use on the chosen learner.
864    :type estimators: :obj:`list` of reliability estimators
865   
866    :param name: Name of this reliability learner
867    :type name: string
868   
869    :rtype: :class:`Orange.evaluation.reliability.Learner`
870    """
871    def __init__(self, box_learner, name="Reliability estimation",
872                 estimators=[SensitivityAnalysis(),
873                             LocalCrossValidation(),
874                             BaggingVarianceCNeighbours(),
875                             Mahalanobis(),
876                             MahalanobisToCenter()],
877                 **kwds):
878        self.__dict__.update(kwds)
879        self.name = name
880        self.estimators = estimators
881        self.box_learner = box_learner
882        self.blending = False
883
884
885    def __call__(self, instances, weight=None, **kwds):
886        """Learn from the given table of data instances.
887       
888        :param instances: Data instances to learn from.
889        :type instances: Orange.data.Table
890        :param weight: Id of meta attribute with weights of instances
891        :type weight: int
892        :rtype: :class:`Orange.evaluation.reliability.Classifier`
893        """
894
895        blending_classifier = None
896        new_domain = None
897
898#        if instances.domain.class_var.var_type != Orange.feature.Continuous.Continuous:
899#            raise Exception("This method only works on data with continuous class.")
900
901        return Classifier(instances, self.box_learner, self.estimators, self.blending, new_domain, blending_classifier)
902
903    def internal_cross_validation(self, instances, folds=10):
904        """ Perform the internal cross validation for getting the best
905        reliability estimate. It uses the reliability estimators defined in
906        estimators attribute.
907
908        Returns the id of the method that scored the best.
909
910        :param instances: Data instances to use for ICV.
911        :type instances: :class:`Orange.data.Table`
912        :param folds: number of folds for ICV.
913        :type folds: int
914        :rtype: int
915
916        """
917        res = Orange.evaluation.testing.cross_validation([self], instances, folds=folds)
918        results = get_pearson_r(res)
919        sorted_results = sorted(results)
920        return sorted_results[-1][3]
921
922    def internal_cross_validation_testing(self, instances, folds=10):
923        """ Perform internal cross validation (as in Automatic selection of
924        reliability estimates for individual regression predictions,
925        Zoran Bosnic, 2010) and return id of the method
926        that scored best on this data.
927
928        :param instances: Data instances to use for ICV.
929        :type instances: :class:`Orange.data.Table`
930        :param folds: number of folds for ICV.
931        :type folds: int
932        :rtype: int
933
934        """
935        cv_indices = Orange.core.MakeRandomIndicesCV(instances, folds)
936
937        list_of_rs = []
938
939        sum_of_rs = defaultdict(float)
940
941        for fold in xrange(folds):
942            data = instances.select(cv_indices, fold)
943            if len(data) < 10:
944                res = Orange.evaluation.testing.leave_one_out([self], data)
945            else:
946                res = Orange.evaluation.testing.cross_validation([self], data)
947            results = get_pearson_r(res)
948            for r, _, _, method in results:
949                sum_of_rs[method] += r
950        sorted_sum_of_rs = sorted(sum_of_rs.items(), key=lambda estimate: estimate[1], reverse=True)
951        return sorted_sum_of_rs[0][0]
952
953    labels = ["SAvar", "SAbias", "BAGV", "CNK", "LCV", "BVCK", "Mahalanobis", "ICV"]
954
955class Classifier:
956    """
957    A reliability estimation wrapper for classifiers.
958
959    What distinguishes this classifier is that the returned probabilities (if
960    :obj:`Orange.classification.Classifier.GetProbabilities` or
961    :obj:`Orange.classification.Classifier.GetBoth` is passed) contain an
962    additional attribute :obj:`reliability_estimate`, which is an instance of
963    :class:`~Orange.evaluation.reliability.Estimate`.
964
965    """
966
967    def __init__(self, instances, box_learner, estimators, blending, blending_domain, rf_classifier, **kwds):
968        self.__dict__.update(kwds)
969        self.instances = instances
970        self.box_learner = box_learner
971        self.estimators = estimators
972        self.blending = blending
973        self.blending_domain = blending_domain
974        self.rf_classifier = rf_classifier
975
976        # Train the learner with original data
977        self.classifier = box_learner(instances)
978
979        # Train all the estimators and create their classifiers
980        self.estimation_classifiers = [estimator(instances, box_learner) for estimator in estimators]
981
982    def __call__(self, instance, result_type=Orange.core.GetValue):
983        """
984        Classify and estimate reliability of estimation for a new instance.
985        When :obj:`result_type` is set to
986        :obj:`Orange.classification.Classifier.GetBoth` or
987        :obj:`Orange.classification.Classifier.GetProbabilities`,
988        an additional attribute :obj:`reliability_estimate`,
989        which is an instance of
990        :class:`~Orange.evaluation.reliability.Estimate`,
991        is added to the distribution object.
992       
993        :param instance: instance to be classified.
994        :type instance: :class:`Orange.data.Instance`
995        :param result_type: :class:`Orange.classification.Classifier.GetValue` or \
996              :class:`Orange.classification.Classifier.GetProbabilities` or
997              :class:`Orange.classification.Classifier.GetBoth`
998       
999        :rtype: :class:`Orange.data.Value`,
1000              :class:`Orange.statistics.Distribution` or a tuple with both
1001        """
1002        predicted, probabilities = self.classifier(instance, Orange.core.GetBoth)
1003
1004        # Create a place holder for estimates
1005        if probabilities is None:
1006            probabilities = Orange.statistics.distribution.Continuous()
1007        #with warnings.catch_warnings():
1008        #    warnings.simplefilter("ignore")
1009        probabilities.setattr('reliability_estimate', [])
1010
1011        # Calculate all the estimates and add them to the results
1012        for estimate in self.estimation_classifiers:
1013            probabilities.reliability_estimate.extend(estimate(instance, predicted, probabilities))
1014
1015        # Return the appropriate type of result
1016        if result_type == Orange.core.GetValue:
1017            return predicted
1018        elif result_type == Orange.core.GetProbabilities:
1019            return probabilities
1020        else:
1021            return predicted, probabilities
1022
1023# Functions for testing and plotting
1024#TODO Document those.
1025def get_acc_rel(method, data, learner):
1026    estimators = [method]
1027    reliability = Orange.evaluation.reliability.Learner(learner, estimators=estimators)
1028    #results = Orange.evaluation.testing.leave_one_out([reliability], data)
1029    results = Orange.evaluation.testing.cross_validation([reliability], data)
1030
1031    rels, acc = [], []
1032
1033    for res in results.results:
1034        rels.append(res.probabilities[0].reliability_estimate[0].estimate)
1035        acc.append(res.probabilities[0][res.actual_class])
1036
1037    return rels, acc
1038
1039
1040def rel_acc_plot(rels, acc, file_name=None, colors=None):
1041
1042    import matplotlib.pylab as plt
1043   
1044    if colors is None:
1045        colors = "k"
1046    plt.scatter(rels, acc, c=colors)
1047    plt.xlim(0.,1.)
1048    plt.ylim(ymin=0.)
1049    plt.xlabel("Reliability")
1050    plt.ylabel("Accuracy")
1051    if file_name is None:
1052        plt.show()
1053    else:
1054        plt.savefig(file_name)
1055
1056def rel_acc_compute_plot(method, data, learner, file_name=None, colors=None):
1057
1058    plt.clf()
1059
1060    rels, acc = get_acc_rel(method, data, learner)
1061    el_acc_plot(acc, rels, file_name=file_name, colors=colors)
1062   
1063
1064def acc_rel_correlation(method, data, learner):
1065    import scipy.stats
1066    rels, acc = get_acc_rel(method, data, learner)
1067    return scipy.stats.spearmanr(acc, rels)[0]
Note: See TracBrowser for help on using the repository browser.