source: orange-reliability/orangecontrib/reliability/__init__.py @ 35:4e777e39f98c

Revision 35:4e777e39f98c, 39.4 KB checked in by Ales Erjavec <ales.erjavec@…>, 7 months ago (diff)

Moved _reliability package into orangecontrib namespace.

Line 
1import Orange
2
3import random
4from Orange import statc
5import math
6import warnings
7import numpy
8
9from collections import defaultdict
10from itertools import izip
11
12# Labels and final variables
13labels = ["SAvar", "SAbias", "BAGV", "CNK", "LCV", "BVCK", "Mahalanobis", "ICV"]
14
15"""
16# All the estimators calculation constants
17DO_SA = 0
18DO_BAGV = 1
19DO_CNK = 2
20DO_LCV = 3
21DO_BVCK = 4
22DO_MAHAL = 5
23"""
24
25# All the estimator method constants
26SAVAR_ABSOLUTE = 0
27SABIAS_SIGNED = 1
28SABIAS_ABSOLUTE = 2
29BAGV_ABSOLUTE = 3
30CNK_SIGNED = 4
31CNK_ABSOLUTE = 5
32LCV_ABSOLUTE = 6
33BVCK_ABSOLUTE = 7
34MAHAL_ABSOLUTE = 8
35BLENDING_ABSOLUTE = 9
36ICV_METHOD = 10
37MAHAL_TO_CENTER_ABSOLUTE = 13
38DENS_ABSOLUTE = 14
39ERR_ABSOLUTE = 15
40
41# Type of estimator constant
42SIGNED = 0
43ABSOLUTE = 1
44
45# Names of all the estimator methods
46METHOD_NAME = {0: "SAvar absolute", 1: "SAbias signed", 2: "SAbias absolute",
47               3: "BAGV absolute", 4: "CNK signed", 5: "CNK absolute",
48               6: "LCV absolute", 7: "BVCK_absolute", 8: "Mahalanobis absolute",
49               9: "BLENDING absolute", 10: "ICV", 11: "RF Variance", 12: "RF Std",
50               13: "Mahalanobis to center", 14: "Density based", 15: "Reference expected error"}
51
52select_with_repeat = Orange.core.MakeRandomIndicesMultiple()
53select_with_repeat.random_generator = Orange.misc.Random()
54
55def get_reliability_estimation_list(res, i):
56    return [result.probabilities[0].reliability_estimate[i].estimate for result in res.results], res.results[0].probabilities[0].reliability_estimate[i].signed_or_absolute, res.results[0].probabilities[0].reliability_estimate[i].method
57
58def get_prediction_error_list(res):
59    return [result.actual_class - result.classes[0] for result in res.results]
60
61def get_description_list(res, i):
62    return [result.probabilities[0].reliability_estimate[i].text_description for result in res.results]
63
64def get_pearson_r(res):
65    """
66    :param res: results of evaluation, done using learners,
67        wrapped into :class:`Orange.evaluation.reliability.Classifier`.
68    :type res: :class:`Orange.evaluation.testing.ExperimentResults`
69
70    Return Pearson's coefficient between the prediction error and each of the
71    used reliability estimates. Also, return the p-value of each of
72    the coefficients.
73    """
74    prediction_error = get_prediction_error_list(res)
75    results = []
76    for i in xrange(len(res.results[0].probabilities[0].reliability_estimate)):
77        reliability_estimate, signed_or_absolute, method = get_reliability_estimation_list(res, i)
78        try:
79            if signed_or_absolute == SIGNED:
80                r, p = statc.pearsonr(prediction_error, reliability_estimate)
81            else:
82                r, p = statc.pearsonr([abs(pe) for pe in prediction_error], reliability_estimate)
83        except Exception:
84            r = p = float("NaN")
85        results.append((r, p, signed_or_absolute, method))
86    return results
87
88def get_spearman_r(res):
89    """
90    :param res: results of evaluation, done using learners,
91        wrapped into :class:`Orange.evaluation.reliability.Classifier`.
92    :type res: :class:`Orange.evaluation.testing.ExperimentResults`
93
94    Return Spearman's coefficient between the prediction error and each of the
95    used reliability estimates. Also, return the p-value of each of
96    the coefficients.
97    """
98    prediction_error = get_prediction_error_list(res)
99    results = []
100    for i in xrange(len(res.results[0].probabilities[0].reliability_estimate)):
101        reliability_estimate, signed_or_absolute, method = get_reliability_estimation_list(res, i)
102        try:
103            if signed_or_absolute == SIGNED:
104                r, p = statc.spearmanr(prediction_error, reliability_estimate)
105            else:
106                r, p = statc.spearmanr([abs(pe) for pe in prediction_error], reliability_estimate)
107        except Exception:
108            r = p = float("NaN")
109        results.append((r, p, signed_or_absolute, method))
110    return results
111
112def get_pearson_r_by_iterations(res):
113    """
114    :param res: results of evaluation, done using learners,
115        wrapped into :class:`Orange.evaluation.reliability.Classifier`.
116    :type res: :class:`Orange.evaluation.testing.ExperimentResults`
117
118    Return average Pearson's coefficient over all folds between prediction error
119    and each of the used estimates.
120    """
121    results_by_fold = Orange.evaluation.scoring.split_by_iterations(res)
122    number_of_estimates = len(res.results[0].probabilities[0].reliability_estimate)
123    number_of_instances = len(res.results)
124    number_of_folds = len(results_by_fold)
125    results = [0 for _ in xrange(number_of_estimates)]
126    sig = [0 for _ in xrange(number_of_estimates)]
127    method_list = [0 for _ in xrange(number_of_estimates)]
128
129    for res in results_by_fold:
130        prediction_error = get_prediction_error_list(res)
131        for i in xrange(number_of_estimates):
132            reliability_estimate, signed_or_absolute, method = get_reliability_estimation_list(res, i)
133            try:
134                if signed_or_absolute == SIGNED:
135                    r, _ = statc.pearsonr(prediction_error, reliability_estimate)
136                else:
137                    r, _ = statc.pearsonr([abs(pe) for pe in prediction_error], reliability_estimate)
138            except Exception:
139                r = float("NaN")
140            results[i] += r
141            sig[i] = signed_or_absolute
142            method_list[i] = method
143
144    # Calculate p-values
145    results = [float(res) / number_of_folds for res in results]
146    ps = [p_value_from_r(r, number_of_instances) for r in results]
147
148    return zip(results, ps, sig, method_list)
149
150def p_value_from_r(r, n):
151    """
152    Calculate p-value from the paerson coefficient and the sample size.
153    """
154    df = n - 2
155    t = r * (df / ((-r + 1.0 + 1e-30) * (r + 1.0 + 1e-30))) ** 0.5
156    return statc.betai (df * 0.5, 0.5, df / (df + t * t))
157
158
159# Distances between two discrete probability distributions
160#TODO Document those.
161def normalize_both(p, q):
162    if not p.normalized:
163        p.normalize()
164    if not q.normalized:
165        q.normalize()
166    return p, q
167
168def minkowsky_dist(p, q, m=2):
169    p, q = normalize_both(p, q)
170    dist = 0
171    for i in range(len(p)):
172        dist += abs(p[i]-q[i])**m
173    return dist**(1./m)
174
175def manhattan_distance(p, q):
176    return minkowsky_dist(p, q, m=1)
177
178def euclidean_dist(p, q):
179    return minkowsky_dist(p, q, m=2)
180
181def variance_dist(p, q):
182    return euclidean_dist(p, q) ** 2
183
184def max_dist(p, q):
185    p, q = normalize_both(p, q)
186    return max([abs(p[i]-q[i]) for i in range(len(p))])
187
188def hellinger_dist(p, q):
189    p, q = normalize_both(p, q)
190    dist = 0
191    for i in range(len(p)):
192        dist += (math.sqrt(p[i])-math.sqrt(q[i])) ** 2
193    return dist
194
195def my_log(x):
196    return 0 if x == 0 else x * math.log(x)
197
198def kullback_leibler(p, q):
199    p, q = normalize_both(p, q)
200    dist = 0
201    for i in range(len(p)):
202        dist += my_log(p[i]-q[i])
203    return dist
204
205def cosine(p, q):
206    p, q = normalize_both(p, q)
207    p, q = [pp for pp in p], [qq for qq in q]
208    return 1 - numpy.dot(x,y) / (numpy.linalg.norm(p)*numpy.linalg.norm(q))
209
210
211class Estimate:
212    """
213    Reliability estimate. Contains attributes that describe the results of
214    reliability estimation.
215
216    .. attribute:: estimate
217
218        A numerical reliability estimate.
219
220    .. attribute:: signed_or_absolute
221
222        Determines whether the method used gives a signed or absolute result.
223        Has a value of either :obj:`SIGNED` or :obj:`ABSOLUTE`.
224
225    .. attribute:: method
226
227        An integer ID of reliability estimation method used.
228
229    .. attribute:: method_name
230
231        Name (string) of reliability estimation method used.
232
233    .. attribute:: icv_method
234
235        An integer ID of reliability estimation method that performed best,
236        as determined by ICV, and of which estimate is stored in the
237        :obj:`estimate` field. (:obj:`None` when ICV was not used.)
238
239    .. attribute:: icv_method_name
240
241        Name (string) of reliability estimation method that performed best,
242        as determined by ICV. (:obj:`None` when ICV was not used.)
243
244    """
245    def __init__(self, estimate, signed_or_absolute, method, icv_method= -1):
246        self.estimate = estimate
247        self.signed_or_absolute = signed_or_absolute
248        self.method = method
249        self.method_name = METHOD_NAME[method]
250        self.icv_method = icv_method
251        self.icv_method_name = METHOD_NAME[icv_method] if icv_method != -1 else ""
252        self.text_description = None
253
254class DescriptiveAnalysis:
255    def __init__(self, estimator, desc=["high", "medium", "low"], procentage=[0.00, 0.33, 0.66], name="da"):
256        self.desc = desc
257        self.procentage = procentage
258        self.estimator = estimator
259        self.name = name
260
261    def __call__(self, instances, weight=None, **kwds):
262
263        # Calculate borders using cross validation
264        res = Orange.evaluation.testing.cross_validation([self.estimator], instances)
265        all_borders = []
266        for i in xrange(len(res.results[0].probabilities[0].reliability_estimate)):
267            estimates, signed_or_absolute, method = get_reliability_estimation_list(res, i)
268            sorted_estimates = sorted(abs(x) for x in estimates)
269            borders = [sorted_estimates[int(len(estimates) * p) - 1]  for p in self.procentage]
270            all_borders.append(borders)
271
272        # Learn on whole train data
273        estimator_classifier = self.estimator(instances)
274
275        return DescriptiveAnalysisClassifier(estimator_classifier, all_borders, self.desc)
276
277class DescriptiveAnalysisClassifier:
278    def __init__(self, estimator_classifier, all_borders, desc):
279        self.estimator_classifier = estimator_classifier
280        self.all_borders = all_borders
281        self.desc = desc
282
283    def __call__(self, instance, result_type=Orange.core.GetValue):
284        predicted, probabilities = self.estimator_classifier(instance, Orange.core.GetBoth)
285
286        for borders, estimate in zip(self.all_borders, probabilities.reliability_estimate):
287            estimate.text_description = self.desc[0]
288            for lower_border, text_desc in zip(borders, self.desc):
289                if estimate.estimate >= lower_border:
290                    estimate.text_description = text_desc
291
292        # Return the appropriate type of result
293        if result_type == Orange.core.GetValue:
294            return predicted
295        elif result_type == Orange.core.GetProbabilities:
296            return probabilities
297        else:
298            return predicted, probabilities
299
300class SensitivityAnalysis:
301    """
302   
303    :param e: List of possible :math:`\epsilon` values for SAvar and SAbias
304        reliability estimates.
305    :type e: list of floats
306   
307    :rtype: :class:`Orange.evaluation.reliability.SensitivityAnalysisClassifier`
308   
309    To estimate the reliability of prediction for given instance,
310    the learning set is extended with this instance, labeled with
311    :math:`K + \epsilon (l_{max} - l_{min})`,
312    where :math:`K` denotes the initial prediction,
313    :math:`\epsilon` is sensitivity parameter and :math:`l_{min}` and
314    :math:`l_{max}` denote lower and the upper bound of the learning
315    instances' labels. After computing different sensitivity predictions
316    using different values of :math:`\epsilon`, the prediction are combined
317    into SAvar and SAbias. SAbias can be used in a signed or absolute form.
318
319    :math:`SAvar = \\frac{\sum_{\epsilon \in E}(K_{\epsilon} - K_{-\epsilon})}{|E|}`
320
321    :math:`SAbias = \\frac{\sum_{\epsilon \in E} (K_{\epsilon} - K ) + (K_{-\epsilon} - K)}{2 |E|}`
322   
323   
324    """
325    def __init__(self, e=[0.01, 0.1, 0.5, 1.0, 2.0], name="sa"):
326        self.e = e
327        self.name = name
328
329    def __call__(self, instances, learner):
330        min_value = max_value = instances[0].getclass().value
331        for ex in instances:
332            if ex.getclass().value > max_value:
333                max_value = ex.getclass().value
334            if ex.getclass().value < min_value:
335                min_value = ex.getclass().value
336        return SensitivityAnalysisClassifier(self.e, instances, min_value, max_value, learner)
337
338class SensitivityAnalysisClassifier:
339    def __init__(self, e, instances, min_value, max_value, learner):
340        self.e = e
341        self.instances = instances
342        self.max_value = max_value
343        self.min_value = min_value
344        self.learner = learner
345
346    def __call__(self, instance, predicted, probabilities):
347        # Create new dataset
348        r_data = Orange.data.Table(self.instances)
349
350        # Create new instance
351        modified_instance = Orange.data.Instance(instance)
352
353        # Append it to the data
354        r_data.append(modified_instance)
355
356        # Calculate SAvar & SAbias
357        SAvar = SAbias = 0
358
359        for eps in self.e:
360            # +epsilon
361            r_data[-1].setclass(predicted.value + eps * (self.max_value - self.min_value))
362            c = self.learner(r_data)
363            k_plus = c(instance, Orange.core.GetValue)
364
365            # -epsilon
366            r_data[-1].setclass(predicted.value - eps * (self.max_value - self.min_value))
367            c = self.learner(r_data)
368            k_minus = c(instance, Orange.core.GetValue)
369            #print len(r_data)
370            #print eps*(self.max_value - self.min_value)
371            #print k_plus
372            #print k_minus
373            # calculate part SAvar and SAbias
374            SAvar += k_plus.value - k_minus.value
375            SAbias += k_plus.value + k_minus.value - 2 * predicted.value
376
377        SAvar /= len(self.e)
378        SAbias /= 2 * len(self.e)
379
380        return [Estimate(SAvar, ABSOLUTE, SAVAR_ABSOLUTE),
381                Estimate(SAbias, SIGNED, SABIAS_SIGNED),
382                Estimate(abs(SAbias), ABSOLUTE, SABIAS_ABSOLUTE)]
383
384
385
386class ReferenceExpectedError:
387    """
388
389    :rtype: :class:`Orange.evaluation.reliability.ReferenceExpectedErrorClassifier`
390
391    Reference reliability estimation method for classification as used in Evaluating Reliability of Single
392    Classifications of Neural Networks, Darko Pevec, 2011.
393
394    :math:`O_{ref} = 2 (\hat y - \hat y ^2) = 2 \hat y (1-\hat y)`
395
396    where :math:`\hat y` is the estimated probability of the predicted class.
397
398    Note that for this method, in contrast with all others, a greater estimate means lower reliability (greater
399    expected error).
400
401    """
402    def __init__(self, name="reference"):
403        self.name = name
404
405    def __call__(self, instances, learner):
406        classifier = learner(instances)
407        return ReferenceExpectedErrorClassifier(classifier)
408
409   
410class ReferenceExpectedErrorClassifier:
411
412    def __init__(self, classifier):
413        self.classifier = classifier
414
415    def __call__(self, instance, *args):
416        y_hat = max(self.classifier(instance, Orange.classification.Classifier.GetProbabilities))
417        return [Estimate(2 * y_hat * (1 - y_hat), ABSOLUTE, ERR_ABSOLUTE)]
418
419   
420
421class BaggingVariance:
422    """
423   
424    :param m: Number of bagging models to be used with BAGV estimate
425    :type m: int
426   
427    :rtype: :class:`Orange.evaluation.reliability.BaggingVarianceClassifier`
428   
429    :math:`m` different bagging models are constructed and used to estimate
430    the value of dependent variable for a given instance. In regression,
431    the variance of those predictions is used as a prediction reliability
432    estimate.
433
434    :math:`BAGV = \\frac{1}{m} \sum_{i=1}^{m} (K_i - K)^2`
435
436    where :math:`K = \\frac{\sum_{i=1}^{m} K_i}{m}` and :math:`K_i` are
437    predictions of individual constructed models. Note that a greater value
438    implies greater error.
439
440    For classification, 1 minus the average Euclidean distance between class
441    probability distributions predicted by the model, and distributions
442    predicted by the individual bagged models, is used as the BAGV reliability
443    measure. Note that in this case a greater value implies a better
444    prediction.
445   
446    """
447    def __init__(self, m=50, name="bv"):
448        self.m = m
449        self.name = name
450
451    def __call__(self, instances, learner):
452        classifiers = []
453
454        if instances.domain.class_var.var_type == Orange.feature.Descriptor.Discrete:
455            classifier = learner(instances)
456        else:
457            classifier = None
458
459        # Create bagged classifiers using sampling with replacement
460        for _ in xrange(self.m):
461            selection = select_with_repeat(len(instances))
462            data = instances.select(selection)
463            classifiers.append(learner(data))
464        return BaggingVarianceClassifier(classifiers, classifier)
465
466class BaggingVarianceClassifier:
467    def __init__(self, classifiers, classifier=None):
468        self.classifiers = classifiers
469        self.classifier = classifier
470
471    def __call__(self, instance, *args):
472        BAGV = 0
473
474        # Calculate the bagging variance
475        if instance.domain.class_var.var_type == Orange.feature.Descriptor.Continuous:
476            bagged_values = [c(instance, Orange.core.GetValue).value for c in self.classifiers if c is not None]
477        elif instance.domain.class_var.var_type == Orange.feature.Descriptor.Discrete:
478            estimate = self.classifier(instance, Orange.core.GetProbabilities)
479            bagged_values = [euclidean_dist(c(instance, Orange.core.GetProbabilities), estimate) for c in self.classifiers if c is not None]
480        k = sum(bagged_values) / len(bagged_values)
481
482        BAGV = sum((bagged_value - k) ** 2 for bagged_value in bagged_values) / len(bagged_values)
483        if instance.domain.class_var.var_type == Orange.feature.Descriptor.Discrete:
484            BAGV = 1 - BAGV
485
486        return [Estimate(BAGV, ABSOLUTE, BAGV_ABSOLUTE)]
487
488class LocalCrossValidation:
489    """
490
491    :param k: Number of nearest neighbours used in LCV estimate
492    :type k: int
493
494    :param distance: function that computes a distance between two discrete
495        distributions (used only in classification problems). The default
496        is Hellinger distance.
497    :type distance: function
498
499    :param distance_weighted: for classification reliability estimation,
500        use an average distance between distributions, weighted by :math:`e^{-d}`,
501        where :math:`d` is the distance between predicted instance and the
502        neighbour.
503
504    :rtype: :class:`Orange.evaluation.reliability.LocalCrossValidationClassifier`
505
506    :math:`k` nearest neighbours to the given instance are found and put in
507    a separate data set. On this data set, a leave-one-out validation is
508    performed. Reliability estimate for regression is then the distance
509    weighted absolute prediction error. In classification, 1 minus the average
510    distance between the predicted class probability distribution and the
511    (trivial) probability distributions of the nearest neighbour.
512
513    If a special value 0 is passed as :math:`k` (as is by default),
514    it is set as 1/20 of data set size (or 5, whichever is greater).
515
516    Summary of the algorithm for regression:
517
518    1. Determine the set of k nearest neighours :math:`N = { (x_1, c_1),...,
519       (x_k, c_k)}`.
520    2. On this set, compute leave-one-out predictions :math:`K_i` and
521       prediction errors :math:`E_i = | C_i - K_i |`.
522    3. :math:`LCV(x) = \\frac{ \sum_{(x_i, c_i) \in N} d(x_i, x) * E_i }{ \sum_{(x_i, c_i) \in N} d(x_i, x) }`
523
524    """
525    def __init__(self, k=0, distance=hellinger_dist, distance_weighted=True, name="lcv"):
526        self.k = k
527        self.distance = distance
528        self.distance_weighted = distance_weighted
529        self.name = name
530
531    def __call__(self, instances, learner):
532        nearest_neighbours_constructor = Orange.classification.knn.FindNearestConstructor()
533        nearest_neighbours_constructor.distanceConstructor = Orange.distance.Euclidean()
534
535        distance_id = Orange.feature.Descriptor.new_meta_id()
536        nearest_neighbours = nearest_neighbours_constructor(instances, 0, distance_id)
537
538        if self.k == 0:
539            self.k = max(5, len(instances) / 20)
540
541        return LocalCrossValidationClassifier(distance_id, nearest_neighbours, self.k, learner,
542            distance=self.distance, distance_weighted=self.distance_weighted)
543
544class LocalCrossValidationClassifier:
545    def __init__(self, distance_id, nearest_neighbours, k, learner, **kwds):
546        self.distance_id = distance_id
547        self.nearest_neighbours = nearest_neighbours
548        self.k = k
549        self.learner = learner
550        for a,b in kwds.items():
551            setattr(self, a, b)
552
553    def __call__(self, instance, *args):
554        LCVer = 0
555        LCVdi = 0
556
557        # Find k nearest neighbors
558
559        knn = [ex for ex in self.nearest_neighbours(instance, self.k)]
560
561        # leave one out of prediction error
562        for i in xrange(len(knn)):
563            train = knn[:]
564            del train[i]
565
566            classifier = self.learner(Orange.data.Table(train))
567
568            if instance.domain.class_var.var_type == Orange.feature.Descriptor.Continuous:
569                returned_value = classifier(knn[i], Orange.core.GetValue)
570                e = abs(knn[i].getclass().value - returned_value.value)
571
572            elif instance.domain.class_var.var_type == Orange.feature.Descriptor.Discrete:
573                returned_value = classifier(knn[i], Orange.core.GetProbabilities)
574                probabilities = [knn[i].get_class() == val for val in instance.domain.class_var.values]
575                e = self.distance(returned_value, Orange.statistics.distribution.Discrete(probabilities))
576
577            dist = math.exp(-knn[i][self.distance_id]) if self.distance_weighted else 1.0
578            LCVer += e * dist
579            LCVdi += dist
580
581        LCV = LCVer / LCVdi if LCVdi != 0 else 0
582        if math.isnan(LCV):
583            LCV = 0.0
584
585        if instance.domain.class_var.var_type == Orange.feature.Descriptor.Discrete:
586            LCV = 1 - LCV
587
588        return [ Estimate(LCV, ABSOLUTE, LCV_ABSOLUTE) ]
589
590class CNeighbours:
591    """
592   
593    :param k: Number of nearest neighbours used in CNK estimate
594    :type k: int
595
596    :param distance: function that computes a distance between two discrete
597        distributions (used only in classification problems). The default
598        is Hellinger distance.
599    :type distance: function
600   
601    :rtype: :class:`Orange.evaluation.reliability.CNeighboursClassifier`
602   
603    For regression, CNK is defined for an unlabeled instance as a difference
604    between average label of its nearest neighbours and its prediction. CNK
605    can be used as a signed or absolute estimate.
606   
607    :math:`CNK = \\frac{\sum_{i=1}^{k}C_i}{k} - K`
608   
609    where :math:`k` denotes number of neighbors, C :sub:`i` denotes neighbours'
610    labels and :math:`K` denotes the instance's prediction. Note that a greater
611    value implies greater prediction error.
612
613    For classification, CNK is equal to 1 minus the average distance between
614    predicted class distribution and (trivial) class distributions of the
615    $k$ nearest neighbours from the learning set. Note that in this case
616    a greater value implies better prediction.
617   
618    """
619    def __init__(self, k=5, distance=hellinger_dist, name = "cnk"):
620        self.k = k
621        self.distance = distance
622        self.name = name
623
624    def __call__(self, instances, learner):
625        nearest_neighbours_constructor = Orange.classification.knn.FindNearestConstructor()
626        nearest_neighbours_constructor.distanceConstructor = Orange.distance.Euclidean()
627
628        distance_id = Orange.feature.Descriptor.new_meta_id()
629        nearest_neighbours = nearest_neighbours_constructor(instances, 0, distance_id)
630        return CNeighboursClassifier(nearest_neighbours, self.k, distance=self.distance)
631
632class CNeighboursClassifier:
633    def __init__(self, nearest_neighbours, k, distance):
634        self.nearest_neighbours = nearest_neighbours
635        self.k = k
636        self.distance = distance
637
638    def __call__(self, instance, predicted, probabilities):
639        CNK = 0
640
641        # Find k nearest neighbors
642
643        knn = [ex for ex in self.nearest_neighbours(instance, self.k)]
644
645        # average label of neighbors
646        if ex.domain.class_var.var_type == Orange.feature.Descriptor.Continuous:
647            for ex in knn:
648                CNK += ex.getclass().value
649            CNK /= self.k
650            CNK -= predicted.value
651
652            return [Estimate(CNK, SIGNED, CNK_SIGNED),
653                    Estimate(abs(CNK), ABSOLUTE, CNK_ABSOLUTE)]
654        elif ex.domain.class_var.var_type == Orange.feature.Descriptor.Discrete:
655            knn_l = Orange.classification.knn.kNNLearner(k=self.k)
656            knn_c = knn_l(knn)
657            for ex in knn:
658                CNK -= self.distance(probabilities, knn_c(ex, Orange.classification.Classifier.GetProbabilities))
659            CNK /= self.k
660            CNK += 1
661
662            return [Estimate(CNK, ABSOLUTE, CNK_ABSOLUTE)]
663
664class Mahalanobis:
665    """
666   
667    :param k: Number of nearest neighbours used in Mahalanobis estimate.
668    :type k: int
669   
670    :rtype: :class:`Orange.evaluation.reliability.MahalanobisClassifier`
671   
672    Mahalanobis distance reliability estimate is defined as
673    `mahalanobis distance <http://en.wikipedia.org/wiki/Mahalanobis_distance>`_
674    to the evaluated instance's :math:`k` nearest neighbours.
675
676   
677    """
678    def __init__(self, k=3, name="mahalanobis"):
679        self.k = k
680        self.name = name
681
682    def __call__(self, instances, *args):
683        nnm = Orange.classification.knn.FindNearestConstructor()
684        nnm.distanceConstructor = Orange.distance.Mahalanobis()
685
686        mid = Orange.feature.Descriptor.new_meta_id()
687        nnm = nnm(instances, 0, mid)
688        return MahalanobisClassifier(self.k, nnm, mid)
689
690class MahalanobisClassifier:
691    def __init__(self, k, nnm, mid):
692        self.k = k
693        self.nnm = nnm
694        self.mid = mid
695
696    def __call__(self, instance, *args):
697        mahalanobis_distance = 0
698
699        mahalanobis_distance = sum(ex[self.mid].value for ex in self.nnm(instance, self.k))
700
701        return [ Estimate(mahalanobis_distance, ABSOLUTE, MAHAL_ABSOLUTE) ]
702
703class MahalanobisToCenter:
704    """
705    :rtype: :class:`Orange.evaluation.reliability.MahalanobisToCenterClassifier`
706   
707    Mahalanobis distance to center reliability estimate is defined as a
708    `mahalanobis distance <http://en.wikipedia.org/wiki/Mahalanobis_distance>`_
709    between the predicted instance and the centroid of the data.
710
711   
712    """
713    def __init__(self, name="mahalanobis to center"):
714        self.name = name
715
716    def __call__(self, instances, *args):
717        dc = Orange.core.DomainContinuizer()
718        dc.classTreatment = Orange.core.DomainContinuizer.Ignore
719        dc.continuousTreatment = Orange.core.DomainContinuizer.NormalizeBySpan
720        dc.multinomialTreatment = Orange.core.DomainContinuizer.NValues
721
722        new_domain = dc(instances)
723        new_instances = instances.translate(new_domain)
724
725        X, _, _ = new_instances.to_numpy()
726        instance_avg = numpy.average(X, 0)
727
728        distance_constructor = Orange.distance.Mahalanobis()
729        distance = distance_constructor(new_instances)
730
731        average_instance = Orange.data.Instance(new_instances.domain, list(instance_avg) + ["?"])
732
733        return MahalanobisToCenterClassifier(distance, average_instance, new_domain)
734
735class MahalanobisToCenterClassifier:
736    def __init__(self, distance, average_instance, new_domain):
737        self.distance = distance
738        self.average_instance = average_instance
739        self.new_domain = new_domain
740
741    def __call__(self, instance, *args):
742
743        inst = Orange.data.Instance(self.new_domain, instance)
744
745        mahalanobis_to_center = self.distance(inst, self.average_instance)
746
747        return [ Estimate(mahalanobis_to_center, ABSOLUTE, MAHAL_TO_CENTER_ABSOLUTE) ]
748
749
750class BaggingVarianceCNeighbours:
751    """
752   
753    :param bagv: Instance of Bagging Variance estimator.
754    :type bagv: :class:`BaggingVariance`
755   
756    :param cnk: Instance of CNK estimator.
757    :type cnk: :class:`CNeighbours`
758   
759    :rtype: :class:`Orange.evaluation.reliability.BaggingVarianceCNeighboursClassifier`
760   
761    BVCK is a combination (average) of Bagging variance and local modeling of
762    prediction error.
763   
764    """
765    def __init__(self, bagv=BaggingVariance(), cnk=CNeighbours(), name="bvck"):
766        self.bagv = bagv
767        self.cnk = cnk
768        self.name = "bvck"
769
770    def __call__(self, instances, learner):
771        bagv_classifier = self.bagv(instances, learner)
772        cnk_classifier = self.cnk(instances, learner)
773        return BaggingVarianceCNeighboursClassifier(bagv_classifier, cnk_classifier)
774
775class BaggingVarianceCNeighboursClassifier:
776    def __init__(self, bagv_classifier, cnk_classifier):
777        self.bagv_classifier = bagv_classifier
778        self.cnk_classifier = cnk_classifier
779
780    def __call__(self, instance, predicted, probabilities):
781        bagv_estimates = self.bagv_classifier(instance, predicted, probabilities)
782        cnk_estimates = self.cnk_classifier(instance, predicted, probabilities)
783
784        bvck_value = (bagv_estimates[0].estimate + cnk_estimates[1].estimate) / 2
785        bvck_estimates = [ Estimate(bvck_value, ABSOLUTE, BVCK_ABSOLUTE) ]
786        bvck_estimates.extend(bagv_estimates)
787        bvck_estimates.extend(cnk_estimates)
788        return bvck_estimates
789
790class ErrorPredicting:
791    def __init__(self, name = "ep"):
792        self.name = name
793
794    def __call__(self, instances, learner):
795        res = Orange.evaluation.testing.cross_validation([learner], instances)
796        prediction_errors = get_prediction_error_list(res)
797
798        new_domain = Orange.data.Domain(instances.domain.attributes, Orange.core.FloatVariable("pe"))
799        new_dataset = Orange.data.Table(new_domain, instances)
800
801        for instance, prediction_error in izip(new_dataset, prediction_errors):
802            instance.set_class(prediction_error)
803
804        rf = Orange.ensemble.forest.RandomForestLearner()
805        rf_classifier = rf(new_dataset)
806
807        return ErrorPredictingClassification(rf_classifier, new_domain)
808
809class ErrorPredictingClassification:
810    def __init__(self, rf_classifier, new_domain):
811        self.rf_classifier = rf_classifier
812        self.new_domain = new_domain
813
814    def __call__(self, instance, predicted, probabilities):
815        new_instance = Orange.data.Instance(self.new_domain, instance)
816        value = self.rf_classifier(new_instance, Orange.core.GetValue)
817
818        return [Estimate(value.value, SIGNED, SABIAS_SIGNED)]
819
820def gauss_kernel(x, sigma=1):
821    return 1./(sigma*math.sqrt(2*math.pi)) * math.exp(-1./2*(x/sigma)**2)
822
823class ParzenWindowDensityBased:
824    """
825    :param K: kernel function. Default: gaussian.
826    :type K: function
827
828    :param d_measure: distance measure for inter-instance distance.
829    :type d_measure: :class:`Orange.distance.DistanceConstructor`
830
831    :rtype: :class:`Orange.evaluation.reliability.ParzenWindowDensityBasedClassifier`
832
833    Returns a value that estimates a density of problem space around the
834    instance being predicted.
835    """
836    def __init__(self, K=gauss_kernel, d_measure=Orange.distance.Euclidean(), name="density"):
837        self.K = K
838        self.d_measure = d_measure
839        self.name = name
840
841    def __call__(self, instances, learner):
842
843        self.distance = self.d_measure(instances)
844
845        def density(x):
846            l, dens = len(instances), 0
847            for ex in instances:
848                dens += self.K(self.distance(x,ex))
849            return dens / l
850
851        max_density = max([density(ex) for ex in instances])
852
853        return ParzenWindowDensityBasedClassifier(density, max_density)
854
855class ParzenWindowDensityBasedClassifier:
856
857    def __init__(self, density, max_density):
858        self.density = density
859        self.max_density = max_density
860
861
862    def __call__(self, instance, *args):
863
864        DENS = self.max_density-self.density(instance)
865
866        return [Estimate(DENS, ABSOLUTE, DENS_ABSOLUTE)]
867
868class Learner:
869    """
870    Reliability estimation wrapper around a learner we want to test.
871    Different reliability estimation algorithms can be used on the
872    chosen learner. This learner works as any other and can be used as one,
873    but it returns the classifier, wrapped into an instance of
874    :class:`Orange.evaluation.reliability.Classifier`.
875   
876    :param box_learner: Learner we want to wrap into a reliability estimation
877        classifier.
878    :type box_learner: :obj:`~Orange.classification.Learner`
879   
880    :param estimators: List of different reliability estimation methods we
881                       want to use on the chosen learner.
882    :type estimators: :obj:`list` of reliability estimators
883   
884    :param name: Name of this reliability learner
885    :type name: string
886   
887    :rtype: :class:`Orange.evaluation.reliability.Learner`
888    """
889    def __init__(self, box_learner, name="Reliability estimation",
890                 estimators=[SensitivityAnalysis(),
891                             LocalCrossValidation(),
892                             BaggingVarianceCNeighbours(),
893                             Mahalanobis(),
894                             MahalanobisToCenter()],
895                 **kwds):
896        self.__dict__.update(kwds)
897        self.name = name
898        self.estimators = estimators
899        self.box_learner = box_learner
900        self.blending = False
901
902
903    def __call__(self, instances, weight=None, **kwds):
904        """Learn from the given table of data instances.
905       
906        :param instances: Data instances to learn from.
907        :type instances: Orange.data.Table
908        :param weight: Id of meta attribute with weights of instances
909        :type weight: int
910        :rtype: :class:`Orange.evaluation.reliability.Classifier`
911        """
912
913        blending_classifier = None
914        new_domain = None
915
916#        if instances.domain.class_var.var_type != Orange.feature.Continuous.Continuous:
917#            raise Exception("This method only works on data with continuous class.")
918
919        return Classifier(instances, self.box_learner, self.estimators, self.blending, new_domain, blending_classifier)
920
921    def internal_cross_validation(self, instances, folds=10):
922        """ Perform the internal cross validation for getting the best
923        reliability estimate. It uses the reliability estimators defined in
924        estimators attribute.
925
926        Returns the id of the method that scored the best.
927
928        :param instances: Data instances to use for ICV.
929        :type instances: :class:`Orange.data.Table`
930        :param folds: number of folds for ICV.
931        :type folds: int
932        :rtype: int
933
934        """
935        res = Orange.evaluation.testing.cross_validation([self], instances, folds=folds)
936        results = get_pearson_r(res)
937        sorted_results = sorted(results)
938        return sorted_results[-1][3]
939
940    def internal_cross_validation_testing(self, instances, folds=10):
941        """ Perform internal cross validation (as in Automatic selection of
942        reliability estimates for individual regression predictions,
943        Zoran Bosnic, 2010) and return id of the method
944        that scored best on this data.
945
946        :param instances: Data instances to use for ICV.
947        :type instances: :class:`Orange.data.Table`
948        :param folds: number of folds for ICV.
949        :type folds: int
950        :rtype: int
951
952        """
953        cv_indices = Orange.core.MakeRandomIndicesCV(instances, folds)
954
955        list_of_rs = []
956
957        sum_of_rs = defaultdict(float)
958
959        for fold in xrange(folds):
960            data = instances.select(cv_indices, fold)
961            if len(data) < 10:
962                res = Orange.evaluation.testing.leave_one_out([self], data)
963            else:
964                res = Orange.evaluation.testing.cross_validation([self], data)
965            results = get_pearson_r(res)
966            for r, _, _, method in results:
967                sum_of_rs[method] += r
968        sorted_sum_of_rs = sorted(sum_of_rs.items(), key=lambda estimate: estimate[1], reverse=True)
969        return sorted_sum_of_rs[0][0]
970
971    labels = ["SAvar", "SAbias", "BAGV", "CNK", "LCV", "BVCK", "Mahalanobis", "ICV"]
972
973class Classifier:
974    """
975    A reliability estimation wrapper for classifiers.
976
977    What distinguishes this classifier is that the returned probabilities (if
978    :obj:`Orange.classification.Classifier.GetProbabilities` or
979    :obj:`Orange.classification.Classifier.GetBoth` is passed) contain an
980    additional attribute :obj:`reliability_estimate`, which is an instance of
981    :class:`~Orange.evaluation.reliability.Estimate`.
982
983    """
984
985    def __init__(self, instances, box_learner, estimators, blending, blending_domain, rf_classifier, **kwds):
986        self.__dict__.update(kwds)
987        self.instances = instances
988        self.box_learner = box_learner
989        self.estimators = estimators
990        self.blending = blending
991        self.blending_domain = blending_domain
992        self.rf_classifier = rf_classifier
993
994        # Train the learner with original data
995        self.classifier = box_learner(instances)
996
997        # Train all the estimators and create their classifiers
998        self.estimation_classifiers = [estimator(instances, box_learner) for estimator in estimators]
999
1000    def __call__(self, instance, result_type=Orange.core.GetValue):
1001        """
1002        Classify and estimate reliability of estimation for a new instance.
1003        When :obj:`result_type` is set to
1004        :obj:`Orange.classification.Classifier.GetBoth` or
1005        :obj:`Orange.classification.Classifier.GetProbabilities`,
1006        an additional attribute :obj:`reliability_estimate`,
1007        which is an instance of
1008        :class:`~Orange.evaluation.reliability.Estimate`,
1009        is added to the distribution object.
1010       
1011        :param instance: instance to be classified.
1012        :type instance: :class:`Orange.data.Instance`
1013        :param result_type: :class:`Orange.classification.Classifier.GetValue` or \
1014              :class:`Orange.classification.Classifier.GetProbabilities` or
1015              :class:`Orange.classification.Classifier.GetBoth`
1016       
1017        :rtype: :class:`Orange.data.Value`,
1018              :class:`Orange.statistics.Distribution` or a tuple with both
1019        """
1020        predicted, probabilities = self.classifier(instance, Orange.core.GetBoth)
1021
1022        # Create a place holder for estimates
1023        if probabilities is None:
1024            probabilities = Orange.statistics.distribution.Continuous()
1025        #with warnings.catch_warnings():
1026        #    warnings.simplefilter("ignore")
1027        probabilities.setattr('reliability_estimate', [])
1028
1029        # Calculate all the estimates and add them to the results
1030        for estimate in self.estimation_classifiers:
1031            probabilities.reliability_estimate.extend(estimate(instance, predicted, probabilities))
1032
1033        # Return the appropriate type of result
1034        if result_type == Orange.core.GetValue:
1035            return predicted
1036        elif result_type == Orange.core.GetProbabilities:
1037            return probabilities
1038        else:
1039            return predicted, probabilities
1040
1041# Functions for testing and plotting
1042#TODO Document those.
1043def get_acc_rel(method, data, learner):
1044    estimators = [method]
1045    reliability = Orange.evaluation.reliability.Learner(learner, estimators=estimators)
1046    #results = Orange.evaluation.testing.leave_one_out([reliability], data)
1047    results = Orange.evaluation.testing.cross_validation([reliability], data)
1048
1049    rels, acc = [], []
1050
1051    for res in results.results:
1052        rels.append(res.probabilities[0].reliability_estimate[0].estimate)
1053        acc.append(res.probabilities[0][res.actual_class])
1054
1055    return rels, acc
1056
1057
1058def rel_acc_plot(rels, acc, file_name=None, colors=None):
1059
1060    import matplotlib.pylab as plt
1061   
1062    if colors is None:
1063        colors = "k"
1064    plt.scatter(rels, acc, c=colors)
1065    plt.xlim(0.,1.)
1066    plt.ylim(ymin=0.)
1067    plt.xlabel("Reliability")
1068    plt.ylabel("Accuracy")
1069    if file_name is None:
1070        plt.show()
1071    else:
1072        plt.savefig(file_name)
1073
1074def rel_acc_compute_plot(method, data, learner, file_name=None, colors=None):
1075
1076    plt.clf()
1077
1078    rels, acc = get_acc_rel(method, data, learner)
1079    el_acc_plot(acc, rels, file_name=file_name, colors=colors)
1080   
1081
1082def acc_rel_correlation(method, data, learner):
1083    import scipy.stats
1084    rels, acc = get_acc_rel(method, data, learner)
1085    return scipy.stats.spearmanr(acc, rels)[0]
Note: See TracBrowser for help on using the repository browser.