source: orange-reliability/_reliability/__init__.py @ 13:f98f0417ceb6

Revision 13:f98f0417ceb6, 39.2 KB checked in by Matija Polajnar <matija.polajnar@…>, 22 months ago (diff)

Documentation of reference method for classification.

Line 
1import Orange
2
3import random
4from Orange import statc
5import math
6import warnings
7import numpy
8
9from collections import defaultdict
10from itertools import izip
11
12# Labels and final variables
13labels = ["SAvar", "SAbias", "BAGV", "CNK", "LCV", "BVCK", "Mahalanobis", "ICV"]
14
15"""
16# All the estimators calculation constants
17DO_SA = 0
18DO_BAGV = 1
19DO_CNK = 2
20DO_LCV = 3
21DO_BVCK = 4
22DO_MAHAL = 5
23"""
24
25# All the estimator method constants
26SAVAR_ABSOLUTE = 0
27SABIAS_SIGNED = 1
28SABIAS_ABSOLUTE = 2
29BAGV_ABSOLUTE = 3
30CNK_SIGNED = 4
31CNK_ABSOLUTE = 5
32LCV_ABSOLUTE = 6
33BVCK_ABSOLUTE = 7
34MAHAL_ABSOLUTE = 8
35BLENDING_ABSOLUTE = 9
36ICV_METHOD = 10
37MAHAL_TO_CENTER_ABSOLUTE = 13
38DENS_ABSOLUTE = 14
39ERR_ABSOLUTE = 15
40
41# Type of estimator constant
42SIGNED = 0
43ABSOLUTE = 1
44
45# Names of all the estimator methods
46METHOD_NAME = {0: "SAvar absolute", 1: "SAbias signed", 2: "SAbias absolute",
47               3: "BAGV absolute", 4: "CNK signed", 5: "CNK absolute",
48               6: "LCV absolute", 7: "BVCK_absolute", 8: "Mahalanobis absolute",
49               9: "BLENDING absolute", 10: "ICV", 11: "RF Variance", 12: "RF Std",
50               13: "Mahalanobis to center", 14: "Density based", 15: "Reference expected error"}
51
52select_with_repeat = Orange.core.MakeRandomIndicesMultiple()
53select_with_repeat.random_generator = Orange.misc.Random()
54
55def get_reliability_estimation_list(res, i):
56    return [result.probabilities[0].reliability_estimate[i].estimate for result in res.results], res.results[0].probabilities[0].reliability_estimate[i].signed_or_absolute, res.results[0].probabilities[0].reliability_estimate[i].method
57
58def get_prediction_error_list(res):
59    return [result.actual_class - result.classes[0] for result in res.results]
60
61def get_description_list(res, i):
62    return [result.probabilities[0].reliability_estimate[i].text_description for result in res.results]
63
64def get_pearson_r(res):
65    """
66    :param res: results of evaluation, done using learners,
67        wrapped into :class:`Orange.evaluation.reliability.Classifier`.
68    :type res: :class:`Orange.evaluation.testing.ExperimentResults`
69
70    Return Pearson's coefficient between the prediction error and each of the
71    used reliability estimates. Also, return the p-value of each of
72    the coefficients.
73    """
74    prediction_error = get_prediction_error_list(res)
75    results = []
76    for i in xrange(len(res.results[0].probabilities[0].reliability_estimate)):
77        reliability_estimate, signed_or_absolute, method = get_reliability_estimation_list(res, i)
78        try:
79            if signed_or_absolute == SIGNED:
80                r, p = statc.pearsonr(prediction_error, reliability_estimate)
81            else:
82                r, p = statc.pearsonr([abs(pe) for pe in prediction_error], reliability_estimate)
83        except Exception:
84            r = p = float("NaN")
85        results.append((r, p, signed_or_absolute, method))
86    return results
87
88def get_spearman_r(res):
89    """
90    :param res: results of evaluation, done using learners,
91        wrapped into :class:`Orange.evaluation.reliability.Classifier`.
92    :type res: :class:`Orange.evaluation.testing.ExperimentResults`
93
94    Return Spearman's coefficient between the prediction error and each of the
95    used reliability estimates. Also, return the p-value of each of
96    the coefficients.
97    """
98    prediction_error = get_prediction_error_list(res)
99    results = []
100    for i in xrange(len(res.results[0].probabilities[0].reliability_estimate)):
101        reliability_estimate, signed_or_absolute, method = get_reliability_estimation_list(res, i)
102        try:
103            if signed_or_absolute == SIGNED:
104                r, p = statc.spearmanr(prediction_error, reliability_estimate)
105            else:
106                r, p = statc.spearmanr([abs(pe) for pe in prediction_error], reliability_estimate)
107        except Exception:
108            r = p = float("NaN")
109        results.append((r, p, signed_or_absolute, method))
110    return results
111
112def get_pearson_r_by_iterations(res):
113    """
114    :param res: results of evaluation, done using learners,
115        wrapped into :class:`Orange.evaluation.reliability.Classifier`.
116    :type res: :class:`Orange.evaluation.testing.ExperimentResults`
117
118    Return average Pearson's coefficient over all folds between prediction error
119    and each of the used estimates.
120    """
121    results_by_fold = Orange.evaluation.scoring.split_by_iterations(res)
122    number_of_estimates = len(res.results[0].probabilities[0].reliability_estimate)
123    number_of_instances = len(res.results)
124    number_of_folds = len(results_by_fold)
125    results = [0 for _ in xrange(number_of_estimates)]
126    sig = [0 for _ in xrange(number_of_estimates)]
127    method_list = [0 for _ in xrange(number_of_estimates)]
128
129    for res in results_by_fold:
130        prediction_error = get_prediction_error_list(res)
131        for i in xrange(number_of_estimates):
132            reliability_estimate, signed_or_absolute, method = get_reliability_estimation_list(res, i)
133            try:
134                if signed_or_absolute == SIGNED:
135                    r, _ = statc.pearsonr(prediction_error, reliability_estimate)
136                else:
137                    r, _ = statc.pearsonr([abs(pe) for pe in prediction_error], reliability_estimate)
138            except Exception:
139                r = float("NaN")
140            results[i] += r
141            sig[i] = signed_or_absolute
142            method_list[i] = method
143
144    # Calculate p-values
145    results = [float(res) / number_of_folds for res in results]
146    ps = [p_value_from_r(r, number_of_instances) for r in results]
147
148    return zip(results, ps, sig, method_list)
149
150def p_value_from_r(r, n):
151    """
152    Calculate p-value from the paerson coefficient and the sample size.
153    """
154    df = n - 2
155    t = r * (df / ((-r + 1.0 + 1e-30) * (r + 1.0 + 1e-30))) ** 0.5
156    return statc.betai (df * 0.5, 0.5, df / (df + t * t))
157
158
159# Distances between two discrete probability distributions
160#TODO Document those.
161def normalize_both(p, q):
162    if not p.normalized:
163        p.normalize()
164    if not q.normalized:
165        q.normalize()
166    return p, q
167
168def minkowsky_dist(p, q, m=2):
169    p, q = normalize_both(p, q)
170    dist = 0
171    for i in range(len(p)):
172        dist += abs(p[i]-q[i])**m
173    return dist**(1./m)
174
175def manhattan_distance(p, q):
176    return minkowsky_dist(p, q, m=1)
177
178def euclidean_dist(p, q):
179    return minkowsky_dist(p, q, m=2)
180
181def variance_dist(p, q):
182    return euclidean_dist(p, q) ** 2
183
184def max_dist(p, q):
185    p, q = normalize_both(p, q)
186    return max([abs(p[i]-q[i]) for i in range(len(p))])
187
188def hellinger_dist(p, q):
189    p, q = normalize_both(p, q)
190    dist = 0
191    for i in range(len(p)):
192        dist += (math.sqrt(p[i])-math.sqrt(q[i])) ** 2
193    return dist
194
195def my_log(x):
196    return 0 if x == 0 else x * math.log(x)
197
198def kullback_leibler(p, q):
199    p, q = normalize_both(p, q)
200    dist = 0
201    for i in range(len(p)):
202        dist += my_log(p[i]-q[i])
203    return dist
204
205def cosine(p, q):
206    p, q = normalize_both(p, q)
207    p, q = [pp for pp in p], [qq for qq in q]
208    return 1 - numpy.dot(x,y) / (numpy.linalg.norm(p)*numpy.linalg.norm(q))
209
210
211class Estimate:
212    """
213    Reliability estimate. Contains attributes that describe the results of
214    reliability estimation.
215
216    .. attribute:: estimate
217
218        A numerical reliability estimate.
219
220    .. attribute:: signed_or_absolute
221
222        Determines whether the method used gives a signed or absolute result.
223        Has a value of either :obj:`SIGNED` or :obj:`ABSOLUTE`.
224
225    .. attribute:: method
226
227        An integer ID of reliability estimation method used.
228
229    .. attribute:: method_name
230
231        Name (string) of reliability estimation method used.
232
233    .. attribute:: icv_method
234
235        An integer ID of reliability estimation method that performed best,
236        as determined by ICV, and of which estimate is stored in the
237        :obj:`estimate` field. (:obj:`None` when ICV was not used.)
238
239    .. attribute:: icv_method_name
240
241        Name (string) of reliability estimation method that performed best,
242        as determined by ICV. (:obj:`None` when ICV was not used.)
243
244    """
245    def __init__(self, estimate, signed_or_absolute, method, icv_method= -1):
246        self.estimate = estimate
247        self.signed_or_absolute = signed_or_absolute
248        self.method = method
249        self.method_name = METHOD_NAME[method]
250        self.icv_method = icv_method
251        self.icv_method_name = METHOD_NAME[icv_method] if icv_method != -1 else ""
252        self.text_description = None
253
254class DescriptiveAnalysis:
255    def __init__(self, estimator, desc=["high", "medium", "low"], procentage=[0.00, 0.33, 0.66]):
256        self.desc = desc
257        self.procentage = procentage
258        self.estimator = estimator
259
260    def __call__(self, instances, weight=None, **kwds):
261
262        # Calculate borders using cross validation
263        res = Orange.evaluation.testing.cross_validation([self.estimator], instances)
264        all_borders = []
265        for i in xrange(len(res.results[0].probabilities[0].reliability_estimate)):
266            estimates, signed_or_absolute, method = get_reliability_estimation_list(res, i)
267            sorted_estimates = sorted(abs(x) for x in estimates)
268            borders = [sorted_estimates[int(len(estimates) * p) - 1]  for p in self.procentage]
269            all_borders.append(borders)
270
271        # Learn on whole train data
272        estimator_classifier = self.estimator(instances)
273
274        return DescriptiveAnalysisClassifier(estimator_classifier, all_borders, self.desc)
275
276class DescriptiveAnalysisClassifier:
277    def __init__(self, estimator_classifier, all_borders, desc):
278        self.estimator_classifier = estimator_classifier
279        self.all_borders = all_borders
280        self.desc = desc
281
282    def __call__(self, instance, result_type=Orange.core.GetValue):
283        predicted, probabilities = self.estimator_classifier(instance, Orange.core.GetBoth)
284
285        for borders, estimate in zip(self.all_borders, probabilities.reliability_estimate):
286            estimate.text_description = self.desc[0]
287            for lower_border, text_desc in zip(borders, self.desc):
288                if estimate.estimate >= lower_border:
289                    estimate.text_description = text_desc
290
291        # Return the appropriate type of result
292        if result_type == Orange.core.GetValue:
293            return predicted
294        elif result_type == Orange.core.GetProbabilities:
295            return probabilities
296        else:
297            return predicted, probabilities
298
299class SensitivityAnalysis:
300    """
301   
302    :param e: List of possible :math:`\epsilon` values for SAvar and SAbias
303        reliability estimates.
304    :type e: list of floats
305   
306    :rtype: :class:`Orange.evaluation.reliability.SensitivityAnalysisClassifier`
307   
308    To estimate the reliability of prediction for given instance,
309    the learning set is extended with this instance, labeled with
310    :math:`K + \epsilon (l_{max} - l_{min})`,
311    where :math:`K` denotes the initial prediction,
312    :math:`\epsilon` is sensitivity parameter and :math:`l_{min}` and
313    :math:`l_{max}` denote lower and the upper bound of the learning
314    instances' labels. After computing different sensitivity predictions
315    using different values of :math:`\epsilon`, the prediction are combined
316    into SAvar and SAbias. SAbias can be used in a signed or absolute form.
317
318    :math:`SAvar = \\frac{\sum_{\epsilon \in E}(K_{\epsilon} - K_{-\epsilon})}{|E|}`
319
320    :math:`SAbias = \\frac{\sum_{\epsilon \in E} (K_{\epsilon} - K ) + (K_{-\epsilon} - K)}{2 |E|}`
321   
322   
323    """
324    def __init__(self, e=[0.01, 0.1, 0.5, 1.0, 2.0]):
325        self.e = e
326
327    def __call__(self, instances, learner):
328        min_value = max_value = instances[0].getclass().value
329        for ex in instances:
330            if ex.getclass().value > max_value:
331                max_value = ex.getclass().value
332            if ex.getclass().value < min_value:
333                min_value = ex.getclass().value
334        return SensitivityAnalysisClassifier(self.e, instances, min_value, max_value, learner)
335
336class SensitivityAnalysisClassifier:
337    def __init__(self, e, instances, min_value, max_value, learner):
338        self.e = e
339        self.instances = instances
340        self.max_value = max_value
341        self.min_value = min_value
342        self.learner = learner
343
344    def __call__(self, instance, predicted, probabilities):
345        # Create new dataset
346        r_data = Orange.data.Table(self.instances)
347
348        # Create new instance
349        modified_instance = Orange.data.Instance(instance)
350
351        # Append it to the data
352        r_data.append(modified_instance)
353
354        # Calculate SAvar & SAbias
355        SAvar = SAbias = 0
356
357        for eps in self.e:
358            # +epsilon
359            r_data[-1].setclass(predicted.value + eps * (self.max_value - self.min_value))
360            c = self.learner(r_data)
361            k_plus = c(instance, Orange.core.GetValue)
362
363            # -epsilon
364            r_data[-1].setclass(predicted.value - eps * (self.max_value - self.min_value))
365            c = self.learner(r_data)
366            k_minus = c(instance, Orange.core.GetValue)
367            #print len(r_data)
368            #print eps*(self.max_value - self.min_value)
369            #print k_plus
370            #print k_minus
371            # calculate part SAvar and SAbias
372            SAvar += k_plus.value - k_minus.value
373            SAbias += k_plus.value + k_minus.value - 2 * predicted.value
374
375        SAvar /= len(self.e)
376        SAbias /= 2 * len(self.e)
377
378        return [Estimate(SAvar, ABSOLUTE, SAVAR_ABSOLUTE),
379                Estimate(SAbias, SIGNED, SABIAS_SIGNED),
380                Estimate(abs(SAbias), ABSOLUTE, SABIAS_ABSOLUTE)]
381
382
383
384class ReferenceExpectedError:
385    """
386
387    :rtype: :class:`Orange.evaluation.reliability.ReferenceExpectedErrorClassifier`
388
389    Reference reliability estimation method for classification as used in Evaluating Reliability of Single
390    Classifications of Neural Networks, Darko Pevec, 2011.
391
392    :math:`O_{ref} = 2 (\hat y - \hat y ^2) = 2 \hat y (1-\hat y)`
393
394    where :math:`\hat y` is the estimated probability of the predicted class.
395
396    Note that for this method, in contrast with all others, a greater estimate means lower reliability (greater
397    expected error).
398
399    """
400    def __init__(self, name="reference"):
401        self.name = name
402
403    def __call__(self, instances, learner):
404        classifier = learner(instances)
405        return ReferenceExpectedErrorClassifier(classifier)
406
407   
408class ReferenceExpectedErrorClassifier:
409
410    def __init__(self, classifier):
411        self.classifier = classifier
412
413    def __call__(self, instance, *args):
414        y_hat = max(self.classifier(instance, Orange.classification.Classifier.GetProbabilities))
415        return [Estimate(2 * y_hat * (1 - y_hat), ABSOLUTE, ERR_ABSOLUTE)]
416
417   
418
419class BaggingVariance:
420    """
421   
422    :param m: Number of bagging models to be used with BAGV estimate
423    :type m: int
424   
425    :rtype: :class:`Orange.evaluation.reliability.BaggingVarianceClassifier`
426   
427    :math:`m` different bagging models are constructed and used to estimate
428    the value of dependent variable for a given instance. In regression,
429    the variance of those predictions is used as a prediction reliability
430    estimate.
431
432    :math:`BAGV = \\frac{1}{m} \sum_{i=1}^{m} (K_i - K)^2`
433
434    where :math:`K = \\frac{\sum_{i=1}^{m} K_i}{m}` and :math:`K_i` are
435    predictions of individual constructed models. Note that a greater value
436    implies greater error.
437
438    For classification, 1 minus the average Euclidean distance between class
439    probability distributions predicted by the model, and distributions
440    predicted by the individual bagged models, is used as the BAGV reliability
441    measure. Note that in this case a greater value implies a better
442    prediction.
443   
444    """
445    def __init__(self, m=50, name="bv"):
446        self.m = m
447        self.name = name
448
449    def __call__(self, instances, learner):
450        classifiers = []
451
452        if instances.domain.class_var.var_type == Orange.feature.Descriptor.Discrete:
453            classifier = learner(instances)
454        else:
455            classifier = None
456
457        # Create bagged classifiers using sampling with replacement
458        for _ in xrange(self.m):
459            selection = select_with_repeat(len(instances))
460            data = instances.select(selection)
461            classifiers.append(learner(data))
462        return BaggingVarianceClassifier(classifiers, classifier)
463
464class BaggingVarianceClassifier:
465    def __init__(self, classifiers, classifier=None):
466        self.classifiers = classifiers
467        self.classifier = classifier
468
469    def __call__(self, instance, *args):
470        BAGV = 0
471
472        # Calculate the bagging variance
473        if instance.domain.class_var.var_type == Orange.feature.Descriptor.Continuous:
474            bagged_values = [c(instance, Orange.core.GetValue).value for c in self.classifiers if c is not None]
475        elif instance.domain.class_var.var_type == Orange.feature.Descriptor.Discrete:
476            estimate = self.classifier(instance, Orange.core.GetProbabilities)
477            bagged_values = [euclidean_dist(c(instance, Orange.core.GetProbabilities), estimate) for c in self.classifiers if c is not None]
478        k = sum(bagged_values) / len(bagged_values)
479
480        BAGV = sum((bagged_value - k) ** 2 for bagged_value in bagged_values) / len(bagged_values)
481        if instance.domain.class_var.var_type == Orange.feature.Descriptor.Discrete:
482            BAGV = 1 - BAGV
483
484        return [Estimate(BAGV, ABSOLUTE, BAGV_ABSOLUTE)]
485
486class LocalCrossValidation:
487    """
488
489    :param k: Number of nearest neighbours used in LCV estimate
490    :type k: int
491
492    :param distance: function that computes a distance between two discrete
493        distributions (used only in classification problems). The default
494        is Hellinger distance.
495    :type distance: function
496
497    :param distance_weighted: for classification reliability estimation,
498        use an average distance between distributions, weighted by :math:`e^{-d}`,
499        where :math:`d` is the distance between predicted instance and the
500        neighbour.
501
502    :rtype: :class:`Orange.evaluation.reliability.LocalCrossValidationClassifier`
503
504    :math:`k` nearest neighbours to the given instance are found and put in
505    a separate data set. On this data set, a leave-one-out validation is
506    performed. Reliability estimate for regression is then the distance
507    weighted absolute prediction error. In classification, 1 minus the average
508    distance between the predicted class probability distribution and the
509    (trivial) probability distributions of the nearest neighbour.
510
511    If a special value 0 is passed as :math:`k` (as is by default),
512    it is set as 1/20 of data set size (or 5, whichever is greater).
513
514    Summary of the algorithm for regression:
515
516    1. Determine the set of k nearest neighours :math:`N = { (x_1, c_1),...,
517       (x_k, c_k)}`.
518    2. On this set, compute leave-one-out predictions :math:`K_i` and
519       prediction errors :math:`E_i = | C_i - K_i |`.
520    3. :math:`LCV(x) = \\frac{ \sum_{(x_i, c_i) \in N} d(x_i, x) * E_i }{ \sum_{(x_i, c_i) \in N} d(x_i, x) }`
521
522    """
523    def __init__(self, k=0, distance=hellinger_dist, distance_weighted=True, name="lcv"):
524        self.k = k
525        self.distance = distance
526        self.distance_weighted = distance_weighted
527        self.name = name
528
529    def __call__(self, instances, learner):
530        nearest_neighbours_constructor = Orange.classification.knn.FindNearestConstructor()
531        nearest_neighbours_constructor.distanceConstructor = Orange.distance.Euclidean()
532
533        distance_id = Orange.feature.Descriptor.new_meta_id()
534        nearest_neighbours = nearest_neighbours_constructor(instances, 0, distance_id)
535
536        if self.k == 0:
537            self.k = max(5, len(instances) / 20)
538
539        return LocalCrossValidationClassifier(distance_id, nearest_neighbours, self.k, learner,
540            distance=self.distance, distance_weighted=self.distance_weighted)
541
542class LocalCrossValidationClassifier:
543    def __init__(self, distance_id, nearest_neighbours, k, learner, **kwds):
544        self.distance_id = distance_id
545        self.nearest_neighbours = nearest_neighbours
546        self.k = k
547        self.learner = learner
548        for a,b in kwds.items():
549            setattr(self, a, b)
550
551    def __call__(self, instance, *args):
552        LCVer = 0
553        LCVdi = 0
554
555        # Find k nearest neighbors
556
557        knn = [ex for ex in self.nearest_neighbours(instance, self.k)]
558
559        # leave one out of prediction error
560        for i in xrange(len(knn)):
561            train = knn[:]
562            del train[i]
563
564            classifier = self.learner(Orange.data.Table(train))
565
566            if instance.domain.class_var.var_type == Orange.feature.Descriptor.Continuous:
567                returned_value = classifier(knn[i], Orange.core.GetValue)
568                e = abs(knn[i].getclass().value - returned_value.value)
569
570            elif instance.domain.class_var.var_type == Orange.feature.Descriptor.Discrete:
571                returned_value = classifier(knn[i], Orange.core.GetProbabilities)
572                probabilities = [knn[i].get_class() == val for val in instance.domain.class_var.values]
573                e = self.distance(returned_value, Orange.statistics.distribution.Discrete(probabilities))
574
575            dist = math.exp(-knn[i][self.distance_id]) if self.distance_weighted else 1.0
576            LCVer += e * dist
577            LCVdi += dist
578
579        LCV = LCVer / LCVdi if LCVdi != 0 else 0
580        if math.isnan(LCV):
581            LCV = 0.0
582
583        if instance.domain.class_var.var_type == Orange.feature.Descriptor.Discrete:
584            LCV = 1 - LCV
585
586        return [ Estimate(LCV, ABSOLUTE, LCV_ABSOLUTE) ]
587
588class CNeighbours:
589    """
590   
591    :param k: Number of nearest neighbours used in CNK estimate
592    :type k: int
593
594    :param distance: function that computes a distance between two discrete
595        distributions (used only in classification problems). The default
596        is Hellinger distance.
597    :type distance: function
598   
599    :rtype: :class:`Orange.evaluation.reliability.CNeighboursClassifier`
600   
601    For regression, CNK is defined for an unlabeled instance as a difference
602    between average label of its nearest neighbours and its prediction. CNK
603    can be used as a signed or absolute estimate.
604   
605    :math:`CNK = \\frac{\sum_{i=1}^{k}C_i}{k} - K`
606   
607    where :math:`k` denotes number of neighbors, C :sub:`i` denotes neighbours'
608    labels and :math:`K` denotes the instance's prediction. Note that a greater
609    value implies greater prediction error.
610
611    For classification, CNK is equal to 1 minus the average distance between
612    predicted class distribution and (trivial) class distributions of the
613    $k$ nearest neighbours from the learning set. Note that in this case
614    a greater value implies better prediction.
615   
616    """
617    def __init__(self, k=5, distance=hellinger_dist, name = "cnk"):
618        self.k = k
619        self.distance = distance
620        self.name = name
621
622    def __call__(self, instances, learner):
623        nearest_neighbours_constructor = Orange.classification.knn.FindNearestConstructor()
624        nearest_neighbours_constructor.distanceConstructor = Orange.distance.Euclidean()
625
626        distance_id = Orange.feature.Descriptor.new_meta_id()
627        nearest_neighbours = nearest_neighbours_constructor(instances, 0, distance_id)
628        return CNeighboursClassifier(nearest_neighbours, self.k, distance=self.distance)
629
630class CNeighboursClassifier:
631    def __init__(self, nearest_neighbours, k, distance):
632        self.nearest_neighbours = nearest_neighbours
633        self.k = k
634        self.distance = distance
635
636    def __call__(self, instance, predicted, probabilities):
637        CNK = 0
638
639        # Find k nearest neighbors
640
641        knn = [ex for ex in self.nearest_neighbours(instance, self.k)]
642
643        # average label of neighbors
644        if ex.domain.class_var.var_type == Orange.feature.Descriptor.Continuous:
645            for ex in knn:
646                CNK += ex.getclass().value
647            CNK /= self.k
648            CNK -= predicted.value
649
650            return [Estimate(CNK, SIGNED, CNK_SIGNED),
651                    Estimate(abs(CNK), ABSOLUTE, CNK_ABSOLUTE)]
652        elif ex.domain.class_var.var_type == Orange.feature.Descriptor.Discrete:
653            knn_l = Orange.classification.knn.kNNLearner(k=self.k)
654            knn_c = knn_l(knn)
655            for ex in knn:
656                CNK -= self.distance(probabilities, knn_c(ex, Orange.classification.Classifier.GetProbabilities))
657            CNK /= self.k
658            CNK += 1
659
660            return [Estimate(CNK, ABSOLUTE, CNK_ABSOLUTE)]
661
662class Mahalanobis:
663    """
664   
665    :param k: Number of nearest neighbours used in Mahalanobis estimate.
666    :type k: int
667   
668    :rtype: :class:`Orange.evaluation.reliability.MahalanobisClassifier`
669   
670    Mahalanobis distance reliability estimate is defined as
671    `mahalanobis distance <http://en.wikipedia.org/wiki/Mahalanobis_distance>`_
672    to the evaluated instance's :math:`k` nearest neighbours.
673
674   
675    """
676    def __init__(self, k=3):
677        self.k = k
678
679    def __call__(self, instances, *args):
680        nnm = Orange.classification.knn.FindNearestConstructor()
681        nnm.distanceConstructor = Orange.distance.Mahalanobis()
682
683        mid = Orange.feature.Descriptor.new_meta_id()
684        nnm = nnm(instances, 0, mid)
685        return MahalanobisClassifier(self.k, nnm, mid)
686
687class MahalanobisClassifier:
688    def __init__(self, k, nnm, mid):
689        self.k = k
690        self.nnm = nnm
691        self.mid = mid
692
693    def __call__(self, instance, *args):
694        mahalanobis_distance = 0
695
696        mahalanobis_distance = sum(ex[self.mid].value for ex in self.nnm(instance, self.k))
697
698        return [ Estimate(mahalanobis_distance, ABSOLUTE, MAHAL_ABSOLUTE) ]
699
700class MahalanobisToCenter:
701    """
702    :rtype: :class:`Orange.evaluation.reliability.MahalanobisToCenterClassifier`
703   
704    Mahalanobis distance to center reliability estimate is defined as a
705    `mahalanobis distance <http://en.wikipedia.org/wiki/Mahalanobis_distance>`_
706    between the predicted instance and the centroid of the data.
707
708   
709    """
710    def __init__(self):
711        pass
712
713    def __call__(self, instances, *args):
714        dc = Orange.core.DomainContinuizer()
715        dc.classTreatment = Orange.core.DomainContinuizer.Ignore
716        dc.continuousTreatment = Orange.core.DomainContinuizer.NormalizeBySpan
717        dc.multinomialTreatment = Orange.core.DomainContinuizer.NValues
718
719        new_domain = dc(instances)
720        new_instances = instances.translate(new_domain)
721
722        X, _, _ = new_instances.to_numpy()
723        instance_avg = numpy.average(X, 0)
724
725        distance_constructor = Orange.distance.Mahalanobis()
726        distance = distance_constructor(new_instances)
727
728        average_instance = Orange.data.Instance(new_instances.domain, list(instance_avg) + ["?"])
729
730        return MahalanobisToCenterClassifier(distance, average_instance, new_domain)
731
732class MahalanobisToCenterClassifier:
733    def __init__(self, distance, average_instance, new_domain):
734        self.distance = distance
735        self.average_instance = average_instance
736        self.new_domain = new_domain
737
738    def __call__(self, instance, *args):
739
740        inst = Orange.data.Instance(self.new_domain, instance)
741
742        mahalanobis_to_center = self.distance(inst, self.average_instance)
743
744        return [ Estimate(mahalanobis_to_center, ABSOLUTE, MAHAL_TO_CENTER_ABSOLUTE) ]
745
746
747class BaggingVarianceCNeighbours:
748    """
749   
750    :param bagv: Instance of Bagging Variance estimator.
751    :type bagv: :class:`BaggingVariance`
752   
753    :param cnk: Instance of CNK estimator.
754    :type cnk: :class:`CNeighbours`
755   
756    :rtype: :class:`Orange.evaluation.reliability.BaggingVarianceCNeighboursClassifier`
757   
758    BVCK is a combination (average) of Bagging variance and local modeling of
759    prediction error.
760   
761    """
762    def __init__(self, bagv=BaggingVariance(), cnk=CNeighbours()):
763        self.bagv = bagv
764        self.cnk = cnk
765
766    def __call__(self, instances, learner):
767        bagv_classifier = self.bagv(instances, learner)
768        cnk_classifier = self.cnk(instances, learner)
769        return BaggingVarianceCNeighboursClassifier(bagv_classifier, cnk_classifier)
770
771class BaggingVarianceCNeighboursClassifier:
772    def __init__(self, bagv_classifier, cnk_classifier):
773        self.bagv_classifier = bagv_classifier
774        self.cnk_classifier = cnk_classifier
775
776    def __call__(self, instance, predicted, probabilities):
777        bagv_estimates = self.bagv_classifier(instance, predicted, probabilities)
778        cnk_estimates = self.cnk_classifier(instance, predicted, probabilities)
779
780        bvck_value = (bagv_estimates[0].estimate + cnk_estimates[1].estimate) / 2
781        bvck_estimates = [ Estimate(bvck_value, ABSOLUTE, BVCK_ABSOLUTE) ]
782        bvck_estimates.extend(bagv_estimates)
783        bvck_estimates.extend(cnk_estimates)
784        return bvck_estimates
785
786class ErrorPredicting:
787    def __init__(self):
788        pass
789
790    def __call__(self, instances, learner):
791        res = Orange.evaluation.testing.cross_validation([learner], instances)
792        prediction_errors = get_prediction_error_list(res)
793
794        new_domain = Orange.data.Domain(instances.domain.attributes, Orange.core.FloatVariable("pe"))
795        new_dataset = Orange.data.Table(new_domain, instances)
796
797        for instance, prediction_error in izip(new_dataset, prediction_errors):
798            instance.set_class(prediction_error)
799
800        rf = Orange.ensemble.forest.RandomForestLearner()
801        rf_classifier = rf(new_dataset)
802
803        return ErrorPredictingClassification(rf_classifier, new_domain)
804
805class ErrorPredictingClassification:
806    def __init__(self, rf_classifier, new_domain):
807        self.rf_classifier = rf_classifier
808        self.new_domain = new_domain
809
810    def __call__(self, instance, predicted, probabilities):
811        new_instance = Orange.data.Instance(self.new_domain, instance)
812        value = self.rf_classifier(new_instance, Orange.core.GetValue)
813
814        return [Estimate(value.value, SIGNED, SABIAS_SIGNED)]
815
816def gauss_kernel(x, sigma=1):
817    return 1./(sigma*math.sqrt(2*math.pi)) * math.exp(-1./2*(x/sigma)**2)
818
819class ParzenWindowDensityBased:
820    """
821    :param K: kernel function. Default: gaussian.
822    :type K: function
823
824    :param d_measure: distance measure for inter-instance distance.
825    :type d_measure: :class:`Orange.distance.DistanceConstructor`
826
827    :rtype: :class:`Orange.evaluation.reliability.ParzenWindowDensityBasedClassifier`
828
829    Returns a value that estimates a density of problem space around the
830    instance being predicted.
831    """
832    def __init__(self, K=gauss_kernel, d_measure=Orange.distance.Euclidean(), name="density"):
833        self.K = K
834        self.d_measure = d_measure
835        self.name = name
836
837    def __call__(self, instances, learner):
838
839        self.distance = self.d_measure(instances)
840
841        def density(x):
842            l, dens = len(instances), 0
843            for ex in instances:
844                dens += self.K(self.distance(x,ex))
845            return dens / l
846
847        max_density = max([density(ex) for ex in instances])
848
849        return ParzenWindowDensityBasedClassifier(density, max_density)
850
851class ParzenWindowDensityBasedClassifier:
852
853    def __init__(self, density, max_density):
854        self.density = density
855        self.max_density = max_density
856
857
858    def __call__(self, instance, *args):
859
860        DENS = self.max_density-self.density(instance)
861
862        return [Estimate(DENS, ABSOLUTE, DENS_ABSOLUTE)]
863
864class Learner:
865    """
866    Reliability estimation wrapper around a learner we want to test.
867    Different reliability estimation algorithms can be used on the
868    chosen learner. This learner works as any other and can be used as one,
869    but it returns the classifier, wrapped into an instance of
870    :class:`Orange.evaluation.reliability.Classifier`.
871   
872    :param box_learner: Learner we want to wrap into a reliability estimation
873        classifier.
874    :type box_learner: :obj:`~Orange.classification.Learner`
875   
876    :param estimators: List of different reliability estimation methods we
877                       want to use on the chosen learner.
878    :type estimators: :obj:`list` of reliability estimators
879   
880    :param name: Name of this reliability learner
881    :type name: string
882   
883    :rtype: :class:`Orange.evaluation.reliability.Learner`
884    """
885    def __init__(self, box_learner, name="Reliability estimation",
886                 estimators=[SensitivityAnalysis(),
887                             LocalCrossValidation(),
888                             BaggingVarianceCNeighbours(),
889                             Mahalanobis(),
890                             MahalanobisToCenter()],
891                 **kwds):
892        self.__dict__.update(kwds)
893        self.name = name
894        self.estimators = estimators
895        self.box_learner = box_learner
896        self.blending = False
897
898
899    def __call__(self, instances, weight=None, **kwds):
900        """Learn from the given table of data instances.
901       
902        :param instances: Data instances to learn from.
903        :type instances: Orange.data.Table
904        :param weight: Id of meta attribute with weights of instances
905        :type weight: int
906        :rtype: :class:`Orange.evaluation.reliability.Classifier`
907        """
908
909        blending_classifier = None
910        new_domain = None
911
912#        if instances.domain.class_var.var_type != Orange.feature.Continuous.Continuous:
913#            raise Exception("This method only works on data with continuous class.")
914
915        return Classifier(instances, self.box_learner, self.estimators, self.blending, new_domain, blending_classifier)
916
917    def internal_cross_validation(self, instances, folds=10):
918        """ Perform the internal cross validation for getting the best
919        reliability estimate. It uses the reliability estimators defined in
920        estimators attribute.
921
922        Returns the id of the method that scored the best.
923
924        :param instances: Data instances to use for ICV.
925        :type instances: :class:`Orange.data.Table`
926        :param folds: number of folds for ICV.
927        :type folds: int
928        :rtype: int
929
930        """
931        res = Orange.evaluation.testing.cross_validation([self], instances, folds=folds)
932        results = get_pearson_r(res)
933        sorted_results = sorted(results)
934        return sorted_results[-1][3]
935
936    def internal_cross_validation_testing(self, instances, folds=10):
937        """ Perform internal cross validation (as in Automatic selection of
938        reliability estimates for individual regression predictions,
939        Zoran Bosnic, 2010) and return id of the method
940        that scored best on this data.
941
942        :param instances: Data instances to use for ICV.
943        :type instances: :class:`Orange.data.Table`
944        :param folds: number of folds for ICV.
945        :type folds: int
946        :rtype: int
947
948        """
949        cv_indices = Orange.core.MakeRandomIndicesCV(instances, folds)
950
951        list_of_rs = []
952
953        sum_of_rs = defaultdict(float)
954
955        for fold in xrange(folds):
956            data = instances.select(cv_indices, fold)
957            if len(data) < 10:
958                res = Orange.evaluation.testing.leave_one_out([self], data)
959            else:
960                res = Orange.evaluation.testing.cross_validation([self], data)
961            results = get_pearson_r(res)
962            for r, _, _, method in results:
963                sum_of_rs[method] += r
964        sorted_sum_of_rs = sorted(sum_of_rs.items(), key=lambda estimate: estimate[1], reverse=True)
965        return sorted_sum_of_rs[0][0]
966
967    labels = ["SAvar", "SAbias", "BAGV", "CNK", "LCV", "BVCK", "Mahalanobis", "ICV"]
968
969class Classifier:
970    """
971    A reliability estimation wrapper for classifiers.
972
973    What distinguishes this classifier is that the returned probabilities (if
974    :obj:`Orange.classification.Classifier.GetProbabilities` or
975    :obj:`Orange.classification.Classifier.GetBoth` is passed) contain an
976    additional attribute :obj:`reliability_estimate`, which is an instance of
977    :class:`~Orange.evaluation.reliability.Estimate`.
978
979    """
980
981    def __init__(self, instances, box_learner, estimators, blending, blending_domain, rf_classifier, **kwds):
982        self.__dict__.update(kwds)
983        self.instances = instances
984        self.box_learner = box_learner
985        self.estimators = estimators
986        self.blending = blending
987        self.blending_domain = blending_domain
988        self.rf_classifier = rf_classifier
989
990        # Train the learner with original data
991        self.classifier = box_learner(instances)
992
993        # Train all the estimators and create their classifiers
994        self.estimation_classifiers = [estimator(instances, box_learner) for estimator in estimators]
995
996    def __call__(self, instance, result_type=Orange.core.GetValue):
997        """
998        Classify and estimate reliability of estimation for a new instance.
999        When :obj:`result_type` is set to
1000        :obj:`Orange.classification.Classifier.GetBoth` or
1001        :obj:`Orange.classification.Classifier.GetProbabilities`,
1002        an additional attribute :obj:`reliability_estimate`,
1003        which is an instance of
1004        :class:`~Orange.evaluation.reliability.Estimate`,
1005        is added to the distribution object.
1006       
1007        :param instance: instance to be classified.
1008        :type instance: :class:`Orange.data.Instance`
1009        :param result_type: :class:`Orange.classification.Classifier.GetValue` or \
1010              :class:`Orange.classification.Classifier.GetProbabilities` or
1011              :class:`Orange.classification.Classifier.GetBoth`
1012       
1013        :rtype: :class:`Orange.data.Value`,
1014              :class:`Orange.statistics.Distribution` or a tuple with both
1015        """
1016        predicted, probabilities = self.classifier(instance, Orange.core.GetBoth)
1017
1018        # Create a place holder for estimates
1019        if probabilities is None:
1020            probabilities = Orange.statistics.distribution.Continuous()
1021        #with warnings.catch_warnings():
1022        #    warnings.simplefilter("ignore")
1023        probabilities.setattr('reliability_estimate', [])
1024
1025        # Calculate all the estimates and add them to the results
1026        for estimate in self.estimation_classifiers:
1027            probabilities.reliability_estimate.extend(estimate(instance, predicted, probabilities))
1028
1029        # Return the appropriate type of result
1030        if result_type == Orange.core.GetValue:
1031            return predicted
1032        elif result_type == Orange.core.GetProbabilities:
1033            return probabilities
1034        else:
1035            return predicted, probabilities
1036
1037# Functions for testing and plotting
1038#TODO Document those.
1039def get_acc_rel(method, data, learner):
1040    estimators = [method]
1041    reliability = Orange.evaluation.reliability.Learner(learner, estimators=estimators)
1042    #results = Orange.evaluation.testing.leave_one_out([reliability], data)
1043    results = Orange.evaluation.testing.cross_validation([reliability], data)
1044
1045    rels, acc = [], []
1046
1047    for res in results.results:
1048        rels.append(res.probabilities[0].reliability_estimate[0].estimate)
1049        acc.append(res.probabilities[0][res.actual_class])
1050
1051    return rels, acc
1052
1053
1054def rel_acc_plot(rels, acc, file_name=None, colors=None):
1055
1056    import matplotlib.pylab as plt
1057   
1058    if colors is None:
1059        colors = "k"
1060    plt.scatter(rels, acc, c=colors)
1061    plt.xlim(0.,1.)
1062    plt.ylim(ymin=0.)
1063    plt.xlabel("Reliability")
1064    plt.ylabel("Accuracy")
1065    if file_name is None:
1066        plt.show()
1067    else:
1068        plt.savefig(file_name)
1069
1070def rel_acc_compute_plot(method, data, learner, file_name=None, colors=None):
1071
1072    plt.clf()
1073
1074    rels, acc = get_acc_rel(method, data, learner)
1075    el_acc_plot(acc, rels, file_name=file_name, colors=colors)
1076   
1077
1078def acc_rel_correlation(method, data, learner):
1079    import scipy.stats
1080    rels, acc = get_acc_rel(method, data, learner)
1081    return scipy.stats.spearmanr(acc, rels)[0]
Note: See TracBrowser for help on using the repository browser.