source: orange-reliability/orangecontrib/reliability/__init__.py @ 54:2a1c28cec845

Revision 54:2a1c28cec845, 44.4 KB checked in by markotoplak, 6 months ago (diff)

Documentation updates.

Line 
1import Orange
2
3import random
4from Orange import statc
5import math
6import warnings
7import numpy
8
9from collections import defaultdict
10from itertools import izip
11
12# All the estimator method constants
13SAVAR_ABSOLUTE = 0
14SABIAS_SIGNED = 1
15SABIAS_ABSOLUTE = 2
16BAGV_ABSOLUTE = 3
17CNK_SIGNED = 4
18CNK_ABSOLUTE = 5
19LCV_ABSOLUTE = 6
20BVCK_ABSOLUTE = 7
21MAHAL_ABSOLUTE = 8
22BLENDING_ABSOLUTE = 9
23ICV_METHOD = 10
24MAHAL_TO_CENTER_ABSOLUTE = 13
25DENS_ABSOLUTE = 14
26ERR_ABSOLUTE = 15
27STACKING = 101
28
29# Type of estimator constant
30SIGNED = 0
31ABSOLUTE = 1
32
33# Names of all the estimator methods
34METHOD_NAME = {0: "SAvar absolute", 1: "SAbias signed", 2: "SAbias absolute",
35               3: "BAGV absolute", 4: "CNK signed", 5: "CNK absolute",
36               6: "LCV absolute", 7: "BVCK absolute", 8: "Mahalanobis absolute",
37               9: "BLENDING absolute", 10: "ICV", 11: "RF Variance", 12: "RF Std",
38               13: "Mahalanobis to center", 14: "Density based", 15: "Reference expected error",
39               101: "Stacking" }
40
41def get_reliability_estimation_list(res, i):
42    return [ result.probabilities[0].reliability_estimate[i].estimate for result in res.results], \
43        res.results[0].probabilities[0].reliability_estimate[i].signed_or_absolute, \
44        res.results[0].probabilities[0].reliability_estimate[i].method
45
46def get_prediction_error_list(res):
47    return [result.actual_class - result.classes[0] for result in res.results]
48
49def get_description_list(res, i):
50    return [result.probabilities[0].reliability_estimate[i].text_description for result in res.results]
51
52def get_pearson_r(res):
53    """
54    :param res: Evaluation results with :obj:`reliability_estimate`.
55    :type res: :class:`Orange.evaluation.testing.ExperimentResults`
56
57    Pearson's coefficients between the prediction error and
58    reliability estimates with p-values.
59    """
60    prediction_error = get_prediction_error_list(res)
61    results = []
62    for i in xrange(len(res.results[0].probabilities[0].reliability_estimate)):
63        reliability_estimate, signed_or_absolute, method = get_reliability_estimation_list(res, i)
64        try:
65            if signed_or_absolute == SIGNED:
66                r, p = statc.pearsonr(prediction_error, reliability_estimate)
67            else:
68                r, p = statc.pearsonr([abs(pe) for pe in prediction_error], reliability_estimate)
69        except Exception:
70            r = p = float("NaN")
71        results.append((r, p, signed_or_absolute, method))
72    return results
73
74def get_spearman_r(res):
75    """
76    :param res: Evaluation results with :obj:`reliability_estimate`.
77    :type res: :class:`Orange.evaluation.testing.ExperimentResults`
78
79    Spearman's coefficients between the prediction error and
80    reliability estimates with p-values.
81    """
82    prediction_error = get_prediction_error_list(res)
83    results = []
84    for i in xrange(len(res.results[0].probabilities[0].reliability_estimate)):
85        reliability_estimate, signed_or_absolute, method = get_reliability_estimation_list(res, i)
86        try:
87            if signed_or_absolute == SIGNED:
88                r, p = statc.spearmanr(prediction_error, reliability_estimate)
89            else:
90                r, p = statc.spearmanr([abs(pe) for pe in prediction_error], reliability_estimate)
91        except Exception:
92            r = p = float("NaN")
93        results.append((r, p, signed_or_absolute, method))
94    return results
95
96def get_pearson_r_by_iterations(res):
97    """
98    :param res: Evaluation results with :obj:`reliability_estimate`.
99    :type res: :class:`Orange.evaluation.testing.ExperimentResults`
100
101    Pearson's coefficients between prediction error
102    and reliability estimates averaged over all folds.
103    """
104    results_by_fold = Orange.evaluation.scoring.split_by_iterations(res)
105    number_of_estimates = len(res.results[0].probabilities[0].reliability_estimate)
106    number_of_instances = len(res.results)
107    number_of_folds = len(results_by_fold)
108    results = [0 for _ in xrange(number_of_estimates)]
109    M
110    sig = [0 for _ in xrange(number_of_estimates)]
111    method_list = [0 for _ in xrange(number_of_estimates)]
112
113    for res in results_by_fold:
114        prediction_error = get_prediction_error_list(res)
115        for i in xrange(number_of_estimates):
116            reliability_estimate, signed_or_absolute, method = get_reliability_estimation_list(res, i)
117            try:
118                if signed_or_absolute == SIGNED:
119                    r, _ = statc.pearsonr(prediction_error, reliability_estimate)
120                else:
121                    r, _ = statc.pearsonr([abs(pe) for pe in prediction_error], reliability_estimate)
122            except Exception:
123                r = float("NaN")
124            results[i] += r
125            sig[i] = signed_or_absolute
126            method_list[i] = method
127
128    # Calculate p-values
129    results = [float(res) / number_of_folds for res in results]
130    ps = [p_value_from_r(r, number_of_instances) for r in results]
131
132    return zip(results, ps, sig, method_list)
133
134def p_value_from_r(r, n):
135    """
136    Calculate p-value from the paerson coefficient and the sample size.
137    """
138    df = n - 2
139    t = r * (df / ((-r + 1.0 + 1e-30) * (r + 1.0 + 1e-30))) ** 0.5
140    return statc.betai (df * 0.5, 0.5, df / (df + t * t))
141
142
143# Distances between two discrete probability distributions
144#TODO Document those.
145def normalize_both(p, q):
146    if not p.normalized:
147        p.normalize()
148    if not q.normalized:
149        q.normalize()
150    return p, q
151
152def minkowsky_dist(p, q, m=2):
153    p, q = normalize_both(p, q)
154    dist = 0
155    for i in range(len(p)):
156        dist += abs(p[i]-q[i])**m
157    return dist**(1./m)
158
159def manhattan_distance(p, q):
160    return minkowsky_dist(p, q, m=1)
161
162def euclidean_dist(p, q):
163    return minkowsky_dist(p, q, m=2)
164
165def variance_dist(p, q):
166    return euclidean_dist(p, q) ** 2
167
168def max_dist(p, q):
169    p, q = normalize_both(p, q)
170    return max([abs(p[i]-q[i]) for i in range(len(p))])
171
172def hellinger_dist(p, q):
173    p, q = normalize_both(p, q)
174    dist = 0
175    for i in range(len(p)):
176        dist += (math.sqrt(p[i])-math.sqrt(q[i])) ** 2
177    return dist
178
179def my_log(x):
180    return 0 if x == 0 else x * math.log(x)
181
182def kullback_leibler(p, q):
183    p, q = normalize_both(p, q)
184    dist = 0
185    for i in range(len(p)):
186        dist += my_log(p[i]-q[i])
187    return dist
188
189def cosine(p, q):
190    p, q = normalize_both(p, q)
191    p, q = [pp for pp in p], [qq for qq in q]
192    return 1 - numpy.dot(x,y) / (numpy.linalg.norm(p)*numpy.linalg.norm(q))
193
194
195class Estimate:
196    """
197    Describes a reliability estimate.
198
199    .. attribute:: estimate
200
201        Value of reliability.
202
203    .. attribute:: signed_or_absolute
204
205        Determines whether the method returned a signed or absolute result.
206        Has a value of either :obj:`SIGNED` or :obj:`ABSOLUTE`.
207
208    .. attribute:: method
209
210        An integer ID of the reliability estimation method used.
211
212    .. attribute:: method_name
213
214        Name (string) of the reliability estimation method used.
215
216    """
217    def __init__(self, estimate, signed_or_absolute, method):
218        self.estimate = estimate
219        self.signed_or_absolute = signed_or_absolute
220        self.method = method
221        self.method_name = METHOD_NAME[method]
222        self.text_description = None
223
224class DescriptiveAnalysis:
225    def __init__(self, estimator, desc=["high", "medium", "low"], procentage=[0.00, 0.33, 0.66], name="da"):
226        self.desc = desc
227        self.procentage = procentage
228        self.estimator = estimator
229        self.name = name
230
231    def __call__(self, instances, weight=None, **kwds):
232
233        # Calculate borders using cross validation
234        res = Orange.evaluation.testing.cross_validation([self.estimator], instances)
235        all_borders = []
236        for i in xrange(len(res.results[0].probabilities[0].reliability_estimate)):
237            estimates, signed_or_absolute, method = get_reliability_estimation_list(res, i)
238            sorted_estimates = sorted(abs(x) for x in estimates)
239            borders = [sorted_estimates[int(len(estimates) * p) - 1]  for p in self.procentage]
240            all_borders.append(borders)
241
242        # Learn on whole train data
243        estimator_classifier = self.estimator(instances)
244
245        return DescriptiveAnalysisClassifier(estimator_classifier, all_borders, self.desc)
246
247class DescriptiveAnalysisClassifier:
248    def __init__(self, estimator_classifier, all_borders, desc):
249        self.estimator_classifier = estimator_classifier
250        self.all_borders = all_borders
251        self.desc = desc
252
253    def __call__(self, instance, result_type=Orange.core.GetValue):
254        predicted, probabilities = self.estimator_classifier(instance, Orange.core.GetBoth)
255
256        for borders, estimate in zip(self.all_borders, probabilities.reliability_estimate):
257            estimate.text_description = self.desc[0]
258            for lower_border, text_desc in zip(borders, self.desc):
259                if estimate.estimate >= lower_border:
260                    estimate.text_description = text_desc
261
262        # Return the appropriate type of result
263        if result_type == Orange.core.GetValue:
264            return predicted
265        elif result_type == Orange.core.GetProbabilities:
266            return probabilities
267        else:
268            return predicted, probabilities
269
270class SensitivityAnalysis:
271    """
272   
273    :param e: Values of :math:`\epsilon`.
274    :type e: list of floats
275   
276    :rtype: :class:`Orange.evaluation.reliability.SensitivityAnalysisClassifier`
277   
278    The learning set is extended with that instancem, where the label is changed to
279    :math:`K + \epsilon (l_{max} - l_{min})` (:math:`K` is  the initial prediction,
280    :math:`\epsilon` a sensitivity parameter, and :math:`l_{min}` and
281    :math:`l_{max}` the lower and upper bounds of labels on training data).
282    Results for multiple values of :math:`\epsilon` are combined
283    into SAvar and SAbias. SAbias has a signed or absolute form.
284
285    :math:`SAvar = \\frac{\sum_{\epsilon \in E}(K_{\epsilon} - K_{-\epsilon})}{|E|}`
286    :math:`SAbias = \\frac{\sum_{\epsilon \in E} (K_{\epsilon} - K ) + (K_{-\epsilon} - K)}{2 |E|}`
287   
288   
289    """
290    def __init__(self, e=[0.01, 0.1, 0.5, 1.0, 2.0], name="sa"):
291        self.e = e
292        self.name = name
293
294    def __call__(self, instances, learner):
295        min_value = max_value = instances[0].getclass().value
296        for ex in instances:
297            if ex.getclass().value > max_value:
298                max_value = ex.getclass().value
299            if ex.getclass().value < min_value:
300                min_value = ex.getclass().value
301        return SensitivityAnalysisClassifier(self.e, instances, min_value, max_value, learner)
302
303class SensitivityAnalysisClassifier:
304    def __init__(self, e, instances, min_value, max_value, learner):
305        self.e = e
306        self.instances = instances
307        self.max_value = max_value
308        self.min_value = min_value
309        self.learner = learner
310
311    def __call__(self, instance, predicted, probabilities):
312        # Create new dataset
313        r_data = Orange.data.Table(self.instances)
314
315        # Create new instance
316        modified_instance = Orange.data.Instance(instance)
317
318        # Append it to the data
319        r_data.append(modified_instance)
320
321        # Calculate SAvar & SAbias
322        SAvar = SAbias = 0
323
324        for eps in self.e:
325            # +epsilon
326            r_data[-1].setclass(predicted.value + eps * (self.max_value - self.min_value))
327            c = self.learner(r_data)
328            k_plus = c(instance, Orange.core.GetValue)
329
330            # -epsilon
331            r_data[-1].setclass(predicted.value - eps * (self.max_value - self.min_value))
332            c = self.learner(r_data)
333            k_minus = c(instance, Orange.core.GetValue)
334            #print len(r_data)
335            #print eps*(self.max_value - self.min_value)
336            #print k_plus
337            #print k_minus
338            # calculate part SAvar and SAbias
339            SAvar += k_plus.value - k_minus.value
340            SAbias += k_plus.value + k_minus.value - 2 * predicted.value
341
342        SAvar /= len(self.e)
343        SAbias /= 2 * len(self.e)
344
345        return [Estimate(SAvar, ABSOLUTE, SAVAR_ABSOLUTE),
346                Estimate(SAbias, SIGNED, SABIAS_SIGNED),
347                Estimate(abs(SAbias), ABSOLUTE, SABIAS_ABSOLUTE)]
348
349
350
351class ReferenceExpectedError:
352    """
353
354    :rtype: :class:`Orange.evaluation.reliability.ReferenceExpectedErrorClassifier`
355
356    Reference estimate for classification: :math:`O_{ref} = 2 (\hat y - \hat y ^2) = 2 \hat y (1-\hat y)`, where :math:`\hat y` is the estimated probability of the predicted class [Pevec2011]_.
357
358    A greater estimate means a greater expected error.
359
360    """
361    def __init__(self, name="reference"):
362        self.name = name
363
364    def __call__(self, instances, learner):
365        classifier = learner(instances)
366        return ReferenceExpectedErrorClassifier(classifier)
367
368   
369class ReferenceExpectedErrorClassifier:
370
371    def __init__(self, classifier):
372        self.classifier = classifier
373
374    def __call__(self, instance, *args):
375        y_hat = max(self.classifier(instance, Orange.classification.Classifier.GetProbabilities))
376        return [Estimate(2 * y_hat * (1 - y_hat), ABSOLUTE, ERR_ABSOLUTE)]
377
378
379class BaggingVariance:
380    """
381   
382    :param m: Number of bagged models. Default: 50.
383    :type m: int
384   
385    :param for_instances:  Optional. If test instances
386      are given as a parameter, this class can compute their reliabilities
387      on the fly, which saves memory.
388
389    :type for_intances: Orange.data.Table
390   
391    :rtype: :class:`Orange.evaluation.reliability.BaggingVarianceClassifier`
392   
393    For regression, BAGV is the variance of predictions:
394
395    :math:`BAGV = \\frac{1}{m} \sum_{i=1}^{m} (K_i - K)^2`, where
396    :math:`K = \\frac{\sum_{i=1}^{m} K_i}{m}` and :math:`K_i` are
397    predictions of individual models.
398
399    For classification, BAGV is 1 minus the average Euclidean
400    distance between class probability distributions predicted by the
401    model, and distributions predicted by the individual bagged model;
402    a greater value implies a better prediction.
403
404    This reliability measure can run out of memory if individual classifiers themselves
405    use a lot of memory; it needs :math:`m` times memory
406    for a single classifier.
407    """
408    def __init__(self, m=50, name="bv", randseed=0, for_instances=None):
409
410        self.m = m
411        self.name = name
412        self.select_with_repeat = Orange.core.MakeRandomIndicesMultiple()
413        self.select_with_repeat.random_generator = Orange.misc.Random(randseed)
414        self.for_instances = for_instances
415
416    def __call__(self, instances, learner):
417        classifiers = []
418
419        if instances.domain.class_var.var_type == Orange.feature.Descriptor.Discrete:
420            classifier = learner(instances)
421        else:
422            classifier = None
423
424        for_inst_class = defaultdict(list)
425        this_iteration = None
426       
427        if self.for_instances:
428            his = map(_hashable_instance, self.for_instances)
429
430        # Create bagged classifiers using sampling with replacement
431        for i in xrange(self.m):
432            this_iteration = set()
433            selection = self.select_with_repeat(len(instances))
434            data = instances.select(selection)
435            cl = learner(data)
436            if cl:
437                if self.for_instances: # predict reliability for testing instances and throw cl away
438                    for instance, hi in zip(self.for_instances, his):
439                        if hi not in this_iteration:
440                            for_inst_class[hi].append(_bagged_value(instance, cl, classifier))
441                            this_iteration.add(hi)
442                else:
443                    classifiers.append(cl)
444
445        return BaggingVarianceClassifier(classifiers, classifier, for_inst_class=dict(for_inst_class))
446
447class BaggingVarianceClassifier:
448    def __init__(self, classifiers, classifier=None, for_inst_class=None):
449        self.classifiers = classifiers
450        self.classifier = classifier
451        self.for_inst_class = for_inst_class
452
453    def __call__(self, instance, *args):
454        BAGV = 0
455
456        # Calculate the bagging variance
457        if self.for_inst_class:
458            bagged_values = self.for_inst_class[_hashable_instance(instance)]
459        else:
460            bagged_values = [ _bagged_value(instance, c, self.classifier) for c in self.classifiers ]
461
462        k = sum(bagged_values) / len(bagged_values)
463
464        BAGV = sum((bagged_value - k) ** 2 for bagged_value in bagged_values) / len(bagged_values)
465        if instance.domain.class_var.var_type == Orange.feature.Descriptor.Discrete:
466            BAGV = 1 - BAGV
467
468        return [Estimate(BAGV, ABSOLUTE, BAGV_ABSOLUTE)]
469
470def _hashable_instance(instance):
471    return tuple(instance[i].value for i in range(len(instance.domain.attributes)))
472
473def _bagged_value(instance, c, classifier):
474    if instance.domain.class_var.var_type == Orange.feature.Descriptor.Continuous:
475        return c(instance, Orange.core.GetValue).value
476    elif instance.domain.class_var.var_type == Orange.feature.Descriptor.Discrete:
477        estimate = classifier(instance, Orange.core.GetProbabilities)
478        return euclidean_dist(c(instance, Orange.core.GetProbabilities), estimate)
479
480
481class LocalCrossValidation:
482    """
483
484    :param k: Number of nearest neighbours used. Default: 0, which denotes
485        1/20 of data set size (or 5, whichever is greater).
486    :type k: int
487
488    :param distance: Function that computes a distance between two discrete
489        distributions (used only in classification problems). The default
490        is Hellinger distance.
491    :type distance: function
492
493    :param distance_weighted: Relevant only for classification;
494        use an average distance between distributions, weighted by :math:`e^{-d}`,
495        where :math:`d` is the distance between predicted instance and the
496        neighbour.
497
498    :rtype: :class:`Orange.evaluation.reliability.LocalCrossValidationClassifier`
499
500    Leave-one-out validation is
501    performed on :math:`k` nearest neighbours to the given instance.
502    Reliability estimate for regression is then the distance
503    weighted absolute prediction error. For classification, it is 1 minus the average
504    distance between the predicted class probability distribution and the
505    (trivial) probability distributions of the nearest neighbour.
506    """
507    def __init__(self, k=0, distance=hellinger_dist, distance_weighted=True, name="lcv"):
508        self.k = k
509        self.distance = distance
510        self.distance_weighted = distance_weighted
511        self.name = name
512
513    def __call__(self, instances, learner):
514        nearest_neighbours_constructor = Orange.classification.knn.FindNearestConstructor()
515        nearest_neighbours_constructor.distanceConstructor = Orange.distance.Euclidean()
516
517        distance_id = Orange.feature.Descriptor.new_meta_id()
518        nearest_neighbours = nearest_neighbours_constructor(instances, 0, distance_id)
519
520        if self.k == 0:
521            self.k = max(5, len(instances) / 20)
522
523        return LocalCrossValidationClassifier(distance_id, nearest_neighbours, self.k, learner,
524            distance=self.distance, distance_weighted=self.distance_weighted)
525
526class LocalCrossValidationClassifier:
527    def __init__(self, distance_id, nearest_neighbours, k, learner, **kwds):
528        self.distance_id = distance_id
529        self.nearest_neighbours = nearest_neighbours
530        self.k = k
531        self.learner = learner
532        for a,b in kwds.items():
533            setattr(self, a, b)
534
535    def __call__(self, instance, *args):
536        LCVer = 0
537        LCVdi = 0
538
539        # Find k nearest neighbors
540
541        knn = [ex for ex in self.nearest_neighbours(instance, self.k)]
542
543        # leave one out of prediction error
544        for i in xrange(len(knn)):
545            train = knn[:]
546            del train[i]
547
548            classifier = self.learner(Orange.data.Table(train))
549
550            if instance.domain.class_var.var_type == Orange.feature.Descriptor.Continuous:
551                returned_value = classifier(knn[i], Orange.core.GetValue)
552                e = abs(knn[i].getclass().value - returned_value.value)
553
554            elif instance.domain.class_var.var_type == Orange.feature.Descriptor.Discrete:
555                returned_value = classifier(knn[i], Orange.core.GetProbabilities)
556                probabilities = [knn[i].get_class() == val for val in instance.domain.class_var.values]
557                e = self.distance(returned_value, Orange.statistics.distribution.Discrete(probabilities))
558
559            dist = math.exp(-knn[i][self.distance_id]) if self.distance_weighted else 1.0
560            LCVer += e * dist
561            LCVdi += dist
562
563        LCV = LCVer / LCVdi if LCVdi != 0 else 0
564        if math.isnan(LCV):
565            LCV = 0.0
566
567        if instance.domain.class_var.var_type == Orange.feature.Descriptor.Discrete:
568            LCV = 1 - LCV
569
570        return [ Estimate(LCV, ABSOLUTE, LCV_ABSOLUTE) ]
571
572class CNeighbours:
573    """
574   
575    :param k: Number of nearest neighbours.
576    :type k: int
577
578    :param distance: function that computes a distance between two discrete
579        distributions (used only in classification problems). The default
580        is Hellinger distance.
581    :type distance: function
582   
583    :rtype: :class:`Orange.evaluation.reliability.CNeighboursClassifier`
584   
585    For regression, CNK is a difference
586    between average label of its nearest neighbours and the prediction. CNK
587    can be either signed or absolute. A greater value implies greater prediction error.
588
589    For classification, CNK is equal to 1 minus the average distance between
590    predicted class distribution and (trivial) class distributions of the
591    $k$ nearest neighbours from the learning set. A greater value implies better prediction.
592   
593    """
594    def __init__(self, k=5, distance=hellinger_dist, name = "cnk"):
595        self.k = k
596        self.distance = distance
597        self.name = name
598
599    def __call__(self, instances, learner):
600        nearest_neighbours_constructor = Orange.classification.knn.FindNearestConstructor()
601        nearest_neighbours_constructor.distanceConstructor = Orange.distance.Euclidean()
602
603        distance_id = Orange.feature.Descriptor.new_meta_id()
604        nearest_neighbours = nearest_neighbours_constructor(instances, 0, distance_id)
605        return CNeighboursClassifier(nearest_neighbours, self.k, distance=self.distance)
606
607class CNeighboursClassifier:
608    def __init__(self, nearest_neighbours, k, distance):
609        self.nearest_neighbours = nearest_neighbours
610        self.k = k
611        self.distance = distance
612
613    def __call__(self, instance, predicted, probabilities):
614        CNK = 0
615
616        # Find k nearest neighbors
617
618        knn = [ex for ex in self.nearest_neighbours(instance, self.k)]
619
620        # average label of neighbors
621        if ex.domain.class_var.var_type == Orange.feature.Descriptor.Continuous:
622            for ex in knn:
623                CNK += ex.getclass().value
624            CNK /= self.k
625            CNK -= predicted.value
626
627            return [Estimate(CNK, SIGNED, CNK_SIGNED),
628                    Estimate(abs(CNK), ABSOLUTE, CNK_ABSOLUTE)]
629        elif ex.domain.class_var.var_type == Orange.feature.Descriptor.Discrete:
630            knn_l = Orange.classification.knn.kNNLearner(k=self.k)
631            knn_c = knn_l(knn)
632            for ex in knn:
633                CNK -= self.distance(probabilities, knn_c(ex, Orange.classification.Classifier.GetProbabilities))
634            CNK /= self.k
635            CNK += 1
636
637            return [Estimate(CNK, ABSOLUTE, CNK_ABSOLUTE)]
638
639class Mahalanobis:
640    """
641   
642    :param k: Number of nearest neighbours used in Mahalanobis estimate.
643    :type k: int
644   
645    :rtype: :class:`Orange.evaluation.reliability.MahalanobisClassifier`
646   
647    Mahalanobis distance reliability estimate is defined as
648    `Mahalanobis distance <http://en.wikipedia.org/wiki/Mahalanobis_distance>`_
649    to the evaluated instance's :math:`k` nearest neighbours.
650
651   
652    """
653    def __init__(self, k=3, name="mahalanobis"):
654        self.k = k
655        self.name = name
656
657    def __call__(self, instances, *args):
658        nnm = Orange.classification.knn.FindNearestConstructor()
659        nnm.distanceConstructor = Orange.distance.Mahalanobis()
660
661        mid = Orange.feature.Descriptor.new_meta_id()
662        nnm = nnm(instances, 0, mid)
663        return MahalanobisClassifier(self.k, nnm, mid)
664
665class MahalanobisClassifier:
666    def __init__(self, k, nnm, mid):
667        self.k = k
668        self.nnm = nnm
669        self.mid = mid
670
671    def __call__(self, instance, *args):
672        mahalanobis_distance = 0
673
674        mahalanobis_distance = sum(ex[self.mid].value for ex in self.nnm(instance, self.k))
675
676        return [ Estimate(mahalanobis_distance, ABSOLUTE, MAHAL_ABSOLUTE) ]
677
678class MahalanobisToCenter:
679    """
680    :rtype: :class:`Orange.evaluation.reliability.MahalanobisToCenterClassifier`
681   
682    Mahalanobis distance to center reliability estimate is defined as a
683    `Mahalanobis distance <http://en.wikipedia.org/wiki/Mahalanobis_distance>`_
684    between the predicted instance and the centroid of the data.
685
686   
687    """
688    def __init__(self, name="mahalanobis to center"):
689        self.name = name
690
691    def __call__(self, instances, *args):
692        dc = Orange.core.DomainContinuizer()
693        dc.classTreatment = Orange.core.DomainContinuizer.Ignore
694        dc.continuousTreatment = Orange.core.DomainContinuizer.NormalizeBySpan
695        dc.multinomialTreatment = Orange.core.DomainContinuizer.NValues
696
697        new_domain = dc(instances)
698        new_instances = instances.translate(new_domain)
699
700        X, _, _ = new_instances.to_numpy()
701        instance_avg = numpy.average(X, 0)
702
703        distance_constructor = Orange.distance.Mahalanobis()
704        distance = distance_constructor(new_instances)
705
706        average_instance = Orange.data.Instance(new_instances.domain, list(instance_avg) + ["?"])
707
708        return MahalanobisToCenterClassifier(distance, average_instance, new_domain)
709
710class MahalanobisToCenterClassifier:
711    def __init__(self, distance, average_instance, new_domain):
712        self.distance = distance
713        self.average_instance = average_instance
714        self.new_domain = new_domain
715
716    def __call__(self, instance, *args):
717
718        inst = Orange.data.Instance(self.new_domain, instance)
719
720        mahalanobis_to_center = self.distance(inst, self.average_instance)
721
722        return [ Estimate(mahalanobis_to_center, ABSOLUTE, MAHAL_TO_CENTER_ABSOLUTE) ]
723
724
725class BaggingVarianceCNeighbours:
726    """
727   
728    :param bagv: Instance of Bagging Variance estimator.
729    :type bagv: :class:`BaggingVariance`
730   
731    :param cnk: Instance of CNK estimator.
732    :type cnk: :class:`CNeighbours`
733   
734    :rtype: :class:`Orange.evaluation.reliability.BaggingVarianceCNeighboursClassifier`
735   
736    BVCK is an average of Bagging variance and local modeling of
737    prediction error.
738   
739    """
740    def __init__(self, bagv=None, cnk=None, name="bvck"):
741        if bagv is None:
742            bagv = BaggingVariance()
743        if cnk is None:
744            cnk = CNeighbours()
745        self.bagv = bagv
746        self.cnk = cnk
747        self.name = "bvck"
748
749    def __call__(self, instances, learner):
750        bagv_classifier = self.bagv(instances, learner)
751        cnk_classifier = self.cnk(instances, learner)
752        return BaggingVarianceCNeighboursClassifier(bagv_classifier, cnk_classifier)
753
754class BaggingVarianceCNeighboursClassifier:
755    def __init__(self, bagv_classifier, cnk_classifier):
756        self.bagv_classifier = bagv_classifier
757        self.cnk_classifier = cnk_classifier
758
759    def __call__(self, instance, predicted, probabilities):
760        bagv_estimates = self.bagv_classifier(instance, predicted, probabilities)
761        cnk_estimates = self.cnk_classifier(instance, predicted, probabilities)
762
763        bvck_value = (bagv_estimates[0].estimate + cnk_estimates[1].estimate) / 2
764        bvck_estimates = [ Estimate(bvck_value, ABSOLUTE, BVCK_ABSOLUTE) ]
765        bvck_estimates.extend(bagv_estimates)
766        bvck_estimates.extend(cnk_estimates)
767        return bvck_estimates
768
769class ErrorPredicting:
770    def __init__(self, name = "ep"):
771        self.name = name
772
773    def __call__(self, instances, learner):
774        res = Orange.evaluation.testing.cross_validation([learner], instances)
775        prediction_errors = get_prediction_error_list(res)
776
777        new_domain = Orange.data.Domain(instances.domain.attributes, Orange.core.FloatVariable("pe"))
778        new_dataset = Orange.data.Table(new_domain, instances)
779
780        for instance, prediction_error in izip(new_dataset, prediction_errors):
781            instance.set_class(prediction_error)
782
783        rf = Orange.ensemble.forest.RandomForestLearner()
784        rf_classifier = rf(new_dataset)
785
786        return ErrorPredictingClassification(rf_classifier, new_domain)
787
788class ErrorPredictingClassification:
789    def __init__(self, rf_classifier, new_domain):
790        self.rf_classifier = rf_classifier
791        self.new_domain = new_domain
792
793    def __call__(self, instance, predicted, probabilities):
794        new_instance = Orange.data.Instance(self.new_domain, instance)
795        value = self.rf_classifier(new_instance, Orange.core.GetValue)
796
797        return [Estimate(value.value, SIGNED, SABIAS_SIGNED)]
798
799def gauss_kernel(x, sigma=1):
800    return 1./(sigma*math.sqrt(2*math.pi)) * math.exp(-1./2*(x/sigma)**2)
801
802class ParzenWindowDensityBased:
803    """
804    :param K: kernel function. Default: gaussian.
805    :type K: function
806
807    :param d_measure: distance measure for inter-instance distance.
808    :type d_measure: :class:`Orange.distance.DistanceConstructor`
809
810    :rtype: :class:`Orange.evaluation.reliability.ParzenWindowDensityBasedClassifier`
811
812    Returns a value that estimates a density of problem space around the
813    instance being predicted.
814    """
815    def __init__(self, K=gauss_kernel, d_measure=Orange.distance.Euclidean(), name="density"):
816        self.K = K
817        self.d_measure = d_measure
818        self.name = name
819
820    def __call__(self, instances, learner):
821
822        self.distance = self.d_measure(instances)
823
824        def density(x):
825            l, dens = len(instances), 0
826            for ex in instances:
827                dens += self.K(self.distance(x,ex))
828            return dens / l
829
830        max_density = max([density(ex) for ex in instances])
831
832        return ParzenWindowDensityBasedClassifier(density, max_density)
833
834class ParzenWindowDensityBasedClassifier:
835
836    def __init__(self, density, max_density):
837        self.density = density
838        self.max_density = max_density
839
840
841    def __call__(self, instance, *args):
842
843        DENS = self.max_density-self.density(instance)
844
845        return [Estimate(DENS, ABSOLUTE, DENS_ABSOLUTE)]
846
847
848def _normalize(data):
849    dc = Orange.core.DomainContinuizer()
850    dc.classTreatment = Orange.core.DomainContinuizer.Ignore
851    dc.continuousTreatment = Orange.core.DomainContinuizer.NormalizeByVariance
852    domain = dc(data)
853    data = data.translate(domain)
854    return data
855
856class _NormalizedLearner(Orange.classification.Learner):
857    """
858    Wrapper for normalization.
859    """
860    def __init__(self, learner):
861        self.learner = learner
862
863    def __call__(self, data, *args, **kwargs):
864        return self.learner(_normalize(data), *args, **kwargs)
865
866class Stacking:
867    """
868
869    This methods develops a model that integrates reliability estimates
870    from all available reliability scoring techniques (see [Wolpert1992]_ and [Dzeroski2004]_). It
871    performs internal cross-validation and therefore takes roughly the same time
872    as :class:`ICV`.
873
874    :param stack_learner: a data modelling method. Default (if None): unregularized linear regression with prior normalization.
875    :type stack_learner: :obj:`Orange.classification.Learner`
876
877    :param estimators: Reliability estimation methods to choose from. Default (if None): :class:`SensitivityAnalysis`, :class:`LocalCrossValidation`, :class:`BaggingVarianceCNeighbours`, :class:`Mahalanobis`, :class:`MahalanobisToCenter`.
878    :type estimators: :obj:`list` of reliability estimators
879 
880    :param folds: The number of fold for cross validation (default 10).
881    :type box_learner: :obj:`int`
882
883    :param save_data: If True, save the data used for training the
884        integration model into resulting classifier's .data attribute (default False).
885    :type box_learner: :obj:`bool`
886 
887    """
888 
889    def __init__(self, 
890        stack_learner=None, 
891        estimators=None, 
892        folds=10, 
893        save_data=False):
894        self.stack_learner = stack_learner
895        self.estimators = estimators
896        self.folds = folds
897        self.save_data = save_data
898        if self.stack_learner is None:
899            self.stack_learner=_NormalizedLearner(Orange.regression.linear.LinearRegressionLearner(ridge_lambda=0.0))
900        if self.estimators is None:
901             self.estimators = [SensitivityAnalysis(),
902                           LocalCrossValidation(),
903                           BaggingVarianceCNeighbours(),
904                           Mahalanobis(),
905                           MahalanobisToCenter()]
906   
907    def __call__(self, data, learner):
908
909        newfeatures = None
910       
911        if self.folds > 1:
912
913            cvi = Orange.data.sample.SubsetIndicesCV(data, self.folds)
914            data_cv = [ None ] * len(data)
915            for f in set(cvi): #for each fold
916                learn = data.select(cvi, f, negate=True)
917                test = data.select(cvi, f)
918
919                #learn reliability estimates for the learning set
920                lf = Learner(learner, estimators=self.estimators)(learn)
921               
922                #pos is used to retain the order of instances
923                for ex, pos in zip(test, [ i for i,n in enumerate(cvi) if n == f ]):
924                    pred = lf(ex, Orange.core.GetBoth)
925                    re = pred[1].reliability_estimate
926                    names = [ e.method_name for e in re ]
927                    assert newfeatures is None or names == newfeatures
928                    newfeatures = names
929                    estimates = [ abs(e.estimate) for e in re ]
930                    error = ex[-1].value - pred[0].value
931                    data_cv[pos] = estimates + [ abs(error) ]
932
933        else:
934 
935            #use half of the data to learn reliability estimates
936            #and the other half for induction of a stacking classifier
937            cvi = Orange.data.sample.SubsetIndicesCV(data, 2)
938            data_cv = []
939
940            learn = data.select(cvi, 0, negate=True)
941            test = data.select(cvi, 0)
942
943            #learn reliability estimates for the learning set
944            lf = Learner(learner, estimators=self.estimators)(learn)
945           
946            for ex in test:
947                pred = lf(ex, Orange.core.GetBoth)
948                re = pred[1].reliability_estimate
949                names = [ e.method_name for e in re ]
950                assert newfeatures is None or names == newfeatures
951                newfeatures = names
952                estimates = [ abs(e.estimate) for e in re ]
953                error = ex[-1].value - pred[0].value
954                data_cv.append(estimates + [ abs(error) ])
955
956        lf = None
957
958        #induce the classifier on cross-validated reliability estimates
959        newfeatures = [ Orange.feature.Continuous(name=n) for n in newfeatures ]
960        newdomain = Orange.data.Domain(newfeatures, Orange.feature.Continuous(name="error"))
961        classifier_data = Orange.data.Table(newdomain, data_cv)
962        stack_classifier = self.stack_learner(classifier_data)
963
964        #induce reliability estimates on the whole data set
965        lf = Learner(learner, estimators=self.estimators)(data)
966
967        return StackingClassifier(stack_classifier, lf, newdomain, data=classifier_data if self.save_data else None)
968
969
970class StackingClassifier:
971
972    def __init__(self, stacking_classifier, reliability_classifier, domain, data=None):
973        self.stacking_classifier = stacking_classifier
974        self.domain = domain
975        self.reliability_classifier = reliability_classifier
976        self.data = data
977
978    def convert(self, instance):
979        """ Return example in the space of reliability estimates. """
980        re = self.reliability_classifier(instance, Orange.core.GetProbabilities).reliability_estimate
981        #take absolute values for all
982        tex = [ abs(e.estimate) for e in re ] + [ "?" ]
983        tex =  Orange.data.Instance(self.domain, tex)
984        return tex
985
986    def __call__(self, instance, *args):
987        tex = self.convert(instance)
988        r = self.stacking_classifier(tex)
989        r = float(r)
990        r = max(0., r)
991        return [ Estimate(r, ABSOLUTE, STACKING) ]
992
993class ICV:
994    """ Selects the best reliability estimator for
995    the given data with internal cross validation [Bosnic2010]_.
996
997    :param estimators: reliability estimation methods to choose from. Default (if None): :class:`SensitivityAnalysis`, :class:`LocalCrossValidation`, :class:`BaggingVarianceCNeighbours`, :class:`Mahalanobis`, :class:`MahalanobisToCenter` ]
998    :type estimators: :obj:`list` of reliability estimators
999 
1000    :param folds: The number of fold for cross validation (default 10).
1001    :type box_learner: :obj:`int`
1002 
1003    """
1004 
1005    def __init__(self, estimators=None, folds=10):
1006        self.estimators = estimators
1007        if self.estimators is None:
1008             self.estimators = [SensitivityAnalysis(),
1009                           LocalCrossValidation(),
1010                           BaggingVarianceCNeighbours(),
1011                           Mahalanobis(),
1012                           MahalanobisToCenter()]
1013        self.folds = folds
1014   
1015    def __call__(self, data, learner):
1016
1017        cvi = Orange.data.sample.SubsetIndicesCV(data, self.folds)
1018        sum_of_rs = defaultdict(float)
1019        n_rs = defaultdict(int)
1020
1021        elearner = Learner(learner, estimators=self.estimators)
1022
1023        #average correlations from each fold
1024        for f in set(cvi):
1025            learn = data.select(cvi, f, negate=True)
1026            test = data.select(cvi, f)
1027
1028            res = Orange.evaluation.testing.learn_and_test_on_test_data([elearner], learn, test)
1029            results = get_pearson_r(res)
1030   
1031            for r, p, sa, method in results:
1032                if not math.isnan(r): #ignore NaN values
1033                    sum_of_rs[(method, sa)] += r
1034                    n_rs[(method, sa)] += 1 
1035
1036        avg_rs = [ (k,(sum_of_rs[k]/n_rs[k])) for k in sum_of_rs ]
1037
1038        avg_rs = sorted(avg_rs, key=lambda estimate: estimate[1], reverse=True)
1039        chosen = avg_rs[0][0]
1040
1041        lf = elearner(data)
1042        return ICVClassifier(chosen, lf)
1043
1044
1045class ICVClassifier:
1046
1047    def __init__(self, chosen, reliability_classifier):
1048        self.chosen = chosen
1049        self.reliability_classifier = reliability_classifier
1050
1051    def __call__(self, instance, *args):
1052        re = self.reliability_classifier(instance, Orange.core.GetProbabilities).reliability_estimate
1053        for e in re:
1054            if e.method == self.chosen[0] and e.signed_or_absolute == self.chosen[1]:
1055                r = e.estimate
1056
1057        return [ Estimate(r, self.chosen[1], ICV_METHOD) ]
1058
1059class Learner:
1060    """
1061    Adds reliability estimation to any prediction method.
1062    This class can be used as any other Orange learner,
1063    but returns the classifier wrapped into an instance of
1064    :class:`Orange.evaluation.reliability.Classifier`.
1065
1066    :param box_learner: Learner to wrap into a reliability estimation
1067        classifier.
1068    :type box_learner: :obj:`~Orange.classification.Learner`
1069   
1070    :param estimators: List of reliability estimation methods. Default (if None): :class:`SensitivityAnalysis`, :class:`LocalCrossValidation`, :class:`BaggingVarianceCNeighbours`, :class:`Mahalanobis`, :class:`MahalanobisToCenter`.
1071    :type estimators: :obj:`list` of reliability estimators
1072   
1073    :param name: Name of this reliability learner.
1074    :type name: string
1075   
1076    :rtype: :class:`Orange.evaluation.reliability.Learner`
1077    """
1078    def __init__(self, box_learner, name="Reliability estimation",
1079                 estimators=None,
1080                 **kwds):
1081        self.__dict__.update(kwds)
1082        self.name = name
1083        self.estimators = estimators
1084        if self.estimators is None:
1085             self.estimators = [SensitivityAnalysis(),
1086                           LocalCrossValidation(),
1087                           BaggingVarianceCNeighbours(),
1088                           Mahalanobis(),
1089                           MahalanobisToCenter()]
1090 
1091        self.box_learner = box_learner
1092        self.blending = False
1093
1094
1095    def __call__(self, instances, weight=None, **kwds):
1096        """Construct a classifier.
1097       
1098        :param instances: Learning data.
1099        :type instances: Orange.data.Table
1100        :param weight: Id of meta attribute with weights of instances
1101        :type weight: int
1102
1103        :rtype: :class:`Orange.evaluation.reliability.Classifier`
1104        """
1105
1106        blending_classifier = None
1107        new_domain = None
1108
1109#        if instances.domain.class_var.var_type != Orange.feature.Continuous.Continuous:
1110#            raise Exception("This method only works on data with continuous class.")
1111
1112        return Classifier(instances, self.box_learner, self.estimators, self.blending, new_domain, blending_classifier)
1113 
1114class Classifier:
1115    """
1116    A reliability estimation wrapper for classifiers.
1117    The returned probabilities contain an
1118    additional attribute :obj:`reliability_estimate`, which is a list of
1119    :class:`~Orange.evaluation.reliability.Estimate` (see :obj:`~Classifier.__call__`).
1120    """
1121
1122    def __init__(self, instances, box_learner, estimators, blending, blending_domain, rf_classifier, **kwds):
1123        self.__dict__.update(kwds)
1124        self.instances = instances
1125        self.box_learner = box_learner
1126        self.estimators = estimators
1127        self.blending = blending
1128        self.blending_domain = blending_domain
1129        self.rf_classifier = rf_classifier
1130
1131        # Train the learner with original data
1132        self.classifier = box_learner(instances)
1133
1134        # Train all the estimators and create their classifiers
1135        self.estimation_classifiers = [estimator(instances, box_learner) for estimator in estimators]
1136
1137    def __call__(self, instance, result_type=Orange.core.GetValue):
1138        """
1139        Classify and estimate reliability for a new instance.
1140        When :obj:`result_type` is set to
1141        :obj:`Orange.classification.Classifier.GetBoth` or
1142        :obj:`Orange.classification.Classifier.GetProbabilities`,
1143        an additional attribute :obj:`reliability_estimate`
1144        (a list of :class:`~Orange.evaluation.reliability.Estimate`)
1145        is added to the distribution object.
1146       
1147        :param instance: instance to be classified.
1148        :type instance: :class:`Orange.data.Instance`
1149        :param result_type: :class:`Orange.classification.Classifier.GetValue` or \
1150              :class:`Orange.classification.Classifier.GetProbabilities` or
1151              :class:`Orange.classification.Classifier.GetBoth`
1152       
1153        :rtype: :class:`Orange.data.Value`,
1154              :class:`Orange.statistics.Distribution` or a tuple with both
1155        """
1156        predicted, probabilities = self.classifier(instance, Orange.core.GetBoth)
1157
1158        # Create a place holder for estimates
1159        if probabilities is None:
1160            probabilities = Orange.statistics.distribution.Continuous()
1161        #with warnings.catch_warnings():
1162        #    warnings.simplefilter("ignore")
1163        probabilities.setattr('reliability_estimate', [])
1164
1165        # Calculate all the estimates and add them to the results
1166        for estimate in self.estimation_classifiers:
1167            probabilities.reliability_estimate.extend(estimate(instance, predicted, probabilities))
1168
1169        # Return the appropriate type of result
1170        if result_type == Orange.core.GetValue:
1171            return predicted
1172        elif result_type == Orange.core.GetProbabilities:
1173            return probabilities
1174        else:
1175            return predicted, probabilities
1176
1177# Functions for testing and plotting
1178#TODO Document those.
1179def get_acc_rel(method, data, learner):
1180    estimators = [method]
1181    reliability = Orange.evaluation.reliability.Learner(learner, estimators=estimators)
1182    #results = Orange.evaluation.testing.leave_one_out([reliability], data)
1183    results = Orange.evaluation.testing.cross_validation([reliability], data)
1184
1185    rels, acc = [], []
1186
1187    for res in results.results:
1188        rels.append(res.probabilities[0].reliability_estimate[0].estimate)
1189        acc.append(res.probabilities[0][res.actual_class])
1190
1191    return rels, acc
1192
1193
1194def rel_acc_plot(rels, acc, file_name=None, colors=None):
1195
1196    import matplotlib.pylab as plt
1197   
1198    if colors is None:
1199        colors = "k"
1200    plt.scatter(rels, acc, c=colors)
1201    plt.xlim(0.,1.)
1202    plt.ylim(ymin=0.)
1203    plt.xlabel("Reliability")
1204    plt.ylabel("Accuracy")
1205    if file_name is None:
1206        plt.show()
1207    else:
1208        plt.savefig(file_name)
1209
1210def rel_acc_compute_plot(method, data, learner, file_name=None, colors=None):
1211
1212    plt.clf()
1213
1214    rels, acc = get_acc_rel(method, data, learner)
1215    el_acc_plot(acc, rels, file_name=file_name, colors=colors)
1216   
1217
1218def acc_rel_correlation(method, data, learner):
1219    import scipy.stats
1220    rels, acc = get_acc_rel(method, data, learner)
1221    return scipy.stats.spearmanr(acc, rels)[0]
Note: See TracBrowser for help on using the repository browser.