source: orange-reliability/orangecontrib/reliability/__init__.py @ 53:ba8bc7d59e7a

Revision 53:ba8bc7d59e7a, 45.2 KB checked in by markotoplak, 7 months ago (diff)

Updates to documentation.

Line 
1import Orange
2
3import random
4from Orange import statc
5import math
6import warnings
7import numpy
8
9from collections import defaultdict
10from itertools import izip
11
12# All the estimator method constants
13SAVAR_ABSOLUTE = 0
14SABIAS_SIGNED = 1
15SABIAS_ABSOLUTE = 2
16BAGV_ABSOLUTE = 3
17CNK_SIGNED = 4
18CNK_ABSOLUTE = 5
19LCV_ABSOLUTE = 6
20BVCK_ABSOLUTE = 7
21MAHAL_ABSOLUTE = 8
22BLENDING_ABSOLUTE = 9
23ICV_METHOD = 10
24MAHAL_TO_CENTER_ABSOLUTE = 13
25DENS_ABSOLUTE = 14
26ERR_ABSOLUTE = 15
27STACKING = 101
28
29# Type of estimator constant
30SIGNED = 0
31ABSOLUTE = 1
32
33# Names of all the estimator methods
34METHOD_NAME = {0: "SAvar absolute", 1: "SAbias signed", 2: "SAbias absolute",
35               3: "BAGV absolute", 4: "CNK signed", 5: "CNK absolute",
36               6: "LCV absolute", 7: "BVCK absolute", 8: "Mahalanobis absolute",
37               9: "BLENDING absolute", 10: "ICV", 11: "RF Variance", 12: "RF Std",
38               13: "Mahalanobis to center", 14: "Density based", 15: "Reference expected error",
39               101: "Stacking" }
40
41def get_reliability_estimation_list(res, i):
42    return [ result.probabilities[0].reliability_estimate[i].estimate for result in res.results], \
43        res.results[0].probabilities[0].reliability_estimate[i].signed_or_absolute, \
44        res.results[0].probabilities[0].reliability_estimate[i].method
45
46def get_prediction_error_list(res):
47    return [result.actual_class - result.classes[0] for result in res.results]
48
49def get_description_list(res, i):
50    return [result.probabilities[0].reliability_estimate[i].text_description for result in res.results]
51
52def get_pearson_r(res):
53    """
54    :param res: results of evaluation, done using learners,
55        wrapped into :class:`Orange.evaluation.reliability.Classifier`.
56    :type res: :class:`Orange.evaluation.testing.ExperimentResults`
57
58    Return Pearson's coefficient between the prediction error and each of the
59    used reliability estimates. Also, return the p-value of each of
60    the coefficients.
61    """
62    prediction_error = get_prediction_error_list(res)
63    results = []
64    for i in xrange(len(res.results[0].probabilities[0].reliability_estimate)):
65        reliability_estimate, signed_or_absolute, method = get_reliability_estimation_list(res, i)
66        try:
67            if signed_or_absolute == SIGNED:
68                r, p = statc.pearsonr(prediction_error, reliability_estimate)
69            else:
70                r, p = statc.pearsonr([abs(pe) for pe in prediction_error], reliability_estimate)
71        except Exception:
72            r = p = float("NaN")
73        results.append((r, p, signed_or_absolute, method))
74    return results
75
76def get_spearman_r(res):
77    """
78    :param res: results of evaluation, done using learners,
79        wrapped into :class:`Orange.evaluation.reliability.Classifier`.
80    :type res: :class:`Orange.evaluation.testing.ExperimentResults`
81
82    Return Spearman's coefficient between the prediction error and each of the
83    used reliability estimates. Also, return the p-value of each of
84    the coefficients.
85    """
86    prediction_error = get_prediction_error_list(res)
87    results = []
88    for i in xrange(len(res.results[0].probabilities[0].reliability_estimate)):
89        reliability_estimate, signed_or_absolute, method = get_reliability_estimation_list(res, i)
90        try:
91            if signed_or_absolute == SIGNED:
92                r, p = statc.spearmanr(prediction_error, reliability_estimate)
93            else:
94                r, p = statc.spearmanr([abs(pe) for pe in prediction_error], reliability_estimate)
95        except Exception:
96            r = p = float("NaN")
97        results.append((r, p, signed_or_absolute, method))
98    return results
99
100def get_pearson_r_by_iterations(res):
101    """
102    :param res: results of evaluation, done using learners,
103        wrapped into :class:`Orange.evaluation.reliability.Classifier`.
104    :type res: :class:`Orange.evaluation.testing.ExperimentResults`
105
106    Return average Pearson's coefficient over all folds between prediction error
107    and each of the used estimates.
108    """
109    results_by_fold = Orange.evaluation.scoring.split_by_iterations(res)
110    number_of_estimates = len(res.results[0].probabilities[0].reliability_estimate)
111    number_of_instances = len(res.results)
112    number_of_folds = len(results_by_fold)
113    results = [0 for _ in xrange(number_of_estimates)]
114    sig = [0 for _ in xrange(number_of_estimates)]
115    method_list = [0 for _ in xrange(number_of_estimates)]
116
117    for res in results_by_fold:
118        prediction_error = get_prediction_error_list(res)
119        for i in xrange(number_of_estimates):
120            reliability_estimate, signed_or_absolute, method = get_reliability_estimation_list(res, i)
121            try:
122                if signed_or_absolute == SIGNED:
123                    r, _ = statc.pearsonr(prediction_error, reliability_estimate)
124                else:
125                    r, _ = statc.pearsonr([abs(pe) for pe in prediction_error], reliability_estimate)
126            except Exception:
127                r = float("NaN")
128            results[i] += r
129            sig[i] = signed_or_absolute
130            method_list[i] = method
131
132    # Calculate p-values
133    results = [float(res) / number_of_folds for res in results]
134    ps = [p_value_from_r(r, number_of_instances) for r in results]
135
136    return zip(results, ps, sig, method_list)
137
138def p_value_from_r(r, n):
139    """
140    Calculate p-value from the paerson coefficient and the sample size.
141    """
142    df = n - 2
143    t = r * (df / ((-r + 1.0 + 1e-30) * (r + 1.0 + 1e-30))) ** 0.5
144    return statc.betai (df * 0.5, 0.5, df / (df + t * t))
145
146
147# Distances between two discrete probability distributions
148#TODO Document those.
149def normalize_both(p, q):
150    if not p.normalized:
151        p.normalize()
152    if not q.normalized:
153        q.normalize()
154    return p, q
155
156def minkowsky_dist(p, q, m=2):
157    p, q = normalize_both(p, q)
158    dist = 0
159    for i in range(len(p)):
160        dist += abs(p[i]-q[i])**m
161    return dist**(1./m)
162
163def manhattan_distance(p, q):
164    return minkowsky_dist(p, q, m=1)
165
166def euclidean_dist(p, q):
167    return minkowsky_dist(p, q, m=2)
168
169def variance_dist(p, q):
170    return euclidean_dist(p, q) ** 2
171
172def max_dist(p, q):
173    p, q = normalize_both(p, q)
174    return max([abs(p[i]-q[i]) for i in range(len(p))])
175
176def hellinger_dist(p, q):
177    p, q = normalize_both(p, q)
178    dist = 0
179    for i in range(len(p)):
180        dist += (math.sqrt(p[i])-math.sqrt(q[i])) ** 2
181    return dist
182
183def my_log(x):
184    return 0 if x == 0 else x * math.log(x)
185
186def kullback_leibler(p, q):
187    p, q = normalize_both(p, q)
188    dist = 0
189    for i in range(len(p)):
190        dist += my_log(p[i]-q[i])
191    return dist
192
193def cosine(p, q):
194    p, q = normalize_both(p, q)
195    p, q = [pp for pp in p], [qq for qq in q]
196    return 1 - numpy.dot(x,y) / (numpy.linalg.norm(p)*numpy.linalg.norm(q))
197
198
199class Estimate:
200    """
201    Reliability estimate. Contains attributes that describe the results of
202    reliability estimation.
203
204    .. attribute:: estimate
205
206        A numerical reliability estimate.
207
208    .. attribute:: signed_or_absolute
209
210        Determines whether the method used gives a signed or absolute result.
211        Has a value of either :obj:`SIGNED` or :obj:`ABSOLUTE`.
212
213    .. attribute:: method
214
215        An integer ID of reliability estimation method used.
216
217    .. attribute:: method_name
218
219        Name (string) of reliability estimation method used.
220
221    """
222    def __init__(self, estimate, signed_or_absolute, method):
223        self.estimate = estimate
224        self.signed_or_absolute = signed_or_absolute
225        self.method = method
226        self.method_name = METHOD_NAME[method]
227        self.text_description = None
228
229class DescriptiveAnalysis:
230    def __init__(self, estimator, desc=["high", "medium", "low"], procentage=[0.00, 0.33, 0.66], name="da"):
231        self.desc = desc
232        self.procentage = procentage
233        self.estimator = estimator
234        self.name = name
235
236    def __call__(self, instances, weight=None, **kwds):
237
238        # Calculate borders using cross validation
239        res = Orange.evaluation.testing.cross_validation([self.estimator], instances)
240        all_borders = []
241        for i in xrange(len(res.results[0].probabilities[0].reliability_estimate)):
242            estimates, signed_or_absolute, method = get_reliability_estimation_list(res, i)
243            sorted_estimates = sorted(abs(x) for x in estimates)
244            borders = [sorted_estimates[int(len(estimates) * p) - 1]  for p in self.procentage]
245            all_borders.append(borders)
246
247        # Learn on whole train data
248        estimator_classifier = self.estimator(instances)
249
250        return DescriptiveAnalysisClassifier(estimator_classifier, all_borders, self.desc)
251
252class DescriptiveAnalysisClassifier:
253    def __init__(self, estimator_classifier, all_borders, desc):
254        self.estimator_classifier = estimator_classifier
255        self.all_borders = all_borders
256        self.desc = desc
257
258    def __call__(self, instance, result_type=Orange.core.GetValue):
259        predicted, probabilities = self.estimator_classifier(instance, Orange.core.GetBoth)
260
261        for borders, estimate in zip(self.all_borders, probabilities.reliability_estimate):
262            estimate.text_description = self.desc[0]
263            for lower_border, text_desc in zip(borders, self.desc):
264                if estimate.estimate >= lower_border:
265                    estimate.text_description = text_desc
266
267        # Return the appropriate type of result
268        if result_type == Orange.core.GetValue:
269            return predicted
270        elif result_type == Orange.core.GetProbabilities:
271            return probabilities
272        else:
273            return predicted, probabilities
274
275class SensitivityAnalysis:
276    """
277   
278    :param e: Values of :math:`\epsilon`.
279    :type e: list of floats
280   
281    :rtype: :class:`Orange.evaluation.reliability.SensitivityAnalysisClassifier`
282   
283    To estimate the reliability of prediction for a given instance,
284    the learning set is extended with that instance with the label changes to
285    :math:`K + \epsilon (l_{max} - l_{min})` (:math:`K` is  the initial prediction,
286    :math:`\epsilon` a sensitivity parameter, and :math:`l_{min}` and
287    :math:`l_{max}` the lower and upper bounds of labels on training data)
288    Results for multiple values of :math:`\epsilon` are combined
289    into SAvar and SAbias. SAbias can be used either in a signed or absolute form.
290
291    :math:`SAvar = \\frac{\sum_{\epsilon \in E}(K_{\epsilon} - K_{-\epsilon})}{|E|}`
292    :math:`SAbias = \\frac{\sum_{\epsilon \in E} (K_{\epsilon} - K ) + (K_{-\epsilon} - K)}{2 |E|}`
293   
294   
295    """
296    def __init__(self, e=[0.01, 0.1, 0.5, 1.0, 2.0], name="sa"):
297        self.e = e
298        self.name = name
299
300    def __call__(self, instances, learner):
301        min_value = max_value = instances[0].getclass().value
302        for ex in instances:
303            if ex.getclass().value > max_value:
304                max_value = ex.getclass().value
305            if ex.getclass().value < min_value:
306                min_value = ex.getclass().value
307        return SensitivityAnalysisClassifier(self.e, instances, min_value, max_value, learner)
308
309class SensitivityAnalysisClassifier:
310    def __init__(self, e, instances, min_value, max_value, learner):
311        self.e = e
312        self.instances = instances
313        self.max_value = max_value
314        self.min_value = min_value
315        self.learner = learner
316
317    def __call__(self, instance, predicted, probabilities):
318        # Create new dataset
319        r_data = Orange.data.Table(self.instances)
320
321        # Create new instance
322        modified_instance = Orange.data.Instance(instance)
323
324        # Append it to the data
325        r_data.append(modified_instance)
326
327        # Calculate SAvar & SAbias
328        SAvar = SAbias = 0
329
330        for eps in self.e:
331            # +epsilon
332            r_data[-1].setclass(predicted.value + eps * (self.max_value - self.min_value))
333            c = self.learner(r_data)
334            k_plus = c(instance, Orange.core.GetValue)
335
336            # -epsilon
337            r_data[-1].setclass(predicted.value - eps * (self.max_value - self.min_value))
338            c = self.learner(r_data)
339            k_minus = c(instance, Orange.core.GetValue)
340            #print len(r_data)
341            #print eps*(self.max_value - self.min_value)
342            #print k_plus
343            #print k_minus
344            # calculate part SAvar and SAbias
345            SAvar += k_plus.value - k_minus.value
346            SAbias += k_plus.value + k_minus.value - 2 * predicted.value
347
348        SAvar /= len(self.e)
349        SAbias /= 2 * len(self.e)
350
351        return [Estimate(SAvar, ABSOLUTE, SAVAR_ABSOLUTE),
352                Estimate(SAbias, SIGNED, SABIAS_SIGNED),
353                Estimate(abs(SAbias), ABSOLUTE, SABIAS_ABSOLUTE)]
354
355
356
357class ReferenceExpectedError:
358    """
359
360    :rtype: :class:`Orange.evaluation.reliability.ReferenceExpectedErrorClassifier`
361
362    Reference estimate for classification: :math:`O_{ref} = 2 (\hat y - \hat y ^2) = 2 \hat y (1-\hat y)`, where :math:`\hat y` is the estimated probability of the predicted class [Pevec2011]_.
363
364    A greater estimate means a greater expected error.
365
366    """
367    def __init__(self, name="reference"):
368        self.name = name
369
370    def __call__(self, instances, learner):
371        classifier = learner(instances)
372        return ReferenceExpectedErrorClassifier(classifier)
373
374   
375class ReferenceExpectedErrorClassifier:
376
377    def __init__(self, classifier):
378        self.classifier = classifier
379
380    def __call__(self, instance, *args):
381        y_hat = max(self.classifier(instance, Orange.classification.Classifier.GetProbabilities))
382        return [Estimate(2 * y_hat * (1 - y_hat), ABSOLUTE, ERR_ABSOLUTE)]
383
384
385class BaggingVariance:
386    """
387   
388    :param m: Number of bagging models to be used with BAGV estimate
389    :type m: int
390   
391    :param for instances:  Optional. If test instances
392      are given as a parameter, this class can compute their reliabilities
393      on the fly, which saves memory.
394
395    :type for_intances: Orange.data.Table
396   
397    :rtype: :class:`Orange.evaluation.reliability.BaggingVarianceClassifier`
398   
399   
400    :math:`m` different bagging models are used to estimate
401    the value of dependent variable for a given instance. For regression,
402    the variance of predictions is a reliability
403    estimate:
404
405    :math:`BAGV = \\frac{1}{m} \sum_{i=1}^{m} (K_i - K)^2`, where
406    :math:`K = \\frac{\sum_{i=1}^{m} K_i}{m}` and :math:`K_i` are
407    predictions of individual models.
408
409    For classification, 1 minus the average Euclidean distance between class
410    probability distributions predicted by the model, and distributions
411    predicted by the individual bagged models, is the BAGV reliability
412    measure. For classification, a greater value implies a better
413    prediction.
414   
415    This reliability measure can run out of memory if individual classifiers themselves
416    use a lot of memory; it needs :math:`m` times memory
417    for a single classifier.
418    """
419    def __init__(self, m=50, name="bv", randseed=0, for_instances=None):
420
421        self.m = m
422        self.name = name
423        self.select_with_repeat = Orange.core.MakeRandomIndicesMultiple()
424        self.select_with_repeat.random_generator = Orange.misc.Random(randseed)
425        self.for_instances = for_instances
426
427    def __call__(self, instances, learner):
428        classifiers = []
429
430        if instances.domain.class_var.var_type == Orange.feature.Descriptor.Discrete:
431            classifier = learner(instances)
432        else:
433            classifier = None
434
435        for_inst_class = defaultdict(list)
436        this_iteration = None
437       
438        if self.for_instances:
439            his = map(_hashable_instance, self.for_instances)
440
441        # Create bagged classifiers using sampling with replacement
442        for i in xrange(self.m):
443            this_iteration = set()
444            selection = self.select_with_repeat(len(instances))
445            data = instances.select(selection)
446            cl = learner(data)
447            if cl:
448                if self.for_instances: # predict reliability for testing instances and throw cl away
449                    for instance, hi in zip(self.for_instances, his):
450                        if hi not in this_iteration:
451                            for_inst_class[hi].append(_bagged_value(instance, cl, classifier))
452                            this_iteration.add(hi)
453                else:
454                    classifiers.append(cl)
455
456        return BaggingVarianceClassifier(classifiers, classifier, for_inst_class=dict(for_inst_class))
457
458class BaggingVarianceClassifier:
459    def __init__(self, classifiers, classifier=None, for_inst_class=None):
460        self.classifiers = classifiers
461        self.classifier = classifier
462        self.for_inst_class = for_inst_class
463
464    def __call__(self, instance, *args):
465        BAGV = 0
466
467        # Calculate the bagging variance
468        if self.for_inst_class:
469            bagged_values = self.for_inst_class[_hashable_instance(instance)]
470        else:
471            bagged_values = [ _bagged_value(instance, c, self.classifier) for c in self.classifiers ]
472
473        k = sum(bagged_values) / len(bagged_values)
474
475        BAGV = sum((bagged_value - k) ** 2 for bagged_value in bagged_values) / len(bagged_values)
476        if instance.domain.class_var.var_type == Orange.feature.Descriptor.Discrete:
477            BAGV = 1 - BAGV
478
479        return [Estimate(BAGV, ABSOLUTE, BAGV_ABSOLUTE)]
480
481def _hashable_instance(instance):
482    return tuple(instance[i].value for i in range(len(instance.domain.attributes)))
483
484def _bagged_value(instance, c, classifier):
485    if instance.domain.class_var.var_type == Orange.feature.Descriptor.Continuous:
486        return c(instance, Orange.core.GetValue).value
487    elif instance.domain.class_var.var_type == Orange.feature.Descriptor.Discrete:
488        estimate = classifier(instance, Orange.core.GetProbabilities)
489        return euclidean_dist(c(instance, Orange.core.GetProbabilities), estimate)
490
491
492class LocalCrossValidation:
493    """
494
495    :param k: Number of nearest neighbours used. Default: 0, which denotes
496        1/20 of data set size (or 5, whichever is greater).
497    :type k: int
498
499    :param distance: Function that computes a distance between two discrete
500        distributions (used only in classification problems). The default
501        is Hellinger distance.
502    :type distance: function
503
504    :param distance_weighted: For classification,
505        use an average distance between distributions, weighted by :math:`e^{-d}`,
506        where :math:`d` is the distance between predicted instance and the
507        neighbour.
508
509    :rtype: :class:`Orange.evaluation.reliability.LocalCrossValidationClassifier`
510
511    Leave-one-out validation is
512    performed on :math:`k` nearest neighbours to the given instance.
513    Reliability estimate for regression is then the distance
514    weighted absolute prediction error. For classification, it is 1 minus the average
515    distance between the predicted class probability distribution and the
516    (trivial) probability distributions of the nearest neighbour.
517    """
518    def __init__(self, k=0, distance=hellinger_dist, distance_weighted=True, name="lcv"):
519        self.k = k
520        self.distance = distance
521        self.distance_weighted = distance_weighted
522        self.name = name
523
524    def __call__(self, instances, learner):
525        nearest_neighbours_constructor = Orange.classification.knn.FindNearestConstructor()
526        nearest_neighbours_constructor.distanceConstructor = Orange.distance.Euclidean()
527
528        distance_id = Orange.feature.Descriptor.new_meta_id()
529        nearest_neighbours = nearest_neighbours_constructor(instances, 0, distance_id)
530
531        if self.k == 0:
532            self.k = max(5, len(instances) / 20)
533
534        return LocalCrossValidationClassifier(distance_id, nearest_neighbours, self.k, learner,
535            distance=self.distance, distance_weighted=self.distance_weighted)
536
537class LocalCrossValidationClassifier:
538    def __init__(self, distance_id, nearest_neighbours, k, learner, **kwds):
539        self.distance_id = distance_id
540        self.nearest_neighbours = nearest_neighbours
541        self.k = k
542        self.learner = learner
543        for a,b in kwds.items():
544            setattr(self, a, b)
545
546    def __call__(self, instance, *args):
547        LCVer = 0
548        LCVdi = 0
549
550        # Find k nearest neighbors
551
552        knn = [ex for ex in self.nearest_neighbours(instance, self.k)]
553
554        # leave one out of prediction error
555        for i in xrange(len(knn)):
556            train = knn[:]
557            del train[i]
558
559            classifier = self.learner(Orange.data.Table(train))
560
561            if instance.domain.class_var.var_type == Orange.feature.Descriptor.Continuous:
562                returned_value = classifier(knn[i], Orange.core.GetValue)
563                e = abs(knn[i].getclass().value - returned_value.value)
564
565            elif instance.domain.class_var.var_type == Orange.feature.Descriptor.Discrete:
566                returned_value = classifier(knn[i], Orange.core.GetProbabilities)
567                probabilities = [knn[i].get_class() == val for val in instance.domain.class_var.values]
568                e = self.distance(returned_value, Orange.statistics.distribution.Discrete(probabilities))
569
570            dist = math.exp(-knn[i][self.distance_id]) if self.distance_weighted else 1.0
571            LCVer += e * dist
572            LCVdi += dist
573
574        LCV = LCVer / LCVdi if LCVdi != 0 else 0
575        if math.isnan(LCV):
576            LCV = 0.0
577
578        if instance.domain.class_var.var_type == Orange.feature.Descriptor.Discrete:
579            LCV = 1 - LCV
580
581        return [ Estimate(LCV, ABSOLUTE, LCV_ABSOLUTE) ]
582
583class CNeighbours:
584    """
585   
586    :param k: Number of nearest neighbours.
587    :type k: int
588
589    :param distance: function that computes a distance between two discrete
590        distributions (used only in classification problems). The default
591        is Hellinger distance.
592    :type distance: function
593   
594    :rtype: :class:`Orange.evaluation.reliability.CNeighboursClassifier`
595   
596    For regression, CNK is defined a difference
597    between average label of its nearest neighbours and the prediction. CNK
598    can be either signed or absolute. A greater value implies greater prediction error.
599
600    For classification, CNK is equal to 1 minus the average distance between
601    predicted class distribution and (trivial) class distributions of the
602    $k$ nearest neighbours from the learning set. A greater value implies better prediction.
603   
604    """
605    def __init__(self, k=5, distance=hellinger_dist, name = "cnk"):
606        self.k = k
607        self.distance = distance
608        self.name = name
609
610    def __call__(self, instances, learner):
611        nearest_neighbours_constructor = Orange.classification.knn.FindNearestConstructor()
612        nearest_neighbours_constructor.distanceConstructor = Orange.distance.Euclidean()
613
614        distance_id = Orange.feature.Descriptor.new_meta_id()
615        nearest_neighbours = nearest_neighbours_constructor(instances, 0, distance_id)
616        return CNeighboursClassifier(nearest_neighbours, self.k, distance=self.distance)
617
618class CNeighboursClassifier:
619    def __init__(self, nearest_neighbours, k, distance):
620        self.nearest_neighbours = nearest_neighbours
621        self.k = k
622        self.distance = distance
623
624    def __call__(self, instance, predicted, probabilities):
625        CNK = 0
626
627        # Find k nearest neighbors
628
629        knn = [ex for ex in self.nearest_neighbours(instance, self.k)]
630
631        # average label of neighbors
632        if ex.domain.class_var.var_type == Orange.feature.Descriptor.Continuous:
633            for ex in knn:
634                CNK += ex.getclass().value
635            CNK /= self.k
636            CNK -= predicted.value
637
638            return [Estimate(CNK, SIGNED, CNK_SIGNED),
639                    Estimate(abs(CNK), ABSOLUTE, CNK_ABSOLUTE)]
640        elif ex.domain.class_var.var_type == Orange.feature.Descriptor.Discrete:
641            knn_l = Orange.classification.knn.kNNLearner(k=self.k)
642            knn_c = knn_l(knn)
643            for ex in knn:
644                CNK -= self.distance(probabilities, knn_c(ex, Orange.classification.Classifier.GetProbabilities))
645            CNK /= self.k
646            CNK += 1
647
648            return [Estimate(CNK, ABSOLUTE, CNK_ABSOLUTE)]
649
650class Mahalanobis:
651    """
652   
653    :param k: Number of nearest neighbours used in Mahalanobis estimate.
654    :type k: int
655   
656    :rtype: :class:`Orange.evaluation.reliability.MahalanobisClassifier`
657   
658    Mahalanobis distance reliability estimate is defined as
659    `Mahalanobis distance <http://en.wikipedia.org/wiki/Mahalanobis_distance>`_
660    to the evaluated instance's :math:`k` nearest neighbours.
661
662   
663    """
664    def __init__(self, k=3, name="mahalanobis"):
665        self.k = k
666        self.name = name
667
668    def __call__(self, instances, *args):
669        nnm = Orange.classification.knn.FindNearestConstructor()
670        nnm.distanceConstructor = Orange.distance.Mahalanobis()
671
672        mid = Orange.feature.Descriptor.new_meta_id()
673        nnm = nnm(instances, 0, mid)
674        return MahalanobisClassifier(self.k, nnm, mid)
675
676class MahalanobisClassifier:
677    def __init__(self, k, nnm, mid):
678        self.k = k
679        self.nnm = nnm
680        self.mid = mid
681
682    def __call__(self, instance, *args):
683        mahalanobis_distance = 0
684
685        mahalanobis_distance = sum(ex[self.mid].value for ex in self.nnm(instance, self.k))
686
687        return [ Estimate(mahalanobis_distance, ABSOLUTE, MAHAL_ABSOLUTE) ]
688
689class MahalanobisToCenter:
690    """
691    :rtype: :class:`Orange.evaluation.reliability.MahalanobisToCenterClassifier`
692   
693    Mahalanobis distance to center reliability estimate is defined as a
694    `Mahalanobis distance <http://en.wikipedia.org/wiki/Mahalanobis_distance>`_
695    between the predicted instance and the centroid of the data.
696
697   
698    """
699    def __init__(self, name="mahalanobis to center"):
700        self.name = name
701
702    def __call__(self, instances, *args):
703        dc = Orange.core.DomainContinuizer()
704        dc.classTreatment = Orange.core.DomainContinuizer.Ignore
705        dc.continuousTreatment = Orange.core.DomainContinuizer.NormalizeBySpan
706        dc.multinomialTreatment = Orange.core.DomainContinuizer.NValues
707
708        new_domain = dc(instances)
709        new_instances = instances.translate(new_domain)
710
711        X, _, _ = new_instances.to_numpy()
712        instance_avg = numpy.average(X, 0)
713
714        distance_constructor = Orange.distance.Mahalanobis()
715        distance = distance_constructor(new_instances)
716
717        average_instance = Orange.data.Instance(new_instances.domain, list(instance_avg) + ["?"])
718
719        return MahalanobisToCenterClassifier(distance, average_instance, new_domain)
720
721class MahalanobisToCenterClassifier:
722    def __init__(self, distance, average_instance, new_domain):
723        self.distance = distance
724        self.average_instance = average_instance
725        self.new_domain = new_domain
726
727    def __call__(self, instance, *args):
728
729        inst = Orange.data.Instance(self.new_domain, instance)
730
731        mahalanobis_to_center = self.distance(inst, self.average_instance)
732
733        return [ Estimate(mahalanobis_to_center, ABSOLUTE, MAHAL_TO_CENTER_ABSOLUTE) ]
734
735
736class BaggingVarianceCNeighbours:
737    """
738   
739    :param bagv: Instance of Bagging Variance estimator.
740    :type bagv: :class:`BaggingVariance`
741   
742    :param cnk: Instance of CNK estimator.
743    :type cnk: :class:`CNeighbours`
744   
745    :rtype: :class:`Orange.evaluation.reliability.BaggingVarianceCNeighboursClassifier`
746   
747    BVCK is an average of Bagging variance and local modeling of
748    prediction error.
749   
750    """
751    def __init__(self, bagv=None, cnk=None, name="bvck"):
752        if bagv is None:
753            bagv = BaggingVariance()
754        if cnk is None:
755            cnk = CNeighbours()
756        self.bagv = bagv
757        self.cnk = cnk
758        self.name = "bvck"
759
760    def __call__(self, instances, learner):
761        bagv_classifier = self.bagv(instances, learner)
762        cnk_classifier = self.cnk(instances, learner)
763        return BaggingVarianceCNeighboursClassifier(bagv_classifier, cnk_classifier)
764
765class BaggingVarianceCNeighboursClassifier:
766    def __init__(self, bagv_classifier, cnk_classifier):
767        self.bagv_classifier = bagv_classifier
768        self.cnk_classifier = cnk_classifier
769
770    def __call__(self, instance, predicted, probabilities):
771        bagv_estimates = self.bagv_classifier(instance, predicted, probabilities)
772        cnk_estimates = self.cnk_classifier(instance, predicted, probabilities)
773
774        bvck_value = (bagv_estimates[0].estimate + cnk_estimates[1].estimate) / 2
775        bvck_estimates = [ Estimate(bvck_value, ABSOLUTE, BVCK_ABSOLUTE) ]
776        bvck_estimates.extend(bagv_estimates)
777        bvck_estimates.extend(cnk_estimates)
778        return bvck_estimates
779
780class ErrorPredicting:
781    def __init__(self, name = "ep"):
782        self.name = name
783
784    def __call__(self, instances, learner):
785        res = Orange.evaluation.testing.cross_validation([learner], instances)
786        prediction_errors = get_prediction_error_list(res)
787
788        new_domain = Orange.data.Domain(instances.domain.attributes, Orange.core.FloatVariable("pe"))
789        new_dataset = Orange.data.Table(new_domain, instances)
790
791        for instance, prediction_error in izip(new_dataset, prediction_errors):
792            instance.set_class(prediction_error)
793
794        rf = Orange.ensemble.forest.RandomForestLearner()
795        rf_classifier = rf(new_dataset)
796
797        return ErrorPredictingClassification(rf_classifier, new_domain)
798
799class ErrorPredictingClassification:
800    def __init__(self, rf_classifier, new_domain):
801        self.rf_classifier = rf_classifier
802        self.new_domain = new_domain
803
804    def __call__(self, instance, predicted, probabilities):
805        new_instance = Orange.data.Instance(self.new_domain, instance)
806        value = self.rf_classifier(new_instance, Orange.core.GetValue)
807
808        return [Estimate(value.value, SIGNED, SABIAS_SIGNED)]
809
810def gauss_kernel(x, sigma=1):
811    return 1./(sigma*math.sqrt(2*math.pi)) * math.exp(-1./2*(x/sigma)**2)
812
813class ParzenWindowDensityBased:
814    """
815    :param K: kernel function. Default: gaussian.
816    :type K: function
817
818    :param d_measure: distance measure for inter-instance distance.
819    :type d_measure: :class:`Orange.distance.DistanceConstructor`
820
821    :rtype: :class:`Orange.evaluation.reliability.ParzenWindowDensityBasedClassifier`
822
823    Returns a value that estimates a density of problem space around the
824    instance being predicted.
825    """
826    def __init__(self, K=gauss_kernel, d_measure=Orange.distance.Euclidean(), name="density"):
827        self.K = K
828        self.d_measure = d_measure
829        self.name = name
830
831    def __call__(self, instances, learner):
832
833        self.distance = self.d_measure(instances)
834
835        def density(x):
836            l, dens = len(instances), 0
837            for ex in instances:
838                dens += self.K(self.distance(x,ex))
839            return dens / l
840
841        max_density = max([density(ex) for ex in instances])
842
843        return ParzenWindowDensityBasedClassifier(density, max_density)
844
845class ParzenWindowDensityBasedClassifier:
846
847    def __init__(self, density, max_density):
848        self.density = density
849        self.max_density = max_density
850
851
852    def __call__(self, instance, *args):
853
854        DENS = self.max_density-self.density(instance)
855
856        return [Estimate(DENS, ABSOLUTE, DENS_ABSOLUTE)]
857
858
859def _normalize(data):
860    dc = Orange.core.DomainContinuizer()
861    dc.classTreatment = Orange.core.DomainContinuizer.Ignore
862    dc.continuousTreatment = Orange.core.DomainContinuizer.NormalizeByVariance
863    domain = dc(data)
864    data = data.translate(domain)
865    return data
866
867class _NormalizedLearner(Orange.classification.Learner):
868    """
869    Wrapper for normalization.
870    """
871    def __init__(self, learner):
872        self.learner = learner
873
874    def __call__(self, data, *args, **kwargs):
875        return self.learner(_normalize(data), *args, **kwargs)
876
877class Stacking:
878    """
879
880    This methods develops a model that integrates reliability estimates
881    from all available reliability scoring techniques (see [Wolpert1992]_ and [Dzeroski2004]_). It
882    performs internal cross-validation and therefore takes roughly the same time
883    as :class:`ICV`.
884
885    :param stack_learner: a data modelling method. Default (if None): unregularized linear regression with prior normalization.
886    :type stack_learner: :obj:`Orange.classification.Learner`
887
888    :param estimators: Reliability estimation methods to choose from. Default (if None): :class:`SensitivityAnalysis`, :class:`LocalCrossValidation`, :class:`BaggingVarianceCNeighbours`, :class:`Mahalanobis`, :class:`MahalanobisToCenter`.
889    :type estimators: :obj:`list` of reliability estimators
890 
891    :param folds: The number of fold for cross validation (default 10).
892    :type box_learner: :obj:`int`
893
894    :param save_data: If True, save the data used for training the
895        integration model into resulting classifier's .data attribute (default False).
896    :type box_learner: :obj:`bool`
897 
898    """
899 
900    def __init__(self, 
901        stack_learner=None, 
902        estimators=None, 
903        folds=10, 
904        save_data=False):
905        self.stack_learner = stack_learner
906        self.estimators = estimators
907        self.folds = folds
908        self.save_data = save_data
909        if self.stack_learner is None:
910            self.stack_learner=_NormalizedLearner(Orange.regression.linear.LinearRegressionLearner(ridge_lambda=0.0))
911        if self.estimators is None:
912             self.estimators = [SensitivityAnalysis(),
913                           LocalCrossValidation(),
914                           BaggingVarianceCNeighbours(),
915                           Mahalanobis(),
916                           MahalanobisToCenter()]
917   
918    def __call__(self, data, learner):
919
920        newfeatures = None
921       
922        if self.folds > 1:
923
924            cvi = Orange.data.sample.SubsetIndicesCV(data, self.folds)
925            data_cv = [ None ] * len(data)
926            for f in set(cvi): #for each fold
927                learn = data.select(cvi, f, negate=True)
928                test = data.select(cvi, f)
929
930                #learn reliability estimates for the learning set
931                lf = Learner(learner, estimators=self.estimators)(learn)
932               
933                #pos is used to retain the order of instances
934                for ex, pos in zip(test, [ i for i,n in enumerate(cvi) if n == f ]):
935                    pred = lf(ex, Orange.core.GetBoth)
936                    re = pred[1].reliability_estimate
937                    names = [ e.method_name for e in re ]
938                    assert newfeatures is None or names == newfeatures
939                    newfeatures = names
940                    estimates = [ abs(e.estimate) for e in re ]
941                    error = ex[-1].value - pred[0].value
942                    data_cv[pos] = estimates + [ abs(error) ]
943
944        else:
945 
946            #use half of the data to learn reliability estimates
947            #and the other half for induction of a stacking classifier
948            cvi = Orange.data.sample.SubsetIndicesCV(data, 2)
949            data_cv = []
950
951            learn = data.select(cvi, 0, negate=True)
952            test = data.select(cvi, 0)
953
954            #learn reliability estimates for the learning set
955            lf = Learner(learner, estimators=self.estimators)(learn)
956           
957            for ex in test:
958                pred = lf(ex, Orange.core.GetBoth)
959                re = pred[1].reliability_estimate
960                names = [ e.method_name for e in re ]
961                assert newfeatures is None or names == newfeatures
962                newfeatures = names
963                estimates = [ abs(e.estimate) for e in re ]
964                error = ex[-1].value - pred[0].value
965                data_cv.append(estimates + [ abs(error) ])
966
967        lf = None
968
969        #induce the classifier on cross-validated reliability estimates
970        newfeatures = [ Orange.feature.Continuous(name=n) for n in newfeatures ]
971        newdomain = Orange.data.Domain(newfeatures, Orange.feature.Continuous(name="error"))
972        classifier_data = Orange.data.Table(newdomain, data_cv)
973        stack_classifier = self.stack_learner(classifier_data)
974
975        #induce reliability estimates on the whole data set
976        lf = Learner(learner, estimators=self.estimators)(data)
977
978        return StackingClassifier(stack_classifier, lf, newdomain, data=classifier_data if self.save_data else None)
979
980
981class StackingClassifier:
982
983    def __init__(self, stacking_classifier, reliability_classifier, domain, data=None):
984        self.stacking_classifier = stacking_classifier
985        self.domain = domain
986        self.reliability_classifier = reliability_classifier
987        self.data = data
988
989    def convert(self, instance):
990        """ Return example in the space of reliability estimates. """
991        re = self.reliability_classifier(instance, Orange.core.GetProbabilities).reliability_estimate
992        #take absolute values for all
993        tex = [ abs(e.estimate) for e in re ] + [ "?" ]
994        tex =  Orange.data.Instance(self.domain, tex)
995        return tex
996
997    def __call__(self, instance, *args):
998        tex = self.convert(instance)
999        r = self.stacking_classifier(tex)
1000        r = float(r)
1001        r = max(0., r)
1002        return [ Estimate(r, ABSOLUTE, STACKING) ]
1003
1004class ICV:
1005    """ Selects the best reliability estimator for
1006    the given data with internal cross validation [Bosnic2010]_.
1007
1008    :param estimators: reliability estimation methods to choose from. Default (if None): :class:`SensitivityAnalysis`, :class:`LocalCrossValidation`, :class:`BaggingVarianceCNeighbours`, :class:`Mahalanobis`, :class:`MahalanobisToCenter` ]
1009    :type estimators: :obj:`list` of reliability estimators
1010 
1011    :param folds: The number of fold for cross validation (default 10).
1012    :type box_learner: :obj:`int`
1013 
1014    """
1015 
1016    def __init__(self, estimators=None, folds=10):
1017        self.estimators = estimators
1018        if self.estimators is None:
1019             self.estimators = [SensitivityAnalysis(),
1020                           LocalCrossValidation(),
1021                           BaggingVarianceCNeighbours(),
1022                           Mahalanobis(),
1023                           MahalanobisToCenter()]
1024        self.folds = folds
1025   
1026    def __call__(self, data, learner):
1027
1028        cvi = Orange.data.sample.SubsetIndicesCV(data, self.folds)
1029        sum_of_rs = defaultdict(float)
1030        n_rs = defaultdict(int)
1031
1032        elearner = Learner(learner, estimators=self.estimators)
1033
1034        #average correlations from each fold
1035        for f in set(cvi):
1036            learn = data.select(cvi, f, negate=True)
1037            test = data.select(cvi, f)
1038
1039            res = Orange.evaluation.testing.learn_and_test_on_test_data([elearner], learn, test)
1040            results = get_pearson_r(res)
1041   
1042            for r, p, sa, method in results:
1043                if not math.isnan(r): #ignore NaN values
1044                    sum_of_rs[(method, sa)] += r
1045                    n_rs[(method, sa)] += 1 
1046
1047        avg_rs = [ (k,(sum_of_rs[k]/n_rs[k])) for k in sum_of_rs ]
1048
1049        avg_rs = sorted(avg_rs, key=lambda estimate: estimate[1], reverse=True)
1050        chosen = avg_rs[0][0]
1051
1052        lf = elearner(data)
1053        return ICVClassifier(chosen, lf)
1054
1055
1056class ICVClassifier:
1057
1058    def __init__(self, chosen, reliability_classifier):
1059        self.chosen = chosen
1060        self.reliability_classifier = reliability_classifier
1061
1062    def __call__(self, instance, *args):
1063        re = self.reliability_classifier(instance, Orange.core.GetProbabilities).reliability_estimate
1064        for e in re:
1065            if e.method == self.chosen[0] and e.signed_or_absolute == self.chosen[1]:
1066                r = e.estimate
1067
1068        return [ Estimate(r, self.chosen[1], ICV_METHOD) ]
1069
1070class Learner:
1071    """
1072    Adds reliability estimation to any learner: multiple reliability estimation
1073    algorithms can be used simultaneously.
1074    This learner can be used as any other learner,
1075    but returns the classifier wrapped into an instance of
1076    :class:`Orange.evaluation.reliability.Classifier`.
1077   
1078    :param box_learner: Learner to wrap into a reliability estimation
1079        classifier.
1080    :type box_learner: :obj:`~Orange.classification.Learner`
1081   
1082    :param estimators: List of reliability estimation methods. Default (if None): :class:`SensitivityAnalysis`, :class:`LocalCrossValidation`, :class:`BaggingVarianceCNeighbours`, :class:`Mahalanobis`, :class:`MahalanobisToCenter`.
1083    :type estimators: :obj:`list` of reliability estimators
1084   
1085    :param name: Name of this reliability learner.
1086    :type name: string
1087   
1088    :rtype: :class:`Orange.evaluation.reliability.Learner`
1089    """
1090    def __init__(self, box_learner, name="Reliability estimation",
1091                 estimators=None,
1092                 **kwds):
1093        self.__dict__.update(kwds)
1094        self.name = name
1095        self.estimators = estimators
1096        if self.estimators is None:
1097             self.estimators = [SensitivityAnalysis(),
1098                           LocalCrossValidation(),
1099                           BaggingVarianceCNeighbours(),
1100                           Mahalanobis(),
1101                           MahalanobisToCenter()]
1102 
1103        self.box_learner = box_learner
1104        self.blending = False
1105
1106
1107    def __call__(self, instances, weight=None, **kwds):
1108        """Learn from the given table of data instances.
1109       
1110        :param instances: Data to learn from.
1111        :type instances: Orange.data.Table
1112        :param weight: Id of meta attribute with weights of instances
1113        :type weight: int
1114
1115        :rtype: :class:`Orange.evaluation.reliability.Classifier`
1116        """
1117
1118        blending_classifier = None
1119        new_domain = None
1120
1121#        if instances.domain.class_var.var_type != Orange.feature.Continuous.Continuous:
1122#            raise Exception("This method only works on data with continuous class.")
1123
1124        return Classifier(instances, self.box_learner, self.estimators, self.blending, new_domain, blending_classifier)
1125 
1126class Classifier:
1127    """
1128    A reliability estimation wrapper for classifiers.
1129    The returned probabilities contain an
1130    additional attribute :obj:`reliability_estimate`, which is a list of
1131    :class:`~Orange.evaluation.reliability.Estimate` (see :obj:`~Classifier.__call__`).
1132    """
1133
1134    def __init__(self, instances, box_learner, estimators, blending, blending_domain, rf_classifier, **kwds):
1135        self.__dict__.update(kwds)
1136        self.instances = instances
1137        self.box_learner = box_learner
1138        self.estimators = estimators
1139        self.blending = blending
1140        self.blending_domain = blending_domain
1141        self.rf_classifier = rf_classifier
1142
1143        # Train the learner with original data
1144        self.classifier = box_learner(instances)
1145
1146        # Train all the estimators and create their classifiers
1147        self.estimation_classifiers = [estimator(instances, box_learner) for estimator in estimators]
1148
1149    def __call__(self, instance, result_type=Orange.core.GetValue):
1150        """
1151        Classify and estimate reliability of estimation for a new instance.
1152        When :obj:`result_type` is set to
1153        :obj:`Orange.classification.Classifier.GetBoth` or
1154        :obj:`Orange.classification.Classifier.GetProbabilities`,
1155        an additional attribute :obj:`reliability_estimate`
1156        (a list of :class:`~Orange.evaluation.reliability.Estimate`)
1157        is added to the distribution object.
1158       
1159        :param instance: instance to be classified.
1160        :type instance: :class:`Orange.data.Instance`
1161        :param result_type: :class:`Orange.classification.Classifier.GetValue` or \
1162              :class:`Orange.classification.Classifier.GetProbabilities` or
1163              :class:`Orange.classification.Classifier.GetBoth`
1164       
1165        :rtype: :class:`Orange.data.Value`,
1166              :class:`Orange.statistics.Distribution` or a tuple with both
1167        """
1168        predicted, probabilities = self.classifier(instance, Orange.core.GetBoth)
1169
1170        # Create a place holder for estimates
1171        if probabilities is None:
1172            probabilities = Orange.statistics.distribution.Continuous()
1173        #with warnings.catch_warnings():
1174        #    warnings.simplefilter("ignore")
1175        probabilities.setattr('reliability_estimate', [])
1176
1177        # Calculate all the estimates and add them to the results
1178        for estimate in self.estimation_classifiers:
1179            probabilities.reliability_estimate.extend(estimate(instance, predicted, probabilities))
1180
1181        # Return the appropriate type of result
1182        if result_type == Orange.core.GetValue:
1183            return predicted
1184        elif result_type == Orange.core.GetProbabilities:
1185            return probabilities
1186        else:
1187            return predicted, probabilities
1188
1189# Functions for testing and plotting
1190#TODO Document those.
1191def get_acc_rel(method, data, learner):
1192    estimators = [method]
1193    reliability = Orange.evaluation.reliability.Learner(learner, estimators=estimators)
1194    #results = Orange.evaluation.testing.leave_one_out([reliability], data)
1195    results = Orange.evaluation.testing.cross_validation([reliability], data)
1196
1197    rels, acc = [], []
1198
1199    for res in results.results:
1200        rels.append(res.probabilities[0].reliability_estimate[0].estimate)
1201        acc.append(res.probabilities[0][res.actual_class])
1202
1203    return rels, acc
1204
1205
1206def rel_acc_plot(rels, acc, file_name=None, colors=None):
1207
1208    import matplotlib.pylab as plt
1209   
1210    if colors is None:
1211        colors = "k"
1212    plt.scatter(rels, acc, c=colors)
1213    plt.xlim(0.,1.)
1214    plt.ylim(ymin=0.)
1215    plt.xlabel("Reliability")
1216    plt.ylabel("Accuracy")
1217    if file_name is None:
1218        plt.show()
1219    else:
1220        plt.savefig(file_name)
1221
1222def rel_acc_compute_plot(method, data, learner, file_name=None, colors=None):
1223
1224    plt.clf()
1225
1226    rels, acc = get_acc_rel(method, data, learner)
1227    el_acc_plot(acc, rels, file_name=file_name, colors=colors)
1228   
1229
1230def acc_rel_correlation(method, data, learner):
1231    import scipy.stats
1232    rels, acc = get_acc_rel(method, data, learner)
1233    return scipy.stats.spearmanr(acc, rels)[0]
Note: See TracBrowser for help on using the repository browser.