source: orange-reliability/orangecontrib/reliability/__init__.py @ 53:ba8bc7d59e7a

Revision 53:ba8bc7d59e7a, 45.2 KB checked in by markotoplak, 6 months ago (diff)

Updates to documentation.

RevLine 
[0]1import Orange
2
3import random
4from Orange import statc
5import math
6import warnings
7import numpy
8
9from collections import defaultdict
10from itertools import izip
11
12# All the estimator method constants
13SAVAR_ABSOLUTE = 0
14SABIAS_SIGNED = 1
15SABIAS_ABSOLUTE = 2
16BAGV_ABSOLUTE = 3
17CNK_SIGNED = 4
18CNK_ABSOLUTE = 5
19LCV_ABSOLUTE = 6
20BVCK_ABSOLUTE = 7
21MAHAL_ABSOLUTE = 8
22BLENDING_ABSOLUTE = 9
23ICV_METHOD = 10
24MAHAL_TO_CENTER_ABSOLUTE = 13
[5]25DENS_ABSOLUTE = 14
[10]26ERR_ABSOLUTE = 15
[37]27STACKING = 101
[0]28
29# Type of estimator constant
30SIGNED = 0
31ABSOLUTE = 1
32
33# Names of all the estimator methods
34METHOD_NAME = {0: "SAvar absolute", 1: "SAbias signed", 2: "SAbias absolute",
35               3: "BAGV absolute", 4: "CNK signed", 5: "CNK absolute",
[37]36               6: "LCV absolute", 7: "BVCK absolute", 8: "Mahalanobis absolute",
[0]37               9: "BLENDING absolute", 10: "ICV", 11: "RF Variance", 12: "RF Std",
[37]38               13: "Mahalanobis to center", 14: "Density based", 15: "Reference expected error",
39               101: "Stacking" }
[0]40
41def get_reliability_estimation_list(res, i):
[37]42    return [ result.probabilities[0].reliability_estimate[i].estimate for result in res.results], \
43        res.results[0].probabilities[0].reliability_estimate[i].signed_or_absolute, \
44        res.results[0].probabilities[0].reliability_estimate[i].method
[0]45
46def get_prediction_error_list(res):
47    return [result.actual_class - result.classes[0] for result in res.results]
48
49def get_description_list(res, i):
50    return [result.probabilities[0].reliability_estimate[i].text_description for result in res.results]
51
52def get_pearson_r(res):
53    """
54    :param res: results of evaluation, done using learners,
55        wrapped into :class:`Orange.evaluation.reliability.Classifier`.
56    :type res: :class:`Orange.evaluation.testing.ExperimentResults`
57
58    Return Pearson's coefficient between the prediction error and each of the
59    used reliability estimates. Also, return the p-value of each of
60    the coefficients.
61    """
62    prediction_error = get_prediction_error_list(res)
63    results = []
64    for i in xrange(len(res.results[0].probabilities[0].reliability_estimate)):
65        reliability_estimate, signed_or_absolute, method = get_reliability_estimation_list(res, i)
66        try:
67            if signed_or_absolute == SIGNED:
68                r, p = statc.pearsonr(prediction_error, reliability_estimate)
69            else:
70                r, p = statc.pearsonr([abs(pe) for pe in prediction_error], reliability_estimate)
71        except Exception:
72            r = p = float("NaN")
73        results.append((r, p, signed_or_absolute, method))
74    return results
75
76def get_spearman_r(res):
77    """
78    :param res: results of evaluation, done using learners,
79        wrapped into :class:`Orange.evaluation.reliability.Classifier`.
80    :type res: :class:`Orange.evaluation.testing.ExperimentResults`
81
82    Return Spearman's coefficient between the prediction error and each of the
83    used reliability estimates. Also, return the p-value of each of
84    the coefficients.
85    """
86    prediction_error = get_prediction_error_list(res)
87    results = []
88    for i in xrange(len(res.results[0].probabilities[0].reliability_estimate)):
89        reliability_estimate, signed_or_absolute, method = get_reliability_estimation_list(res, i)
90        try:
91            if signed_or_absolute == SIGNED:
92                r, p = statc.spearmanr(prediction_error, reliability_estimate)
93            else:
94                r, p = statc.spearmanr([abs(pe) for pe in prediction_error], reliability_estimate)
95        except Exception:
96            r = p = float("NaN")
97        results.append((r, p, signed_or_absolute, method))
98    return results
99
100def get_pearson_r_by_iterations(res):
101    """
102    :param res: results of evaluation, done using learners,
103        wrapped into :class:`Orange.evaluation.reliability.Classifier`.
104    :type res: :class:`Orange.evaluation.testing.ExperimentResults`
105
106    Return average Pearson's coefficient over all folds between prediction error
107    and each of the used estimates.
108    """
109    results_by_fold = Orange.evaluation.scoring.split_by_iterations(res)
110    number_of_estimates = len(res.results[0].probabilities[0].reliability_estimate)
111    number_of_instances = len(res.results)
112    number_of_folds = len(results_by_fold)
113    results = [0 for _ in xrange(number_of_estimates)]
114    sig = [0 for _ in xrange(number_of_estimates)]
115    method_list = [0 for _ in xrange(number_of_estimates)]
116
117    for res in results_by_fold:
118        prediction_error = get_prediction_error_list(res)
119        for i in xrange(number_of_estimates):
120            reliability_estimate, signed_or_absolute, method = get_reliability_estimation_list(res, i)
121            try:
122                if signed_or_absolute == SIGNED:
123                    r, _ = statc.pearsonr(prediction_error, reliability_estimate)
124                else:
125                    r, _ = statc.pearsonr([abs(pe) for pe in prediction_error], reliability_estimate)
126            except Exception:
127                r = float("NaN")
128            results[i] += r
129            sig[i] = signed_or_absolute
130            method_list[i] = method
131
132    # Calculate p-values
133    results = [float(res) / number_of_folds for res in results]
134    ps = [p_value_from_r(r, number_of_instances) for r in results]
135
136    return zip(results, ps, sig, method_list)
137
138def p_value_from_r(r, n):
139    """
140    Calculate p-value from the paerson coefficient and the sample size.
141    """
142    df = n - 2
143    t = r * (df / ((-r + 1.0 + 1e-30) * (r + 1.0 + 1e-30))) ** 0.5
144    return statc.betai (df * 0.5, 0.5, df / (df + t * t))
145
[5]146
147# Distances between two discrete probability distributions
148#TODO Document those.
149def normalize_both(p, q):
150    if not p.normalized:
151        p.normalize()
152    if not q.normalized:
153        q.normalize()
154    return p, q
155
156def minkowsky_dist(p, q, m=2):
157    p, q = normalize_both(p, q)
158    dist = 0
159    for i in range(len(p)):
160        dist += abs(p[i]-q[i])**m
161    return dist**(1./m)
162
163def manhattan_distance(p, q):
164    return minkowsky_dist(p, q, m=1)
165
166def euclidean_dist(p, q):
167    return minkowsky_dist(p, q, m=2)
168
169def variance_dist(p, q):
170    return euclidean_dist(p, q) ** 2
171
172def max_dist(p, q):
173    p, q = normalize_both(p, q)
174    return max([abs(p[i]-q[i]) for i in range(len(p))])
175
176def hellinger_dist(p, q):
177    p, q = normalize_both(p, q)
178    dist = 0
179    for i in range(len(p)):
180        dist += (math.sqrt(p[i])-math.sqrt(q[i])) ** 2
181    return dist
182
183def my_log(x):
184    return 0 if x == 0 else x * math.log(x)
185
186def kullback_leibler(p, q):
187    p, q = normalize_both(p, q)
188    dist = 0
189    for i in range(len(p)):
190        dist += my_log(p[i]-q[i])
191    return dist
192
193def cosine(p, q):
194    p, q = normalize_both(p, q)
195    p, q = [pp for pp in p], [qq for qq in q]
196    return 1 - numpy.dot(x,y) / (numpy.linalg.norm(p)*numpy.linalg.norm(q))
197
198
[0]199class Estimate:
200    """
201    Reliability estimate. Contains attributes that describe the results of
202    reliability estimation.
203
204    .. attribute:: estimate
205
206        A numerical reliability estimate.
207
208    .. attribute:: signed_or_absolute
209
210        Determines whether the method used gives a signed or absolute result.
211        Has a value of either :obj:`SIGNED` or :obj:`ABSOLUTE`.
212
213    .. attribute:: method
214
215        An integer ID of reliability estimation method used.
216
217    .. attribute:: method_name
218
219        Name (string) of reliability estimation method used.
220
221    """
[40]222    def __init__(self, estimate, signed_or_absolute, method):
[0]223        self.estimate = estimate
224        self.signed_or_absolute = signed_or_absolute
225        self.method = method
226        self.method_name = METHOD_NAME[method]
227        self.text_description = None
228
229class DescriptiveAnalysis:
[14]230    def __init__(self, estimator, desc=["high", "medium", "low"], procentage=[0.00, 0.33, 0.66], name="da"):
[0]231        self.desc = desc
232        self.procentage = procentage
233        self.estimator = estimator
[14]234        self.name = name
[0]235
236    def __call__(self, instances, weight=None, **kwds):
237
238        # Calculate borders using cross validation
239        res = Orange.evaluation.testing.cross_validation([self.estimator], instances)
240        all_borders = []
241        for i in xrange(len(res.results[0].probabilities[0].reliability_estimate)):
242            estimates, signed_or_absolute, method = get_reliability_estimation_list(res, i)
243            sorted_estimates = sorted(abs(x) for x in estimates)
244            borders = [sorted_estimates[int(len(estimates) * p) - 1]  for p in self.procentage]
245            all_borders.append(borders)
246
247        # Learn on whole train data
248        estimator_classifier = self.estimator(instances)
249
250        return DescriptiveAnalysisClassifier(estimator_classifier, all_borders, self.desc)
251
252class DescriptiveAnalysisClassifier:
253    def __init__(self, estimator_classifier, all_borders, desc):
254        self.estimator_classifier = estimator_classifier
255        self.all_borders = all_borders
256        self.desc = desc
257
258    def __call__(self, instance, result_type=Orange.core.GetValue):
259        predicted, probabilities = self.estimator_classifier(instance, Orange.core.GetBoth)
260
261        for borders, estimate in zip(self.all_borders, probabilities.reliability_estimate):
262            estimate.text_description = self.desc[0]
263            for lower_border, text_desc in zip(borders, self.desc):
264                if estimate.estimate >= lower_border:
265                    estimate.text_description = text_desc
266
267        # Return the appropriate type of result
268        if result_type == Orange.core.GetValue:
269            return predicted
270        elif result_type == Orange.core.GetProbabilities:
271            return probabilities
272        else:
273            return predicted, probabilities
274
275class SensitivityAnalysis:
276    """
277   
[53]278    :param e: Values of :math:`\epsilon`.
[0]279    :type e: list of floats
280   
281    :rtype: :class:`Orange.evaluation.reliability.SensitivityAnalysisClassifier`
282   
[53]283    To estimate the reliability of prediction for a given instance,
284    the learning set is extended with that instance with the label changes to
285    :math:`K + \epsilon (l_{max} - l_{min})` (:math:`K` is  the initial prediction,
286    :math:`\epsilon` a sensitivity parameter, and :math:`l_{min}` and
287    :math:`l_{max}` the lower and upper bounds of labels on training data)
288    Results for multiple values of :math:`\epsilon` are combined
289    into SAvar and SAbias. SAbias can be used either in a signed or absolute form.
[0]290
291    :math:`SAvar = \\frac{\sum_{\epsilon \in E}(K_{\epsilon} - K_{-\epsilon})}{|E|}`
292    :math:`SAbias = \\frac{\sum_{\epsilon \in E} (K_{\epsilon} - K ) + (K_{-\epsilon} - K)}{2 |E|}`
293   
294   
295    """
[14]296    def __init__(self, e=[0.01, 0.1, 0.5, 1.0, 2.0], name="sa"):
[0]297        self.e = e
[14]298        self.name = name
[0]299
300    def __call__(self, instances, learner):
301        min_value = max_value = instances[0].getclass().value
302        for ex in instances:
303            if ex.getclass().value > max_value:
304                max_value = ex.getclass().value
305            if ex.getclass().value < min_value:
306                min_value = ex.getclass().value
307        return SensitivityAnalysisClassifier(self.e, instances, min_value, max_value, learner)
308
309class SensitivityAnalysisClassifier:
310    def __init__(self, e, instances, min_value, max_value, learner):
311        self.e = e
312        self.instances = instances
313        self.max_value = max_value
314        self.min_value = min_value
315        self.learner = learner
316
317    def __call__(self, instance, predicted, probabilities):
318        # Create new dataset
319        r_data = Orange.data.Table(self.instances)
320
321        # Create new instance
322        modified_instance = Orange.data.Instance(instance)
323
324        # Append it to the data
325        r_data.append(modified_instance)
326
327        # Calculate SAvar & SAbias
328        SAvar = SAbias = 0
329
330        for eps in self.e:
331            # +epsilon
332            r_data[-1].setclass(predicted.value + eps * (self.max_value - self.min_value))
333            c = self.learner(r_data)
334            k_plus = c(instance, Orange.core.GetValue)
335
336            # -epsilon
337            r_data[-1].setclass(predicted.value - eps * (self.max_value - self.min_value))
338            c = self.learner(r_data)
339            k_minus = c(instance, Orange.core.GetValue)
340            #print len(r_data)
341            #print eps*(self.max_value - self.min_value)
342            #print k_plus
343            #print k_minus
344            # calculate part SAvar and SAbias
345            SAvar += k_plus.value - k_minus.value
346            SAbias += k_plus.value + k_minus.value - 2 * predicted.value
347
348        SAvar /= len(self.e)
349        SAbias /= 2 * len(self.e)
350
351        return [Estimate(SAvar, ABSOLUTE, SAVAR_ABSOLUTE),
352                Estimate(SAbias, SIGNED, SABIAS_SIGNED),
353                Estimate(abs(SAbias), ABSOLUTE, SABIAS_ABSOLUTE)]
354
[10]355
356
357class ReferenceExpectedError:
[13]358    """
[10]359
[13]360    :rtype: :class:`Orange.evaluation.reliability.ReferenceExpectedErrorClassifier`
361
[53]362    Reference estimate for classification: :math:`O_{ref} = 2 (\hat y - \hat y ^2) = 2 \hat y (1-\hat y)`, where :math:`\hat y` is the estimated probability of the predicted class [Pevec2011]_.
[13]363
[53]364    A greater estimate means a greater expected error.
[13]365
366    """
[10]367    def __init__(self, name="reference"):
368        self.name = name
369
370    def __call__(self, instances, learner):
371        classifier = learner(instances)
372        return ReferenceExpectedErrorClassifier(classifier)
373
374   
375class ReferenceExpectedErrorClassifier:
376
377    def __init__(self, classifier):
378        self.classifier = classifier
379
380    def __call__(self, instance, *args):
381        y_hat = max(self.classifier(instance, Orange.classification.Classifier.GetProbabilities))
382        return [Estimate(2 * y_hat * (1 - y_hat), ABSOLUTE, ERR_ABSOLUTE)]
383
384
[0]385class BaggingVariance:
386    """
387   
388    :param m: Number of bagging models to be used with BAGV estimate
389    :type m: int
390   
[53]391    :param for instances:  Optional. If test instances
392      are given as a parameter, this class can compute their reliabilities
393      on the fly, which saves memory.
394
395    :type for_intances: Orange.data.Table
396   
[0]397    :rtype: :class:`Orange.evaluation.reliability.BaggingVarianceClassifier`
398   
[53]399   
400    :math:`m` different bagging models are used to estimate
401    the value of dependent variable for a given instance. For regression,
402    the variance of predictions is a reliability
403    estimate:
[0]404
[53]405    :math:`BAGV = \\frac{1}{m} \sum_{i=1}^{m} (K_i - K)^2`, where
406    :math:`K = \\frac{\sum_{i=1}^{m} K_i}{m}` and :math:`K_i` are
407    predictions of individual models.
[5]408
409    For classification, 1 minus the average Euclidean distance between class
410    probability distributions predicted by the model, and distributions
[53]411    predicted by the individual bagged models, is the BAGV reliability
412    measure. For classification, a greater value implies a better
[5]413    prediction.
[0]414   
[53]415    This reliability measure can run out of memory if individual classifiers themselves
416    use a lot of memory; it needs :math:`m` times memory
417    for a single classifier.
[0]418    """
[37]419    def __init__(self, m=50, name="bv", randseed=0, for_instances=None):
[53]420
[0]421        self.m = m
[9]422        self.name = name
[37]423        self.select_with_repeat = Orange.core.MakeRandomIndicesMultiple()
424        self.select_with_repeat.random_generator = Orange.misc.Random(randseed)
425        self.for_instances = for_instances
[0]426
427    def __call__(self, instances, learner):
428        classifiers = []
429
[5]430        if instances.domain.class_var.var_type == Orange.feature.Descriptor.Discrete:
431            classifier = learner(instances)
432        else:
433            classifier = None
434
[37]435        for_inst_class = defaultdict(list)
436        this_iteration = None
437       
438        if self.for_instances:
439            his = map(_hashable_instance, self.for_instances)
440
[0]441        # Create bagged classifiers using sampling with replacement
[37]442        for i in xrange(self.m):
443            this_iteration = set()
444            selection = self.select_with_repeat(len(instances))
[0]445            data = instances.select(selection)
[37]446            cl = learner(data)
447            if cl:
448                if self.for_instances: # predict reliability for testing instances and throw cl away
449                    for instance, hi in zip(self.for_instances, his):
450                        if hi not in this_iteration:
451                            for_inst_class[hi].append(_bagged_value(instance, cl, classifier))
452                            this_iteration.add(hi)
453                else:
454                    classifiers.append(cl)
455
456        return BaggingVarianceClassifier(classifiers, classifier, for_inst_class=dict(for_inst_class))
[0]457
458class BaggingVarianceClassifier:
[37]459    def __init__(self, classifiers, classifier=None, for_inst_class=None):
[0]460        self.classifiers = classifiers
[8]461        self.classifier = classifier
[37]462        self.for_inst_class = for_inst_class
[5]463
464    def __call__(self, instance, *args):
[0]465        BAGV = 0
466
467        # Calculate the bagging variance
[37]468        if self.for_inst_class:
469            bagged_values = self.for_inst_class[_hashable_instance(instance)]
470        else:
471            bagged_values = [ _bagged_value(instance, c, self.classifier) for c in self.classifiers ]
472
[0]473        k = sum(bagged_values) / len(bagged_values)
474
475        BAGV = sum((bagged_value - k) ** 2 for bagged_value in bagged_values) / len(bagged_values)
[5]476        if instance.domain.class_var.var_type == Orange.feature.Descriptor.Discrete:
477            BAGV = 1 - BAGV
[0]478
479        return [Estimate(BAGV, ABSOLUTE, BAGV_ABSOLUTE)]
480
[37]481def _hashable_instance(instance):
482    return tuple(instance[i].value for i in range(len(instance.domain.attributes)))
483
484def _bagged_value(instance, c, classifier):
485    if instance.domain.class_var.var_type == Orange.feature.Descriptor.Continuous:
486        return c(instance, Orange.core.GetValue).value
487    elif instance.domain.class_var.var_type == Orange.feature.Descriptor.Discrete:
488        estimate = classifier(instance, Orange.core.GetProbabilities)
489        return euclidean_dist(c(instance, Orange.core.GetProbabilities), estimate)
490
491
[0]492class LocalCrossValidation:
493    """
[5]494
[53]495    :param k: Number of nearest neighbours used. Default: 0, which denotes
496        1/20 of data set size (or 5, whichever is greater).
[0]497    :type k: int
[5]498
[53]499    :param distance: Function that computes a distance between two discrete
[5]500        distributions (used only in classification problems). The default
501        is Hellinger distance.
502    :type distance: function
503
[53]504    :param distance_weighted: For classification,
[5]505        use an average distance between distributions, weighted by :math:`e^{-d}`,
506        where :math:`d` is the distance between predicted instance and the
507        neighbour.
508
[0]509    :rtype: :class:`Orange.evaluation.reliability.LocalCrossValidationClassifier`
[5]510
[53]511    Leave-one-out validation is
512    performed on :math:`k` nearest neighbours to the given instance.
513    Reliability estimate for regression is then the distance
514    weighted absolute prediction error. For classification, it is 1 minus the average
[5]515    distance between the predicted class probability distribution and the
516    (trivial) probability distributions of the nearest neighbour.
[0]517    """
[9]518    def __init__(self, k=0, distance=hellinger_dist, distance_weighted=True, name="lcv"):
[0]519        self.k = k
[5]520        self.distance = distance
521        self.distance_weighted = distance_weighted
[9]522        self.name = name
[0]523
524    def __call__(self, instances, learner):
525        nearest_neighbours_constructor = Orange.classification.knn.FindNearestConstructor()
526        nearest_neighbours_constructor.distanceConstructor = Orange.distance.Euclidean()
527
528        distance_id = Orange.feature.Descriptor.new_meta_id()
529        nearest_neighbours = nearest_neighbours_constructor(instances, 0, distance_id)
530
531        if self.k == 0:
532            self.k = max(5, len(instances) / 20)
533
[5]534        return LocalCrossValidationClassifier(distance_id, nearest_neighbours, self.k, learner,
535            distance=self.distance, distance_weighted=self.distance_weighted)
[0]536
537class LocalCrossValidationClassifier:
[5]538    def __init__(self, distance_id, nearest_neighbours, k, learner, **kwds):
[0]539        self.distance_id = distance_id
540        self.nearest_neighbours = nearest_neighbours
541        self.k = k
542        self.learner = learner
[5]543        for a,b in kwds.items():
544            setattr(self, a, b)
[0]545
546    def __call__(self, instance, *args):
547        LCVer = 0
548        LCVdi = 0
549
550        # Find k nearest neighbors
551
552        knn = [ex for ex in self.nearest_neighbours(instance, self.k)]
553
554        # leave one out of prediction error
555        for i in xrange(len(knn)):
556            train = knn[:]
557            del train[i]
558
559            classifier = self.learner(Orange.data.Table(train))
560
[5]561            if instance.domain.class_var.var_type == Orange.feature.Descriptor.Continuous:
562                returned_value = classifier(knn[i], Orange.core.GetValue)
563                e = abs(knn[i].getclass().value - returned_value.value)
[0]564
[5]565            elif instance.domain.class_var.var_type == Orange.feature.Descriptor.Discrete:
566                returned_value = classifier(knn[i], Orange.core.GetProbabilities)
567                probabilities = [knn[i].get_class() == val for val in instance.domain.class_var.values]
568                e = self.distance(returned_value, Orange.statistics.distribution.Discrete(probabilities))
[0]569
[5]570            dist = math.exp(-knn[i][self.distance_id]) if self.distance_weighted else 1.0
571            LCVer += e * dist
572            LCVdi += dist
[0]573
574        LCV = LCVer / LCVdi if LCVdi != 0 else 0
575        if math.isnan(LCV):
576            LCV = 0.0
[5]577
578        if instance.domain.class_var.var_type == Orange.feature.Descriptor.Discrete:
579            LCV = 1 - LCV
580
[0]581        return [ Estimate(LCV, ABSOLUTE, LCV_ABSOLUTE) ]
582
583class CNeighbours:
584    """
585   
[53]586    :param k: Number of nearest neighbours.
[0]587    :type k: int
[5]588
589    :param distance: function that computes a distance between two discrete
590        distributions (used only in classification problems). The default
591        is Hellinger distance.
592    :type distance: function
[0]593   
594    :rtype: :class:`Orange.evaluation.reliability.CNeighboursClassifier`
595   
[53]596    For regression, CNK is defined a difference
597    between average label of its nearest neighbours and the prediction. CNK
598    can be either signed or absolute. A greater value implies greater prediction error.
[5]599
600    For classification, CNK is equal to 1 minus the average distance between
601    predicted class distribution and (trivial) class distributions of the
[53]602    $k$ nearest neighbours from the learning set. A greater value implies better prediction.
[0]603   
604    """
[9]605    def __init__(self, k=5, distance=hellinger_dist, name = "cnk"):
[0]606        self.k = k
[5]607        self.distance = distance
[9]608        self.name = name
[0]609
610    def __call__(self, instances, learner):
611        nearest_neighbours_constructor = Orange.classification.knn.FindNearestConstructor()
612        nearest_neighbours_constructor.distanceConstructor = Orange.distance.Euclidean()
613
614        distance_id = Orange.feature.Descriptor.new_meta_id()
615        nearest_neighbours = nearest_neighbours_constructor(instances, 0, distance_id)
[5]616        return CNeighboursClassifier(nearest_neighbours, self.k, distance=self.distance)
[0]617
618class CNeighboursClassifier:
[8]619    def __init__(self, nearest_neighbours, k, distance):
[0]620        self.nearest_neighbours = nearest_neighbours
621        self.k = k
[8]622        self.distance = distance
[0]623
624    def __call__(self, instance, predicted, probabilities):
625        CNK = 0
626
627        # Find k nearest neighbors
628
629        knn = [ex for ex in self.nearest_neighbours(instance, self.k)]
630
631        # average label of neighbors
[5]632        if ex.domain.class_var.var_type == Orange.feature.Descriptor.Continuous:
633            for ex in knn:
634                CNK += ex.getclass().value
635            CNK /= self.k
636            CNK -= predicted.value
[0]637
[5]638            return [Estimate(CNK, SIGNED, CNK_SIGNED),
639                    Estimate(abs(CNK), ABSOLUTE, CNK_ABSOLUTE)]
640        elif ex.domain.class_var.var_type == Orange.feature.Descriptor.Discrete:
641            knn_l = Orange.classification.knn.kNNLearner(k=self.k)
642            knn_c = knn_l(knn)
643            for ex in knn:
644                CNK -= self.distance(probabilities, knn_c(ex, Orange.classification.Classifier.GetProbabilities))
645            CNK /= self.k
646            CNK += 1
[0]647
[5]648            return [Estimate(CNK, ABSOLUTE, CNK_ABSOLUTE)]
[0]649
650class Mahalanobis:
651    """
652   
653    :param k: Number of nearest neighbours used in Mahalanobis estimate.
654    :type k: int
655   
656    :rtype: :class:`Orange.evaluation.reliability.MahalanobisClassifier`
657   
658    Mahalanobis distance reliability estimate is defined as
[42]659    `Mahalanobis distance <http://en.wikipedia.org/wiki/Mahalanobis_distance>`_
[0]660    to the evaluated instance's :math:`k` nearest neighbours.
661
662   
663    """
[14]664    def __init__(self, k=3, name="mahalanobis"):
[0]665        self.k = k
[14]666        self.name = name
[0]667
668    def __call__(self, instances, *args):
669        nnm = Orange.classification.knn.FindNearestConstructor()
670        nnm.distanceConstructor = Orange.distance.Mahalanobis()
671
672        mid = Orange.feature.Descriptor.new_meta_id()
673        nnm = nnm(instances, 0, mid)
674        return MahalanobisClassifier(self.k, nnm, mid)
675
676class MahalanobisClassifier:
677    def __init__(self, k, nnm, mid):
678        self.k = k
679        self.nnm = nnm
680        self.mid = mid
681
682    def __call__(self, instance, *args):
683        mahalanobis_distance = 0
684
685        mahalanobis_distance = sum(ex[self.mid].value for ex in self.nnm(instance, self.k))
686
687        return [ Estimate(mahalanobis_distance, ABSOLUTE, MAHAL_ABSOLUTE) ]
688
689class MahalanobisToCenter:
690    """
691    :rtype: :class:`Orange.evaluation.reliability.MahalanobisToCenterClassifier`
692   
693    Mahalanobis distance to center reliability estimate is defined as a
[42]694    `Mahalanobis distance <http://en.wikipedia.org/wiki/Mahalanobis_distance>`_
[0]695    between the predicted instance and the centroid of the data.
696
697   
698    """
[14]699    def __init__(self, name="mahalanobis to center"):
700        self.name = name
[0]701
702    def __call__(self, instances, *args):
703        dc = Orange.core.DomainContinuizer()
704        dc.classTreatment = Orange.core.DomainContinuizer.Ignore
705        dc.continuousTreatment = Orange.core.DomainContinuizer.NormalizeBySpan
706        dc.multinomialTreatment = Orange.core.DomainContinuizer.NValues
707
708        new_domain = dc(instances)
709        new_instances = instances.translate(new_domain)
710
711        X, _, _ = new_instances.to_numpy()
712        instance_avg = numpy.average(X, 0)
713
714        distance_constructor = Orange.distance.Mahalanobis()
715        distance = distance_constructor(new_instances)
716
717        average_instance = Orange.data.Instance(new_instances.domain, list(instance_avg) + ["?"])
718
719        return MahalanobisToCenterClassifier(distance, average_instance, new_domain)
720
721class MahalanobisToCenterClassifier:
722    def __init__(self, distance, average_instance, new_domain):
723        self.distance = distance
724        self.average_instance = average_instance
725        self.new_domain = new_domain
726
727    def __call__(self, instance, *args):
728
729        inst = Orange.data.Instance(self.new_domain, instance)
730
731        mahalanobis_to_center = self.distance(inst, self.average_instance)
732
733        return [ Estimate(mahalanobis_to_center, ABSOLUTE, MAHAL_TO_CENTER_ABSOLUTE) ]
734
735
736class BaggingVarianceCNeighbours:
737    """
738   
739    :param bagv: Instance of Bagging Variance estimator.
740    :type bagv: :class:`BaggingVariance`
741   
742    :param cnk: Instance of CNK estimator.
743    :type cnk: :class:`CNeighbours`
744   
745    :rtype: :class:`Orange.evaluation.reliability.BaggingVarianceCNeighboursClassifier`
746   
[42]747    BVCK is an average of Bagging variance and local modeling of
[0]748    prediction error.
749   
750    """
[37]751    def __init__(self, bagv=None, cnk=None, name="bvck"):
752        if bagv is None:
753            bagv = BaggingVariance()
754        if cnk is None:
755            cnk = CNeighbours()
[0]756        self.bagv = bagv
757        self.cnk = cnk
[14]758        self.name = "bvck"
[0]759
760    def __call__(self, instances, learner):
761        bagv_classifier = self.bagv(instances, learner)
762        cnk_classifier = self.cnk(instances, learner)
763        return BaggingVarianceCNeighboursClassifier(bagv_classifier, cnk_classifier)
764
765class BaggingVarianceCNeighboursClassifier:
766    def __init__(self, bagv_classifier, cnk_classifier):
767        self.bagv_classifier = bagv_classifier
768        self.cnk_classifier = cnk_classifier
769
770    def __call__(self, instance, predicted, probabilities):
771        bagv_estimates = self.bagv_classifier(instance, predicted, probabilities)
772        cnk_estimates = self.cnk_classifier(instance, predicted, probabilities)
773
774        bvck_value = (bagv_estimates[0].estimate + cnk_estimates[1].estimate) / 2
775        bvck_estimates = [ Estimate(bvck_value, ABSOLUTE, BVCK_ABSOLUTE) ]
776        bvck_estimates.extend(bagv_estimates)
777        bvck_estimates.extend(cnk_estimates)
778        return bvck_estimates
779
780class ErrorPredicting:
[14]781    def __init__(self, name = "ep"):
782        self.name = name
[0]783
784    def __call__(self, instances, learner):
785        res = Orange.evaluation.testing.cross_validation([learner], instances)
786        prediction_errors = get_prediction_error_list(res)
787
788        new_domain = Orange.data.Domain(instances.domain.attributes, Orange.core.FloatVariable("pe"))
789        new_dataset = Orange.data.Table(new_domain, instances)
790
791        for instance, prediction_error in izip(new_dataset, prediction_errors):
792            instance.set_class(prediction_error)
793
794        rf = Orange.ensemble.forest.RandomForestLearner()
795        rf_classifier = rf(new_dataset)
796
797        return ErrorPredictingClassification(rf_classifier, new_domain)
798
799class ErrorPredictingClassification:
800    def __init__(self, rf_classifier, new_domain):
801        self.rf_classifier = rf_classifier
802        self.new_domain = new_domain
803
804    def __call__(self, instance, predicted, probabilities):
805        new_instance = Orange.data.Instance(self.new_domain, instance)
806        value = self.rf_classifier(new_instance, Orange.core.GetValue)
807
808        return [Estimate(value.value, SIGNED, SABIAS_SIGNED)]
809
[5]810def gauss_kernel(x, sigma=1):
811    return 1./(sigma*math.sqrt(2*math.pi)) * math.exp(-1./2*(x/sigma)**2)
812
813class ParzenWindowDensityBased:
814    """
815    :param K: kernel function. Default: gaussian.
816    :type K: function
817
818    :param d_measure: distance measure for inter-instance distance.
819    :type d_measure: :class:`Orange.distance.DistanceConstructor`
820
821    :rtype: :class:`Orange.evaluation.reliability.ParzenWindowDensityBasedClassifier`
822
823    Returns a value that estimates a density of problem space around the
824    instance being predicted.
825    """
[9]826    def __init__(self, K=gauss_kernel, d_measure=Orange.distance.Euclidean(), name="density"):
[5]827        self.K = K
828        self.d_measure = d_measure
[9]829        self.name = name
[5]830
[11]831    def __call__(self, instances, learner):
[5]832
833        self.distance = self.d_measure(instances)
834
835        def density(x):
836            l, dens = len(instances), 0
837            for ex in instances:
838                dens += self.K(self.distance(x,ex))
839            return dens / l
840
841        max_density = max([density(ex) for ex in instances])
842
843        return ParzenWindowDensityBasedClassifier(density, max_density)
844
845class ParzenWindowDensityBasedClassifier:
846
847    def __init__(self, density, max_density):
848        self.density = density
849        self.max_density = max_density
850
851
852    def __call__(self, instance, *args):
853
854        DENS = self.max_density-self.density(instance)
855
856        return [Estimate(DENS, ABSOLUTE, DENS_ABSOLUTE)]
857
[42]858
859def _normalize(data):
860    dc = Orange.core.DomainContinuizer()
861    dc.classTreatment = Orange.core.DomainContinuizer.Ignore
862    dc.continuousTreatment = Orange.core.DomainContinuizer.NormalizeByVariance
863    domain = dc(data)
864    data = data.translate(domain)
865    return data
866
867class _NormalizedLearner(Orange.classification.Learner):
868    """
869    Wrapper for normalization.
870    """
871    def __init__(self, learner):
872        self.learner = learner
873
874    def __call__(self, data, *args, **kwargs):
875        return self.learner(_normalize(data), *args, **kwargs)
876
[37]877class Stacking:
[42]878    """
[37]879
[42]880    This methods develops a model that integrates reliability estimates
[53]881    from all available reliability scoring techniques (see [Wolpert1992]_ and [Dzeroski2004]_). It
882    performs internal cross-validation and therefore takes roughly the same time
883    as :class:`ICV`.
[42]884
885    :param stack_learner: a data modelling method. Default (if None): unregularized linear regression with prior normalization.
886    :type stack_learner: :obj:`Orange.classification.Learner`
887
888    :param estimators: Reliability estimation methods to choose from. Default (if None): :class:`SensitivityAnalysis`, :class:`LocalCrossValidation`, :class:`BaggingVarianceCNeighbours`, :class:`Mahalanobis`, :class:`MahalanobisToCenter`.
889    :type estimators: :obj:`list` of reliability estimators
890 
891    :param folds: The number of fold for cross validation (default 10).
892    :type box_learner: :obj:`int`
893
894    :param save_data: If True, save the data used for training the
[53]895        integration model into resulting classifier's .data attribute (default False).
[42]896    :type box_learner: :obj:`bool`
897 
898    """
899 
900    def __init__(self, 
901        stack_learner=None, 
902        estimators=None, 
903        folds=10, 
904        save_data=False):
[37]905        self.stack_learner = stack_learner
906        self.estimators = estimators
907        self.folds = folds
908        self.save_data = save_data
[42]909        if self.stack_learner is None:
910            self.stack_learner=_NormalizedLearner(Orange.regression.linear.LinearRegressionLearner(ridge_lambda=0.0))
[38]911        if self.estimators is None:
912             self.estimators = [SensitivityAnalysis(),
913                           LocalCrossValidation(),
914                           BaggingVarianceCNeighbours(),
915                           Mahalanobis(),
916                           MahalanobisToCenter()]
[37]917   
918    def __call__(self, data, learner):
919
920        newfeatures = None
921       
922        if self.folds > 1:
923
924            cvi = Orange.data.sample.SubsetIndicesCV(data, self.folds)
925            data_cv = [ None ] * len(data)
926            for f in set(cvi): #for each fold
927                learn = data.select(cvi, f, negate=True)
928                test = data.select(cvi, f)
929
930                #learn reliability estimates for the learning set
931                lf = Learner(learner, estimators=self.estimators)(learn)
932               
933                #pos is used to retain the order of instances
934                for ex, pos in zip(test, [ i for i,n in enumerate(cvi) if n == f ]):
935                    pred = lf(ex, Orange.core.GetBoth)
936                    re = pred[1].reliability_estimate
937                    names = [ e.method_name for e in re ]
938                    assert newfeatures is None or names == newfeatures
939                    newfeatures = names
940                    estimates = [ abs(e.estimate) for e in re ]
941                    error = ex[-1].value - pred[0].value
942                    data_cv[pos] = estimates + [ abs(error) ]
943
944        else:
945 
946            #use half of the data to learn reliability estimates
947            #and the other half for induction of a stacking classifier
948            cvi = Orange.data.sample.SubsetIndicesCV(data, 2)
949            data_cv = []
950
951            learn = data.select(cvi, 0, negate=True)
952            test = data.select(cvi, 0)
953
954            #learn reliability estimates for the learning set
955            lf = Learner(learner, estimators=self.estimators)(learn)
956           
957            for ex in test:
958                pred = lf(ex, Orange.core.GetBoth)
959                re = pred[1].reliability_estimate
960                names = [ e.method_name for e in re ]
961                assert newfeatures is None or names == newfeatures
962                newfeatures = names
963                estimates = [ abs(e.estimate) for e in re ]
964                error = ex[-1].value - pred[0].value
965                data_cv.append(estimates + [ abs(error) ])
966
967        lf = None
968
969        #induce the classifier on cross-validated reliability estimates
970        newfeatures = [ Orange.feature.Continuous(name=n) for n in newfeatures ]
971        newdomain = Orange.data.Domain(newfeatures, Orange.feature.Continuous(name="error"))
972        classifier_data = Orange.data.Table(newdomain, data_cv)
973        stack_classifier = self.stack_learner(classifier_data)
974
975        #induce reliability estimates on the whole data set
976        lf = Learner(learner, estimators=self.estimators)(data)
977
[42]978        return StackingClassifier(stack_classifier, lf, newdomain, data=classifier_data if self.save_data else None)
[37]979
980
981class StackingClassifier:
982
[42]983    def __init__(self, stacking_classifier, reliability_classifier, domain, data=None):
[37]984        self.stacking_classifier = stacking_classifier
985        self.domain = domain
986        self.reliability_classifier = reliability_classifier
[42]987        self.data = data
[37]988
989    def convert(self, instance):
990        """ Return example in the space of reliability estimates. """
991        re = self.reliability_classifier(instance, Orange.core.GetProbabilities).reliability_estimate
992        #take absolute values for all
993        tex = [ abs(e.estimate) for e in re ] + [ "?" ]
994        tex =  Orange.data.Instance(self.domain, tex)
995        return tex
996
997    def __call__(self, instance, *args):
998        tex = self.convert(instance)
999        r = self.stacking_classifier(tex)
1000        r = float(r)
1001        r = max(0., r)
1002        return [ Estimate(r, ABSOLUTE, STACKING) ]
1003
1004class ICV:
[42]1005    """ Selects the best reliability estimator for
1006    the given data with internal cross validation [Bosnic2010]_.
1007
1008    :param estimators: reliability estimation methods to choose from. Default (if None): :class:`SensitivityAnalysis`, :class:`LocalCrossValidation`, :class:`BaggingVarianceCNeighbours`, :class:`Mahalanobis`, :class:`MahalanobisToCenter` ]
1009    :type estimators: :obj:`list` of reliability estimators
1010 
1011    :param folds: The number of fold for cross validation (default 10).
1012    :type box_learner: :obj:`int`
1013 
[37]1014    """
1015 
[38]1016    def __init__(self, estimators=None, folds=10):
[37]1017        self.estimators = estimators
[38]1018        if self.estimators is None:
1019             self.estimators = [SensitivityAnalysis(),
1020                           LocalCrossValidation(),
1021                           BaggingVarianceCNeighbours(),
1022                           Mahalanobis(),
1023                           MahalanobisToCenter()]
[37]1024        self.folds = folds
1025   
1026    def __call__(self, data, learner):
1027
1028        cvi = Orange.data.sample.SubsetIndicesCV(data, self.folds)
1029        sum_of_rs = defaultdict(float)
[38]1030        n_rs = defaultdict(int)
[37]1031
1032        elearner = Learner(learner, estimators=self.estimators)
1033
1034        #average correlations from each fold
1035        for f in set(cvi):
1036            learn = data.select(cvi, f, negate=True)
1037            test = data.select(cvi, f)
1038
1039            res = Orange.evaluation.testing.learn_and_test_on_test_data([elearner], learn, test)
1040            results = get_pearson_r(res)
[38]1041   
[37]1042            for r, p, sa, method in results:
[38]1043                if not math.isnan(r): #ignore NaN values
1044                    sum_of_rs[(method, sa)] += r
1045                    n_rs[(method, sa)] += 1 
[37]1046
[38]1047        avg_rs = [ (k,(sum_of_rs[k]/n_rs[k])) for k in sum_of_rs ]
1048
1049        avg_rs = sorted(avg_rs, key=lambda estimate: estimate[1], reverse=True)
1050        chosen = avg_rs[0][0]
[37]1051
1052        lf = elearner(data)
1053        return ICVClassifier(chosen, lf)
1054
1055
1056class ICVClassifier:
1057
1058    def __init__(self, chosen, reliability_classifier):
1059        self.chosen = chosen
1060        self.reliability_classifier = reliability_classifier
1061
1062    def __call__(self, instance, *args):
1063        re = self.reliability_classifier(instance, Orange.core.GetProbabilities).reliability_estimate
1064        for e in re:
1065            if e.method == self.chosen[0] and e.signed_or_absolute == self.chosen[1]:
1066                r = e.estimate
1067
1068        return [ Estimate(r, self.chosen[1], ICV_METHOD) ]
1069
[0]1070class Learner:
1071    """
[42]1072    Adds reliability estimation to any learner: multiple reliability estimation
1073    algorithms can be used simultaneously.
1074    This learner can be used as any other learner,
1075    but returns the classifier wrapped into an instance of
[0]1076    :class:`Orange.evaluation.reliability.Classifier`.
1077   
[42]1078    :param box_learner: Learner to wrap into a reliability estimation
[0]1079        classifier.
1080    :type box_learner: :obj:`~Orange.classification.Learner`
1081   
[42]1082    :param estimators: List of reliability estimation methods. Default (if None): :class:`SensitivityAnalysis`, :class:`LocalCrossValidation`, :class:`BaggingVarianceCNeighbours`, :class:`Mahalanobis`, :class:`MahalanobisToCenter`.
[0]1083    :type estimators: :obj:`list` of reliability estimators
1084   
[42]1085    :param name: Name of this reliability learner.
[0]1086    :type name: string
1087   
1088    :rtype: :class:`Orange.evaluation.reliability.Learner`
1089    """
1090    def __init__(self, box_learner, name="Reliability estimation",
[38]1091                 estimators=None,
[0]1092                 **kwds):
1093        self.__dict__.update(kwds)
1094        self.name = name
1095        self.estimators = estimators
[38]1096        if self.estimators is None:
1097             self.estimators = [SensitivityAnalysis(),
1098                           LocalCrossValidation(),
1099                           BaggingVarianceCNeighbours(),
1100                           Mahalanobis(),
1101                           MahalanobisToCenter()]
1102 
[0]1103        self.box_learner = box_learner
1104        self.blending = False
1105
1106
1107    def __call__(self, instances, weight=None, **kwds):
1108        """Learn from the given table of data instances.
1109       
[42]1110        :param instances: Data to learn from.
[0]1111        :type instances: Orange.data.Table
1112        :param weight: Id of meta attribute with weights of instances
1113        :type weight: int
[42]1114
[0]1115        :rtype: :class:`Orange.evaluation.reliability.Classifier`
1116        """
1117
1118        blending_classifier = None
1119        new_domain = None
1120
1121#        if instances.domain.class_var.var_type != Orange.feature.Continuous.Continuous:
1122#            raise Exception("This method only works on data with continuous class.")
1123
1124        return Classifier(instances, self.box_learner, self.estimators, self.blending, new_domain, blending_classifier)
[37]1125 
[0]1126class Classifier:
1127    """
[42]1128    A reliability estimation wrapper for classifiers.
1129    The returned probabilities contain an
1130    additional attribute :obj:`reliability_estimate`, which is a list of
1131    :class:`~Orange.evaluation.reliability.Estimate` (see :obj:`~Classifier.__call__`).
[0]1132    """
1133
1134    def __init__(self, instances, box_learner, estimators, blending, blending_domain, rf_classifier, **kwds):
1135        self.__dict__.update(kwds)
1136        self.instances = instances
1137        self.box_learner = box_learner
1138        self.estimators = estimators
1139        self.blending = blending
1140        self.blending_domain = blending_domain
1141        self.rf_classifier = rf_classifier
1142
1143        # Train the learner with original data
1144        self.classifier = box_learner(instances)
1145
1146        # Train all the estimators and create their classifiers
1147        self.estimation_classifiers = [estimator(instances, box_learner) for estimator in estimators]
1148
1149    def __call__(self, instance, result_type=Orange.core.GetValue):
1150        """
1151        Classify and estimate reliability of estimation for a new instance.
1152        When :obj:`result_type` is set to
1153        :obj:`Orange.classification.Classifier.GetBoth` or
1154        :obj:`Orange.classification.Classifier.GetProbabilities`,
[42]1155        an additional attribute :obj:`reliability_estimate`
1156        (a list of :class:`~Orange.evaluation.reliability.Estimate`)
[0]1157        is added to the distribution object.
1158       
1159        :param instance: instance to be classified.
1160        :type instance: :class:`Orange.data.Instance`
1161        :param result_type: :class:`Orange.classification.Classifier.GetValue` or \
1162              :class:`Orange.classification.Classifier.GetProbabilities` or
1163              :class:`Orange.classification.Classifier.GetBoth`
1164       
1165        :rtype: :class:`Orange.data.Value`,
1166              :class:`Orange.statistics.Distribution` or a tuple with both
1167        """
1168        predicted, probabilities = self.classifier(instance, Orange.core.GetBoth)
1169
1170        # Create a place holder for estimates
1171        if probabilities is None:
1172            probabilities = Orange.statistics.distribution.Continuous()
1173        #with warnings.catch_warnings():
1174        #    warnings.simplefilter("ignore")
1175        probabilities.setattr('reliability_estimate', [])
1176
1177        # Calculate all the estimates and add them to the results
1178        for estimate in self.estimation_classifiers:
1179            probabilities.reliability_estimate.extend(estimate(instance, predicted, probabilities))
1180
1181        # Return the appropriate type of result
1182        if result_type == Orange.core.GetValue:
1183            return predicted
1184        elif result_type == Orange.core.GetProbabilities:
1185            return probabilities
1186        else:
1187            return predicted, probabilities
[5]1188
1189# Functions for testing and plotting
1190#TODO Document those.
1191def get_acc_rel(method, data, learner):
1192    estimators = [method]
1193    reliability = Orange.evaluation.reliability.Learner(learner, estimators=estimators)
1194    #results = Orange.evaluation.testing.leave_one_out([reliability], data)
1195    results = Orange.evaluation.testing.cross_validation([reliability], data)
1196
1197    rels, acc = [], []
1198
1199    for res in results.results:
1200        rels.append(res.probabilities[0].reliability_estimate[0].estimate)
1201        acc.append(res.probabilities[0][res.actual_class])
1202
1203    return rels, acc
1204
[12]1205
1206def rel_acc_plot(rels, acc, file_name=None, colors=None):
[5]1207
1208    import matplotlib.pylab as plt
[12]1209   
1210    if colors is None:
1211        colors = "k"
1212    plt.scatter(rels, acc, c=colors)
1213    plt.xlim(0.,1.)
1214    plt.ylim(ymin=0.)
1215    plt.xlabel("Reliability")
1216    plt.ylabel("Accuracy")
1217    if file_name is None:
1218        plt.show()
1219    else:
1220        plt.savefig(file_name)
1221
1222def rel_acc_compute_plot(method, data, learner, file_name=None, colors=None):
[5]1223
1224    plt.clf()
1225
1226    rels, acc = get_acc_rel(method, data, learner)
[12]1227    el_acc_plot(acc, rels, file_name=file_name, colors=colors)
1228   
[5]1229
1230def acc_rel_correlation(method, data, learner):
1231    import scipy.stats
1232    rels, acc = get_acc_rel(method, data, learner)
[37]1233    return scipy.stats.spearmanr(acc, rels)[0]
Note: See TracBrowser for help on using the repository browser.