source: orange-reliability/orangecontrib/reliability/__init__.py @ 42:75bf74617e81

Revision 42:75bf74617e81, 46.3 KB checked in by markotoplak, 7 months ago (diff)

Updates to the documentation.

Line 
1import Orange
2
3import random
4from Orange import statc
5import math
6import warnings
7import numpy
8
9from collections import defaultdict
10from itertools import izip
11
12# All the estimator method constants
13SAVAR_ABSOLUTE = 0
14SABIAS_SIGNED = 1
15SABIAS_ABSOLUTE = 2
16BAGV_ABSOLUTE = 3
17CNK_SIGNED = 4
18CNK_ABSOLUTE = 5
19LCV_ABSOLUTE = 6
20BVCK_ABSOLUTE = 7
21MAHAL_ABSOLUTE = 8
22BLENDING_ABSOLUTE = 9
23ICV_METHOD = 10
24MAHAL_TO_CENTER_ABSOLUTE = 13
25DENS_ABSOLUTE = 14
26ERR_ABSOLUTE = 15
27STACKING = 101
28
29# Type of estimator constant
30SIGNED = 0
31ABSOLUTE = 1
32
33# Names of all the estimator methods
34METHOD_NAME = {0: "SAvar absolute", 1: "SAbias signed", 2: "SAbias absolute",
35               3: "BAGV absolute", 4: "CNK signed", 5: "CNK absolute",
36               6: "LCV absolute", 7: "BVCK absolute", 8: "Mahalanobis absolute",
37               9: "BLENDING absolute", 10: "ICV", 11: "RF Variance", 12: "RF Std",
38               13: "Mahalanobis to center", 14: "Density based", 15: "Reference expected error",
39               101: "Stacking" }
40
41def get_reliability_estimation_list(res, i):
42    return [ result.probabilities[0].reliability_estimate[i].estimate for result in res.results], \
43        res.results[0].probabilities[0].reliability_estimate[i].signed_or_absolute, \
44        res.results[0].probabilities[0].reliability_estimate[i].method
45
46def get_prediction_error_list(res):
47    return [result.actual_class - result.classes[0] for result in res.results]
48
49def get_description_list(res, i):
50    return [result.probabilities[0].reliability_estimate[i].text_description for result in res.results]
51
52def get_pearson_r(res):
53    """
54    :param res: results of evaluation, done using learners,
55        wrapped into :class:`Orange.evaluation.reliability.Classifier`.
56    :type res: :class:`Orange.evaluation.testing.ExperimentResults`
57
58    Return Pearson's coefficient between the prediction error and each of the
59    used reliability estimates. Also, return the p-value of each of
60    the coefficients.
61    """
62    prediction_error = get_prediction_error_list(res)
63    results = []
64    for i in xrange(len(res.results[0].probabilities[0].reliability_estimate)):
65        reliability_estimate, signed_or_absolute, method = get_reliability_estimation_list(res, i)
66        try:
67            if signed_or_absolute == SIGNED:
68                r, p = statc.pearsonr(prediction_error, reliability_estimate)
69            else:
70                r, p = statc.pearsonr([abs(pe) for pe in prediction_error], reliability_estimate)
71        except Exception:
72            r = p = float("NaN")
73        results.append((r, p, signed_or_absolute, method))
74    return results
75
76def get_spearman_r(res):
77    """
78    :param res: results of evaluation, done using learners,
79        wrapped into :class:`Orange.evaluation.reliability.Classifier`.
80    :type res: :class:`Orange.evaluation.testing.ExperimentResults`
81
82    Return Spearman's coefficient between the prediction error and each of the
83    used reliability estimates. Also, return the p-value of each of
84    the coefficients.
85    """
86    prediction_error = get_prediction_error_list(res)
87    results = []
88    for i in xrange(len(res.results[0].probabilities[0].reliability_estimate)):
89        reliability_estimate, signed_or_absolute, method = get_reliability_estimation_list(res, i)
90        try:
91            if signed_or_absolute == SIGNED:
92                r, p = statc.spearmanr(prediction_error, reliability_estimate)
93            else:
94                r, p = statc.spearmanr([abs(pe) for pe in prediction_error], reliability_estimate)
95        except Exception:
96            r = p = float("NaN")
97        results.append((r, p, signed_or_absolute, method))
98    return results
99
100def get_pearson_r_by_iterations(res):
101    """
102    :param res: results of evaluation, done using learners,
103        wrapped into :class:`Orange.evaluation.reliability.Classifier`.
104    :type res: :class:`Orange.evaluation.testing.ExperimentResults`
105
106    Return average Pearson's coefficient over all folds between prediction error
107    and each of the used estimates.
108    """
109    results_by_fold = Orange.evaluation.scoring.split_by_iterations(res)
110    number_of_estimates = len(res.results[0].probabilities[0].reliability_estimate)
111    number_of_instances = len(res.results)
112    number_of_folds = len(results_by_fold)
113    results = [0 for _ in xrange(number_of_estimates)]
114    sig = [0 for _ in xrange(number_of_estimates)]
115    method_list = [0 for _ in xrange(number_of_estimates)]
116
117    for res in results_by_fold:
118        prediction_error = get_prediction_error_list(res)
119        for i in xrange(number_of_estimates):
120            reliability_estimate, signed_or_absolute, method = get_reliability_estimation_list(res, i)
121            try:
122                if signed_or_absolute == SIGNED:
123                    r, _ = statc.pearsonr(prediction_error, reliability_estimate)
124                else:
125                    r, _ = statc.pearsonr([abs(pe) for pe in prediction_error], reliability_estimate)
126            except Exception:
127                r = float("NaN")
128            results[i] += r
129            sig[i] = signed_or_absolute
130            method_list[i] = method
131
132    # Calculate p-values
133    results = [float(res) / number_of_folds for res in results]
134    ps = [p_value_from_r(r, number_of_instances) for r in results]
135
136    return zip(results, ps, sig, method_list)
137
138def p_value_from_r(r, n):
139    """
140    Calculate p-value from the paerson coefficient and the sample size.
141    """
142    df = n - 2
143    t = r * (df / ((-r + 1.0 + 1e-30) * (r + 1.0 + 1e-30))) ** 0.5
144    return statc.betai (df * 0.5, 0.5, df / (df + t * t))
145
146
147# Distances between two discrete probability distributions
148#TODO Document those.
149def normalize_both(p, q):
150    if not p.normalized:
151        p.normalize()
152    if not q.normalized:
153        q.normalize()
154    return p, q
155
156def minkowsky_dist(p, q, m=2):
157    p, q = normalize_both(p, q)
158    dist = 0
159    for i in range(len(p)):
160        dist += abs(p[i]-q[i])**m
161    return dist**(1./m)
162
163def manhattan_distance(p, q):
164    return minkowsky_dist(p, q, m=1)
165
166def euclidean_dist(p, q):
167    return minkowsky_dist(p, q, m=2)
168
169def variance_dist(p, q):
170    return euclidean_dist(p, q) ** 2
171
172def max_dist(p, q):
173    p, q = normalize_both(p, q)
174    return max([abs(p[i]-q[i]) for i in range(len(p))])
175
176def hellinger_dist(p, q):
177    p, q = normalize_both(p, q)
178    dist = 0
179    for i in range(len(p)):
180        dist += (math.sqrt(p[i])-math.sqrt(q[i])) ** 2
181    return dist
182
183def my_log(x):
184    return 0 if x == 0 else x * math.log(x)
185
186def kullback_leibler(p, q):
187    p, q = normalize_both(p, q)
188    dist = 0
189    for i in range(len(p)):
190        dist += my_log(p[i]-q[i])
191    return dist
192
193def cosine(p, q):
194    p, q = normalize_both(p, q)
195    p, q = [pp for pp in p], [qq for qq in q]
196    return 1 - numpy.dot(x,y) / (numpy.linalg.norm(p)*numpy.linalg.norm(q))
197
198
199class Estimate:
200    """
201    Reliability estimate. Contains attributes that describe the results of
202    reliability estimation.
203
204    .. attribute:: estimate
205
206        A numerical reliability estimate.
207
208    .. attribute:: signed_or_absolute
209
210        Determines whether the method used gives a signed or absolute result.
211        Has a value of either :obj:`SIGNED` or :obj:`ABSOLUTE`.
212
213    .. attribute:: method
214
215        An integer ID of reliability estimation method used.
216
217    .. attribute:: method_name
218
219        Name (string) of reliability estimation method used.
220
221    """
222    def __init__(self, estimate, signed_or_absolute, method):
223        self.estimate = estimate
224        self.signed_or_absolute = signed_or_absolute
225        self.method = method
226        self.method_name = METHOD_NAME[method]
227        self.text_description = None
228
229class DescriptiveAnalysis:
230    def __init__(self, estimator, desc=["high", "medium", "low"], procentage=[0.00, 0.33, 0.66], name="da"):
231        self.desc = desc
232        self.procentage = procentage
233        self.estimator = estimator
234        self.name = name
235
236    def __call__(self, instances, weight=None, **kwds):
237
238        # Calculate borders using cross validation
239        res = Orange.evaluation.testing.cross_validation([self.estimator], instances)
240        all_borders = []
241        for i in xrange(len(res.results[0].probabilities[0].reliability_estimate)):
242            estimates, signed_or_absolute, method = get_reliability_estimation_list(res, i)
243            sorted_estimates = sorted(abs(x) for x in estimates)
244            borders = [sorted_estimates[int(len(estimates) * p) - 1]  for p in self.procentage]
245            all_borders.append(borders)
246
247        # Learn on whole train data
248        estimator_classifier = self.estimator(instances)
249
250        return DescriptiveAnalysisClassifier(estimator_classifier, all_borders, self.desc)
251
252class DescriptiveAnalysisClassifier:
253    def __init__(self, estimator_classifier, all_borders, desc):
254        self.estimator_classifier = estimator_classifier
255        self.all_borders = all_borders
256        self.desc = desc
257
258    def __call__(self, instance, result_type=Orange.core.GetValue):
259        predicted, probabilities = self.estimator_classifier(instance, Orange.core.GetBoth)
260
261        for borders, estimate in zip(self.all_borders, probabilities.reliability_estimate):
262            estimate.text_description = self.desc[0]
263            for lower_border, text_desc in zip(borders, self.desc):
264                if estimate.estimate >= lower_border:
265                    estimate.text_description = text_desc
266
267        # Return the appropriate type of result
268        if result_type == Orange.core.GetValue:
269            return predicted
270        elif result_type == Orange.core.GetProbabilities:
271            return probabilities
272        else:
273            return predicted, probabilities
274
275class SensitivityAnalysis:
276    """
277   
278    :param e: List of possible :math:`\epsilon` values for SAvar and SAbias
279        reliability estimates.
280    :type e: list of floats
281   
282    :rtype: :class:`Orange.evaluation.reliability.SensitivityAnalysisClassifier`
283   
284    To estimate the reliability of prediction for given instance,
285    the learning set is extended with this instance, labeled with
286    :math:`K + \epsilon (l_{max} - l_{min})`,
287    where :math:`K` denotes the initial prediction,
288    :math:`\epsilon` is sensitivity parameter and :math:`l_{min}` and
289    :math:`l_{max}` denote lower and the upper bound of the learning
290    instances' labels. After computing different sensitivity predictions
291    using different values of :math:`\epsilon`, the prediction are combined
292    into SAvar and SAbias. SAbias can be used in a signed or absolute form.
293
294    :math:`SAvar = \\frac{\sum_{\epsilon \in E}(K_{\epsilon} - K_{-\epsilon})}{|E|}`
295
296    :math:`SAbias = \\frac{\sum_{\epsilon \in E} (K_{\epsilon} - K ) + (K_{-\epsilon} - K)}{2 |E|}`
297   
298   
299    """
300    def __init__(self, e=[0.01, 0.1, 0.5, 1.0, 2.0], name="sa"):
301        self.e = e
302        self.name = name
303
304    def __call__(self, instances, learner):
305        min_value = max_value = instances[0].getclass().value
306        for ex in instances:
307            if ex.getclass().value > max_value:
308                max_value = ex.getclass().value
309            if ex.getclass().value < min_value:
310                min_value = ex.getclass().value
311        return SensitivityAnalysisClassifier(self.e, instances, min_value, max_value, learner)
312
313class SensitivityAnalysisClassifier:
314    def __init__(self, e, instances, min_value, max_value, learner):
315        self.e = e
316        self.instances = instances
317        self.max_value = max_value
318        self.min_value = min_value
319        self.learner = learner
320
321    def __call__(self, instance, predicted, probabilities):
322        # Create new dataset
323        r_data = Orange.data.Table(self.instances)
324
325        # Create new instance
326        modified_instance = Orange.data.Instance(instance)
327
328        # Append it to the data
329        r_data.append(modified_instance)
330
331        # Calculate SAvar & SAbias
332        SAvar = SAbias = 0
333
334        for eps in self.e:
335            # +epsilon
336            r_data[-1].setclass(predicted.value + eps * (self.max_value - self.min_value))
337            c = self.learner(r_data)
338            k_plus = c(instance, Orange.core.GetValue)
339
340            # -epsilon
341            r_data[-1].setclass(predicted.value - eps * (self.max_value - self.min_value))
342            c = self.learner(r_data)
343            k_minus = c(instance, Orange.core.GetValue)
344            #print len(r_data)
345            #print eps*(self.max_value - self.min_value)
346            #print k_plus
347            #print k_minus
348            # calculate part SAvar and SAbias
349            SAvar += k_plus.value - k_minus.value
350            SAbias += k_plus.value + k_minus.value - 2 * predicted.value
351
352        SAvar /= len(self.e)
353        SAbias /= 2 * len(self.e)
354
355        return [Estimate(SAvar, ABSOLUTE, SAVAR_ABSOLUTE),
356                Estimate(SAbias, SIGNED, SABIAS_SIGNED),
357                Estimate(abs(SAbias), ABSOLUTE, SABIAS_ABSOLUTE)]
358
359
360
361class ReferenceExpectedError:
362    """
363
364    :rtype: :class:`Orange.evaluation.reliability.ReferenceExpectedErrorClassifier`
365
366    Reference reliability estimation method for classification [Pevec2011]_:
367
368    :math:`O_{ref} = 2 (\hat y - \hat y ^2) = 2 \hat y (1-\hat y)`,
369
370    where :math:`\hat y` is the estimated probability of the predicted class.
371
372    Note that for this method, in contrast with all others, a greater estimate means lower reliability (greater expected error).
373
374    """
375    def __init__(self, name="reference"):
376        self.name = name
377
378    def __call__(self, instances, learner):
379        classifier = learner(instances)
380        return ReferenceExpectedErrorClassifier(classifier)
381
382   
383class ReferenceExpectedErrorClassifier:
384
385    def __init__(self, classifier):
386        self.classifier = classifier
387
388    def __call__(self, instance, *args):
389        y_hat = max(self.classifier(instance, Orange.classification.Classifier.GetProbabilities))
390        return [Estimate(2 * y_hat * (1 - y_hat), ABSOLUTE, ERR_ABSOLUTE)]
391
392
393class BaggingVariance:
394    """
395   
396    :param m: Number of bagging models to be used with BAGV estimate
397    :type m: int
398   
399    :rtype: :class:`Orange.evaluation.reliability.BaggingVarianceClassifier`
400   
401    :math:`m` different bagging models are constructed and used to estimate
402    the value of dependent variable for a given instance. In regression,
403    the variance of those predictions is used as a prediction reliability
404    estimate.
405
406    :math:`BAGV = \\frac{1}{m} \sum_{i=1}^{m} (K_i - K)^2`
407
408    where :math:`K = \\frac{\sum_{i=1}^{m} K_i}{m}` and :math:`K_i` are
409    predictions of individual constructed models. Note that a greater value
410    implies greater error.
411
412    For classification, 1 minus the average Euclidean distance between class
413    probability distributions predicted by the model, and distributions
414    predicted by the individual bagged models, is used as the BAGV reliability
415    measure. Note that in this case a greater value implies a better
416    prediction.
417   
418    This reliability measure can run out of memory fast if individual classifiers
419    use a lot of memory, as it build m of them, thereby using :math:`m` times memory
420    for a single classifier. If instances for measuring predictions
421    are given as a parameter, this class can only compute their reliability,
422    which saves memory.
423
424    """
425    def __init__(self, m=50, name="bv", randseed=0, for_instances=None):
426        """
427        for_instances:
428        """
429        self.m = m
430        self.name = name
431        self.select_with_repeat = Orange.core.MakeRandomIndicesMultiple()
432        self.select_with_repeat.random_generator = Orange.misc.Random(randseed)
433        self.for_instances = for_instances
434
435    def __call__(self, instances, learner):
436        classifiers = []
437
438        if instances.domain.class_var.var_type == Orange.feature.Descriptor.Discrete:
439            classifier = learner(instances)
440        else:
441            classifier = None
442
443        for_inst_class = defaultdict(list)
444        this_iteration = None
445       
446        if self.for_instances:
447            his = map(_hashable_instance, self.for_instances)
448
449        # Create bagged classifiers using sampling with replacement
450        for i in xrange(self.m):
451            this_iteration = set()
452            selection = self.select_with_repeat(len(instances))
453            data = instances.select(selection)
454            cl = learner(data)
455            if cl:
456                if self.for_instances: # predict reliability for testing instances and throw cl away
457                    for instance, hi in zip(self.for_instances, his):
458                        if hi not in this_iteration:
459                            for_inst_class[hi].append(_bagged_value(instance, cl, classifier))
460                            this_iteration.add(hi)
461                else:
462                    classifiers.append(cl)
463
464        return BaggingVarianceClassifier(classifiers, classifier, for_inst_class=dict(for_inst_class))
465
466class BaggingVarianceClassifier:
467    def __init__(self, classifiers, classifier=None, for_inst_class=None):
468        self.classifiers = classifiers
469        self.classifier = classifier
470        self.for_inst_class = for_inst_class
471
472    def __call__(self, instance, *args):
473        BAGV = 0
474
475        # Calculate the bagging variance
476        if self.for_inst_class:
477            bagged_values = self.for_inst_class[_hashable_instance(instance)]
478        else:
479            bagged_values = [ _bagged_value(instance, c, self.classifier) for c in self.classifiers ]
480
481        k = sum(bagged_values) / len(bagged_values)
482
483        BAGV = sum((bagged_value - k) ** 2 for bagged_value in bagged_values) / len(bagged_values)
484        if instance.domain.class_var.var_type == Orange.feature.Descriptor.Discrete:
485            BAGV = 1 - BAGV
486
487        return [Estimate(BAGV, ABSOLUTE, BAGV_ABSOLUTE)]
488
489def _hashable_instance(instance):
490    return tuple(instance[i].value for i in range(len(instance.domain.attributes)))
491
492def _bagged_value(instance, c, classifier):
493    if instance.domain.class_var.var_type == Orange.feature.Descriptor.Continuous:
494        return c(instance, Orange.core.GetValue).value
495    elif instance.domain.class_var.var_type == Orange.feature.Descriptor.Discrete:
496        estimate = classifier(instance, Orange.core.GetProbabilities)
497        return euclidean_dist(c(instance, Orange.core.GetProbabilities), estimate)
498
499
500class LocalCrossValidation:
501    """
502
503    :param k: Number of nearest neighbours used in LCV estimate
504    :type k: int
505
506    :param distance: function that computes a distance between two discrete
507        distributions (used only in classification problems). The default
508        is Hellinger distance.
509    :type distance: function
510
511    :param distance_weighted: for classification reliability estimation,
512        use an average distance between distributions, weighted by :math:`e^{-d}`,
513        where :math:`d` is the distance between predicted instance and the
514        neighbour.
515
516    :rtype: :class:`Orange.evaluation.reliability.LocalCrossValidationClassifier`
517
518    :math:`k` nearest neighbours to the given instance are found and put in
519    a separate data set. On this data set, a leave-one-out validation is
520    performed. Reliability estimate for regression is then the distance
521    weighted absolute prediction error. In classification, 1 minus the average
522    distance between the predicted class probability distribution and the
523    (trivial) probability distributions of the nearest neighbour.
524
525    If a special value 0 is passed as :math:`k` (as is by default),
526    it is set as 1/20 of data set size (or 5, whichever is greater).
527
528    Summary of the algorithm for regression:
529
530    1. Determine the set of k nearest neighours :math:`N = { (x_1, c_1),...,
531       (x_k, c_k)}`.
532    2. On this set, compute leave-one-out predictions :math:`K_i` and
533       prediction errors :math:`E_i = | C_i - K_i |`.
534    3. :math:`LCV(x) = \\frac{ \sum_{(x_i, c_i) \in N} d(x_i, x) * E_i }{ \sum_{(x_i, c_i) \in N} d(x_i, x) }`
535
536    """
537    def __init__(self, k=0, distance=hellinger_dist, distance_weighted=True, name="lcv"):
538        self.k = k
539        self.distance = distance
540        self.distance_weighted = distance_weighted
541        self.name = name
542
543    def __call__(self, instances, learner):
544        nearest_neighbours_constructor = Orange.classification.knn.FindNearestConstructor()
545        nearest_neighbours_constructor.distanceConstructor = Orange.distance.Euclidean()
546
547        distance_id = Orange.feature.Descriptor.new_meta_id()
548        nearest_neighbours = nearest_neighbours_constructor(instances, 0, distance_id)
549
550        if self.k == 0:
551            self.k = max(5, len(instances) / 20)
552
553        return LocalCrossValidationClassifier(distance_id, nearest_neighbours, self.k, learner,
554            distance=self.distance, distance_weighted=self.distance_weighted)
555
556class LocalCrossValidationClassifier:
557    def __init__(self, distance_id, nearest_neighbours, k, learner, **kwds):
558        self.distance_id = distance_id
559        self.nearest_neighbours = nearest_neighbours
560        self.k = k
561        self.learner = learner
562        for a,b in kwds.items():
563            setattr(self, a, b)
564
565    def __call__(self, instance, *args):
566        LCVer = 0
567        LCVdi = 0
568
569        # Find k nearest neighbors
570
571        knn = [ex for ex in self.nearest_neighbours(instance, self.k)]
572
573        # leave one out of prediction error
574        for i in xrange(len(knn)):
575            train = knn[:]
576            del train[i]
577
578            classifier = self.learner(Orange.data.Table(train))
579
580            if instance.domain.class_var.var_type == Orange.feature.Descriptor.Continuous:
581                returned_value = classifier(knn[i], Orange.core.GetValue)
582                e = abs(knn[i].getclass().value - returned_value.value)
583
584            elif instance.domain.class_var.var_type == Orange.feature.Descriptor.Discrete:
585                returned_value = classifier(knn[i], Orange.core.GetProbabilities)
586                probabilities = [knn[i].get_class() == val for val in instance.domain.class_var.values]
587                e = self.distance(returned_value, Orange.statistics.distribution.Discrete(probabilities))
588
589            dist = math.exp(-knn[i][self.distance_id]) if self.distance_weighted else 1.0
590            LCVer += e * dist
591            LCVdi += dist
592
593        LCV = LCVer / LCVdi if LCVdi != 0 else 0
594        if math.isnan(LCV):
595            LCV = 0.0
596
597        if instance.domain.class_var.var_type == Orange.feature.Descriptor.Discrete:
598            LCV = 1 - LCV
599
600        return [ Estimate(LCV, ABSOLUTE, LCV_ABSOLUTE) ]
601
602class CNeighbours:
603    """
604   
605    :param k: Number of nearest neighbours used in CNK estimate
606    :type k: int
607
608    :param distance: function that computes a distance between two discrete
609        distributions (used only in classification problems). The default
610        is Hellinger distance.
611    :type distance: function
612   
613    :rtype: :class:`Orange.evaluation.reliability.CNeighboursClassifier`
614   
615    For regression, CNK is defined for an unlabeled instance as a difference
616    between average label of its nearest neighbours and its prediction. CNK
617    can be used as a signed or absolute estimate.
618   
619    :math:`CNK = \\frac{\sum_{i=1}^{k}C_i}{k} - K`
620   
621    where :math:`k` denotes number of neighbors, C :sub:`i` denotes neighbours'
622    labels and :math:`K` denotes the instance's prediction. Note that a greater
623    value implies greater prediction error.
624
625    For classification, CNK is equal to 1 minus the average distance between
626    predicted class distribution and (trivial) class distributions of the
627    $k$ nearest neighbours from the learning set. Note that in this case
628    a greater value implies better prediction.
629   
630    """
631    def __init__(self, k=5, distance=hellinger_dist, name = "cnk"):
632        self.k = k
633        self.distance = distance
634        self.name = name
635
636    def __call__(self, instances, learner):
637        nearest_neighbours_constructor = Orange.classification.knn.FindNearestConstructor()
638        nearest_neighbours_constructor.distanceConstructor = Orange.distance.Euclidean()
639
640        distance_id = Orange.feature.Descriptor.new_meta_id()
641        nearest_neighbours = nearest_neighbours_constructor(instances, 0, distance_id)
642        return CNeighboursClassifier(nearest_neighbours, self.k, distance=self.distance)
643
644class CNeighboursClassifier:
645    def __init__(self, nearest_neighbours, k, distance):
646        self.nearest_neighbours = nearest_neighbours
647        self.k = k
648        self.distance = distance
649
650    def __call__(self, instance, predicted, probabilities):
651        CNK = 0
652
653        # Find k nearest neighbors
654
655        knn = [ex for ex in self.nearest_neighbours(instance, self.k)]
656
657        # average label of neighbors
658        if ex.domain.class_var.var_type == Orange.feature.Descriptor.Continuous:
659            for ex in knn:
660                CNK += ex.getclass().value
661            CNK /= self.k
662            CNK -= predicted.value
663
664            return [Estimate(CNK, SIGNED, CNK_SIGNED),
665                    Estimate(abs(CNK), ABSOLUTE, CNK_ABSOLUTE)]
666        elif ex.domain.class_var.var_type == Orange.feature.Descriptor.Discrete:
667            knn_l = Orange.classification.knn.kNNLearner(k=self.k)
668            knn_c = knn_l(knn)
669            for ex in knn:
670                CNK -= self.distance(probabilities, knn_c(ex, Orange.classification.Classifier.GetProbabilities))
671            CNK /= self.k
672            CNK += 1
673
674            return [Estimate(CNK, ABSOLUTE, CNK_ABSOLUTE)]
675
676class Mahalanobis:
677    """
678   
679    :param k: Number of nearest neighbours used in Mahalanobis estimate.
680    :type k: int
681   
682    :rtype: :class:`Orange.evaluation.reliability.MahalanobisClassifier`
683   
684    Mahalanobis distance reliability estimate is defined as
685    `Mahalanobis distance <http://en.wikipedia.org/wiki/Mahalanobis_distance>`_
686    to the evaluated instance's :math:`k` nearest neighbours.
687
688   
689    """
690    def __init__(self, k=3, name="mahalanobis"):
691        self.k = k
692        self.name = name
693
694    def __call__(self, instances, *args):
695        nnm = Orange.classification.knn.FindNearestConstructor()
696        nnm.distanceConstructor = Orange.distance.Mahalanobis()
697
698        mid = Orange.feature.Descriptor.new_meta_id()
699        nnm = nnm(instances, 0, mid)
700        return MahalanobisClassifier(self.k, nnm, mid)
701
702class MahalanobisClassifier:
703    def __init__(self, k, nnm, mid):
704        self.k = k
705        self.nnm = nnm
706        self.mid = mid
707
708    def __call__(self, instance, *args):
709        mahalanobis_distance = 0
710
711        mahalanobis_distance = sum(ex[self.mid].value for ex in self.nnm(instance, self.k))
712
713        return [ Estimate(mahalanobis_distance, ABSOLUTE, MAHAL_ABSOLUTE) ]
714
715class MahalanobisToCenter:
716    """
717    :rtype: :class:`Orange.evaluation.reliability.MahalanobisToCenterClassifier`
718   
719    Mahalanobis distance to center reliability estimate is defined as a
720    `Mahalanobis distance <http://en.wikipedia.org/wiki/Mahalanobis_distance>`_
721    between the predicted instance and the centroid of the data.
722
723   
724    """
725    def __init__(self, name="mahalanobis to center"):
726        self.name = name
727
728    def __call__(self, instances, *args):
729        dc = Orange.core.DomainContinuizer()
730        dc.classTreatment = Orange.core.DomainContinuizer.Ignore
731        dc.continuousTreatment = Orange.core.DomainContinuizer.NormalizeBySpan
732        dc.multinomialTreatment = Orange.core.DomainContinuizer.NValues
733
734        new_domain = dc(instances)
735        new_instances = instances.translate(new_domain)
736
737        X, _, _ = new_instances.to_numpy()
738        instance_avg = numpy.average(X, 0)
739
740        distance_constructor = Orange.distance.Mahalanobis()
741        distance = distance_constructor(new_instances)
742
743        average_instance = Orange.data.Instance(new_instances.domain, list(instance_avg) + ["?"])
744
745        return MahalanobisToCenterClassifier(distance, average_instance, new_domain)
746
747class MahalanobisToCenterClassifier:
748    def __init__(self, distance, average_instance, new_domain):
749        self.distance = distance
750        self.average_instance = average_instance
751        self.new_domain = new_domain
752
753    def __call__(self, instance, *args):
754
755        inst = Orange.data.Instance(self.new_domain, instance)
756
757        mahalanobis_to_center = self.distance(inst, self.average_instance)
758
759        return [ Estimate(mahalanobis_to_center, ABSOLUTE, MAHAL_TO_CENTER_ABSOLUTE) ]
760
761
762class BaggingVarianceCNeighbours:
763    """
764   
765    :param bagv: Instance of Bagging Variance estimator.
766    :type bagv: :class:`BaggingVariance`
767   
768    :param cnk: Instance of CNK estimator.
769    :type cnk: :class:`CNeighbours`
770   
771    :rtype: :class:`Orange.evaluation.reliability.BaggingVarianceCNeighboursClassifier`
772   
773    BVCK is an average of Bagging variance and local modeling of
774    prediction error.
775   
776    """
777    def __init__(self, bagv=None, cnk=None, name="bvck"):
778        if bagv is None:
779            bagv = BaggingVariance()
780        if cnk is None:
781            cnk = CNeighbours()
782        self.bagv = bagv
783        self.cnk = cnk
784        self.name = "bvck"
785
786    def __call__(self, instances, learner):
787        bagv_classifier = self.bagv(instances, learner)
788        cnk_classifier = self.cnk(instances, learner)
789        return BaggingVarianceCNeighboursClassifier(bagv_classifier, cnk_classifier)
790
791class BaggingVarianceCNeighboursClassifier:
792    def __init__(self, bagv_classifier, cnk_classifier):
793        self.bagv_classifier = bagv_classifier
794        self.cnk_classifier = cnk_classifier
795
796    def __call__(self, instance, predicted, probabilities):
797        bagv_estimates = self.bagv_classifier(instance, predicted, probabilities)
798        cnk_estimates = self.cnk_classifier(instance, predicted, probabilities)
799
800        bvck_value = (bagv_estimates[0].estimate + cnk_estimates[1].estimate) / 2
801        bvck_estimates = [ Estimate(bvck_value, ABSOLUTE, BVCK_ABSOLUTE) ]
802        bvck_estimates.extend(bagv_estimates)
803        bvck_estimates.extend(cnk_estimates)
804        return bvck_estimates
805
806class ErrorPredicting:
807    def __init__(self, name = "ep"):
808        self.name = name
809
810    def __call__(self, instances, learner):
811        res = Orange.evaluation.testing.cross_validation([learner], instances)
812        prediction_errors = get_prediction_error_list(res)
813
814        new_domain = Orange.data.Domain(instances.domain.attributes, Orange.core.FloatVariable("pe"))
815        new_dataset = Orange.data.Table(new_domain, instances)
816
817        for instance, prediction_error in izip(new_dataset, prediction_errors):
818            instance.set_class(prediction_error)
819
820        rf = Orange.ensemble.forest.RandomForestLearner()
821        rf_classifier = rf(new_dataset)
822
823        return ErrorPredictingClassification(rf_classifier, new_domain)
824
825class ErrorPredictingClassification:
826    def __init__(self, rf_classifier, new_domain):
827        self.rf_classifier = rf_classifier
828        self.new_domain = new_domain
829
830    def __call__(self, instance, predicted, probabilities):
831        new_instance = Orange.data.Instance(self.new_domain, instance)
832        value = self.rf_classifier(new_instance, Orange.core.GetValue)
833
834        return [Estimate(value.value, SIGNED, SABIAS_SIGNED)]
835
836def gauss_kernel(x, sigma=1):
837    return 1./(sigma*math.sqrt(2*math.pi)) * math.exp(-1./2*(x/sigma)**2)
838
839class ParzenWindowDensityBased:
840    """
841    :param K: kernel function. Default: gaussian.
842    :type K: function
843
844    :param d_measure: distance measure for inter-instance distance.
845    :type d_measure: :class:`Orange.distance.DistanceConstructor`
846
847    :rtype: :class:`Orange.evaluation.reliability.ParzenWindowDensityBasedClassifier`
848
849    Returns a value that estimates a density of problem space around the
850    instance being predicted.
851    """
852    def __init__(self, K=gauss_kernel, d_measure=Orange.distance.Euclidean(), name="density"):
853        self.K = K
854        self.d_measure = d_measure
855        self.name = name
856
857    def __call__(self, instances, learner):
858
859        self.distance = self.d_measure(instances)
860
861        def density(x):
862            l, dens = len(instances), 0
863            for ex in instances:
864                dens += self.K(self.distance(x,ex))
865            return dens / l
866
867        max_density = max([density(ex) for ex in instances])
868
869        return ParzenWindowDensityBasedClassifier(density, max_density)
870
871class ParzenWindowDensityBasedClassifier:
872
873    def __init__(self, density, max_density):
874        self.density = density
875        self.max_density = max_density
876
877
878    def __call__(self, instance, *args):
879
880        DENS = self.max_density-self.density(instance)
881
882        return [Estimate(DENS, ABSOLUTE, DENS_ABSOLUTE)]
883
884
885def _normalize(data):
886    dc = Orange.core.DomainContinuizer()
887    dc.classTreatment = Orange.core.DomainContinuizer.Ignore
888    dc.continuousTreatment = Orange.core.DomainContinuizer.NormalizeByVariance
889    domain = dc(data)
890    data = data.translate(domain)
891    return data
892
893class _NormalizedLearner(Orange.classification.Learner):
894    """
895    Wrapper for normalization.
896    """
897    def __init__(self, learner):
898        self.learner = learner
899
900    def __call__(self, data, *args, **kwargs):
901        return self.learner(_normalize(data), *args, **kwargs)
902
903class Stacking:
904    """
905
906    This methods develops a model that integrates reliability estimates
907    from all available reliability scoring techniques. To develop such
908    model it needs to performs internal cross-validation, similarly to :class:`ICV`.
909
910    :param stack_learner: a data modelling method. Default (if None): unregularized linear regression with prior normalization.
911    :type stack_learner: :obj:`Orange.classification.Learner`
912
913    :param estimators: Reliability estimation methods to choose from. Default (if None): :class:`SensitivityAnalysis`, :class:`LocalCrossValidation`, :class:`BaggingVarianceCNeighbours`, :class:`Mahalanobis`, :class:`MahalanobisToCenter`.
914    :type estimators: :obj:`list` of reliability estimators
915 
916    :param folds: The number of fold for cross validation (default 10).
917    :type box_learner: :obj:`int`
918
919    :param save_data: If True, save the data used for training the
920        model for intergration into resulting classifier's .data attribute (default False).
921    :type box_learner: :obj:`bool`
922 
923    """
924 
925    def __init__(self, 
926        stack_learner=None, 
927        estimators=None, 
928        folds=10, 
929        save_data=False):
930        self.stack_learner = stack_learner
931        self.estimators = estimators
932        self.folds = folds
933        self.save_data = save_data
934        if self.stack_learner is None:
935            self.stack_learner=_NormalizedLearner(Orange.regression.linear.LinearRegressionLearner(ridge_lambda=0.0))
936        if self.estimators is None:
937             self.estimators = [SensitivityAnalysis(),
938                           LocalCrossValidation(),
939                           BaggingVarianceCNeighbours(),
940                           Mahalanobis(),
941                           MahalanobisToCenter()]
942   
943    def __call__(self, data, learner):
944
945        newfeatures = None
946       
947        if self.folds > 1:
948
949            cvi = Orange.data.sample.SubsetIndicesCV(data, self.folds)
950            data_cv = [ None ] * len(data)
951            for f in set(cvi): #for each fold
952                learn = data.select(cvi, f, negate=True)
953                test = data.select(cvi, f)
954
955                #learn reliability estimates for the learning set
956                lf = Learner(learner, estimators=self.estimators)(learn)
957               
958                #pos is used to retain the order of instances
959                for ex, pos in zip(test, [ i for i,n in enumerate(cvi) if n == f ]):
960                    pred = lf(ex, Orange.core.GetBoth)
961                    re = pred[1].reliability_estimate
962                    names = [ e.method_name for e in re ]
963                    assert newfeatures is None or names == newfeatures
964                    newfeatures = names
965                    estimates = [ abs(e.estimate) for e in re ]
966                    error = ex[-1].value - pred[0].value
967                    data_cv[pos] = estimates + [ abs(error) ]
968
969        else:
970 
971            #use half of the data to learn reliability estimates
972            #and the other half for induction of a stacking classifier
973            cvi = Orange.data.sample.SubsetIndicesCV(data, 2)
974            data_cv = []
975
976            learn = data.select(cvi, 0, negate=True)
977            test = data.select(cvi, 0)
978
979            #learn reliability estimates for the learning set
980            lf = Learner(learner, estimators=self.estimators)(learn)
981           
982            for ex in test:
983                pred = lf(ex, Orange.core.GetBoth)
984                re = pred[1].reliability_estimate
985                names = [ e.method_name for e in re ]
986                assert newfeatures is None or names == newfeatures
987                newfeatures = names
988                estimates = [ abs(e.estimate) for e in re ]
989                error = ex[-1].value - pred[0].value
990                data_cv.append(estimates + [ abs(error) ])
991
992        lf = None
993
994        #induce the classifier on cross-validated reliability estimates
995        newfeatures = [ Orange.feature.Continuous(name=n) for n in newfeatures ]
996        newdomain = Orange.data.Domain(newfeatures, Orange.feature.Continuous(name="error"))
997        classifier_data = Orange.data.Table(newdomain, data_cv)
998        stack_classifier = self.stack_learner(classifier_data)
999
1000        #induce reliability estimates on the whole data set
1001        lf = Learner(learner, estimators=self.estimators)(data)
1002
1003        return StackingClassifier(stack_classifier, lf, newdomain, data=classifier_data if self.save_data else None)
1004
1005
1006class StackingClassifier:
1007
1008    def __init__(self, stacking_classifier, reliability_classifier, domain, data=None):
1009        self.stacking_classifier = stacking_classifier
1010        self.domain = domain
1011        self.reliability_classifier = reliability_classifier
1012        self.data = data
1013
1014    def convert(self, instance):
1015        """ Return example in the space of reliability estimates. """
1016        re = self.reliability_classifier(instance, Orange.core.GetProbabilities).reliability_estimate
1017        #take absolute values for all
1018        tex = [ abs(e.estimate) for e in re ] + [ "?" ]
1019        tex =  Orange.data.Instance(self.domain, tex)
1020        return tex
1021
1022    def __call__(self, instance, *args):
1023        tex = self.convert(instance)
1024        r = self.stacking_classifier(tex)
1025        r = float(r)
1026        r = max(0., r)
1027        return [ Estimate(r, ABSOLUTE, STACKING) ]
1028
1029class ICV:
1030    """ Selects the best reliability estimator for
1031    the given data with internal cross validation [Bosnic2010]_.
1032
1033    :param estimators: reliability estimation methods to choose from. Default (if None): :class:`SensitivityAnalysis`, :class:`LocalCrossValidation`, :class:`BaggingVarianceCNeighbours`, :class:`Mahalanobis`, :class:`MahalanobisToCenter` ]
1034    :type estimators: :obj:`list` of reliability estimators
1035 
1036    :param folds: The number of fold for cross validation (default 10).
1037    :type box_learner: :obj:`int`
1038 
1039    """
1040 
1041    def __init__(self, estimators=None, folds=10):
1042        self.estimators = estimators
1043        if self.estimators is None:
1044             self.estimators = [SensitivityAnalysis(),
1045                           LocalCrossValidation(),
1046                           BaggingVarianceCNeighbours(),
1047                           Mahalanobis(),
1048                           MahalanobisToCenter()]
1049        self.folds = folds
1050   
1051    def __call__(self, data, learner):
1052
1053        cvi = Orange.data.sample.SubsetIndicesCV(data, self.folds)
1054        sum_of_rs = defaultdict(float)
1055        n_rs = defaultdict(int)
1056
1057        elearner = Learner(learner, estimators=self.estimators)
1058
1059        #average correlations from each fold
1060        for f in set(cvi):
1061            learn = data.select(cvi, f, negate=True)
1062            test = data.select(cvi, f)
1063
1064            res = Orange.evaluation.testing.learn_and_test_on_test_data([elearner], learn, test)
1065            results = get_pearson_r(res)
1066   
1067            for r, p, sa, method in results:
1068                if not math.isnan(r): #ignore NaN values
1069                    sum_of_rs[(method, sa)] += r
1070                    n_rs[(method, sa)] += 1 
1071
1072        avg_rs = [ (k,(sum_of_rs[k]/n_rs[k])) for k in sum_of_rs ]
1073
1074        avg_rs = sorted(avg_rs, key=lambda estimate: estimate[1], reverse=True)
1075        chosen = avg_rs[0][0]
1076
1077        lf = elearner(data)
1078        return ICVClassifier(chosen, lf)
1079
1080
1081class ICVClassifier:
1082
1083    def __init__(self, chosen, reliability_classifier):
1084        self.chosen = chosen
1085        self.reliability_classifier = reliability_classifier
1086
1087    def __call__(self, instance, *args):
1088        re = self.reliability_classifier(instance, Orange.core.GetProbabilities).reliability_estimate
1089        for e in re:
1090            if e.method == self.chosen[0] and e.signed_or_absolute == self.chosen[1]:
1091                r = e.estimate
1092
1093        return [ Estimate(r, self.chosen[1], ICV_METHOD) ]
1094
1095class Learner:
1096    """
1097    Adds reliability estimation to any learner: multiple reliability estimation
1098    algorithms can be used simultaneously.
1099    This learner can be used as any other learner,
1100    but returns the classifier wrapped into an instance of
1101    :class:`Orange.evaluation.reliability.Classifier`.
1102   
1103    :param box_learner: Learner to wrap into a reliability estimation
1104        classifier.
1105    :type box_learner: :obj:`~Orange.classification.Learner`
1106   
1107    :param estimators: List of reliability estimation methods. Default (if None): :class:`SensitivityAnalysis`, :class:`LocalCrossValidation`, :class:`BaggingVarianceCNeighbours`, :class:`Mahalanobis`, :class:`MahalanobisToCenter`.
1108    :type estimators: :obj:`list` of reliability estimators
1109   
1110    :param name: Name of this reliability learner.
1111    :type name: string
1112   
1113    :rtype: :class:`Orange.evaluation.reliability.Learner`
1114    """
1115    def __init__(self, box_learner, name="Reliability estimation",
1116                 estimators=None,
1117                 **kwds):
1118        self.__dict__.update(kwds)
1119        self.name = name
1120        self.estimators = estimators
1121        if self.estimators is None:
1122             self.estimators = [SensitivityAnalysis(),
1123                           LocalCrossValidation(),
1124                           BaggingVarianceCNeighbours(),
1125                           Mahalanobis(),
1126                           MahalanobisToCenter()]
1127 
1128        self.box_learner = box_learner
1129        self.blending = False
1130
1131
1132    def __call__(self, instances, weight=None, **kwds):
1133        """Learn from the given table of data instances.
1134       
1135        :param instances: Data to learn from.
1136        :type instances: Orange.data.Table
1137        :param weight: Id of meta attribute with weights of instances
1138        :type weight: int
1139
1140        :rtype: :class:`Orange.evaluation.reliability.Classifier`
1141        """
1142
1143        blending_classifier = None
1144        new_domain = None
1145
1146#        if instances.domain.class_var.var_type != Orange.feature.Continuous.Continuous:
1147#            raise Exception("This method only works on data with continuous class.")
1148
1149        return Classifier(instances, self.box_learner, self.estimators, self.blending, new_domain, blending_classifier)
1150 
1151class Classifier:
1152    """
1153    A reliability estimation wrapper for classifiers.
1154    The returned probabilities contain an
1155    additional attribute :obj:`reliability_estimate`, which is a list of
1156    :class:`~Orange.evaluation.reliability.Estimate` (see :obj:`~Classifier.__call__`).
1157    """
1158
1159    def __init__(self, instances, box_learner, estimators, blending, blending_domain, rf_classifier, **kwds):
1160        self.__dict__.update(kwds)
1161        self.instances = instances
1162        self.box_learner = box_learner
1163        self.estimators = estimators
1164        self.blending = blending
1165        self.blending_domain = blending_domain
1166        self.rf_classifier = rf_classifier
1167
1168        # Train the learner with original data
1169        self.classifier = box_learner(instances)
1170
1171        # Train all the estimators and create their classifiers
1172        self.estimation_classifiers = [estimator(instances, box_learner) for estimator in estimators]
1173
1174    def __call__(self, instance, result_type=Orange.core.GetValue):
1175        """
1176        Classify and estimate reliability of estimation for a new instance.
1177        When :obj:`result_type` is set to
1178        :obj:`Orange.classification.Classifier.GetBoth` or
1179        :obj:`Orange.classification.Classifier.GetProbabilities`,
1180        an additional attribute :obj:`reliability_estimate`
1181        (a list of :class:`~Orange.evaluation.reliability.Estimate`)
1182        is added to the distribution object.
1183       
1184        :param instance: instance to be classified.
1185        :type instance: :class:`Orange.data.Instance`
1186        :param result_type: :class:`Orange.classification.Classifier.GetValue` or \
1187              :class:`Orange.classification.Classifier.GetProbabilities` or
1188              :class:`Orange.classification.Classifier.GetBoth`
1189       
1190        :rtype: :class:`Orange.data.Value`,
1191              :class:`Orange.statistics.Distribution` or a tuple with both
1192        """
1193        predicted, probabilities = self.classifier(instance, Orange.core.GetBoth)
1194
1195        # Create a place holder for estimates
1196        if probabilities is None:
1197            probabilities = Orange.statistics.distribution.Continuous()
1198        #with warnings.catch_warnings():
1199        #    warnings.simplefilter("ignore")
1200        probabilities.setattr('reliability_estimate', [])
1201
1202        # Calculate all the estimates and add them to the results
1203        for estimate in self.estimation_classifiers:
1204            probabilities.reliability_estimate.extend(estimate(instance, predicted, probabilities))
1205
1206        # Return the appropriate type of result
1207        if result_type == Orange.core.GetValue:
1208            return predicted
1209        elif result_type == Orange.core.GetProbabilities:
1210            return probabilities
1211        else:
1212            return predicted, probabilities
1213
1214# Functions for testing and plotting
1215#TODO Document those.
1216def get_acc_rel(method, data, learner):
1217    estimators = [method]
1218    reliability = Orange.evaluation.reliability.Learner(learner, estimators=estimators)
1219    #results = Orange.evaluation.testing.leave_one_out([reliability], data)
1220    results = Orange.evaluation.testing.cross_validation([reliability], data)
1221
1222    rels, acc = [], []
1223
1224    for res in results.results:
1225        rels.append(res.probabilities[0].reliability_estimate[0].estimate)
1226        acc.append(res.probabilities[0][res.actual_class])
1227
1228    return rels, acc
1229
1230
1231def rel_acc_plot(rels, acc, file_name=None, colors=None):
1232
1233    import matplotlib.pylab as plt
1234   
1235    if colors is None:
1236        colors = "k"
1237    plt.scatter(rels, acc, c=colors)
1238    plt.xlim(0.,1.)
1239    plt.ylim(ymin=0.)
1240    plt.xlabel("Reliability")
1241    plt.ylabel("Accuracy")
1242    if file_name is None:
1243        plt.show()
1244    else:
1245        plt.savefig(file_name)
1246
1247def rel_acc_compute_plot(method, data, learner, file_name=None, colors=None):
1248
1249    plt.clf()
1250
1251    rels, acc = get_acc_rel(method, data, learner)
1252    el_acc_plot(acc, rels, file_name=file_name, colors=colors)
1253   
1254
1255def acc_rel_correlation(method, data, learner):
1256    import scipy.stats
1257    rels, acc = get_acc_rel(method, data, learner)
1258    return scipy.stats.spearmanr(acc, rels)[0]
Note: See TracBrowser for help on using the repository browser.