source: orange-reliability/orangecontrib/reliability/__init__.py @ 38:7aabb3dd2321

Revision 38:7aabb3dd2321, 45.1 KB checked in by markotoplak, 7 months ago (diff)

Added default estimators for ICV and Stacking. Fixed a bug in ICV if a correction is "nan".

Line 
1import Orange
2
3import random
4from Orange import statc
5import math
6import warnings
7import numpy
8
9from collections import defaultdict
10from itertools import izip
11
12# All the estimator method constants
13SAVAR_ABSOLUTE = 0
14SABIAS_SIGNED = 1
15SABIAS_ABSOLUTE = 2
16BAGV_ABSOLUTE = 3
17CNK_SIGNED = 4
18CNK_ABSOLUTE = 5
19LCV_ABSOLUTE = 6
20BVCK_ABSOLUTE = 7
21MAHAL_ABSOLUTE = 8
22BLENDING_ABSOLUTE = 9
23ICV_METHOD = 10
24MAHAL_TO_CENTER_ABSOLUTE = 13
25DENS_ABSOLUTE = 14
26ERR_ABSOLUTE = 15
27STACKING = 101
28
29# Type of estimator constant
30SIGNED = 0
31ABSOLUTE = 1
32
33# Names of all the estimator methods
34METHOD_NAME = {0: "SAvar absolute", 1: "SAbias signed", 2: "SAbias absolute",
35               3: "BAGV absolute", 4: "CNK signed", 5: "CNK absolute",
36               6: "LCV absolute", 7: "BVCK absolute", 8: "Mahalanobis absolute",
37               9: "BLENDING absolute", 10: "ICV", 11: "RF Variance", 12: "RF Std",
38               13: "Mahalanobis to center", 14: "Density based", 15: "Reference expected error",
39               101: "Stacking" }
40
41def get_reliability_estimation_list(res, i):
42    return [ result.probabilities[0].reliability_estimate[i].estimate for result in res.results], \
43        res.results[0].probabilities[0].reliability_estimate[i].signed_or_absolute, \
44        res.results[0].probabilities[0].reliability_estimate[i].method
45
46def get_prediction_error_list(res):
47    return [result.actual_class - result.classes[0] for result in res.results]
48
49def get_description_list(res, i):
50    return [result.probabilities[0].reliability_estimate[i].text_description for result in res.results]
51
52def get_pearson_r(res):
53    """
54    :param res: results of evaluation, done using learners,
55        wrapped into :class:`Orange.evaluation.reliability.Classifier`.
56    :type res: :class:`Orange.evaluation.testing.ExperimentResults`
57
58    Return Pearson's coefficient between the prediction error and each of the
59    used reliability estimates. Also, return the p-value of each of
60    the coefficients.
61    """
62    prediction_error = get_prediction_error_list(res)
63    results = []
64    for i in xrange(len(res.results[0].probabilities[0].reliability_estimate)):
65        reliability_estimate, signed_or_absolute, method = get_reliability_estimation_list(res, i)
66        try:
67            if signed_or_absolute == SIGNED:
68                r, p = statc.pearsonr(prediction_error, reliability_estimate)
69            else:
70                r, p = statc.pearsonr([abs(pe) for pe in prediction_error], reliability_estimate)
71        except Exception:
72            r = p = float("NaN")
73        results.append((r, p, signed_or_absolute, method))
74    return results
75
76def get_spearman_r(res):
77    """
78    :param res: results of evaluation, done using learners,
79        wrapped into :class:`Orange.evaluation.reliability.Classifier`.
80    :type res: :class:`Orange.evaluation.testing.ExperimentResults`
81
82    Return Spearman's coefficient between the prediction error and each of the
83    used reliability estimates. Also, return the p-value of each of
84    the coefficients.
85    """
86    prediction_error = get_prediction_error_list(res)
87    results = []
88    for i in xrange(len(res.results[0].probabilities[0].reliability_estimate)):
89        reliability_estimate, signed_or_absolute, method = get_reliability_estimation_list(res, i)
90        try:
91            if signed_or_absolute == SIGNED:
92                r, p = statc.spearmanr(prediction_error, reliability_estimate)
93            else:
94                r, p = statc.spearmanr([abs(pe) for pe in prediction_error], reliability_estimate)
95        except Exception:
96            r = p = float("NaN")
97        results.append((r, p, signed_or_absolute, method))
98    return results
99
100def get_pearson_r_by_iterations(res):
101    """
102    :param res: results of evaluation, done using learners,
103        wrapped into :class:`Orange.evaluation.reliability.Classifier`.
104    :type res: :class:`Orange.evaluation.testing.ExperimentResults`
105
106    Return average Pearson's coefficient over all folds between prediction error
107    and each of the used estimates.
108    """
109    results_by_fold = Orange.evaluation.scoring.split_by_iterations(res)
110    number_of_estimates = len(res.results[0].probabilities[0].reliability_estimate)
111    number_of_instances = len(res.results)
112    number_of_folds = len(results_by_fold)
113    results = [0 for _ in xrange(number_of_estimates)]
114    sig = [0 for _ in xrange(number_of_estimates)]
115    method_list = [0 for _ in xrange(number_of_estimates)]
116
117    for res in results_by_fold:
118        prediction_error = get_prediction_error_list(res)
119        for i in xrange(number_of_estimates):
120            reliability_estimate, signed_or_absolute, method = get_reliability_estimation_list(res, i)
121            try:
122                if signed_or_absolute == SIGNED:
123                    r, _ = statc.pearsonr(prediction_error, reliability_estimate)
124                else:
125                    r, _ = statc.pearsonr([abs(pe) for pe in prediction_error], reliability_estimate)
126            except Exception:
127                r = float("NaN")
128            results[i] += r
129            sig[i] = signed_or_absolute
130            method_list[i] = method
131
132    # Calculate p-values
133    results = [float(res) / number_of_folds for res in results]
134    ps = [p_value_from_r(r, number_of_instances) for r in results]
135
136    return zip(results, ps, sig, method_list)
137
138def p_value_from_r(r, n):
139    """
140    Calculate p-value from the paerson coefficient and the sample size.
141    """
142    df = n - 2
143    t = r * (df / ((-r + 1.0 + 1e-30) * (r + 1.0 + 1e-30))) ** 0.5
144    return statc.betai (df * 0.5, 0.5, df / (df + t * t))
145
146
147# Distances between two discrete probability distributions
148#TODO Document those.
149def normalize_both(p, q):
150    if not p.normalized:
151        p.normalize()
152    if not q.normalized:
153        q.normalize()
154    return p, q
155
156def minkowsky_dist(p, q, m=2):
157    p, q = normalize_both(p, q)
158    dist = 0
159    for i in range(len(p)):
160        dist += abs(p[i]-q[i])**m
161    return dist**(1./m)
162
163def manhattan_distance(p, q):
164    return minkowsky_dist(p, q, m=1)
165
166def euclidean_dist(p, q):
167    return minkowsky_dist(p, q, m=2)
168
169def variance_dist(p, q):
170    return euclidean_dist(p, q) ** 2
171
172def max_dist(p, q):
173    p, q = normalize_both(p, q)
174    return max([abs(p[i]-q[i]) for i in range(len(p))])
175
176def hellinger_dist(p, q):
177    p, q = normalize_both(p, q)
178    dist = 0
179    for i in range(len(p)):
180        dist += (math.sqrt(p[i])-math.sqrt(q[i])) ** 2
181    return dist
182
183def my_log(x):
184    return 0 if x == 0 else x * math.log(x)
185
186def kullback_leibler(p, q):
187    p, q = normalize_both(p, q)
188    dist = 0
189    for i in range(len(p)):
190        dist += my_log(p[i]-q[i])
191    return dist
192
193def cosine(p, q):
194    p, q = normalize_both(p, q)
195    p, q = [pp for pp in p], [qq for qq in q]
196    return 1 - numpy.dot(x,y) / (numpy.linalg.norm(p)*numpy.linalg.norm(q))
197
198
199class Estimate:
200    """
201    Reliability estimate. Contains attributes that describe the results of
202    reliability estimation.
203
204    .. attribute:: estimate
205
206        A numerical reliability estimate.
207
208    .. attribute:: signed_or_absolute
209
210        Determines whether the method used gives a signed or absolute result.
211        Has a value of either :obj:`SIGNED` or :obj:`ABSOLUTE`.
212
213    .. attribute:: method
214
215        An integer ID of reliability estimation method used.
216
217    .. attribute:: method_name
218
219        Name (string) of reliability estimation method used.
220
221    .. attribute:: icv_method
222
223        An integer ID of reliability estimation method that performed best,
224        as determined by ICV, and of which estimate is stored in the
225        :obj:`estimate` field. (:obj:`None` when ICV was not used.)
226
227    .. attribute:: icv_method_name
228
229        Name (string) of reliability estimation method that performed best,
230        as determined by ICV. (:obj:`None` when ICV was not used.)
231
232    """
233    def __init__(self, estimate, signed_or_absolute, method, icv_method= -1):
234        self.estimate = estimate
235        self.signed_or_absolute = signed_or_absolute
236        self.method = method
237        self.method_name = METHOD_NAME[method]
238        self.icv_method = icv_method
239        self.icv_method_name = METHOD_NAME[icv_method] if icv_method != -1 else ""
240        self.text_description = None
241
242class DescriptiveAnalysis:
243    def __init__(self, estimator, desc=["high", "medium", "low"], procentage=[0.00, 0.33, 0.66], name="da"):
244        self.desc = desc
245        self.procentage = procentage
246        self.estimator = estimator
247        self.name = name
248
249    def __call__(self, instances, weight=None, **kwds):
250
251        # Calculate borders using cross validation
252        res = Orange.evaluation.testing.cross_validation([self.estimator], instances)
253        all_borders = []
254        for i in xrange(len(res.results[0].probabilities[0].reliability_estimate)):
255            estimates, signed_or_absolute, method = get_reliability_estimation_list(res, i)
256            sorted_estimates = sorted(abs(x) for x in estimates)
257            borders = [sorted_estimates[int(len(estimates) * p) - 1]  for p in self.procentage]
258            all_borders.append(borders)
259
260        # Learn on whole train data
261        estimator_classifier = self.estimator(instances)
262
263        return DescriptiveAnalysisClassifier(estimator_classifier, all_borders, self.desc)
264
265class DescriptiveAnalysisClassifier:
266    def __init__(self, estimator_classifier, all_borders, desc):
267        self.estimator_classifier = estimator_classifier
268        self.all_borders = all_borders
269        self.desc = desc
270
271    def __call__(self, instance, result_type=Orange.core.GetValue):
272        predicted, probabilities = self.estimator_classifier(instance, Orange.core.GetBoth)
273
274        for borders, estimate in zip(self.all_borders, probabilities.reliability_estimate):
275            estimate.text_description = self.desc[0]
276            for lower_border, text_desc in zip(borders, self.desc):
277                if estimate.estimate >= lower_border:
278                    estimate.text_description = text_desc
279
280        # Return the appropriate type of result
281        if result_type == Orange.core.GetValue:
282            return predicted
283        elif result_type == Orange.core.GetProbabilities:
284            return probabilities
285        else:
286            return predicted, probabilities
287
288class SensitivityAnalysis:
289    """
290   
291    :param e: List of possible :math:`\epsilon` values for SAvar and SAbias
292        reliability estimates.
293    :type e: list of floats
294   
295    :rtype: :class:`Orange.evaluation.reliability.SensitivityAnalysisClassifier`
296   
297    To estimate the reliability of prediction for given instance,
298    the learning set is extended with this instance, labeled with
299    :math:`K + \epsilon (l_{max} - l_{min})`,
300    where :math:`K` denotes the initial prediction,
301    :math:`\epsilon` is sensitivity parameter and :math:`l_{min}` and
302    :math:`l_{max}` denote lower and the upper bound of the learning
303    instances' labels. After computing different sensitivity predictions
304    using different values of :math:`\epsilon`, the prediction are combined
305    into SAvar and SAbias. SAbias can be used in a signed or absolute form.
306
307    :math:`SAvar = \\frac{\sum_{\epsilon \in E}(K_{\epsilon} - K_{-\epsilon})}{|E|}`
308
309    :math:`SAbias = \\frac{\sum_{\epsilon \in E} (K_{\epsilon} - K ) + (K_{-\epsilon} - K)}{2 |E|}`
310   
311   
312    """
313    def __init__(self, e=[0.01, 0.1, 0.5, 1.0, 2.0], name="sa"):
314        self.e = e
315        self.name = name
316
317    def __call__(self, instances, learner):
318        min_value = max_value = instances[0].getclass().value
319        for ex in instances:
320            if ex.getclass().value > max_value:
321                max_value = ex.getclass().value
322            if ex.getclass().value < min_value:
323                min_value = ex.getclass().value
324        return SensitivityAnalysisClassifier(self.e, instances, min_value, max_value, learner)
325
326class SensitivityAnalysisClassifier:
327    def __init__(self, e, instances, min_value, max_value, learner):
328        self.e = e
329        self.instances = instances
330        self.max_value = max_value
331        self.min_value = min_value
332        self.learner = learner
333
334    def __call__(self, instance, predicted, probabilities):
335        # Create new dataset
336        r_data = Orange.data.Table(self.instances)
337
338        # Create new instance
339        modified_instance = Orange.data.Instance(instance)
340
341        # Append it to the data
342        r_data.append(modified_instance)
343
344        # Calculate SAvar & SAbias
345        SAvar = SAbias = 0
346
347        for eps in self.e:
348            # +epsilon
349            r_data[-1].setclass(predicted.value + eps * (self.max_value - self.min_value))
350            c = self.learner(r_data)
351            k_plus = c(instance, Orange.core.GetValue)
352
353            # -epsilon
354            r_data[-1].setclass(predicted.value - eps * (self.max_value - self.min_value))
355            c = self.learner(r_data)
356            k_minus = c(instance, Orange.core.GetValue)
357            #print len(r_data)
358            #print eps*(self.max_value - self.min_value)
359            #print k_plus
360            #print k_minus
361            # calculate part SAvar and SAbias
362            SAvar += k_plus.value - k_minus.value
363            SAbias += k_plus.value + k_minus.value - 2 * predicted.value
364
365        SAvar /= len(self.e)
366        SAbias /= 2 * len(self.e)
367
368        return [Estimate(SAvar, ABSOLUTE, SAVAR_ABSOLUTE),
369                Estimate(SAbias, SIGNED, SABIAS_SIGNED),
370                Estimate(abs(SAbias), ABSOLUTE, SABIAS_ABSOLUTE)]
371
372
373
374class ReferenceExpectedError:
375    """
376
377    :rtype: :class:`Orange.evaluation.reliability.ReferenceExpectedErrorClassifier`
378
379    Reference reliability estimation method for classification as used in Evaluating Reliability of Single
380    Classifications of Neural Networks, Darko Pevec, 2011.
381
382    :math:`O_{ref} = 2 (\hat y - \hat y ^2) = 2 \hat y (1-\hat y)`
383
384    where :math:`\hat y` is the estimated probability of the predicted class.
385
386    Note that for this method, in contrast with all others, a greater estimate means lower reliability (greater
387    expected error).
388
389    """
390    def __init__(self, name="reference"):
391        self.name = name
392
393    def __call__(self, instances, learner):
394        classifier = learner(instances)
395        return ReferenceExpectedErrorClassifier(classifier)
396
397   
398class ReferenceExpectedErrorClassifier:
399
400    def __init__(self, classifier):
401        self.classifier = classifier
402
403    def __call__(self, instance, *args):
404        y_hat = max(self.classifier(instance, Orange.classification.Classifier.GetProbabilities))
405        return [Estimate(2 * y_hat * (1 - y_hat), ABSOLUTE, ERR_ABSOLUTE)]
406
407
408class BaggingVariance:
409    """
410   
411    :param m: Number of bagging models to be used with BAGV estimate
412    :type m: int
413   
414    :rtype: :class:`Orange.evaluation.reliability.BaggingVarianceClassifier`
415   
416    :math:`m` different bagging models are constructed and used to estimate
417    the value of dependent variable for a given instance. In regression,
418    the variance of those predictions is used as a prediction reliability
419    estimate.
420
421    :math:`BAGV = \\frac{1}{m} \sum_{i=1}^{m} (K_i - K)^2`
422
423    where :math:`K = \\frac{\sum_{i=1}^{m} K_i}{m}` and :math:`K_i` are
424    predictions of individual constructed models. Note that a greater value
425    implies greater error.
426
427    For classification, 1 minus the average Euclidean distance between class
428    probability distributions predicted by the model, and distributions
429    predicted by the individual bagged models, is used as the BAGV reliability
430    measure. Note that in this case a greater value implies a better
431    prediction.
432   
433    This reliability measure can run out of memory fast if individual classifiers
434    use a lot of memory, as it build m of them, thereby using :math:`m` times memory
435    for a single classifier. If instances for measuring predictions
436    are given as a parameter, this class can only compute their reliability,
437    which allows less memory use.
438
439    """
440    def __init__(self, m=50, name="bv", randseed=0, for_instances=None):
441        """
442        for_instances:
443        """
444        self.m = m
445        self.name = name
446        self.select_with_repeat = Orange.core.MakeRandomIndicesMultiple()
447        self.select_with_repeat.random_generator = Orange.misc.Random(randseed)
448        self.for_instances = for_instances
449
450    def __call__(self, instances, learner):
451        classifiers = []
452
453        if instances.domain.class_var.var_type == Orange.feature.Descriptor.Discrete:
454            classifier = learner(instances)
455        else:
456            classifier = None
457
458        for_inst_class = defaultdict(list)
459        this_iteration = None
460       
461        if self.for_instances:
462            his = map(_hashable_instance, self.for_instances)
463
464        # Create bagged classifiers using sampling with replacement
465        for i in xrange(self.m):
466            this_iteration = set()
467            selection = self.select_with_repeat(len(instances))
468            data = instances.select(selection)
469            cl = learner(data)
470            if cl:
471                if self.for_instances: # predict reliability for testing instances and throw cl away
472                    for instance, hi in zip(self.for_instances, his):
473                        if hi not in this_iteration:
474                            for_inst_class[hi].append(_bagged_value(instance, cl, classifier))
475                            this_iteration.add(hi)
476                else:
477                    classifiers.append(cl)
478
479        return BaggingVarianceClassifier(classifiers, classifier, for_inst_class=dict(for_inst_class))
480
481class BaggingVarianceClassifier:
482    def __init__(self, classifiers, classifier=None, for_inst_class=None):
483        self.classifiers = classifiers
484        self.classifier = classifier
485        self.for_inst_class = for_inst_class
486
487    def __call__(self, instance, *args):
488        BAGV = 0
489
490        # Calculate the bagging variance
491        if self.for_inst_class:
492            bagged_values = self.for_inst_class[_hashable_instance(instance)]
493        else:
494            bagged_values = [ _bagged_value(instance, c, self.classifier) for c in self.classifiers ]
495
496        k = sum(bagged_values) / len(bagged_values)
497
498        BAGV = sum((bagged_value - k) ** 2 for bagged_value in bagged_values) / len(bagged_values)
499        if instance.domain.class_var.var_type == Orange.feature.Descriptor.Discrete:
500            BAGV = 1 - BAGV
501
502        return [Estimate(BAGV, ABSOLUTE, BAGV_ABSOLUTE)]
503
504def _hashable_instance(instance):
505    return tuple(instance[i].value for i in range(len(instance.domain.attributes)))
506
507def _bagged_value(instance, c, classifier):
508    if instance.domain.class_var.var_type == Orange.feature.Descriptor.Continuous:
509        return c(instance, Orange.core.GetValue).value
510    elif instance.domain.class_var.var_type == Orange.feature.Descriptor.Discrete:
511        estimate = classifier(instance, Orange.core.GetProbabilities)
512        return euclidean_dist(c(instance, Orange.core.GetProbabilities), estimate)
513
514
515class LocalCrossValidation:
516    """
517
518    :param k: Number of nearest neighbours used in LCV estimate
519    :type k: int
520
521    :param distance: function that computes a distance between two discrete
522        distributions (used only in classification problems). The default
523        is Hellinger distance.
524    :type distance: function
525
526    :param distance_weighted: for classification reliability estimation,
527        use an average distance between distributions, weighted by :math:`e^{-d}`,
528        where :math:`d` is the distance between predicted instance and the
529        neighbour.
530
531    :rtype: :class:`Orange.evaluation.reliability.LocalCrossValidationClassifier`
532
533    :math:`k` nearest neighbours to the given instance are found and put in
534    a separate data set. On this data set, a leave-one-out validation is
535    performed. Reliability estimate for regression is then the distance
536    weighted absolute prediction error. In classification, 1 minus the average
537    distance between the predicted class probability distribution and the
538    (trivial) probability distributions of the nearest neighbour.
539
540    If a special value 0 is passed as :math:`k` (as is by default),
541    it is set as 1/20 of data set size (or 5, whichever is greater).
542
543    Summary of the algorithm for regression:
544
545    1. Determine the set of k nearest neighours :math:`N = { (x_1, c_1),...,
546       (x_k, c_k)}`.
547    2. On this set, compute leave-one-out predictions :math:`K_i` and
548       prediction errors :math:`E_i = | C_i - K_i |`.
549    3. :math:`LCV(x) = \\frac{ \sum_{(x_i, c_i) \in N} d(x_i, x) * E_i }{ \sum_{(x_i, c_i) \in N} d(x_i, x) }`
550
551    """
552    def __init__(self, k=0, distance=hellinger_dist, distance_weighted=True, name="lcv"):
553        self.k = k
554        self.distance = distance
555        self.distance_weighted = distance_weighted
556        self.name = name
557
558    def __call__(self, instances, learner):
559        nearest_neighbours_constructor = Orange.classification.knn.FindNearestConstructor()
560        nearest_neighbours_constructor.distanceConstructor = Orange.distance.Euclidean()
561
562        distance_id = Orange.feature.Descriptor.new_meta_id()
563        nearest_neighbours = nearest_neighbours_constructor(instances, 0, distance_id)
564
565        if self.k == 0:
566            self.k = max(5, len(instances) / 20)
567
568        return LocalCrossValidationClassifier(distance_id, nearest_neighbours, self.k, learner,
569            distance=self.distance, distance_weighted=self.distance_weighted)
570
571class LocalCrossValidationClassifier:
572    def __init__(self, distance_id, nearest_neighbours, k, learner, **kwds):
573        self.distance_id = distance_id
574        self.nearest_neighbours = nearest_neighbours
575        self.k = k
576        self.learner = learner
577        for a,b in kwds.items():
578            setattr(self, a, b)
579
580    def __call__(self, instance, *args):
581        LCVer = 0
582        LCVdi = 0
583
584        # Find k nearest neighbors
585
586        knn = [ex for ex in self.nearest_neighbours(instance, self.k)]
587
588        # leave one out of prediction error
589        for i in xrange(len(knn)):
590            train = knn[:]
591            del train[i]
592
593            classifier = self.learner(Orange.data.Table(train))
594
595            if instance.domain.class_var.var_type == Orange.feature.Descriptor.Continuous:
596                returned_value = classifier(knn[i], Orange.core.GetValue)
597                e = abs(knn[i].getclass().value - returned_value.value)
598
599            elif instance.domain.class_var.var_type == Orange.feature.Descriptor.Discrete:
600                returned_value = classifier(knn[i], Orange.core.GetProbabilities)
601                probabilities = [knn[i].get_class() == val for val in instance.domain.class_var.values]
602                e = self.distance(returned_value, Orange.statistics.distribution.Discrete(probabilities))
603
604            dist = math.exp(-knn[i][self.distance_id]) if self.distance_weighted else 1.0
605            LCVer += e * dist
606            LCVdi += dist
607
608        LCV = LCVer / LCVdi if LCVdi != 0 else 0
609        if math.isnan(LCV):
610            LCV = 0.0
611
612        if instance.domain.class_var.var_type == Orange.feature.Descriptor.Discrete:
613            LCV = 1 - LCV
614
615        return [ Estimate(LCV, ABSOLUTE, LCV_ABSOLUTE) ]
616
617class CNeighbours:
618    """
619   
620    :param k: Number of nearest neighbours used in CNK estimate
621    :type k: int
622
623    :param distance: function that computes a distance between two discrete
624        distributions (used only in classification problems). The default
625        is Hellinger distance.
626    :type distance: function
627   
628    :rtype: :class:`Orange.evaluation.reliability.CNeighboursClassifier`
629   
630    For regression, CNK is defined for an unlabeled instance as a difference
631    between average label of its nearest neighbours and its prediction. CNK
632    can be used as a signed or absolute estimate.
633   
634    :math:`CNK = \\frac{\sum_{i=1}^{k}C_i}{k} - K`
635   
636    where :math:`k` denotes number of neighbors, C :sub:`i` denotes neighbours'
637    labels and :math:`K` denotes the instance's prediction. Note that a greater
638    value implies greater prediction error.
639
640    For classification, CNK is equal to 1 minus the average distance between
641    predicted class distribution and (trivial) class distributions of the
642    $k$ nearest neighbours from the learning set. Note that in this case
643    a greater value implies better prediction.
644   
645    """
646    def __init__(self, k=5, distance=hellinger_dist, name = "cnk"):
647        self.k = k
648        self.distance = distance
649        self.name = name
650
651    def __call__(self, instances, learner):
652        nearest_neighbours_constructor = Orange.classification.knn.FindNearestConstructor()
653        nearest_neighbours_constructor.distanceConstructor = Orange.distance.Euclidean()
654
655        distance_id = Orange.feature.Descriptor.new_meta_id()
656        nearest_neighbours = nearest_neighbours_constructor(instances, 0, distance_id)
657        return CNeighboursClassifier(nearest_neighbours, self.k, distance=self.distance)
658
659class CNeighboursClassifier:
660    def __init__(self, nearest_neighbours, k, distance):
661        self.nearest_neighbours = nearest_neighbours
662        self.k = k
663        self.distance = distance
664
665    def __call__(self, instance, predicted, probabilities):
666        CNK = 0
667
668        # Find k nearest neighbors
669
670        knn = [ex for ex in self.nearest_neighbours(instance, self.k)]
671
672        # average label of neighbors
673        if ex.domain.class_var.var_type == Orange.feature.Descriptor.Continuous:
674            for ex in knn:
675                CNK += ex.getclass().value
676            CNK /= self.k
677            CNK -= predicted.value
678
679            return [Estimate(CNK, SIGNED, CNK_SIGNED),
680                    Estimate(abs(CNK), ABSOLUTE, CNK_ABSOLUTE)]
681        elif ex.domain.class_var.var_type == Orange.feature.Descriptor.Discrete:
682            knn_l = Orange.classification.knn.kNNLearner(k=self.k)
683            knn_c = knn_l(knn)
684            for ex in knn:
685                CNK -= self.distance(probabilities, knn_c(ex, Orange.classification.Classifier.GetProbabilities))
686            CNK /= self.k
687            CNK += 1
688
689            return [Estimate(CNK, ABSOLUTE, CNK_ABSOLUTE)]
690
691class Mahalanobis:
692    """
693   
694    :param k: Number of nearest neighbours used in Mahalanobis estimate.
695    :type k: int
696   
697    :rtype: :class:`Orange.evaluation.reliability.MahalanobisClassifier`
698   
699    Mahalanobis distance reliability estimate is defined as
700    `mahalanobis distance <http://en.wikipedia.org/wiki/Mahalanobis_distance>`_
701    to the evaluated instance's :math:`k` nearest neighbours.
702
703   
704    """
705    def __init__(self, k=3, name="mahalanobis"):
706        self.k = k
707        self.name = name
708
709    def __call__(self, instances, *args):
710        nnm = Orange.classification.knn.FindNearestConstructor()
711        nnm.distanceConstructor = Orange.distance.Mahalanobis()
712
713        mid = Orange.feature.Descriptor.new_meta_id()
714        nnm = nnm(instances, 0, mid)
715        return MahalanobisClassifier(self.k, nnm, mid)
716
717class MahalanobisClassifier:
718    def __init__(self, k, nnm, mid):
719        self.k = k
720        self.nnm = nnm
721        self.mid = mid
722
723    def __call__(self, instance, *args):
724        mahalanobis_distance = 0
725
726        mahalanobis_distance = sum(ex[self.mid].value for ex in self.nnm(instance, self.k))
727
728        return [ Estimate(mahalanobis_distance, ABSOLUTE, MAHAL_ABSOLUTE) ]
729
730class MahalanobisToCenter:
731    """
732    :rtype: :class:`Orange.evaluation.reliability.MahalanobisToCenterClassifier`
733   
734    Mahalanobis distance to center reliability estimate is defined as a
735    `mahalanobis distance <http://en.wikipedia.org/wiki/Mahalanobis_distance>`_
736    between the predicted instance and the centroid of the data.
737
738   
739    """
740    def __init__(self, name="mahalanobis to center"):
741        self.name = name
742
743    def __call__(self, instances, *args):
744        dc = Orange.core.DomainContinuizer()
745        dc.classTreatment = Orange.core.DomainContinuizer.Ignore
746        dc.continuousTreatment = Orange.core.DomainContinuizer.NormalizeBySpan
747        dc.multinomialTreatment = Orange.core.DomainContinuizer.NValues
748
749        new_domain = dc(instances)
750        new_instances = instances.translate(new_domain)
751
752        X, _, _ = new_instances.to_numpy()
753        instance_avg = numpy.average(X, 0)
754
755        distance_constructor = Orange.distance.Mahalanobis()
756        distance = distance_constructor(new_instances)
757
758        average_instance = Orange.data.Instance(new_instances.domain, list(instance_avg) + ["?"])
759
760        return MahalanobisToCenterClassifier(distance, average_instance, new_domain)
761
762class MahalanobisToCenterClassifier:
763    def __init__(self, distance, average_instance, new_domain):
764        self.distance = distance
765        self.average_instance = average_instance
766        self.new_domain = new_domain
767
768    def __call__(self, instance, *args):
769
770        inst = Orange.data.Instance(self.new_domain, instance)
771
772        mahalanobis_to_center = self.distance(inst, self.average_instance)
773
774        return [ Estimate(mahalanobis_to_center, ABSOLUTE, MAHAL_TO_CENTER_ABSOLUTE) ]
775
776
777class BaggingVarianceCNeighbours:
778    """
779   
780    :param bagv: Instance of Bagging Variance estimator.
781    :type bagv: :class:`BaggingVariance`
782   
783    :param cnk: Instance of CNK estimator.
784    :type cnk: :class:`CNeighbours`
785   
786    :rtype: :class:`Orange.evaluation.reliability.BaggingVarianceCNeighboursClassifier`
787   
788    BVCK is a combination (average) of Bagging variance and local modeling of
789    prediction error.
790   
791    """
792    def __init__(self, bagv=None, cnk=None, name="bvck"):
793        if bagv is None:
794            bagv = BaggingVariance()
795        if cnk is None:
796            cnk = CNeighbours()
797        self.bagv = bagv
798        self.cnk = cnk
799        self.name = "bvck"
800
801    def __call__(self, instances, learner):
802        bagv_classifier = self.bagv(instances, learner)
803        cnk_classifier = self.cnk(instances, learner)
804        return BaggingVarianceCNeighboursClassifier(bagv_classifier, cnk_classifier)
805
806class BaggingVarianceCNeighboursClassifier:
807    def __init__(self, bagv_classifier, cnk_classifier):
808        self.bagv_classifier = bagv_classifier
809        self.cnk_classifier = cnk_classifier
810
811    def __call__(self, instance, predicted, probabilities):
812        bagv_estimates = self.bagv_classifier(instance, predicted, probabilities)
813        cnk_estimates = self.cnk_classifier(instance, predicted, probabilities)
814
815        bvck_value = (bagv_estimates[0].estimate + cnk_estimates[1].estimate) / 2
816        bvck_estimates = [ Estimate(bvck_value, ABSOLUTE, BVCK_ABSOLUTE) ]
817        bvck_estimates.extend(bagv_estimates)
818        bvck_estimates.extend(cnk_estimates)
819        return bvck_estimates
820
821class ErrorPredicting:
822    def __init__(self, name = "ep"):
823        self.name = name
824
825    def __call__(self, instances, learner):
826        res = Orange.evaluation.testing.cross_validation([learner], instances)
827        prediction_errors = get_prediction_error_list(res)
828
829        new_domain = Orange.data.Domain(instances.domain.attributes, Orange.core.FloatVariable("pe"))
830        new_dataset = Orange.data.Table(new_domain, instances)
831
832        for instance, prediction_error in izip(new_dataset, prediction_errors):
833            instance.set_class(prediction_error)
834
835        rf = Orange.ensemble.forest.RandomForestLearner()
836        rf_classifier = rf(new_dataset)
837
838        return ErrorPredictingClassification(rf_classifier, new_domain)
839
840class ErrorPredictingClassification:
841    def __init__(self, rf_classifier, new_domain):
842        self.rf_classifier = rf_classifier
843        self.new_domain = new_domain
844
845    def __call__(self, instance, predicted, probabilities):
846        new_instance = Orange.data.Instance(self.new_domain, instance)
847        value = self.rf_classifier(new_instance, Orange.core.GetValue)
848
849        return [Estimate(value.value, SIGNED, SABIAS_SIGNED)]
850
851def gauss_kernel(x, sigma=1):
852    return 1./(sigma*math.sqrt(2*math.pi)) * math.exp(-1./2*(x/sigma)**2)
853
854class ParzenWindowDensityBased:
855    """
856    :param K: kernel function. Default: gaussian.
857    :type K: function
858
859    :param d_measure: distance measure for inter-instance distance.
860    :type d_measure: :class:`Orange.distance.DistanceConstructor`
861
862    :rtype: :class:`Orange.evaluation.reliability.ParzenWindowDensityBasedClassifier`
863
864    Returns a value that estimates a density of problem space around the
865    instance being predicted.
866    """
867    def __init__(self, K=gauss_kernel, d_measure=Orange.distance.Euclidean(), name="density"):
868        self.K = K
869        self.d_measure = d_measure
870        self.name = name
871
872    def __call__(self, instances, learner):
873
874        self.distance = self.d_measure(instances)
875
876        def density(x):
877            l, dens = len(instances), 0
878            for ex in instances:
879                dens += self.K(self.distance(x,ex))
880            return dens / l
881
882        max_density = max([density(ex) for ex in instances])
883
884        return ParzenWindowDensityBasedClassifier(density, max_density)
885
886class ParzenWindowDensityBasedClassifier:
887
888    def __init__(self, density, max_density):
889        self.density = density
890        self.max_density = max_density
891
892
893    def __call__(self, instance, *args):
894
895        DENS = self.max_density-self.density(instance)
896
897        return [Estimate(DENS, ABSOLUTE, DENS_ABSOLUTE)]
898
899class Stacking:
900
901    def __init__(self, stack_learner, estimators=None, folds=10, save_data=False):
902        self.stack_learner = stack_learner
903        self.estimators = estimators
904        self.folds = folds
905        self.save_data = save_data
906        if self.estimators is None:
907             self.estimators = [SensitivityAnalysis(),
908                           LocalCrossValidation(),
909                           BaggingVarianceCNeighbours(),
910                           Mahalanobis(),
911                           MahalanobisToCenter()]
912   
913    def __call__(self, data, learner):
914
915        newfeatures = None
916       
917        if self.folds > 1:
918
919            cvi = Orange.data.sample.SubsetIndicesCV(data, self.folds)
920            data_cv = [ None ] * len(data)
921            for f in set(cvi): #for each fold
922                learn = data.select(cvi, f, negate=True)
923                test = data.select(cvi, f)
924
925                #learn reliability estimates for the learning set
926                lf = Learner(learner, estimators=self.estimators)(learn)
927               
928                #pos is used to retain the order of instances
929                for ex, pos in zip(test, [ i for i,n in enumerate(cvi) if n == f ]):
930                    pred = lf(ex, Orange.core.GetBoth)
931                    re = pred[1].reliability_estimate
932                    names = [ e.method_name for e in re ]
933                    assert newfeatures is None or names == newfeatures
934                    newfeatures = names
935                    estimates = [ abs(e.estimate) for e in re ]
936                    error = ex[-1].value - pred[0].value
937                    data_cv[pos] = estimates + [ abs(error) ]
938
939        else:
940 
941            #use half of the data to learn reliability estimates
942            #and the other half for induction of a stacking classifier
943            cvi = Orange.data.sample.SubsetIndicesCV(data, 2)
944            data_cv = []
945
946            learn = data.select(cvi, 0, negate=True)
947            test = data.select(cvi, 0)
948
949            #learn reliability estimates for the learning set
950            lf = Learner(learner, estimators=self.estimators)(learn)
951           
952            for ex in test:
953                pred = lf(ex, Orange.core.GetBoth)
954                re = pred[1].reliability_estimate
955                names = [ e.method_name for e in re ]
956                assert newfeatures is None or names == newfeatures
957                newfeatures = names
958                estimates = [ abs(e.estimate) for e in re ]
959                error = ex[-1].value - pred[0].value
960                data_cv.append(estimates + [ abs(error) ])
961
962            print "DCV", len(data_cv)
963
964        lf = None
965
966        #induce the classifier on cross-validated reliability estimates
967        newfeatures = [ Orange.feature.Continuous(name=n) for n in newfeatures ]
968        newdomain = Orange.data.Domain(newfeatures, Orange.feature.Continuous(name="error"))
969        classifier_data = Orange.data.Table(newdomain, data_cv)
970        stack_classifier = self.stack_learner(classifier_data)
971
972        #induce reliability estimates on the whole data set
973        lf = Learner(learner, estimators=self.estimators)(data)
974
975        if self.save_data:
976            self.classifier_data = classifier_data
977
978        return StackingClassifier(stack_classifier, lf, newdomain)
979
980
981class StackingClassifier:
982
983    def __init__(self, stacking_classifier, reliability_classifier, domain):
984        self.stacking_classifier = stacking_classifier
985        print self.stacking_classifier
986        self.domain = domain
987        self.reliability_classifier = reliability_classifier
988
989    def convert(self, instance):
990        """ Return example in the space of reliability estimates. """
991        re = self.reliability_classifier(instance, Orange.core.GetProbabilities).reliability_estimate
992        #take absolute values for all
993        tex = [ abs(e.estimate) for e in re ] + [ "?" ]
994        tex =  Orange.data.Instance(self.domain, tex)
995        return tex
996
997    def __call__(self, instance, *args):
998        tex = self.convert(instance)
999        r = self.stacking_classifier(tex)
1000        r = float(r)
1001        r = max(0., r)
1002        return [ Estimate(r, ABSOLUTE, STACKING) ]
1003
1004class ICV:
1005    """ Perform internal cross validation (as in Automatic selection of
1006    reliability estimates for individual regression predictions,
1007    Zoran Bosnic, 2010) and return id of the method
1008    that scored best on this data.
1009    """
1010 
1011    def __init__(self, estimators=None, folds=10):
1012        self.estimators = estimators
1013        if self.estimators is None:
1014             self.estimators = [SensitivityAnalysis(),
1015                           LocalCrossValidation(),
1016                           BaggingVarianceCNeighbours(),
1017                           Mahalanobis(),
1018                           MahalanobisToCenter()]
1019        self.folds = folds
1020   
1021    def __call__(self, data, learner):
1022
1023        cvi = Orange.data.sample.SubsetIndicesCV(data, self.folds)
1024        sum_of_rs = defaultdict(float)
1025        n_rs = defaultdict(int)
1026
1027        elearner = Learner(learner, estimators=self.estimators)
1028
1029        #average correlations from each fold
1030        for f in set(cvi):
1031            learn = data.select(cvi, f, negate=True)
1032            test = data.select(cvi, f)
1033
1034            res = Orange.evaluation.testing.learn_and_test_on_test_data([elearner], learn, test)
1035            results = get_pearson_r(res)
1036   
1037            for r, p, sa, method in results:
1038                if not math.isnan(r): #ignore NaN values
1039                    sum_of_rs[(method, sa)] += r
1040                    n_rs[(method, sa)] += 1 
1041
1042        avg_rs = [ (k,(sum_of_rs[k]/n_rs[k])) for k in sum_of_rs ]
1043
1044        avg_rs = sorted(avg_rs, key=lambda estimate: estimate[1], reverse=True)
1045        chosen = avg_rs[0][0]
1046
1047        lf = elearner(data)
1048        return ICVClassifier(chosen, lf)
1049
1050
1051class ICVClassifier:
1052
1053    def __init__(self, chosen, reliability_classifier):
1054        self.chosen = chosen
1055        self.reliability_classifier = reliability_classifier
1056
1057    def __call__(self, instance, *args):
1058        re = self.reliability_classifier(instance, Orange.core.GetProbabilities).reliability_estimate
1059        for e in re:
1060            if e.method == self.chosen[0] and e.signed_or_absolute == self.chosen[1]:
1061                r = e.estimate
1062
1063        return [ Estimate(r, self.chosen[1], ICV_METHOD) ]
1064
1065class Learner:
1066    """
1067    Reliability estimation wrapper around a learner we want to test.
1068    Different reliability estimation algorithms can be used on the
1069    chosen learner. This learner works as any other and can be used as one,
1070    but it returns the classifier, wrapped into an instance of
1071    :class:`Orange.evaluation.reliability.Classifier`.
1072   
1073    :param box_learner: Learner we want to wrap into a reliability estimation
1074        classifier.
1075    :type box_learner: :obj:`~Orange.classification.Learner`
1076   
1077    :param estimators: List of different reliability estimation methods we
1078                       want to use on the chosen learner.
1079    :type estimators: :obj:`list` of reliability estimators
1080   
1081    :param name: Name of this reliability learner
1082    :type name: string
1083   
1084    :rtype: :class:`Orange.evaluation.reliability.Learner`
1085    """
1086    def __init__(self, box_learner, name="Reliability estimation",
1087                 estimators=None,
1088                 **kwds):
1089        self.__dict__.update(kwds)
1090        self.name = name
1091        self.estimators = estimators
1092        if self.estimators is None:
1093             self.estimators = [SensitivityAnalysis(),
1094                           LocalCrossValidation(),
1095                           BaggingVarianceCNeighbours(),
1096                           Mahalanobis(),
1097                           MahalanobisToCenter()]
1098 
1099        self.box_learner = box_learner
1100        self.blending = False
1101
1102
1103    def __call__(self, instances, weight=None, **kwds):
1104        """Learn from the given table of data instances.
1105       
1106        :param instances: Data instances to learn from.
1107        :type instances: Orange.data.Table
1108        :param weight: Id of meta attribute with weights of instances
1109        :type weight: int
1110        :rtype: :class:`Orange.evaluation.reliability.Classifier`
1111        """
1112
1113        blending_classifier = None
1114        new_domain = None
1115
1116#        if instances.domain.class_var.var_type != Orange.feature.Continuous.Continuous:
1117#            raise Exception("This method only works on data with continuous class.")
1118
1119        return Classifier(instances, self.box_learner, self.estimators, self.blending, new_domain, blending_classifier)
1120 
1121class Classifier:
1122    """
1123    A reliability estimation wrapper for classifiers.
1124
1125    What distinguishes this classifier is that the returned probabilities (if
1126    :obj:`Orange.classification.Classifier.GetProbabilities` or
1127    :obj:`Orange.classification.Classifier.GetBoth` is passed) contain an
1128    additional attribute :obj:`reliability_estimate`, which is an instance of
1129    :class:`~Orange.evaluation.reliability.Estimate`.
1130
1131    """
1132
1133    def __init__(self, instances, box_learner, estimators, blending, blending_domain, rf_classifier, **kwds):
1134        self.__dict__.update(kwds)
1135        self.instances = instances
1136        self.box_learner = box_learner
1137        self.estimators = estimators
1138        self.blending = blending
1139        self.blending_domain = blending_domain
1140        self.rf_classifier = rf_classifier
1141
1142        # Train the learner with original data
1143        self.classifier = box_learner(instances)
1144
1145        # Train all the estimators and create their classifiers
1146        self.estimation_classifiers = [estimator(instances, box_learner) for estimator in estimators]
1147
1148    def __call__(self, instance, result_type=Orange.core.GetValue):
1149        """
1150        Classify and estimate reliability of estimation for a new instance.
1151        When :obj:`result_type` is set to
1152        :obj:`Orange.classification.Classifier.GetBoth` or
1153        :obj:`Orange.classification.Classifier.GetProbabilities`,
1154        an additional attribute :obj:`reliability_estimate`,
1155        which is an instance of
1156        :class:`~Orange.evaluation.reliability.Estimate`,
1157        is added to the distribution object.
1158       
1159        :param instance: instance to be classified.
1160        :type instance: :class:`Orange.data.Instance`
1161        :param result_type: :class:`Orange.classification.Classifier.GetValue` or \
1162              :class:`Orange.classification.Classifier.GetProbabilities` or
1163              :class:`Orange.classification.Classifier.GetBoth`
1164       
1165        :rtype: :class:`Orange.data.Value`,
1166              :class:`Orange.statistics.Distribution` or a tuple with both
1167        """
1168        predicted, probabilities = self.classifier(instance, Orange.core.GetBoth)
1169
1170        # Create a place holder for estimates
1171        if probabilities is None:
1172            probabilities = Orange.statistics.distribution.Continuous()
1173        #with warnings.catch_warnings():
1174        #    warnings.simplefilter("ignore")
1175        probabilities.setattr('reliability_estimate', [])
1176
1177        # Calculate all the estimates and add them to the results
1178        for estimate in self.estimation_classifiers:
1179            probabilities.reliability_estimate.extend(estimate(instance, predicted, probabilities))
1180
1181        # Return the appropriate type of result
1182        if result_type == Orange.core.GetValue:
1183            return predicted
1184        elif result_type == Orange.core.GetProbabilities:
1185            return probabilities
1186        else:
1187            return predicted, probabilities
1188
1189# Functions for testing and plotting
1190#TODO Document those.
1191def get_acc_rel(method, data, learner):
1192    estimators = [method]
1193    reliability = Orange.evaluation.reliability.Learner(learner, estimators=estimators)
1194    #results = Orange.evaluation.testing.leave_one_out([reliability], data)
1195    results = Orange.evaluation.testing.cross_validation([reliability], data)
1196
1197    rels, acc = [], []
1198
1199    for res in results.results:
1200        rels.append(res.probabilities[0].reliability_estimate[0].estimate)
1201        acc.append(res.probabilities[0][res.actual_class])
1202
1203    return rels, acc
1204
1205
1206def rel_acc_plot(rels, acc, file_name=None, colors=None):
1207
1208    import matplotlib.pylab as plt
1209   
1210    if colors is None:
1211        colors = "k"
1212    plt.scatter(rels, acc, c=colors)
1213    plt.xlim(0.,1.)
1214    plt.ylim(ymin=0.)
1215    plt.xlabel("Reliability")
1216    plt.ylabel("Accuracy")
1217    if file_name is None:
1218        plt.show()
1219    else:
1220        plt.savefig(file_name)
1221
1222def rel_acc_compute_plot(method, data, learner, file_name=None, colors=None):
1223
1224    plt.clf()
1225
1226    rels, acc = get_acc_rel(method, data, learner)
1227    el_acc_plot(acc, rels, file_name=file_name, colors=colors)
1228   
1229
1230def acc_rel_correlation(method, data, learner):
1231    import scipy.stats
1232    rels, acc = get_acc_rel(method, data, learner)
1233    return scipy.stats.spearmanr(acc, rels)[0]
Note: See TracBrowser for help on using the repository browser.