source: orange-reliability/_reliability/__init__.py @ 5:f50b356a019c

Revision 5:f50b356a019c, 37.8 KB checked in by Matija Polajnar <matija.polajnar@…>, 23 months ago (diff)

Merge in Lan Umek's implementations of reliability estimation for classification.

Line 
1import Orange
2
3import random
4from Orange import statc
5import math
6import warnings
7import numpy
8
9from collections import defaultdict
10from itertools import izip
11
12# Labels and final variables
13labels = ["SAvar", "SAbias", "BAGV", "CNK", "LCV", "BVCK", "Mahalanobis", "ICV"]
14
15"""
16# All the estimators calculation constants
17DO_SA = 0
18DO_BAGV = 1
19DO_CNK = 2
20DO_LCV = 3
21DO_BVCK = 4
22DO_MAHAL = 5
23"""
24
25# All the estimator method constants
26SAVAR_ABSOLUTE = 0
27SABIAS_SIGNED = 1
28SABIAS_ABSOLUTE = 2
29BAGV_ABSOLUTE = 3
30CNK_SIGNED = 4
31CNK_ABSOLUTE = 5
32LCV_ABSOLUTE = 6
33BVCK_ABSOLUTE = 7
34MAHAL_ABSOLUTE = 8
35BLENDING_ABSOLUTE = 9
36ICV_METHOD = 10
37MAHAL_TO_CENTER_ABSOLUTE = 13
38DENS_ABSOLUTE = 14
39
40# Type of estimator constant
41SIGNED = 0
42ABSOLUTE = 1
43
44# Names of all the estimator methods
45METHOD_NAME = {0: "SAvar absolute", 1: "SAbias signed", 2: "SAbias absolute",
46               3: "BAGV absolute", 4: "CNK signed", 5: "CNK absolute",
47               6: "LCV absolute", 7: "BVCK_absolute", 8: "Mahalanobis absolute",
48               9: "BLENDING absolute", 10: "ICV", 11: "RF Variance", 12: "RF Std",
49               13: "Mahalanobis to center", 14: "Density based"}
50
51select_with_repeat = Orange.core.MakeRandomIndicesMultiple()
52select_with_repeat.random_generator = Orange.misc.Random()
53
54def get_reliability_estimation_list(res, i):
55    return [result.probabilities[0].reliability_estimate[i].estimate for result in res.results], res.results[0].probabilities[0].reliability_estimate[i].signed_or_absolute, res.results[0].probabilities[0].reliability_estimate[i].method
56
57def get_prediction_error_list(res):
58    return [result.actual_class - result.classes[0] for result in res.results]
59
60def get_description_list(res, i):
61    return [result.probabilities[0].reliability_estimate[i].text_description for result in res.results]
62
63def get_pearson_r(res):
64    """
65    :param res: results of evaluation, done using learners,
66        wrapped into :class:`Orange.evaluation.reliability.Classifier`.
67    :type res: :class:`Orange.evaluation.testing.ExperimentResults`
68
69    Return Pearson's coefficient between the prediction error and each of the
70    used reliability estimates. Also, return the p-value of each of
71    the coefficients.
72    """
73    prediction_error = get_prediction_error_list(res)
74    results = []
75    for i in xrange(len(res.results[0].probabilities[0].reliability_estimate)):
76        reliability_estimate, signed_or_absolute, method = get_reliability_estimation_list(res, i)
77        try:
78            if signed_or_absolute == SIGNED:
79                r, p = statc.pearsonr(prediction_error, reliability_estimate)
80            else:
81                r, p = statc.pearsonr([abs(pe) for pe in prediction_error], reliability_estimate)
82        except Exception:
83            r = p = float("NaN")
84        results.append((r, p, signed_or_absolute, method))
85    return results
86
87def get_spearman_r(res):
88    """
89    :param res: results of evaluation, done using learners,
90        wrapped into :class:`Orange.evaluation.reliability.Classifier`.
91    :type res: :class:`Orange.evaluation.testing.ExperimentResults`
92
93    Return Spearman's coefficient between the prediction error and each of the
94    used reliability estimates. Also, return the p-value of each of
95    the coefficients.
96    """
97    prediction_error = get_prediction_error_list(res)
98    results = []
99    for i in xrange(len(res.results[0].probabilities[0].reliability_estimate)):
100        reliability_estimate, signed_or_absolute, method = get_reliability_estimation_list(res, i)
101        try:
102            if signed_or_absolute == SIGNED:
103                r, p = statc.spearmanr(prediction_error, reliability_estimate)
104            else:
105                r, p = statc.spearmanr([abs(pe) for pe in prediction_error], reliability_estimate)
106        except Exception:
107            r = p = float("NaN")
108        results.append((r, p, signed_or_absolute, method))
109    return results
110
111def get_pearson_r_by_iterations(res):
112    """
113    :param res: results of evaluation, done using learners,
114        wrapped into :class:`Orange.evaluation.reliability.Classifier`.
115    :type res: :class:`Orange.evaluation.testing.ExperimentResults`
116
117    Return average Pearson's coefficient over all folds between prediction error
118    and each of the used estimates.
119    """
120    results_by_fold = Orange.evaluation.scoring.split_by_iterations(res)
121    number_of_estimates = len(res.results[0].probabilities[0].reliability_estimate)
122    number_of_instances = len(res.results)
123    number_of_folds = len(results_by_fold)
124    results = [0 for _ in xrange(number_of_estimates)]
125    sig = [0 for _ in xrange(number_of_estimates)]
126    method_list = [0 for _ in xrange(number_of_estimates)]
127
128    for res in results_by_fold:
129        prediction_error = get_prediction_error_list(res)
130        for i in xrange(number_of_estimates):
131            reliability_estimate, signed_or_absolute, method = get_reliability_estimation_list(res, i)
132            try:
133                if signed_or_absolute == SIGNED:
134                    r, _ = statc.pearsonr(prediction_error, reliability_estimate)
135                else:
136                    r, _ = statc.pearsonr([abs(pe) for pe in prediction_error], reliability_estimate)
137            except Exception:
138                r = float("NaN")
139            results[i] += r
140            sig[i] = signed_or_absolute
141            method_list[i] = method
142
143    # Calculate p-values
144    results = [float(res) / number_of_folds for res in results]
145    ps = [p_value_from_r(r, number_of_instances) for r in results]
146
147    return zip(results, ps, sig, method_list)
148
149def p_value_from_r(r, n):
150    """
151    Calculate p-value from the paerson coefficient and the sample size.
152    """
153    df = n - 2
154    t = r * (df / ((-r + 1.0 + 1e-30) * (r + 1.0 + 1e-30))) ** 0.5
155    return statc.betai (df * 0.5, 0.5, df / (df + t * t))
156
157
158# Distances between two discrete probability distributions
159#TODO Document those.
160def normalize_both(p, q):
161    if not p.normalized:
162        p.normalize()
163    if not q.normalized:
164        q.normalize()
165    return p, q
166
167def minkowsky_dist(p, q, m=2):
168    p, q = normalize_both(p, q)
169    dist = 0
170    for i in range(len(p)):
171        dist += abs(p[i]-q[i])**m
172    return dist**(1./m)
173
174def manhattan_distance(p, q):
175    return minkowsky_dist(p, q, m=1)
176
177def euclidean_dist(p, q):
178    return minkowsky_dist(p, q, m=2)
179
180def variance_dist(p, q):
181    return euclidean_dist(p, q) ** 2
182
183def max_dist(p, q):
184    p, q = normalize_both(p, q)
185    return max([abs(p[i]-q[i]) for i in range(len(p))])
186
187def hellinger_dist(p, q):
188    p, q = normalize_both(p, q)
189    dist = 0
190    for i in range(len(p)):
191        dist += (math.sqrt(p[i])-math.sqrt(q[i])) ** 2
192    return dist
193
194def my_log(x):
195    return 0 if x == 0 else x * math.log(x)
196
197def kullback_leibler(p, q):
198    p, q = normalize_both(p, q)
199    dist = 0
200    for i in range(len(p)):
201        dist += my_log(p[i]-q[i])
202    return dist
203
204def cosine(p, q):
205    p, q = normalize_both(p, q)
206    p, q = [pp for pp in p], [qq for qq in q]
207    return 1 - numpy.dot(x,y) / (numpy.linalg.norm(p)*numpy.linalg.norm(q))
208
209
210class Estimate:
211    """
212    Reliability estimate. Contains attributes that describe the results of
213    reliability estimation.
214
215    .. attribute:: estimate
216
217        A numerical reliability estimate.
218
219    .. attribute:: signed_or_absolute
220
221        Determines whether the method used gives a signed or absolute result.
222        Has a value of either :obj:`SIGNED` or :obj:`ABSOLUTE`.
223
224    .. attribute:: method
225
226        An integer ID of reliability estimation method used.
227
228    .. attribute:: method_name
229
230        Name (string) of reliability estimation method used.
231
232    .. attribute:: icv_method
233
234        An integer ID of reliability estimation method that performed best,
235        as determined by ICV, and of which estimate is stored in the
236        :obj:`estimate` field. (:obj:`None` when ICV was not used.)
237
238    .. attribute:: icv_method_name
239
240        Name (string) of reliability estimation method that performed best,
241        as determined by ICV. (:obj:`None` when ICV was not used.)
242
243    """
244    def __init__(self, estimate, signed_or_absolute, method, icv_method= -1):
245        self.estimate = estimate
246        self.signed_or_absolute = signed_or_absolute
247        self.method = method
248        self.method_name = METHOD_NAME[method]
249        self.icv_method = icv_method
250        self.icv_method_name = METHOD_NAME[icv_method] if icv_method != -1 else ""
251        self.text_description = None
252
253class DescriptiveAnalysis:
254    def __init__(self, estimator, desc=["high", "medium", "low"], procentage=[0.00, 0.33, 0.66]):
255        self.desc = desc
256        self.procentage = procentage
257        self.estimator = estimator
258
259    def __call__(self, instances, weight=None, **kwds):
260
261        # Calculate borders using cross validation
262        res = Orange.evaluation.testing.cross_validation([self.estimator], instances)
263        all_borders = []
264        for i in xrange(len(res.results[0].probabilities[0].reliability_estimate)):
265            estimates, signed_or_absolute, method = get_reliability_estimation_list(res, i)
266            sorted_estimates = sorted(abs(x) for x in estimates)
267            borders = [sorted_estimates[int(len(estimates) * p) - 1]  for p in self.procentage]
268            all_borders.append(borders)
269
270        # Learn on whole train data
271        estimator_classifier = self.estimator(instances)
272
273        return DescriptiveAnalysisClassifier(estimator_classifier, all_borders, self.desc)
274
275class DescriptiveAnalysisClassifier:
276    def __init__(self, estimator_classifier, all_borders, desc):
277        self.estimator_classifier = estimator_classifier
278        self.all_borders = all_borders
279        self.desc = desc
280
281    def __call__(self, instance, result_type=Orange.core.GetValue):
282        predicted, probabilities = self.estimator_classifier(instance, Orange.core.GetBoth)
283
284        for borders, estimate in zip(self.all_borders, probabilities.reliability_estimate):
285            estimate.text_description = self.desc[0]
286            for lower_border, text_desc in zip(borders, self.desc):
287                if estimate.estimate >= lower_border:
288                    estimate.text_description = text_desc
289
290        # Return the appropriate type of result
291        if result_type == Orange.core.GetValue:
292            return predicted
293        elif result_type == Orange.core.GetProbabilities:
294            return probabilities
295        else:
296            return predicted, probabilities
297
298class SensitivityAnalysis:
299    """
300   
301    :param e: List of possible :math:`\epsilon` values for SAvar and SAbias
302        reliability estimates.
303    :type e: list of floats
304   
305    :rtype: :class:`Orange.evaluation.reliability.SensitivityAnalysisClassifier`
306   
307    To estimate the reliability of prediction for given instance,
308    the learning set is extended with this instance, labeled with
309    :math:`K + \epsilon (l_{max} - l_{min})`,
310    where :math:`K` denotes the initial prediction,
311    :math:`\epsilon` is sensitivity parameter and :math:`l_{min}` and
312    :math:`l_{max}` denote lower and the upper bound of the learning
313    instances' labels. After computing different sensitivity predictions
314    using different values of :math:`\epsilon`, the prediction are combined
315    into SAvar and SAbias. SAbias can be used in a signed or absolute form.
316
317    :math:`SAvar = \\frac{\sum_{\epsilon \in E}(K_{\epsilon} - K_{-\epsilon})}{|E|}`
318
319    :math:`SAbias = \\frac{\sum_{\epsilon \in E} (K_{\epsilon} - K ) + (K_{-\epsilon} - K)}{2 |E|}`
320   
321   
322    """
323    def __init__(self, e=[0.01, 0.1, 0.5, 1.0, 2.0]):
324        self.e = e
325
326    def __call__(self, instances, learner):
327        min_value = max_value = instances[0].getclass().value
328        for ex in instances:
329            if ex.getclass().value > max_value:
330                max_value = ex.getclass().value
331            if ex.getclass().value < min_value:
332                min_value = ex.getclass().value
333        return SensitivityAnalysisClassifier(self.e, instances, min_value, max_value, learner)
334
335class SensitivityAnalysisClassifier:
336    def __init__(self, e, instances, min_value, max_value, learner):
337        self.e = e
338        self.instances = instances
339        self.max_value = max_value
340        self.min_value = min_value
341        self.learner = learner
342
343    def __call__(self, instance, predicted, probabilities):
344        # Create new dataset
345        r_data = Orange.data.Table(self.instances)
346
347        # Create new instance
348        modified_instance = Orange.data.Instance(instance)
349
350        # Append it to the data
351        r_data.append(modified_instance)
352
353        # Calculate SAvar & SAbias
354        SAvar = SAbias = 0
355
356        for eps in self.e:
357            # +epsilon
358            r_data[-1].setclass(predicted.value + eps * (self.max_value - self.min_value))
359            c = self.learner(r_data)
360            k_plus = c(instance, Orange.core.GetValue)
361
362            # -epsilon
363            r_data[-1].setclass(predicted.value - eps * (self.max_value - self.min_value))
364            c = self.learner(r_data)
365            k_minus = c(instance, Orange.core.GetValue)
366            #print len(r_data)
367            #print eps*(self.max_value - self.min_value)
368            #print k_plus
369            #print k_minus
370            # calculate part SAvar and SAbias
371            SAvar += k_plus.value - k_minus.value
372            SAbias += k_plus.value + k_minus.value - 2 * predicted.value
373
374        SAvar /= len(self.e)
375        SAbias /= 2 * len(self.e)
376
377        return [Estimate(SAvar, ABSOLUTE, SAVAR_ABSOLUTE),
378                Estimate(SAbias, SIGNED, SABIAS_SIGNED),
379                Estimate(abs(SAbias), ABSOLUTE, SABIAS_ABSOLUTE)]
380
381class BaggingVariance:
382    """
383   
384    :param m: Number of bagging models to be used with BAGV estimate
385    :type m: int
386   
387    :rtype: :class:`Orange.evaluation.reliability.BaggingVarianceClassifier`
388   
389    :math:`m` different bagging models are constructed and used to estimate
390    the value of dependent variable for a given instance. In regression,
391    the variance of those predictions is used as a prediction reliability
392    estimate.
393
394    :math:`BAGV = \\frac{1}{m} \sum_{i=1}^{m} (K_i - K)^2`
395
396    where :math:`K = \\frac{\sum_{i=1}^{m} K_i}{m}` and :math:`K_i` are
397    predictions of individual constructed models. Note that a greater value
398    implies greater error.
399
400    For classification, 1 minus the average Euclidean distance between class
401    probability distributions predicted by the model, and distributions
402    predicted by the individual bagged models, is used as the BAGV reliability
403    measure. Note that in this case a greater value implies a better
404    prediction.
405   
406    """
407    def __init__(self, m=50):
408        self.m = m
409
410    def __call__(self, instances, learner):
411        classifiers = []
412
413        if instances.domain.class_var.var_type == Orange.feature.Descriptor.Discrete:
414            classifier = learner(instances)
415        else:
416            classifier = None
417
418        # Create bagged classifiers using sampling with replacement
419        for _ in xrange(self.m):
420            selection = select_with_repeat(len(instances))
421            data = instances.select(selection)
422            classifiers.append(learner(data))
423        return BaggingVarianceClassifier(classifiers, classifier)
424
425class BaggingVarianceClassifier:
426    def __init__(self, classifiers):
427        self.classifiers = classifiers
428
429    def __call__(self, instance, *args):
430        def __init__(self, classifiers, classifier=None):
431            self.classifiers = classifiers
432            self.classifier = classifier
433
434    def __call__(self, instance, *args):
435        BAGV = 0
436
437        # Calculate the bagging variance
438        if instance.domain.class_var.var_type == Orange.feature.Descriptor.Continuous:
439            bagged_values = [c(instance, Orange.core.GetValue).value for c in self.classifiers if c is not None]
440        elif instance.domain.class_var.var_type == Orange.feature.Descriptor.Discrete:
441            estimate = self.classifier(instance, Orange.core.GetProbabilities)
442            bagged_values = [euclidean_dist(c(instance, Orange.core.GetProbabilities), estimate) for c in self.classifiers if c is not None]
443        k = sum(bagged_values) / len(bagged_values)
444
445        BAGV = sum((bagged_value - k) ** 2 for bagged_value in bagged_values) / len(bagged_values)
446        if instance.domain.class_var.var_type == Orange.feature.Descriptor.Discrete:
447            BAGV = 1 - BAGV
448
449        return [Estimate(BAGV, ABSOLUTE, BAGV_ABSOLUTE)]
450
451class LocalCrossValidation:
452    """
453
454    :param k: Number of nearest neighbours used in LCV estimate
455    :type k: int
456
457    :param distance: function that computes a distance between two discrete
458        distributions (used only in classification problems). The default
459        is Hellinger distance.
460    :type distance: function
461
462    :param distance_weighted: for classification reliability estimation,
463        use an average distance between distributions, weighted by :math:`e^{-d}`,
464        where :math:`d` is the distance between predicted instance and the
465        neighbour.
466
467    :rtype: :class:`Orange.evaluation.reliability.LocalCrossValidationClassifier`
468
469    :math:`k` nearest neighbours to the given instance are found and put in
470    a separate data set. On this data set, a leave-one-out validation is
471    performed. Reliability estimate for regression is then the distance
472    weighted absolute prediction error. In classification, 1 minus the average
473    distance between the predicted class probability distribution and the
474    (trivial) probability distributions of the nearest neighbour.
475
476    If a special value 0 is passed as :math:`k` (as is by default),
477    it is set as 1/20 of data set size (or 5, whichever is greater).
478
479    Summary of the algorithm for regression:
480
481    1. Determine the set of k nearest neighours :math:`N = { (x_1, c_1),...,
482       (x_k, c_k)}`.
483    2. On this set, compute leave-one-out predictions :math:`K_i` and
484       prediction errors :math:`E_i = | C_i - K_i |`.
485    3. :math:`LCV(x) = \\frac{ \sum_{(x_i, c_i) \in N} d(x_i, x) * E_i }{ \sum_{(x_i, c_i) \in N} d(x_i, x) }`
486
487    """
488    def __init__(self, k=0, distance=hellinger_dist, distance_weighted=True):
489        self.k = k
490        self.distance = distance
491        self.distance_weighted = distance_weighted
492
493    def __call__(self, instances, learner):
494        nearest_neighbours_constructor = Orange.classification.knn.FindNearestConstructor()
495        nearest_neighbours_constructor.distanceConstructor = Orange.distance.Euclidean()
496
497        distance_id = Orange.feature.Descriptor.new_meta_id()
498        nearest_neighbours = nearest_neighbours_constructor(instances, 0, distance_id)
499
500        if self.k == 0:
501            self.k = max(5, len(instances) / 20)
502
503        return LocalCrossValidationClassifier(distance_id, nearest_neighbours, self.k, learner,
504            distance=self.distance, distance_weighted=self.distance_weighted)
505
506class LocalCrossValidationClassifier:
507    def __init__(self, distance_id, nearest_neighbours, k, learner, **kwds):
508        self.distance_id = distance_id
509        self.nearest_neighbours = nearest_neighbours
510        self.k = k
511        self.learner = learner
512        for a,b in kwds.items():
513            setattr(self, a, b)
514
515    def __call__(self, instance, *args):
516        LCVer = 0
517        LCVdi = 0
518
519        # Find k nearest neighbors
520
521        knn = [ex for ex in self.nearest_neighbours(instance, self.k)]
522
523        # leave one out of prediction error
524        for i in xrange(len(knn)):
525            train = knn[:]
526            del train[i]
527
528            classifier = self.learner(Orange.data.Table(train))
529
530            if instance.domain.class_var.var_type == Orange.feature.Descriptor.Continuous:
531                returned_value = classifier(knn[i], Orange.core.GetValue)
532                e = abs(knn[i].getclass().value - returned_value.value)
533
534            elif instance.domain.class_var.var_type == Orange.feature.Descriptor.Discrete:
535                returned_value = classifier(knn[i], Orange.core.GetProbabilities)
536                probabilities = [knn[i].get_class() == val for val in instance.domain.class_var.values]
537                e = self.distance(returned_value, Orange.statistics.distribution.Discrete(probabilities))
538
539            dist = math.exp(-knn[i][self.distance_id]) if self.distance_weighted else 1.0
540            LCVer += e * dist
541            LCVdi += dist
542
543        LCV = LCVer / LCVdi if LCVdi != 0 else 0
544        if math.isnan(LCV):
545            LCV = 0.0
546
547        if instance.domain.class_var.var_type == Orange.feature.Descriptor.Discrete:
548            LCV = 1 - LCV
549
550        return [ Estimate(LCV, ABSOLUTE, LCV_ABSOLUTE) ]
551
552class CNeighbours:
553    """
554   
555    :param k: Number of nearest neighbours used in CNK estimate
556    :type k: int
557
558    :param distance: function that computes a distance between two discrete
559        distributions (used only in classification problems). The default
560        is Hellinger distance.
561    :type distance: function
562   
563    :rtype: :class:`Orange.evaluation.reliability.CNeighboursClassifier`
564   
565    For regression, CNK is defined for an unlabeled instance as a difference
566    between average label of its nearest neighbours and its prediction. CNK
567    can be used as a signed or absolute estimate.
568   
569    :math:`CNK = \\frac{\sum_{i=1}^{k}C_i}{k} - K`
570   
571    where :math:`k` denotes number of neighbors, C :sub:`i` denotes neighbours'
572    labels and :math:`K` denotes the instance's prediction. Note that a greater
573    value implies greater prediction error.
574
575    For classification, CNK is equal to 1 minus the average distance between
576    predicted class distribution and (trivial) class distributions of the
577    $k$ nearest neighbours from the learning set. Note that in this case
578    a greater value implies better prediction.
579   
580    """
581    def __init__(self, k=5, distance=hellinger_dist):
582        self.k = k
583        self.distance = distance
584
585    def __call__(self, instances, learner):
586        nearest_neighbours_constructor = Orange.classification.knn.FindNearestConstructor()
587        nearest_neighbours_constructor.distanceConstructor = Orange.distance.Euclidean()
588
589        distance_id = Orange.feature.Descriptor.new_meta_id()
590        nearest_neighbours = nearest_neighbours_constructor(instances, 0, distance_id)
591        return CNeighboursClassifier(nearest_neighbours, self.k, distance=self.distance)
592
593class CNeighboursClassifier:
594    def __init__(self, nearest_neighbours, k):
595        self.nearest_neighbours = nearest_neighbours
596        self.k = k
597
598    def __call__(self, instance, predicted, probabilities):
599        CNK = 0
600
601        # Find k nearest neighbors
602
603        knn = [ex for ex in self.nearest_neighbours(instance, self.k)]
604
605        # average label of neighbors
606        if ex.domain.class_var.var_type == Orange.feature.Descriptor.Continuous:
607            for ex in knn:
608                CNK += ex.getclass().value
609            CNK /= self.k
610            CNK -= predicted.value
611
612            return [Estimate(CNK, SIGNED, CNK_SIGNED),
613                    Estimate(abs(CNK), ABSOLUTE, CNK_ABSOLUTE)]
614        elif ex.domain.class_var.var_type == Orange.feature.Descriptor.Discrete:
615            knn_l = Orange.classification.knn.kNNLearner(k=self.k)
616            knn_c = knn_l(knn)
617            for ex in knn:
618                CNK -= self.distance(probabilities, knn_c(ex, Orange.classification.Classifier.GetProbabilities))
619            CNK /= self.k
620            CNK += 1
621
622            return [Estimate(CNK, ABSOLUTE, CNK_ABSOLUTE)]
623
624class Mahalanobis:
625    """
626   
627    :param k: Number of nearest neighbours used in Mahalanobis estimate.
628    :type k: int
629   
630    :rtype: :class:`Orange.evaluation.reliability.MahalanobisClassifier`
631   
632    Mahalanobis distance reliability estimate is defined as
633    `mahalanobis distance <http://en.wikipedia.org/wiki/Mahalanobis_distance>`_
634    to the evaluated instance's :math:`k` nearest neighbours.
635
636   
637    """
638    def __init__(self, k=3):
639        self.k = k
640
641    def __call__(self, instances, *args):
642        nnm = Orange.classification.knn.FindNearestConstructor()
643        nnm.distanceConstructor = Orange.distance.Mahalanobis()
644
645        mid = Orange.feature.Descriptor.new_meta_id()
646        nnm = nnm(instances, 0, mid)
647        return MahalanobisClassifier(self.k, nnm, mid)
648
649class MahalanobisClassifier:
650    def __init__(self, k, nnm, mid):
651        self.k = k
652        self.nnm = nnm
653        self.mid = mid
654
655    def __call__(self, instance, *args):
656        mahalanobis_distance = 0
657
658        mahalanobis_distance = sum(ex[self.mid].value for ex in self.nnm(instance, self.k))
659
660        return [ Estimate(mahalanobis_distance, ABSOLUTE, MAHAL_ABSOLUTE) ]
661
662class MahalanobisToCenter:
663    """
664    :rtype: :class:`Orange.evaluation.reliability.MahalanobisToCenterClassifier`
665   
666    Mahalanobis distance to center reliability estimate is defined as a
667    `mahalanobis distance <http://en.wikipedia.org/wiki/Mahalanobis_distance>`_
668    between the predicted instance and the centroid of the data.
669
670   
671    """
672    def __init__(self):
673        pass
674
675    def __call__(self, instances, *args):
676        dc = Orange.core.DomainContinuizer()
677        dc.classTreatment = Orange.core.DomainContinuizer.Ignore
678        dc.continuousTreatment = Orange.core.DomainContinuizer.NormalizeBySpan
679        dc.multinomialTreatment = Orange.core.DomainContinuizer.NValues
680
681        new_domain = dc(instances)
682        new_instances = instances.translate(new_domain)
683
684        X, _, _ = new_instances.to_numpy()
685        instance_avg = numpy.average(X, 0)
686
687        distance_constructor = Orange.distance.Mahalanobis()
688        distance = distance_constructor(new_instances)
689
690        average_instance = Orange.data.Instance(new_instances.domain, list(instance_avg) + ["?"])
691
692        return MahalanobisToCenterClassifier(distance, average_instance, new_domain)
693
694class MahalanobisToCenterClassifier:
695    def __init__(self, distance, average_instance, new_domain):
696        self.distance = distance
697        self.average_instance = average_instance
698        self.new_domain = new_domain
699
700    def __call__(self, instance, *args):
701
702        inst = Orange.data.Instance(self.new_domain, instance)
703
704        mahalanobis_to_center = self.distance(inst, self.average_instance)
705
706        return [ Estimate(mahalanobis_to_center, ABSOLUTE, MAHAL_TO_CENTER_ABSOLUTE) ]
707
708
709class BaggingVarianceCNeighbours:
710    """
711   
712    :param bagv: Instance of Bagging Variance estimator.
713    :type bagv: :class:`BaggingVariance`
714   
715    :param cnk: Instance of CNK estimator.
716    :type cnk: :class:`CNeighbours`
717   
718    :rtype: :class:`Orange.evaluation.reliability.BaggingVarianceCNeighboursClassifier`
719   
720    BVCK is a combination (average) of Bagging variance and local modeling of
721    prediction error.
722   
723    """
724    def __init__(self, bagv=BaggingVariance(), cnk=CNeighbours()):
725        self.bagv = bagv
726        self.cnk = cnk
727
728    def __call__(self, instances, learner):
729        bagv_classifier = self.bagv(instances, learner)
730        cnk_classifier = self.cnk(instances, learner)
731        return BaggingVarianceCNeighboursClassifier(bagv_classifier, cnk_classifier)
732
733class BaggingVarianceCNeighboursClassifier:
734    def __init__(self, bagv_classifier, cnk_classifier):
735        self.bagv_classifier = bagv_classifier
736        self.cnk_classifier = cnk_classifier
737
738    def __call__(self, instance, predicted, probabilities):
739        bagv_estimates = self.bagv_classifier(instance, predicted, probabilities)
740        cnk_estimates = self.cnk_classifier(instance, predicted, probabilities)
741
742        bvck_value = (bagv_estimates[0].estimate + cnk_estimates[1].estimate) / 2
743        bvck_estimates = [ Estimate(bvck_value, ABSOLUTE, BVCK_ABSOLUTE) ]
744        bvck_estimates.extend(bagv_estimates)
745        bvck_estimates.extend(cnk_estimates)
746        return bvck_estimates
747
748class ErrorPredicting:
749    def __init__(self):
750        pass
751
752    def __call__(self, instances, learner):
753        res = Orange.evaluation.testing.cross_validation([learner], instances)
754        prediction_errors = get_prediction_error_list(res)
755
756        new_domain = Orange.data.Domain(instances.domain.attributes, Orange.core.FloatVariable("pe"))
757        new_dataset = Orange.data.Table(new_domain, instances)
758
759        for instance, prediction_error in izip(new_dataset, prediction_errors):
760            instance.set_class(prediction_error)
761
762        rf = Orange.ensemble.forest.RandomForestLearner()
763        rf_classifier = rf(new_dataset)
764
765        return ErrorPredictingClassification(rf_classifier, new_domain)
766
767class ErrorPredictingClassification:
768    def __init__(self, rf_classifier, new_domain):
769        self.rf_classifier = rf_classifier
770        self.new_domain = new_domain
771
772    def __call__(self, instance, predicted, probabilities):
773        new_instance = Orange.data.Instance(self.new_domain, instance)
774        value = self.rf_classifier(new_instance, Orange.core.GetValue)
775
776        return [Estimate(value.value, SIGNED, SABIAS_SIGNED)]
777
778def gauss_kernel(x, sigma=1):
779    return 1./(sigma*math.sqrt(2*math.pi)) * math.exp(-1./2*(x/sigma)**2)
780
781class ParzenWindowDensityBased:
782    """
783    :param K: kernel function. Default: gaussian.
784    :type K: function
785
786    :param d_measure: distance measure for inter-instance distance.
787    :type d_measure: :class:`Orange.distance.DistanceConstructor`
788
789    :rtype: :class:`Orange.evaluation.reliability.ParzenWindowDensityBasedClassifier`
790
791    Returns a value that estimates a density of problem space around the
792    instance being predicted.
793    """
794    def __init__(self, K=gauss_kernel, d_measure=Orange.distance.Euclidean()):
795        self.K = K
796        self.d_measure = d_measure
797
798    def __call__(self, instances):
799
800        self.distance = self.d_measure(instances)
801
802        def density(x):
803            l, dens = len(instances), 0
804            for ex in instances:
805                dens += self.K(self.distance(x,ex))
806            return dens / l
807
808        max_density = max([density(ex) for ex in instances])
809
810        return ParzenWindowDensityBasedClassifier(density, max_density)
811
812class ParzenWindowDensityBasedClassifier:
813
814    def __init__(self, density, max_density):
815        self.density = density
816        self.max_density = max_density
817
818
819    def __call__(self, instance, *args):
820
821        DENS = self.max_density-self.density(instance)
822
823        return [Estimate(DENS, ABSOLUTE, DENS_ABSOLUTE)]
824
825class Learner:
826    """
827    Reliability estimation wrapper around a learner we want to test.
828    Different reliability estimation algorithms can be used on the
829    chosen learner. This learner works as any other and can be used as one,
830    but it returns the classifier, wrapped into an instance of
831    :class:`Orange.evaluation.reliability.Classifier`.
832   
833    :param box_learner: Learner we want to wrap into a reliability estimation
834        classifier.
835    :type box_learner: :obj:`~Orange.classification.Learner`
836   
837    :param estimators: List of different reliability estimation methods we
838                       want to use on the chosen learner.
839    :type estimators: :obj:`list` of reliability estimators
840   
841    :param name: Name of this reliability learner
842    :type name: string
843   
844    :rtype: :class:`Orange.evaluation.reliability.Learner`
845    """
846    def __init__(self, box_learner, name="Reliability estimation",
847                 estimators=[SensitivityAnalysis(),
848                             LocalCrossValidation(),
849                             BaggingVarianceCNeighbours(),
850                             Mahalanobis(),
851                             MahalanobisToCenter()],
852                 **kwds):
853        self.__dict__.update(kwds)
854        self.name = name
855        self.estimators = estimators
856        self.box_learner = box_learner
857        self.blending = False
858
859
860    def __call__(self, instances, weight=None, **kwds):
861        """Learn from the given table of data instances.
862       
863        :param instances: Data instances to learn from.
864        :type instances: Orange.data.Table
865        :param weight: Id of meta attribute with weights of instances
866        :type weight: int
867        :rtype: :class:`Orange.evaluation.reliability.Classifier`
868        """
869
870        blending_classifier = None
871        new_domain = None
872
873#        if instances.domain.class_var.var_type != Orange.feature.Continuous.Continuous:
874#            raise Exception("This method only works on data with continuous class.")
875
876        return Classifier(instances, self.box_learner, self.estimators, self.blending, new_domain, blending_classifier)
877
878    def internal_cross_validation(self, instances, folds=10):
879        """ Perform the internal cross validation for getting the best
880        reliability estimate. It uses the reliability estimators defined in
881        estimators attribute.
882
883        Returns the id of the method that scored the best.
884
885        :param instances: Data instances to use for ICV.
886        :type instances: :class:`Orange.data.Table`
887        :param folds: number of folds for ICV.
888        :type folds: int
889        :rtype: int
890
891        """
892        res = Orange.evaluation.testing.cross_validation([self], instances, folds=folds)
893        results = get_pearson_r(res)
894        sorted_results = sorted(results)
895        return sorted_results[-1][3]
896
897    def internal_cross_validation_testing(self, instances, folds=10):
898        """ Perform internal cross validation (as in Automatic selection of
899        reliability estimates for individual regression predictions,
900        Zoran Bosnic, 2010) and return id of the method
901        that scored best on this data.
902
903        :param instances: Data instances to use for ICV.
904        :type instances: :class:`Orange.data.Table`
905        :param folds: number of folds for ICV.
906        :type folds: int
907        :rtype: int
908
909        """
910        cv_indices = Orange.core.MakeRandomIndicesCV(instances, folds)
911
912        list_of_rs = []
913
914        sum_of_rs = defaultdict(float)
915
916        for fold in xrange(folds):
917            data = instances.select(cv_indices, fold)
918            if len(data) < 10:
919                res = Orange.evaluation.testing.leave_one_out([self], data)
920            else:
921                res = Orange.evaluation.testing.cross_validation([self], data)
922            results = get_pearson_r(res)
923            for r, _, _, method in results:
924                sum_of_rs[method] += r
925        sorted_sum_of_rs = sorted(sum_of_rs.items(), key=lambda estimate: estimate[1], reverse=True)
926        return sorted_sum_of_rs[0][0]
927
928    labels = ["SAvar", "SAbias", "BAGV", "CNK", "LCV", "BVCK", "Mahalanobis", "ICV"]
929
930class Classifier:
931    """
932    A reliability estimation wrapper for classifiers.
933
934    What distinguishes this classifier is that the returned probabilities (if
935    :obj:`Orange.classification.Classifier.GetProbabilities` or
936    :obj:`Orange.classification.Classifier.GetBoth` is passed) contain an
937    additional attribute :obj:`reliability_estimate`, which is an instance of
938    :class:`~Orange.evaluation.reliability.Estimate`.
939
940    """
941
942    def __init__(self, instances, box_learner, estimators, blending, blending_domain, rf_classifier, **kwds):
943        self.__dict__.update(kwds)
944        self.instances = instances
945        self.box_learner = box_learner
946        self.estimators = estimators
947        self.blending = blending
948        self.blending_domain = blending_domain
949        self.rf_classifier = rf_classifier
950
951        # Train the learner with original data
952        self.classifier = box_learner(instances)
953
954        # Train all the estimators and create their classifiers
955        self.estimation_classifiers = [estimator(instances, box_learner) for estimator in estimators]
956
957    def __call__(self, instance, result_type=Orange.core.GetValue):
958        """
959        Classify and estimate reliability of estimation for a new instance.
960        When :obj:`result_type` is set to
961        :obj:`Orange.classification.Classifier.GetBoth` or
962        :obj:`Orange.classification.Classifier.GetProbabilities`,
963        an additional attribute :obj:`reliability_estimate`,
964        which is an instance of
965        :class:`~Orange.evaluation.reliability.Estimate`,
966        is added to the distribution object.
967       
968        :param instance: instance to be classified.
969        :type instance: :class:`Orange.data.Instance`
970        :param result_type: :class:`Orange.classification.Classifier.GetValue` or \
971              :class:`Orange.classification.Classifier.GetProbabilities` or
972              :class:`Orange.classification.Classifier.GetBoth`
973       
974        :rtype: :class:`Orange.data.Value`,
975              :class:`Orange.statistics.Distribution` or a tuple with both
976        """
977        predicted, probabilities = self.classifier(instance, Orange.core.GetBoth)
978
979        # Create a place holder for estimates
980        if probabilities is None:
981            probabilities = Orange.statistics.distribution.Continuous()
982        #with warnings.catch_warnings():
983        #    warnings.simplefilter("ignore")
984        probabilities.setattr('reliability_estimate', [])
985
986        # Calculate all the estimates and add them to the results
987        for estimate in self.estimation_classifiers:
988            probabilities.reliability_estimate.extend(estimate(instance, predicted, probabilities))
989
990        # Return the appropriate type of result
991        if result_type == Orange.core.GetValue:
992            return predicted
993        elif result_type == Orange.core.GetProbabilities:
994            return probabilities
995        else:
996            return predicted, probabilities
997
998# Functions for testing and plotting
999#TODO Document those.
1000def get_acc_rel(method, data, learner):
1001    estimators = [method]
1002    reliability = Orange.evaluation.reliability.Learner(learner, estimators=estimators)
1003    #results = Orange.evaluation.testing.leave_one_out([reliability], data)
1004    results = Orange.evaluation.testing.cross_validation([reliability], data)
1005
1006    rels, acc = [], []
1007
1008    for res in results.results:
1009        rels.append(res.probabilities[0].reliability_estimate[0].estimate)
1010        acc.append(res.probabilities[0][res.actual_class])
1011
1012    return rels, acc
1013
1014def acc_rel_plot(method, data, learner, file_name="acc_rel_plot.png", colors=None):
1015
1016    import matplotlib.pylab as plt
1017
1018    plt.clf()
1019
1020    rels, acc = get_acc_rel(method, data, learner)
1021    print "rels", rels
1022    print "acc", acc
1023
1024    if colors is None:
1025        colors = "k"
1026    plt.scatter(acc, rels, c=colors)
1027    plt.xlim(0.,1.)
1028    plt.ylim(ymin=0.)
1029    plt.savefig(file_name)
1030
1031def acc_rel_correlation(method, data, learner):
1032    import scipy.stats
1033    rels, acc = get_acc_rel(method, data, learner)
1034    return scipy.stats.spearmanr(acc, rels)[0]
Note: See TracBrowser for help on using the repository browser.