source: orange-reliability/_reliability/__init__.py @ 9:9679301ca484

Revision 9:9679301ca484, 37.8 KB checked in by Matija Polajnar <matija.polajnar@…>, 22 months ago (diff)

Add names to reliability estimation learners.

Line 
1import Orange
2
3import random
4from Orange import statc
5import math
6import warnings
7import numpy
8
9from collections import defaultdict
10from itertools import izip
11
12# Labels and final variables
13labels = ["SAvar", "SAbias", "BAGV", "CNK", "LCV", "BVCK", "Mahalanobis", "ICV"]
14
15"""
16# All the estimators calculation constants
17DO_SA = 0
18DO_BAGV = 1
19DO_CNK = 2
20DO_LCV = 3
21DO_BVCK = 4
22DO_MAHAL = 5
23"""
24
25# All the estimator method constants
26SAVAR_ABSOLUTE = 0
27SABIAS_SIGNED = 1
28SABIAS_ABSOLUTE = 2
29BAGV_ABSOLUTE = 3
30CNK_SIGNED = 4
31CNK_ABSOLUTE = 5
32LCV_ABSOLUTE = 6
33BVCK_ABSOLUTE = 7
34MAHAL_ABSOLUTE = 8
35BLENDING_ABSOLUTE = 9
36ICV_METHOD = 10
37MAHAL_TO_CENTER_ABSOLUTE = 13
38DENS_ABSOLUTE = 14
39
40# Type of estimator constant
41SIGNED = 0
42ABSOLUTE = 1
43
44# Names of all the estimator methods
45METHOD_NAME = {0: "SAvar absolute", 1: "SAbias signed", 2: "SAbias absolute",
46               3: "BAGV absolute", 4: "CNK signed", 5: "CNK absolute",
47               6: "LCV absolute", 7: "BVCK_absolute", 8: "Mahalanobis absolute",
48               9: "BLENDING absolute", 10: "ICV", 11: "RF Variance", 12: "RF Std",
49               13: "Mahalanobis to center", 14: "Density based"}
50
51select_with_repeat = Orange.core.MakeRandomIndicesMultiple()
52select_with_repeat.random_generator = Orange.misc.Random()
53
54def get_reliability_estimation_list(res, i):
55    return [result.probabilities[0].reliability_estimate[i].estimate for result in res.results], res.results[0].probabilities[0].reliability_estimate[i].signed_or_absolute, res.results[0].probabilities[0].reliability_estimate[i].method
56
57def get_prediction_error_list(res):
58    return [result.actual_class - result.classes[0] for result in res.results]
59
60def get_description_list(res, i):
61    return [result.probabilities[0].reliability_estimate[i].text_description for result in res.results]
62
63def get_pearson_r(res):
64    """
65    :param res: results of evaluation, done using learners,
66        wrapped into :class:`Orange.evaluation.reliability.Classifier`.
67    :type res: :class:`Orange.evaluation.testing.ExperimentResults`
68
69    Return Pearson's coefficient between the prediction error and each of the
70    used reliability estimates. Also, return the p-value of each of
71    the coefficients.
72    """
73    prediction_error = get_prediction_error_list(res)
74    results = []
75    for i in xrange(len(res.results[0].probabilities[0].reliability_estimate)):
76        reliability_estimate, signed_or_absolute, method = get_reliability_estimation_list(res, i)
77        try:
78            if signed_or_absolute == SIGNED:
79                r, p = statc.pearsonr(prediction_error, reliability_estimate)
80            else:
81                r, p = statc.pearsonr([abs(pe) for pe in prediction_error], reliability_estimate)
82        except Exception:
83            r = p = float("NaN")
84        results.append((r, p, signed_or_absolute, method))
85    return results
86
87def get_spearman_r(res):
88    """
89    :param res: results of evaluation, done using learners,
90        wrapped into :class:`Orange.evaluation.reliability.Classifier`.
91    :type res: :class:`Orange.evaluation.testing.ExperimentResults`
92
93    Return Spearman's coefficient between the prediction error and each of the
94    used reliability estimates. Also, return the p-value of each of
95    the coefficients.
96    """
97    prediction_error = get_prediction_error_list(res)
98    results = []
99    for i in xrange(len(res.results[0].probabilities[0].reliability_estimate)):
100        reliability_estimate, signed_or_absolute, method = get_reliability_estimation_list(res, i)
101        try:
102            if signed_or_absolute == SIGNED:
103                r, p = statc.spearmanr(prediction_error, reliability_estimate)
104            else:
105                r, p = statc.spearmanr([abs(pe) for pe in prediction_error], reliability_estimate)
106        except Exception:
107            r = p = float("NaN")
108        results.append((r, p, signed_or_absolute, method))
109    return results
110
111def get_pearson_r_by_iterations(res):
112    """
113    :param res: results of evaluation, done using learners,
114        wrapped into :class:`Orange.evaluation.reliability.Classifier`.
115    :type res: :class:`Orange.evaluation.testing.ExperimentResults`
116
117    Return average Pearson's coefficient over all folds between prediction error
118    and each of the used estimates.
119    """
120    results_by_fold = Orange.evaluation.scoring.split_by_iterations(res)
121    number_of_estimates = len(res.results[0].probabilities[0].reliability_estimate)
122    number_of_instances = len(res.results)
123    number_of_folds = len(results_by_fold)
124    results = [0 for _ in xrange(number_of_estimates)]
125    sig = [0 for _ in xrange(number_of_estimates)]
126    method_list = [0 for _ in xrange(number_of_estimates)]
127
128    for res in results_by_fold:
129        prediction_error = get_prediction_error_list(res)
130        for i in xrange(number_of_estimates):
131            reliability_estimate, signed_or_absolute, method = get_reliability_estimation_list(res, i)
132            try:
133                if signed_or_absolute == SIGNED:
134                    r, _ = statc.pearsonr(prediction_error, reliability_estimate)
135                else:
136                    r, _ = statc.pearsonr([abs(pe) for pe in prediction_error], reliability_estimate)
137            except Exception:
138                r = float("NaN")
139            results[i] += r
140            sig[i] = signed_or_absolute
141            method_list[i] = method
142
143    # Calculate p-values
144    results = [float(res) / number_of_folds for res in results]
145    ps = [p_value_from_r(r, number_of_instances) for r in results]
146
147    return zip(results, ps, sig, method_list)
148
149def p_value_from_r(r, n):
150    """
151    Calculate p-value from the paerson coefficient and the sample size.
152    """
153    df = n - 2
154    t = r * (df / ((-r + 1.0 + 1e-30) * (r + 1.0 + 1e-30))) ** 0.5
155    return statc.betai (df * 0.5, 0.5, df / (df + t * t))
156
157
158# Distances between two discrete probability distributions
159#TODO Document those.
160def normalize_both(p, q):
161    if not p.normalized:
162        p.normalize()
163    if not q.normalized:
164        q.normalize()
165    return p, q
166
167def minkowsky_dist(p, q, m=2):
168    p, q = normalize_both(p, q)
169    dist = 0
170    for i in range(len(p)):
171        dist += abs(p[i]-q[i])**m
172    return dist**(1./m)
173
174def manhattan_distance(p, q):
175    return minkowsky_dist(p, q, m=1)
176
177def euclidean_dist(p, q):
178    return minkowsky_dist(p, q, m=2)
179
180def variance_dist(p, q):
181    return euclidean_dist(p, q) ** 2
182
183def max_dist(p, q):
184    p, q = normalize_both(p, q)
185    return max([abs(p[i]-q[i]) for i in range(len(p))])
186
187def hellinger_dist(p, q):
188    p, q = normalize_both(p, q)
189    dist = 0
190    for i in range(len(p)):
191        dist += (math.sqrt(p[i])-math.sqrt(q[i])) ** 2
192    return dist
193
194def my_log(x):
195    return 0 if x == 0 else x * math.log(x)
196
197def kullback_leibler(p, q):
198    p, q = normalize_both(p, q)
199    dist = 0
200    for i in range(len(p)):
201        dist += my_log(p[i]-q[i])
202    return dist
203
204def cosine(p, q):
205    p, q = normalize_both(p, q)
206    p, q = [pp for pp in p], [qq for qq in q]
207    return 1 - numpy.dot(x,y) / (numpy.linalg.norm(p)*numpy.linalg.norm(q))
208
209
210class Estimate:
211    """
212    Reliability estimate. Contains attributes that describe the results of
213    reliability estimation.
214
215    .. attribute:: estimate
216
217        A numerical reliability estimate.
218
219    .. attribute:: signed_or_absolute
220
221        Determines whether the method used gives a signed or absolute result.
222        Has a value of either :obj:`SIGNED` or :obj:`ABSOLUTE`.
223
224    .. attribute:: method
225
226        An integer ID of reliability estimation method used.
227
228    .. attribute:: method_name
229
230        Name (string) of reliability estimation method used.
231
232    .. attribute:: icv_method
233
234        An integer ID of reliability estimation method that performed best,
235        as determined by ICV, and of which estimate is stored in the
236        :obj:`estimate` field. (:obj:`None` when ICV was not used.)
237
238    .. attribute:: icv_method_name
239
240        Name (string) of reliability estimation method that performed best,
241        as determined by ICV. (:obj:`None` when ICV was not used.)
242
243    """
244    def __init__(self, estimate, signed_or_absolute, method, icv_method= -1):
245        self.estimate = estimate
246        self.signed_or_absolute = signed_or_absolute
247        self.method = method
248        self.method_name = METHOD_NAME[method]
249        self.icv_method = icv_method
250        self.icv_method_name = METHOD_NAME[icv_method] if icv_method != -1 else ""
251        self.text_description = None
252
253class DescriptiveAnalysis:
254    def __init__(self, estimator, desc=["high", "medium", "low"], procentage=[0.00, 0.33, 0.66]):
255        self.desc = desc
256        self.procentage = procentage
257        self.estimator = estimator
258
259    def __call__(self, instances, weight=None, **kwds):
260
261        # Calculate borders using cross validation
262        res = Orange.evaluation.testing.cross_validation([self.estimator], instances)
263        all_borders = []
264        for i in xrange(len(res.results[0].probabilities[0].reliability_estimate)):
265            estimates, signed_or_absolute, method = get_reliability_estimation_list(res, i)
266            sorted_estimates = sorted(abs(x) for x in estimates)
267            borders = [sorted_estimates[int(len(estimates) * p) - 1]  for p in self.procentage]
268            all_borders.append(borders)
269
270        # Learn on whole train data
271        estimator_classifier = self.estimator(instances)
272
273        return DescriptiveAnalysisClassifier(estimator_classifier, all_borders, self.desc)
274
275class DescriptiveAnalysisClassifier:
276    def __init__(self, estimator_classifier, all_borders, desc):
277        self.estimator_classifier = estimator_classifier
278        self.all_borders = all_borders
279        self.desc = desc
280
281    def __call__(self, instance, result_type=Orange.core.GetValue):
282        predicted, probabilities = self.estimator_classifier(instance, Orange.core.GetBoth)
283
284        for borders, estimate in zip(self.all_borders, probabilities.reliability_estimate):
285            estimate.text_description = self.desc[0]
286            for lower_border, text_desc in zip(borders, self.desc):
287                if estimate.estimate >= lower_border:
288                    estimate.text_description = text_desc
289
290        # Return the appropriate type of result
291        if result_type == Orange.core.GetValue:
292            return predicted
293        elif result_type == Orange.core.GetProbabilities:
294            return probabilities
295        else:
296            return predicted, probabilities
297
298class SensitivityAnalysis:
299    """
300   
301    :param e: List of possible :math:`\epsilon` values for SAvar and SAbias
302        reliability estimates.
303    :type e: list of floats
304   
305    :rtype: :class:`Orange.evaluation.reliability.SensitivityAnalysisClassifier`
306   
307    To estimate the reliability of prediction for given instance,
308    the learning set is extended with this instance, labeled with
309    :math:`K + \epsilon (l_{max} - l_{min})`,
310    where :math:`K` denotes the initial prediction,
311    :math:`\epsilon` is sensitivity parameter and :math:`l_{min}` and
312    :math:`l_{max}` denote lower and the upper bound of the learning
313    instances' labels. After computing different sensitivity predictions
314    using different values of :math:`\epsilon`, the prediction are combined
315    into SAvar and SAbias. SAbias can be used in a signed or absolute form.
316
317    :math:`SAvar = \\frac{\sum_{\epsilon \in E}(K_{\epsilon} - K_{-\epsilon})}{|E|}`
318
319    :math:`SAbias = \\frac{\sum_{\epsilon \in E} (K_{\epsilon} - K ) + (K_{-\epsilon} - K)}{2 |E|}`
320   
321   
322    """
323    def __init__(self, e=[0.01, 0.1, 0.5, 1.0, 2.0]):
324        self.e = e
325
326    def __call__(self, instances, learner):
327        min_value = max_value = instances[0].getclass().value
328        for ex in instances:
329            if ex.getclass().value > max_value:
330                max_value = ex.getclass().value
331            if ex.getclass().value < min_value:
332                min_value = ex.getclass().value
333        return SensitivityAnalysisClassifier(self.e, instances, min_value, max_value, learner)
334
335class SensitivityAnalysisClassifier:
336    def __init__(self, e, instances, min_value, max_value, learner):
337        self.e = e
338        self.instances = instances
339        self.max_value = max_value
340        self.min_value = min_value
341        self.learner = learner
342
343    def __call__(self, instance, predicted, probabilities):
344        # Create new dataset
345        r_data = Orange.data.Table(self.instances)
346
347        # Create new instance
348        modified_instance = Orange.data.Instance(instance)
349
350        # Append it to the data
351        r_data.append(modified_instance)
352
353        # Calculate SAvar & SAbias
354        SAvar = SAbias = 0
355
356        for eps in self.e:
357            # +epsilon
358            r_data[-1].setclass(predicted.value + eps * (self.max_value - self.min_value))
359            c = self.learner(r_data)
360            k_plus = c(instance, Orange.core.GetValue)
361
362            # -epsilon
363            r_data[-1].setclass(predicted.value - eps * (self.max_value - self.min_value))
364            c = self.learner(r_data)
365            k_minus = c(instance, Orange.core.GetValue)
366            #print len(r_data)
367            #print eps*(self.max_value - self.min_value)
368            #print k_plus
369            #print k_minus
370            # calculate part SAvar and SAbias
371            SAvar += k_plus.value - k_minus.value
372            SAbias += k_plus.value + k_minus.value - 2 * predicted.value
373
374        SAvar /= len(self.e)
375        SAbias /= 2 * len(self.e)
376
377        return [Estimate(SAvar, ABSOLUTE, SAVAR_ABSOLUTE),
378                Estimate(SAbias, SIGNED, SABIAS_SIGNED),
379                Estimate(abs(SAbias), ABSOLUTE, SABIAS_ABSOLUTE)]
380
381class BaggingVariance:
382    """
383   
384    :param m: Number of bagging models to be used with BAGV estimate
385    :type m: int
386   
387    :rtype: :class:`Orange.evaluation.reliability.BaggingVarianceClassifier`
388   
389    :math:`m` different bagging models are constructed and used to estimate
390    the value of dependent variable for a given instance. In regression,
391    the variance of those predictions is used as a prediction reliability
392    estimate.
393
394    :math:`BAGV = \\frac{1}{m} \sum_{i=1}^{m} (K_i - K)^2`
395
396    where :math:`K = \\frac{\sum_{i=1}^{m} K_i}{m}` and :math:`K_i` are
397    predictions of individual constructed models. Note that a greater value
398    implies greater error.
399
400    For classification, 1 minus the average Euclidean distance between class
401    probability distributions predicted by the model, and distributions
402    predicted by the individual bagged models, is used as the BAGV reliability
403    measure. Note that in this case a greater value implies a better
404    prediction.
405   
406    """
407    def __init__(self, m=50, name="bv"):
408        self.m = m
409        self.name = name
410
411    def __call__(self, instances, learner):
412        classifiers = []
413
414        if instances.domain.class_var.var_type == Orange.feature.Descriptor.Discrete:
415            classifier = learner(instances)
416        else:
417            classifier = None
418
419        # Create bagged classifiers using sampling with replacement
420        for _ in xrange(self.m):
421            selection = select_with_repeat(len(instances))
422            data = instances.select(selection)
423            classifiers.append(learner(data))
424        return BaggingVarianceClassifier(classifiers, classifier)
425
426class BaggingVarianceClassifier:
427    def __init__(self, classifiers, classifier=None):
428        self.classifiers = classifiers
429        self.classifier = classifier
430
431    def __call__(self, instance, *args):
432        BAGV = 0
433
434        # Calculate the bagging variance
435        if instance.domain.class_var.var_type == Orange.feature.Descriptor.Continuous:
436            bagged_values = [c(instance, Orange.core.GetValue).value for c in self.classifiers if c is not None]
437        elif instance.domain.class_var.var_type == Orange.feature.Descriptor.Discrete:
438            estimate = self.classifier(instance, Orange.core.GetProbabilities)
439            bagged_values = [euclidean_dist(c(instance, Orange.core.GetProbabilities), estimate) for c in self.classifiers if c is not None]
440        k = sum(bagged_values) / len(bagged_values)
441
442        BAGV = sum((bagged_value - k) ** 2 for bagged_value in bagged_values) / len(bagged_values)
443        if instance.domain.class_var.var_type == Orange.feature.Descriptor.Discrete:
444            BAGV = 1 - BAGV
445
446        return [Estimate(BAGV, ABSOLUTE, BAGV_ABSOLUTE)]
447
448class LocalCrossValidation:
449    """
450
451    :param k: Number of nearest neighbours used in LCV estimate
452    :type k: int
453
454    :param distance: function that computes a distance between two discrete
455        distributions (used only in classification problems). The default
456        is Hellinger distance.
457    :type distance: function
458
459    :param distance_weighted: for classification reliability estimation,
460        use an average distance between distributions, weighted by :math:`e^{-d}`,
461        where :math:`d` is the distance between predicted instance and the
462        neighbour.
463
464    :rtype: :class:`Orange.evaluation.reliability.LocalCrossValidationClassifier`
465
466    :math:`k` nearest neighbours to the given instance are found and put in
467    a separate data set. On this data set, a leave-one-out validation is
468    performed. Reliability estimate for regression is then the distance
469    weighted absolute prediction error. In classification, 1 minus the average
470    distance between the predicted class probability distribution and the
471    (trivial) probability distributions of the nearest neighbour.
472
473    If a special value 0 is passed as :math:`k` (as is by default),
474    it is set as 1/20 of data set size (or 5, whichever is greater).
475
476    Summary of the algorithm for regression:
477
478    1. Determine the set of k nearest neighours :math:`N = { (x_1, c_1),...,
479       (x_k, c_k)}`.
480    2. On this set, compute leave-one-out predictions :math:`K_i` and
481       prediction errors :math:`E_i = | C_i - K_i |`.
482    3. :math:`LCV(x) = \\frac{ \sum_{(x_i, c_i) \in N} d(x_i, x) * E_i }{ \sum_{(x_i, c_i) \in N} d(x_i, x) }`
483
484    """
485    def __init__(self, k=0, distance=hellinger_dist, distance_weighted=True, name="lcv"):
486        self.k = k
487        self.distance = distance
488        self.distance_weighted = distance_weighted
489        self.name = name
490
491    def __call__(self, instances, learner):
492        nearest_neighbours_constructor = Orange.classification.knn.FindNearestConstructor()
493        nearest_neighbours_constructor.distanceConstructor = Orange.distance.Euclidean()
494
495        distance_id = Orange.feature.Descriptor.new_meta_id()
496        nearest_neighbours = nearest_neighbours_constructor(instances, 0, distance_id)
497
498        if self.k == 0:
499            self.k = max(5, len(instances) / 20)
500
501        return LocalCrossValidationClassifier(distance_id, nearest_neighbours, self.k, learner,
502            distance=self.distance, distance_weighted=self.distance_weighted)
503
504class LocalCrossValidationClassifier:
505    def __init__(self, distance_id, nearest_neighbours, k, learner, **kwds):
506        self.distance_id = distance_id
507        self.nearest_neighbours = nearest_neighbours
508        self.k = k
509        self.learner = learner
510        for a,b in kwds.items():
511            setattr(self, a, b)
512
513    def __call__(self, instance, *args):
514        LCVer = 0
515        LCVdi = 0
516
517        # Find k nearest neighbors
518
519        knn = [ex for ex in self.nearest_neighbours(instance, self.k)]
520
521        # leave one out of prediction error
522        for i in xrange(len(knn)):
523            train = knn[:]
524            del train[i]
525
526            classifier = self.learner(Orange.data.Table(train))
527
528            if instance.domain.class_var.var_type == Orange.feature.Descriptor.Continuous:
529                returned_value = classifier(knn[i], Orange.core.GetValue)
530                e = abs(knn[i].getclass().value - returned_value.value)
531
532            elif instance.domain.class_var.var_type == Orange.feature.Descriptor.Discrete:
533                returned_value = classifier(knn[i], Orange.core.GetProbabilities)
534                probabilities = [knn[i].get_class() == val for val in instance.domain.class_var.values]
535                e = self.distance(returned_value, Orange.statistics.distribution.Discrete(probabilities))
536
537            dist = math.exp(-knn[i][self.distance_id]) if self.distance_weighted else 1.0
538            LCVer += e * dist
539            LCVdi += dist
540
541        LCV = LCVer / LCVdi if LCVdi != 0 else 0
542        if math.isnan(LCV):
543            LCV = 0.0
544
545        if instance.domain.class_var.var_type == Orange.feature.Descriptor.Discrete:
546            LCV = 1 - LCV
547
548        return [ Estimate(LCV, ABSOLUTE, LCV_ABSOLUTE) ]
549
550class CNeighbours:
551    """
552   
553    :param k: Number of nearest neighbours used in CNK estimate
554    :type k: int
555
556    :param distance: function that computes a distance between two discrete
557        distributions (used only in classification problems). The default
558        is Hellinger distance.
559    :type distance: function
560   
561    :rtype: :class:`Orange.evaluation.reliability.CNeighboursClassifier`
562   
563    For regression, CNK is defined for an unlabeled instance as a difference
564    between average label of its nearest neighbours and its prediction. CNK
565    can be used as a signed or absolute estimate.
566   
567    :math:`CNK = \\frac{\sum_{i=1}^{k}C_i}{k} - K`
568   
569    where :math:`k` denotes number of neighbors, C :sub:`i` denotes neighbours'
570    labels and :math:`K` denotes the instance's prediction. Note that a greater
571    value implies greater prediction error.
572
573    For classification, CNK is equal to 1 minus the average distance between
574    predicted class distribution and (trivial) class distributions of the
575    $k$ nearest neighbours from the learning set. Note that in this case
576    a greater value implies better prediction.
577   
578    """
579    def __init__(self, k=5, distance=hellinger_dist, name = "cnk"):
580        self.k = k
581        self.distance = distance
582        self.name = name
583
584    def __call__(self, instances, learner):
585        nearest_neighbours_constructor = Orange.classification.knn.FindNearestConstructor()
586        nearest_neighbours_constructor.distanceConstructor = Orange.distance.Euclidean()
587
588        distance_id = Orange.feature.Descriptor.new_meta_id()
589        nearest_neighbours = nearest_neighbours_constructor(instances, 0, distance_id)
590        return CNeighboursClassifier(nearest_neighbours, self.k, distance=self.distance)
591
592class CNeighboursClassifier:
593    def __init__(self, nearest_neighbours, k, distance):
594        self.nearest_neighbours = nearest_neighbours
595        self.k = k
596        self.distance = distance
597
598    def __call__(self, instance, predicted, probabilities):
599        CNK = 0
600
601        # Find k nearest neighbors
602
603        knn = [ex for ex in self.nearest_neighbours(instance, self.k)]
604
605        # average label of neighbors
606        if ex.domain.class_var.var_type == Orange.feature.Descriptor.Continuous:
607            for ex in knn:
608                CNK += ex.getclass().value
609            CNK /= self.k
610            CNK -= predicted.value
611
612            return [Estimate(CNK, SIGNED, CNK_SIGNED),
613                    Estimate(abs(CNK), ABSOLUTE, CNK_ABSOLUTE)]
614        elif ex.domain.class_var.var_type == Orange.feature.Descriptor.Discrete:
615            knn_l = Orange.classification.knn.kNNLearner(k=self.k)
616            knn_c = knn_l(knn)
617            for ex in knn:
618                CNK -= self.distance(probabilities, knn_c(ex, Orange.classification.Classifier.GetProbabilities))
619            CNK /= self.k
620            CNK += 1
621
622            return [Estimate(CNK, ABSOLUTE, CNK_ABSOLUTE)]
623
624class Mahalanobis:
625    """
626   
627    :param k: Number of nearest neighbours used in Mahalanobis estimate.
628    :type k: int
629   
630    :rtype: :class:`Orange.evaluation.reliability.MahalanobisClassifier`
631   
632    Mahalanobis distance reliability estimate is defined as
633    `mahalanobis distance <http://en.wikipedia.org/wiki/Mahalanobis_distance>`_
634    to the evaluated instance's :math:`k` nearest neighbours.
635
636   
637    """
638    def __init__(self, k=3):
639        self.k = k
640
641    def __call__(self, instances, *args):
642        nnm = Orange.classification.knn.FindNearestConstructor()
643        nnm.distanceConstructor = Orange.distance.Mahalanobis()
644
645        mid = Orange.feature.Descriptor.new_meta_id()
646        nnm = nnm(instances, 0, mid)
647        return MahalanobisClassifier(self.k, nnm, mid)
648
649class MahalanobisClassifier:
650    def __init__(self, k, nnm, mid):
651        self.k = k
652        self.nnm = nnm
653        self.mid = mid
654
655    def __call__(self, instance, *args):
656        mahalanobis_distance = 0
657
658        mahalanobis_distance = sum(ex[self.mid].value for ex in self.nnm(instance, self.k))
659
660        return [ Estimate(mahalanobis_distance, ABSOLUTE, MAHAL_ABSOLUTE) ]
661
662class MahalanobisToCenter:
663    """
664    :rtype: :class:`Orange.evaluation.reliability.MahalanobisToCenterClassifier`
665   
666    Mahalanobis distance to center reliability estimate is defined as a
667    `mahalanobis distance <http://en.wikipedia.org/wiki/Mahalanobis_distance>`_
668    between the predicted instance and the centroid of the data.
669
670   
671    """
672    def __init__(self):
673        pass
674
675    def __call__(self, instances, *args):
676        dc = Orange.core.DomainContinuizer()
677        dc.classTreatment = Orange.core.DomainContinuizer.Ignore
678        dc.continuousTreatment = Orange.core.DomainContinuizer.NormalizeBySpan
679        dc.multinomialTreatment = Orange.core.DomainContinuizer.NValues
680
681        new_domain = dc(instances)
682        new_instances = instances.translate(new_domain)
683
684        X, _, _ = new_instances.to_numpy()
685        instance_avg = numpy.average(X, 0)
686
687        distance_constructor = Orange.distance.Mahalanobis()
688        distance = distance_constructor(new_instances)
689
690        average_instance = Orange.data.Instance(new_instances.domain, list(instance_avg) + ["?"])
691
692        return MahalanobisToCenterClassifier(distance, average_instance, new_domain)
693
694class MahalanobisToCenterClassifier:
695    def __init__(self, distance, average_instance, new_domain):
696        self.distance = distance
697        self.average_instance = average_instance
698        self.new_domain = new_domain
699
700    def __call__(self, instance, *args):
701
702        inst = Orange.data.Instance(self.new_domain, instance)
703
704        mahalanobis_to_center = self.distance(inst, self.average_instance)
705
706        return [ Estimate(mahalanobis_to_center, ABSOLUTE, MAHAL_TO_CENTER_ABSOLUTE) ]
707
708
709class BaggingVarianceCNeighbours:
710    """
711   
712    :param bagv: Instance of Bagging Variance estimator.
713    :type bagv: :class:`BaggingVariance`
714   
715    :param cnk: Instance of CNK estimator.
716    :type cnk: :class:`CNeighbours`
717   
718    :rtype: :class:`Orange.evaluation.reliability.BaggingVarianceCNeighboursClassifier`
719   
720    BVCK is a combination (average) of Bagging variance and local modeling of
721    prediction error.
722   
723    """
724    def __init__(self, bagv=BaggingVariance(), cnk=CNeighbours()):
725        self.bagv = bagv
726        self.cnk = cnk
727
728    def __call__(self, instances, learner):
729        bagv_classifier = self.bagv(instances, learner)
730        cnk_classifier = self.cnk(instances, learner)
731        return BaggingVarianceCNeighboursClassifier(bagv_classifier, cnk_classifier)
732
733class BaggingVarianceCNeighboursClassifier:
734    def __init__(self, bagv_classifier, cnk_classifier):
735        self.bagv_classifier = bagv_classifier
736        self.cnk_classifier = cnk_classifier
737
738    def __call__(self, instance, predicted, probabilities):
739        bagv_estimates = self.bagv_classifier(instance, predicted, probabilities)
740        cnk_estimates = self.cnk_classifier(instance, predicted, probabilities)
741
742        bvck_value = (bagv_estimates[0].estimate + cnk_estimates[1].estimate) / 2
743        bvck_estimates = [ Estimate(bvck_value, ABSOLUTE, BVCK_ABSOLUTE) ]
744        bvck_estimates.extend(bagv_estimates)
745        bvck_estimates.extend(cnk_estimates)
746        return bvck_estimates
747
748class ErrorPredicting:
749    def __init__(self):
750        pass
751
752    def __call__(self, instances, learner):
753        res = Orange.evaluation.testing.cross_validation([learner], instances)
754        prediction_errors = get_prediction_error_list(res)
755
756        new_domain = Orange.data.Domain(instances.domain.attributes, Orange.core.FloatVariable("pe"))
757        new_dataset = Orange.data.Table(new_domain, instances)
758
759        for instance, prediction_error in izip(new_dataset, prediction_errors):
760            instance.set_class(prediction_error)
761
762        rf = Orange.ensemble.forest.RandomForestLearner()
763        rf_classifier = rf(new_dataset)
764
765        return ErrorPredictingClassification(rf_classifier, new_domain)
766
767class ErrorPredictingClassification:
768    def __init__(self, rf_classifier, new_domain):
769        self.rf_classifier = rf_classifier
770        self.new_domain = new_domain
771
772    def __call__(self, instance, predicted, probabilities):
773        new_instance = Orange.data.Instance(self.new_domain, instance)
774        value = self.rf_classifier(new_instance, Orange.core.GetValue)
775
776        return [Estimate(value.value, SIGNED, SABIAS_SIGNED)]
777
778def gauss_kernel(x, sigma=1):
779    return 1./(sigma*math.sqrt(2*math.pi)) * math.exp(-1./2*(x/sigma)**2)
780
781class ParzenWindowDensityBased:
782    """
783    :param K: kernel function. Default: gaussian.
784    :type K: function
785
786    :param d_measure: distance measure for inter-instance distance.
787    :type d_measure: :class:`Orange.distance.DistanceConstructor`
788
789    :rtype: :class:`Orange.evaluation.reliability.ParzenWindowDensityBasedClassifier`
790
791    Returns a value that estimates a density of problem space around the
792    instance being predicted.
793    """
794    def __init__(self, K=gauss_kernel, d_measure=Orange.distance.Euclidean(), name="density"):
795        self.K = K
796        self.d_measure = d_measure
797        self.name = name
798
799    def __call__(self, instances):
800
801        self.distance = self.d_measure(instances)
802
803        def density(x):
804            l, dens = len(instances), 0
805            for ex in instances:
806                dens += self.K(self.distance(x,ex))
807            return dens / l
808
809        max_density = max([density(ex) for ex in instances])
810
811        return ParzenWindowDensityBasedClassifier(density, max_density)
812
813class ParzenWindowDensityBasedClassifier:
814
815    def __init__(self, density, max_density):
816        self.density = density
817        self.max_density = max_density
818
819
820    def __call__(self, instance, *args):
821
822        DENS = self.max_density-self.density(instance)
823
824        return [Estimate(DENS, ABSOLUTE, DENS_ABSOLUTE)]
825
826class Learner:
827    """
828    Reliability estimation wrapper around a learner we want to test.
829    Different reliability estimation algorithms can be used on the
830    chosen learner. This learner works as any other and can be used as one,
831    but it returns the classifier, wrapped into an instance of
832    :class:`Orange.evaluation.reliability.Classifier`.
833   
834    :param box_learner: Learner we want to wrap into a reliability estimation
835        classifier.
836    :type box_learner: :obj:`~Orange.classification.Learner`
837   
838    :param estimators: List of different reliability estimation methods we
839                       want to use on the chosen learner.
840    :type estimators: :obj:`list` of reliability estimators
841   
842    :param name: Name of this reliability learner
843    :type name: string
844   
845    :rtype: :class:`Orange.evaluation.reliability.Learner`
846    """
847    def __init__(self, box_learner, name="Reliability estimation",
848                 estimators=[SensitivityAnalysis(),
849                             LocalCrossValidation(),
850                             BaggingVarianceCNeighbours(),
851                             Mahalanobis(),
852                             MahalanobisToCenter()],
853                 **kwds):
854        self.__dict__.update(kwds)
855        self.name = name
856        self.estimators = estimators
857        self.box_learner = box_learner
858        self.blending = False
859
860
861    def __call__(self, instances, weight=None, **kwds):
862        """Learn from the given table of data instances.
863       
864        :param instances: Data instances to learn from.
865        :type instances: Orange.data.Table
866        :param weight: Id of meta attribute with weights of instances
867        :type weight: int
868        :rtype: :class:`Orange.evaluation.reliability.Classifier`
869        """
870
871        blending_classifier = None
872        new_domain = None
873
874#        if instances.domain.class_var.var_type != Orange.feature.Continuous.Continuous:
875#            raise Exception("This method only works on data with continuous class.")
876
877        return Classifier(instances, self.box_learner, self.estimators, self.blending, new_domain, blending_classifier)
878
879    def internal_cross_validation(self, instances, folds=10):
880        """ Perform the internal cross validation for getting the best
881        reliability estimate. It uses the reliability estimators defined in
882        estimators attribute.
883
884        Returns the id of the method that scored the best.
885
886        :param instances: Data instances to use for ICV.
887        :type instances: :class:`Orange.data.Table`
888        :param folds: number of folds for ICV.
889        :type folds: int
890        :rtype: int
891
892        """
893        res = Orange.evaluation.testing.cross_validation([self], instances, folds=folds)
894        results = get_pearson_r(res)
895        sorted_results = sorted(results)
896        return sorted_results[-1][3]
897
898    def internal_cross_validation_testing(self, instances, folds=10):
899        """ Perform internal cross validation (as in Automatic selection of
900        reliability estimates for individual regression predictions,
901        Zoran Bosnic, 2010) and return id of the method
902        that scored best on this data.
903
904        :param instances: Data instances to use for ICV.
905        :type instances: :class:`Orange.data.Table`
906        :param folds: number of folds for ICV.
907        :type folds: int
908        :rtype: int
909
910        """
911        cv_indices = Orange.core.MakeRandomIndicesCV(instances, folds)
912
913        list_of_rs = []
914
915        sum_of_rs = defaultdict(float)
916
917        for fold in xrange(folds):
918            data = instances.select(cv_indices, fold)
919            if len(data) < 10:
920                res = Orange.evaluation.testing.leave_one_out([self], data)
921            else:
922                res = Orange.evaluation.testing.cross_validation([self], data)
923            results = get_pearson_r(res)
924            for r, _, _, method in results:
925                sum_of_rs[method] += r
926        sorted_sum_of_rs = sorted(sum_of_rs.items(), key=lambda estimate: estimate[1], reverse=True)
927        return sorted_sum_of_rs[0][0]
928
929    labels = ["SAvar", "SAbias", "BAGV", "CNK", "LCV", "BVCK", "Mahalanobis", "ICV"]
930
931class Classifier:
932    """
933    A reliability estimation wrapper for classifiers.
934
935    What distinguishes this classifier is that the returned probabilities (if
936    :obj:`Orange.classification.Classifier.GetProbabilities` or
937    :obj:`Orange.classification.Classifier.GetBoth` is passed) contain an
938    additional attribute :obj:`reliability_estimate`, which is an instance of
939    :class:`~Orange.evaluation.reliability.Estimate`.
940
941    """
942
943    def __init__(self, instances, box_learner, estimators, blending, blending_domain, rf_classifier, **kwds):
944        self.__dict__.update(kwds)
945        self.instances = instances
946        self.box_learner = box_learner
947        self.estimators = estimators
948        self.blending = blending
949        self.blending_domain = blending_domain
950        self.rf_classifier = rf_classifier
951
952        # Train the learner with original data
953        self.classifier = box_learner(instances)
954
955        # Train all the estimators and create their classifiers
956        self.estimation_classifiers = [estimator(instances, box_learner) for estimator in estimators]
957
958    def __call__(self, instance, result_type=Orange.core.GetValue):
959        """
960        Classify and estimate reliability of estimation for a new instance.
961        When :obj:`result_type` is set to
962        :obj:`Orange.classification.Classifier.GetBoth` or
963        :obj:`Orange.classification.Classifier.GetProbabilities`,
964        an additional attribute :obj:`reliability_estimate`,
965        which is an instance of
966        :class:`~Orange.evaluation.reliability.Estimate`,
967        is added to the distribution object.
968       
969        :param instance: instance to be classified.
970        :type instance: :class:`Orange.data.Instance`
971        :param result_type: :class:`Orange.classification.Classifier.GetValue` or \
972              :class:`Orange.classification.Classifier.GetProbabilities` or
973              :class:`Orange.classification.Classifier.GetBoth`
974       
975        :rtype: :class:`Orange.data.Value`,
976              :class:`Orange.statistics.Distribution` or a tuple with both
977        """
978        predicted, probabilities = self.classifier(instance, Orange.core.GetBoth)
979
980        # Create a place holder for estimates
981        if probabilities is None:
982            probabilities = Orange.statistics.distribution.Continuous()
983        #with warnings.catch_warnings():
984        #    warnings.simplefilter("ignore")
985        probabilities.setattr('reliability_estimate', [])
986
987        # Calculate all the estimates and add them to the results
988        for estimate in self.estimation_classifiers:
989            probabilities.reliability_estimate.extend(estimate(instance, predicted, probabilities))
990
991        # Return the appropriate type of result
992        if result_type == Orange.core.GetValue:
993            return predicted
994        elif result_type == Orange.core.GetProbabilities:
995            return probabilities
996        else:
997            return predicted, probabilities
998
999# Functions for testing and plotting
1000#TODO Document those.
1001def get_acc_rel(method, data, learner):
1002    estimators = [method]
1003    reliability = Orange.evaluation.reliability.Learner(learner, estimators=estimators)
1004    #results = Orange.evaluation.testing.leave_one_out([reliability], data)
1005    results = Orange.evaluation.testing.cross_validation([reliability], data)
1006
1007    rels, acc = [], []
1008
1009    for res in results.results:
1010        rels.append(res.probabilities[0].reliability_estimate[0].estimate)
1011        acc.append(res.probabilities[0][res.actual_class])
1012
1013    return rels, acc
1014
1015def acc_rel_plot(method, data, learner, file_name="acc_rel_plot.png", colors=None):
1016
1017    import matplotlib.pylab as plt
1018
1019    plt.clf()
1020
1021    rels, acc = get_acc_rel(method, data, learner)
1022    print "rels", rels
1023    print "acc", acc
1024
1025    if colors is None:
1026        colors = "k"
1027    plt.scatter(acc, rels, c=colors)
1028    plt.xlim(0.,1.)
1029    plt.ylim(ymin=0.)
1030    plt.savefig(file_name)
1031
1032def acc_rel_correlation(method, data, learner):
1033    import scipy.stats
1034    rels, acc = get_acc_rel(method, data, learner)
1035    return scipy.stats.spearmanr(acc, rels)[0]
Note: See TracBrowser for help on using the repository browser.