source: orange-reliability/_reliability/__init__.py @ 8:4295fbc71fb2

Revision 8:4295fbc71fb2, 37.7 KB checked in by Matija Polajnar <matija.polajnar@…>, 22 months ago (diff)

Fix incorrect (incomplete) merge.

Line 
1import Orange
2
3import random
4from Orange import statc
5import math
6import warnings
7import numpy
8
9from collections import defaultdict
10from itertools import izip
11
12# Labels and final variables
13labels = ["SAvar", "SAbias", "BAGV", "CNK", "LCV", "BVCK", "Mahalanobis", "ICV"]
14
15"""
16# All the estimators calculation constants
17DO_SA = 0
18DO_BAGV = 1
19DO_CNK = 2
20DO_LCV = 3
21DO_BVCK = 4
22DO_MAHAL = 5
23"""
24
25# All the estimator method constants
26SAVAR_ABSOLUTE = 0
27SABIAS_SIGNED = 1
28SABIAS_ABSOLUTE = 2
29BAGV_ABSOLUTE = 3
30CNK_SIGNED = 4
31CNK_ABSOLUTE = 5
32LCV_ABSOLUTE = 6
33BVCK_ABSOLUTE = 7
34MAHAL_ABSOLUTE = 8
35BLENDING_ABSOLUTE = 9
36ICV_METHOD = 10
37MAHAL_TO_CENTER_ABSOLUTE = 13
38DENS_ABSOLUTE = 14
39
40# Type of estimator constant
41SIGNED = 0
42ABSOLUTE = 1
43
44# Names of all the estimator methods
45METHOD_NAME = {0: "SAvar absolute", 1: "SAbias signed", 2: "SAbias absolute",
46               3: "BAGV absolute", 4: "CNK signed", 5: "CNK absolute",
47               6: "LCV absolute", 7: "BVCK_absolute", 8: "Mahalanobis absolute",
48               9: "BLENDING absolute", 10: "ICV", 11: "RF Variance", 12: "RF Std",
49               13: "Mahalanobis to center", 14: "Density based"}
50
51select_with_repeat = Orange.core.MakeRandomIndicesMultiple()
52select_with_repeat.random_generator = Orange.misc.Random()
53
54def get_reliability_estimation_list(res, i):
55    return [result.probabilities[0].reliability_estimate[i].estimate for result in res.results], res.results[0].probabilities[0].reliability_estimate[i].signed_or_absolute, res.results[0].probabilities[0].reliability_estimate[i].method
56
57def get_prediction_error_list(res):
58    return [result.actual_class - result.classes[0] for result in res.results]
59
60def get_description_list(res, i):
61    return [result.probabilities[0].reliability_estimate[i].text_description for result in res.results]
62
63def get_pearson_r(res):
64    """
65    :param res: results of evaluation, done using learners,
66        wrapped into :class:`Orange.evaluation.reliability.Classifier`.
67    :type res: :class:`Orange.evaluation.testing.ExperimentResults`
68
69    Return Pearson's coefficient between the prediction error and each of the
70    used reliability estimates. Also, return the p-value of each of
71    the coefficients.
72    """
73    prediction_error = get_prediction_error_list(res)
74    results = []
75    for i in xrange(len(res.results[0].probabilities[0].reliability_estimate)):
76        reliability_estimate, signed_or_absolute, method = get_reliability_estimation_list(res, i)
77        try:
78            if signed_or_absolute == SIGNED:
79                r, p = statc.pearsonr(prediction_error, reliability_estimate)
80            else:
81                r, p = statc.pearsonr([abs(pe) for pe in prediction_error], reliability_estimate)
82        except Exception:
83            r = p = float("NaN")
84        results.append((r, p, signed_or_absolute, method))
85    return results
86
87def get_spearman_r(res):
88    """
89    :param res: results of evaluation, done using learners,
90        wrapped into :class:`Orange.evaluation.reliability.Classifier`.
91    :type res: :class:`Orange.evaluation.testing.ExperimentResults`
92
93    Return Spearman's coefficient between the prediction error and each of the
94    used reliability estimates. Also, return the p-value of each of
95    the coefficients.
96    """
97    prediction_error = get_prediction_error_list(res)
98    results = []
99    for i in xrange(len(res.results[0].probabilities[0].reliability_estimate)):
100        reliability_estimate, signed_or_absolute, method = get_reliability_estimation_list(res, i)
101        try:
102            if signed_or_absolute == SIGNED:
103                r, p = statc.spearmanr(prediction_error, reliability_estimate)
104            else:
105                r, p = statc.spearmanr([abs(pe) for pe in prediction_error], reliability_estimate)
106        except Exception:
107            r = p = float("NaN")
108        results.append((r, p, signed_or_absolute, method))
109    return results
110
111def get_pearson_r_by_iterations(res):
112    """
113    :param res: results of evaluation, done using learners,
114        wrapped into :class:`Orange.evaluation.reliability.Classifier`.
115    :type res: :class:`Orange.evaluation.testing.ExperimentResults`
116
117    Return average Pearson's coefficient over all folds between prediction error
118    and each of the used estimates.
119    """
120    results_by_fold = Orange.evaluation.scoring.split_by_iterations(res)
121    number_of_estimates = len(res.results[0].probabilities[0].reliability_estimate)
122    number_of_instances = len(res.results)
123    number_of_folds = len(results_by_fold)
124    results = [0 for _ in xrange(number_of_estimates)]
125    sig = [0 for _ in xrange(number_of_estimates)]
126    method_list = [0 for _ in xrange(number_of_estimates)]
127
128    for res in results_by_fold:
129        prediction_error = get_prediction_error_list(res)
130        for i in xrange(number_of_estimates):
131            reliability_estimate, signed_or_absolute, method = get_reliability_estimation_list(res, i)
132            try:
133                if signed_or_absolute == SIGNED:
134                    r, _ = statc.pearsonr(prediction_error, reliability_estimate)
135                else:
136                    r, _ = statc.pearsonr([abs(pe) for pe in prediction_error], reliability_estimate)
137            except Exception:
138                r = float("NaN")
139            results[i] += r
140            sig[i] = signed_or_absolute
141            method_list[i] = method
142
143    # Calculate p-values
144    results = [float(res) / number_of_folds for res in results]
145    ps = [p_value_from_r(r, number_of_instances) for r in results]
146
147    return zip(results, ps, sig, method_list)
148
149def p_value_from_r(r, n):
150    """
151    Calculate p-value from the paerson coefficient and the sample size.
152    """
153    df = n - 2
154    t = r * (df / ((-r + 1.0 + 1e-30) * (r + 1.0 + 1e-30))) ** 0.5
155    return statc.betai (df * 0.5, 0.5, df / (df + t * t))
156
157
158# Distances between two discrete probability distributions
159#TODO Document those.
160def normalize_both(p, q):
161    if not p.normalized:
162        p.normalize()
163    if not q.normalized:
164        q.normalize()
165    return p, q
166
167def minkowsky_dist(p, q, m=2):
168    p, q = normalize_both(p, q)
169    dist = 0
170    for i in range(len(p)):
171        dist += abs(p[i]-q[i])**m
172    return dist**(1./m)
173
174def manhattan_distance(p, q):
175    return minkowsky_dist(p, q, m=1)
176
177def euclidean_dist(p, q):
178    return minkowsky_dist(p, q, m=2)
179
180def variance_dist(p, q):
181    return euclidean_dist(p, q) ** 2
182
183def max_dist(p, q):
184    p, q = normalize_both(p, q)
185    return max([abs(p[i]-q[i]) for i in range(len(p))])
186
187def hellinger_dist(p, q):
188    p, q = normalize_both(p, q)
189    dist = 0
190    for i in range(len(p)):
191        dist += (math.sqrt(p[i])-math.sqrt(q[i])) ** 2
192    return dist
193
194def my_log(x):
195    return 0 if x == 0 else x * math.log(x)
196
197def kullback_leibler(p, q):
198    p, q = normalize_both(p, q)
199    dist = 0
200    for i in range(len(p)):
201        dist += my_log(p[i]-q[i])
202    return dist
203
204def cosine(p, q):
205    p, q = normalize_both(p, q)
206    p, q = [pp for pp in p], [qq for qq in q]
207    return 1 - numpy.dot(x,y) / (numpy.linalg.norm(p)*numpy.linalg.norm(q))
208
209
210class Estimate:
211    """
212    Reliability estimate. Contains attributes that describe the results of
213    reliability estimation.
214
215    .. attribute:: estimate
216
217        A numerical reliability estimate.
218
219    .. attribute:: signed_or_absolute
220
221        Determines whether the method used gives a signed or absolute result.
222        Has a value of either :obj:`SIGNED` or :obj:`ABSOLUTE`.
223
224    .. attribute:: method
225
226        An integer ID of reliability estimation method used.
227
228    .. attribute:: method_name
229
230        Name (string) of reliability estimation method used.
231
232    .. attribute:: icv_method
233
234        An integer ID of reliability estimation method that performed best,
235        as determined by ICV, and of which estimate is stored in the
236        :obj:`estimate` field. (:obj:`None` when ICV was not used.)
237
238    .. attribute:: icv_method_name
239
240        Name (string) of reliability estimation method that performed best,
241        as determined by ICV. (:obj:`None` when ICV was not used.)
242
243    """
244    def __init__(self, estimate, signed_or_absolute, method, icv_method= -1):
245        self.estimate = estimate
246        self.signed_or_absolute = signed_or_absolute
247        self.method = method
248        self.method_name = METHOD_NAME[method]
249        self.icv_method = icv_method
250        self.icv_method_name = METHOD_NAME[icv_method] if icv_method != -1 else ""
251        self.text_description = None
252
253class DescriptiveAnalysis:
254    def __init__(self, estimator, desc=["high", "medium", "low"], procentage=[0.00, 0.33, 0.66]):
255        self.desc = desc
256        self.procentage = procentage
257        self.estimator = estimator
258
259    def __call__(self, instances, weight=None, **kwds):
260
261        # Calculate borders using cross validation
262        res = Orange.evaluation.testing.cross_validation([self.estimator], instances)
263        all_borders = []
264        for i in xrange(len(res.results[0].probabilities[0].reliability_estimate)):
265            estimates, signed_or_absolute, method = get_reliability_estimation_list(res, i)
266            sorted_estimates = sorted(abs(x) for x in estimates)
267            borders = [sorted_estimates[int(len(estimates) * p) - 1]  for p in self.procentage]
268            all_borders.append(borders)
269
270        # Learn on whole train data
271        estimator_classifier = self.estimator(instances)
272
273        return DescriptiveAnalysisClassifier(estimator_classifier, all_borders, self.desc)
274
275class DescriptiveAnalysisClassifier:
276    def __init__(self, estimator_classifier, all_borders, desc):
277        self.estimator_classifier = estimator_classifier
278        self.all_borders = all_borders
279        self.desc = desc
280
281    def __call__(self, instance, result_type=Orange.core.GetValue):
282        predicted, probabilities = self.estimator_classifier(instance, Orange.core.GetBoth)
283
284        for borders, estimate in zip(self.all_borders, probabilities.reliability_estimate):
285            estimate.text_description = self.desc[0]
286            for lower_border, text_desc in zip(borders, self.desc):
287                if estimate.estimate >= lower_border:
288                    estimate.text_description = text_desc
289
290        # Return the appropriate type of result
291        if result_type == Orange.core.GetValue:
292            return predicted
293        elif result_type == Orange.core.GetProbabilities:
294            return probabilities
295        else:
296            return predicted, probabilities
297
298class SensitivityAnalysis:
299    """
300   
301    :param e: List of possible :math:`\epsilon` values for SAvar and SAbias
302        reliability estimates.
303    :type e: list of floats
304   
305    :rtype: :class:`Orange.evaluation.reliability.SensitivityAnalysisClassifier`
306   
307    To estimate the reliability of prediction for given instance,
308    the learning set is extended with this instance, labeled with
309    :math:`K + \epsilon (l_{max} - l_{min})`,
310    where :math:`K` denotes the initial prediction,
311    :math:`\epsilon` is sensitivity parameter and :math:`l_{min}` and
312    :math:`l_{max}` denote lower and the upper bound of the learning
313    instances' labels. After computing different sensitivity predictions
314    using different values of :math:`\epsilon`, the prediction are combined
315    into SAvar and SAbias. SAbias can be used in a signed or absolute form.
316
317    :math:`SAvar = \\frac{\sum_{\epsilon \in E}(K_{\epsilon} - K_{-\epsilon})}{|E|}`
318
319    :math:`SAbias = \\frac{\sum_{\epsilon \in E} (K_{\epsilon} - K ) + (K_{-\epsilon} - K)}{2 |E|}`
320   
321   
322    """
323    def __init__(self, e=[0.01, 0.1, 0.5, 1.0, 2.0]):
324        self.e = e
325
326    def __call__(self, instances, learner):
327        min_value = max_value = instances[0].getclass().value
328        for ex in instances:
329            if ex.getclass().value > max_value:
330                max_value = ex.getclass().value
331            if ex.getclass().value < min_value:
332                min_value = ex.getclass().value
333        return SensitivityAnalysisClassifier(self.e, instances, min_value, max_value, learner)
334
335class SensitivityAnalysisClassifier:
336    def __init__(self, e, instances, min_value, max_value, learner):
337        self.e = e
338        self.instances = instances
339        self.max_value = max_value
340        self.min_value = min_value
341        self.learner = learner
342
343    def __call__(self, instance, predicted, probabilities):
344        # Create new dataset
345        r_data = Orange.data.Table(self.instances)
346
347        # Create new instance
348        modified_instance = Orange.data.Instance(instance)
349
350        # Append it to the data
351        r_data.append(modified_instance)
352
353        # Calculate SAvar & SAbias
354        SAvar = SAbias = 0
355
356        for eps in self.e:
357            # +epsilon
358            r_data[-1].setclass(predicted.value + eps * (self.max_value - self.min_value))
359            c = self.learner(r_data)
360            k_plus = c(instance, Orange.core.GetValue)
361
362            # -epsilon
363            r_data[-1].setclass(predicted.value - eps * (self.max_value - self.min_value))
364            c = self.learner(r_data)
365            k_minus = c(instance, Orange.core.GetValue)
366            #print len(r_data)
367            #print eps*(self.max_value - self.min_value)
368            #print k_plus
369            #print k_minus
370            # calculate part SAvar and SAbias
371            SAvar += k_plus.value - k_minus.value
372            SAbias += k_plus.value + k_minus.value - 2 * predicted.value
373
374        SAvar /= len(self.e)
375        SAbias /= 2 * len(self.e)
376
377        return [Estimate(SAvar, ABSOLUTE, SAVAR_ABSOLUTE),
378                Estimate(SAbias, SIGNED, SABIAS_SIGNED),
379                Estimate(abs(SAbias), ABSOLUTE, SABIAS_ABSOLUTE)]
380
381class BaggingVariance:
382    """
383   
384    :param m: Number of bagging models to be used with BAGV estimate
385    :type m: int
386   
387    :rtype: :class:`Orange.evaluation.reliability.BaggingVarianceClassifier`
388   
389    :math:`m` different bagging models are constructed and used to estimate
390    the value of dependent variable for a given instance. In regression,
391    the variance of those predictions is used as a prediction reliability
392    estimate.
393
394    :math:`BAGV = \\frac{1}{m} \sum_{i=1}^{m} (K_i - K)^2`
395
396    where :math:`K = \\frac{\sum_{i=1}^{m} K_i}{m}` and :math:`K_i` are
397    predictions of individual constructed models. Note that a greater value
398    implies greater error.
399
400    For classification, 1 minus the average Euclidean distance between class
401    probability distributions predicted by the model, and distributions
402    predicted by the individual bagged models, is used as the BAGV reliability
403    measure. Note that in this case a greater value implies a better
404    prediction.
405   
406    """
407    def __init__(self, m=50):
408        self.m = m
409
410    def __call__(self, instances, learner):
411        classifiers = []
412
413        if instances.domain.class_var.var_type == Orange.feature.Descriptor.Discrete:
414            classifier = learner(instances)
415        else:
416            classifier = None
417
418        # Create bagged classifiers using sampling with replacement
419        for _ in xrange(self.m):
420            selection = select_with_repeat(len(instances))
421            data = instances.select(selection)
422            classifiers.append(learner(data))
423        return BaggingVarianceClassifier(classifiers, classifier)
424
425class BaggingVarianceClassifier:
426    def __init__(self, classifiers, classifier=None):
427        self.classifiers = classifiers
428        self.classifier = classifier
429
430    def __call__(self, instance, *args):
431        BAGV = 0
432
433        # Calculate the bagging variance
434        if instance.domain.class_var.var_type == Orange.feature.Descriptor.Continuous:
435            bagged_values = [c(instance, Orange.core.GetValue).value for c in self.classifiers if c is not None]
436        elif instance.domain.class_var.var_type == Orange.feature.Descriptor.Discrete:
437            estimate = self.classifier(instance, Orange.core.GetProbabilities)
438            bagged_values = [euclidean_dist(c(instance, Orange.core.GetProbabilities), estimate) for c in self.classifiers if c is not None]
439        k = sum(bagged_values) / len(bagged_values)
440
441        BAGV = sum((bagged_value - k) ** 2 for bagged_value in bagged_values) / len(bagged_values)
442        if instance.domain.class_var.var_type == Orange.feature.Descriptor.Discrete:
443            BAGV = 1 - BAGV
444
445        return [Estimate(BAGV, ABSOLUTE, BAGV_ABSOLUTE)]
446
447class LocalCrossValidation:
448    """
449
450    :param k: Number of nearest neighbours used in LCV estimate
451    :type k: int
452
453    :param distance: function that computes a distance between two discrete
454        distributions (used only in classification problems). The default
455        is Hellinger distance.
456    :type distance: function
457
458    :param distance_weighted: for classification reliability estimation,
459        use an average distance between distributions, weighted by :math:`e^{-d}`,
460        where :math:`d` is the distance between predicted instance and the
461        neighbour.
462
463    :rtype: :class:`Orange.evaluation.reliability.LocalCrossValidationClassifier`
464
465    :math:`k` nearest neighbours to the given instance are found and put in
466    a separate data set. On this data set, a leave-one-out validation is
467    performed. Reliability estimate for regression is then the distance
468    weighted absolute prediction error. In classification, 1 minus the average
469    distance between the predicted class probability distribution and the
470    (trivial) probability distributions of the nearest neighbour.
471
472    If a special value 0 is passed as :math:`k` (as is by default),
473    it is set as 1/20 of data set size (or 5, whichever is greater).
474
475    Summary of the algorithm for regression:
476
477    1. Determine the set of k nearest neighours :math:`N = { (x_1, c_1),...,
478       (x_k, c_k)}`.
479    2. On this set, compute leave-one-out predictions :math:`K_i` and
480       prediction errors :math:`E_i = | C_i - K_i |`.
481    3. :math:`LCV(x) = \\frac{ \sum_{(x_i, c_i) \in N} d(x_i, x) * E_i }{ \sum_{(x_i, c_i) \in N} d(x_i, x) }`
482
483    """
484    def __init__(self, k=0, distance=hellinger_dist, distance_weighted=True):
485        self.k = k
486        self.distance = distance
487        self.distance_weighted = distance_weighted
488
489    def __call__(self, instances, learner):
490        nearest_neighbours_constructor = Orange.classification.knn.FindNearestConstructor()
491        nearest_neighbours_constructor.distanceConstructor = Orange.distance.Euclidean()
492
493        distance_id = Orange.feature.Descriptor.new_meta_id()
494        nearest_neighbours = nearest_neighbours_constructor(instances, 0, distance_id)
495
496        if self.k == 0:
497            self.k = max(5, len(instances) / 20)
498
499        return LocalCrossValidationClassifier(distance_id, nearest_neighbours, self.k, learner,
500            distance=self.distance, distance_weighted=self.distance_weighted)
501
502class LocalCrossValidationClassifier:
503    def __init__(self, distance_id, nearest_neighbours, k, learner, **kwds):
504        self.distance_id = distance_id
505        self.nearest_neighbours = nearest_neighbours
506        self.k = k
507        self.learner = learner
508        for a,b in kwds.items():
509            setattr(self, a, b)
510
511    def __call__(self, instance, *args):
512        LCVer = 0
513        LCVdi = 0
514
515        # Find k nearest neighbors
516
517        knn = [ex for ex in self.nearest_neighbours(instance, self.k)]
518
519        # leave one out of prediction error
520        for i in xrange(len(knn)):
521            train = knn[:]
522            del train[i]
523
524            classifier = self.learner(Orange.data.Table(train))
525
526            if instance.domain.class_var.var_type == Orange.feature.Descriptor.Continuous:
527                returned_value = classifier(knn[i], Orange.core.GetValue)
528                e = abs(knn[i].getclass().value - returned_value.value)
529
530            elif instance.domain.class_var.var_type == Orange.feature.Descriptor.Discrete:
531                returned_value = classifier(knn[i], Orange.core.GetProbabilities)
532                probabilities = [knn[i].get_class() == val for val in instance.domain.class_var.values]
533                e = self.distance(returned_value, Orange.statistics.distribution.Discrete(probabilities))
534
535            dist = math.exp(-knn[i][self.distance_id]) if self.distance_weighted else 1.0
536            LCVer += e * dist
537            LCVdi += dist
538
539        LCV = LCVer / LCVdi if LCVdi != 0 else 0
540        if math.isnan(LCV):
541            LCV = 0.0
542
543        if instance.domain.class_var.var_type == Orange.feature.Descriptor.Discrete:
544            LCV = 1 - LCV
545
546        return [ Estimate(LCV, ABSOLUTE, LCV_ABSOLUTE) ]
547
548class CNeighbours:
549    """
550   
551    :param k: Number of nearest neighbours used in CNK estimate
552    :type k: int
553
554    :param distance: function that computes a distance between two discrete
555        distributions (used only in classification problems). The default
556        is Hellinger distance.
557    :type distance: function
558   
559    :rtype: :class:`Orange.evaluation.reliability.CNeighboursClassifier`
560   
561    For regression, CNK is defined for an unlabeled instance as a difference
562    between average label of its nearest neighbours and its prediction. CNK
563    can be used as a signed or absolute estimate.
564   
565    :math:`CNK = \\frac{\sum_{i=1}^{k}C_i}{k} - K`
566   
567    where :math:`k` denotes number of neighbors, C :sub:`i` denotes neighbours'
568    labels and :math:`K` denotes the instance's prediction. Note that a greater
569    value implies greater prediction error.
570
571    For classification, CNK is equal to 1 minus the average distance between
572    predicted class distribution and (trivial) class distributions of the
573    $k$ nearest neighbours from the learning set. Note that in this case
574    a greater value implies better prediction.
575   
576    """
577    def __init__(self, k=5, distance=hellinger_dist):
578        self.k = k
579        self.distance = distance
580
581    def __call__(self, instances, learner):
582        nearest_neighbours_constructor = Orange.classification.knn.FindNearestConstructor()
583        nearest_neighbours_constructor.distanceConstructor = Orange.distance.Euclidean()
584
585        distance_id = Orange.feature.Descriptor.new_meta_id()
586        nearest_neighbours = nearest_neighbours_constructor(instances, 0, distance_id)
587        return CNeighboursClassifier(nearest_neighbours, self.k, distance=self.distance)
588
589class CNeighboursClassifier:
590    def __init__(self, nearest_neighbours, k, distance):
591        self.nearest_neighbours = nearest_neighbours
592        self.k = k
593        self.distance = distance
594
595    def __call__(self, instance, predicted, probabilities):
596        CNK = 0
597
598        # Find k nearest neighbors
599
600        knn = [ex for ex in self.nearest_neighbours(instance, self.k)]
601
602        # average label of neighbors
603        if ex.domain.class_var.var_type == Orange.feature.Descriptor.Continuous:
604            for ex in knn:
605                CNK += ex.getclass().value
606            CNK /= self.k
607            CNK -= predicted.value
608
609            return [Estimate(CNK, SIGNED, CNK_SIGNED),
610                    Estimate(abs(CNK), ABSOLUTE, CNK_ABSOLUTE)]
611        elif ex.domain.class_var.var_type == Orange.feature.Descriptor.Discrete:
612            knn_l = Orange.classification.knn.kNNLearner(k=self.k)
613            knn_c = knn_l(knn)
614            for ex in knn:
615                CNK -= self.distance(probabilities, knn_c(ex, Orange.classification.Classifier.GetProbabilities))
616            CNK /= self.k
617            CNK += 1
618
619            return [Estimate(CNK, ABSOLUTE, CNK_ABSOLUTE)]
620
621class Mahalanobis:
622    """
623   
624    :param k: Number of nearest neighbours used in Mahalanobis estimate.
625    :type k: int
626   
627    :rtype: :class:`Orange.evaluation.reliability.MahalanobisClassifier`
628   
629    Mahalanobis distance reliability estimate is defined as
630    `mahalanobis distance <http://en.wikipedia.org/wiki/Mahalanobis_distance>`_
631    to the evaluated instance's :math:`k` nearest neighbours.
632
633   
634    """
635    def __init__(self, k=3):
636        self.k = k
637
638    def __call__(self, instances, *args):
639        nnm = Orange.classification.knn.FindNearestConstructor()
640        nnm.distanceConstructor = Orange.distance.Mahalanobis()
641
642        mid = Orange.feature.Descriptor.new_meta_id()
643        nnm = nnm(instances, 0, mid)
644        return MahalanobisClassifier(self.k, nnm, mid)
645
646class MahalanobisClassifier:
647    def __init__(self, k, nnm, mid):
648        self.k = k
649        self.nnm = nnm
650        self.mid = mid
651
652    def __call__(self, instance, *args):
653        mahalanobis_distance = 0
654
655        mahalanobis_distance = sum(ex[self.mid].value for ex in self.nnm(instance, self.k))
656
657        return [ Estimate(mahalanobis_distance, ABSOLUTE, MAHAL_ABSOLUTE) ]
658
659class MahalanobisToCenter:
660    """
661    :rtype: :class:`Orange.evaluation.reliability.MahalanobisToCenterClassifier`
662   
663    Mahalanobis distance to center reliability estimate is defined as a
664    `mahalanobis distance <http://en.wikipedia.org/wiki/Mahalanobis_distance>`_
665    between the predicted instance and the centroid of the data.
666
667   
668    """
669    def __init__(self):
670        pass
671
672    def __call__(self, instances, *args):
673        dc = Orange.core.DomainContinuizer()
674        dc.classTreatment = Orange.core.DomainContinuizer.Ignore
675        dc.continuousTreatment = Orange.core.DomainContinuizer.NormalizeBySpan
676        dc.multinomialTreatment = Orange.core.DomainContinuizer.NValues
677
678        new_domain = dc(instances)
679        new_instances = instances.translate(new_domain)
680
681        X, _, _ = new_instances.to_numpy()
682        instance_avg = numpy.average(X, 0)
683
684        distance_constructor = Orange.distance.Mahalanobis()
685        distance = distance_constructor(new_instances)
686
687        average_instance = Orange.data.Instance(new_instances.domain, list(instance_avg) + ["?"])
688
689        return MahalanobisToCenterClassifier(distance, average_instance, new_domain)
690
691class MahalanobisToCenterClassifier:
692    def __init__(self, distance, average_instance, new_domain):
693        self.distance = distance
694        self.average_instance = average_instance
695        self.new_domain = new_domain
696
697    def __call__(self, instance, *args):
698
699        inst = Orange.data.Instance(self.new_domain, instance)
700
701        mahalanobis_to_center = self.distance(inst, self.average_instance)
702
703        return [ Estimate(mahalanobis_to_center, ABSOLUTE, MAHAL_TO_CENTER_ABSOLUTE) ]
704
705
706class BaggingVarianceCNeighbours:
707    """
708   
709    :param bagv: Instance of Bagging Variance estimator.
710    :type bagv: :class:`BaggingVariance`
711   
712    :param cnk: Instance of CNK estimator.
713    :type cnk: :class:`CNeighbours`
714   
715    :rtype: :class:`Orange.evaluation.reliability.BaggingVarianceCNeighboursClassifier`
716   
717    BVCK is a combination (average) of Bagging variance and local modeling of
718    prediction error.
719   
720    """
721    def __init__(self, bagv=BaggingVariance(), cnk=CNeighbours()):
722        self.bagv = bagv
723        self.cnk = cnk
724
725    def __call__(self, instances, learner):
726        bagv_classifier = self.bagv(instances, learner)
727        cnk_classifier = self.cnk(instances, learner)
728        return BaggingVarianceCNeighboursClassifier(bagv_classifier, cnk_classifier)
729
730class BaggingVarianceCNeighboursClassifier:
731    def __init__(self, bagv_classifier, cnk_classifier):
732        self.bagv_classifier = bagv_classifier
733        self.cnk_classifier = cnk_classifier
734
735    def __call__(self, instance, predicted, probabilities):
736        bagv_estimates = self.bagv_classifier(instance, predicted, probabilities)
737        cnk_estimates = self.cnk_classifier(instance, predicted, probabilities)
738
739        bvck_value = (bagv_estimates[0].estimate + cnk_estimates[1].estimate) / 2
740        bvck_estimates = [ Estimate(bvck_value, ABSOLUTE, BVCK_ABSOLUTE) ]
741        bvck_estimates.extend(bagv_estimates)
742        bvck_estimates.extend(cnk_estimates)
743        return bvck_estimates
744
745class ErrorPredicting:
746    def __init__(self):
747        pass
748
749    def __call__(self, instances, learner):
750        res = Orange.evaluation.testing.cross_validation([learner], instances)
751        prediction_errors = get_prediction_error_list(res)
752
753        new_domain = Orange.data.Domain(instances.domain.attributes, Orange.core.FloatVariable("pe"))
754        new_dataset = Orange.data.Table(new_domain, instances)
755
756        for instance, prediction_error in izip(new_dataset, prediction_errors):
757            instance.set_class(prediction_error)
758
759        rf = Orange.ensemble.forest.RandomForestLearner()
760        rf_classifier = rf(new_dataset)
761
762        return ErrorPredictingClassification(rf_classifier, new_domain)
763
764class ErrorPredictingClassification:
765    def __init__(self, rf_classifier, new_domain):
766        self.rf_classifier = rf_classifier
767        self.new_domain = new_domain
768
769    def __call__(self, instance, predicted, probabilities):
770        new_instance = Orange.data.Instance(self.new_domain, instance)
771        value = self.rf_classifier(new_instance, Orange.core.GetValue)
772
773        return [Estimate(value.value, SIGNED, SABIAS_SIGNED)]
774
775def gauss_kernel(x, sigma=1):
776    return 1./(sigma*math.sqrt(2*math.pi)) * math.exp(-1./2*(x/sigma)**2)
777
778class ParzenWindowDensityBased:
779    """
780    :param K: kernel function. Default: gaussian.
781    :type K: function
782
783    :param d_measure: distance measure for inter-instance distance.
784    :type d_measure: :class:`Orange.distance.DistanceConstructor`
785
786    :rtype: :class:`Orange.evaluation.reliability.ParzenWindowDensityBasedClassifier`
787
788    Returns a value that estimates a density of problem space around the
789    instance being predicted.
790    """
791    def __init__(self, K=gauss_kernel, d_measure=Orange.distance.Euclidean()):
792        self.K = K
793        self.d_measure = d_measure
794
795    def __call__(self, instances):
796
797        self.distance = self.d_measure(instances)
798
799        def density(x):
800            l, dens = len(instances), 0
801            for ex in instances:
802                dens += self.K(self.distance(x,ex))
803            return dens / l
804
805        max_density = max([density(ex) for ex in instances])
806
807        return ParzenWindowDensityBasedClassifier(density, max_density)
808
809class ParzenWindowDensityBasedClassifier:
810
811    def __init__(self, density, max_density):
812        self.density = density
813        self.max_density = max_density
814
815
816    def __call__(self, instance, *args):
817
818        DENS = self.max_density-self.density(instance)
819
820        return [Estimate(DENS, ABSOLUTE, DENS_ABSOLUTE)]
821
822class Learner:
823    """
824    Reliability estimation wrapper around a learner we want to test.
825    Different reliability estimation algorithms can be used on the
826    chosen learner. This learner works as any other and can be used as one,
827    but it returns the classifier, wrapped into an instance of
828    :class:`Orange.evaluation.reliability.Classifier`.
829   
830    :param box_learner: Learner we want to wrap into a reliability estimation
831        classifier.
832    :type box_learner: :obj:`~Orange.classification.Learner`
833   
834    :param estimators: List of different reliability estimation methods we
835                       want to use on the chosen learner.
836    :type estimators: :obj:`list` of reliability estimators
837   
838    :param name: Name of this reliability learner
839    :type name: string
840   
841    :rtype: :class:`Orange.evaluation.reliability.Learner`
842    """
843    def __init__(self, box_learner, name="Reliability estimation",
844                 estimators=[SensitivityAnalysis(),
845                             LocalCrossValidation(),
846                             BaggingVarianceCNeighbours(),
847                             Mahalanobis(),
848                             MahalanobisToCenter()],
849                 **kwds):
850        self.__dict__.update(kwds)
851        self.name = name
852        self.estimators = estimators
853        self.box_learner = box_learner
854        self.blending = False
855
856
857    def __call__(self, instances, weight=None, **kwds):
858        """Learn from the given table of data instances.
859       
860        :param instances: Data instances to learn from.
861        :type instances: Orange.data.Table
862        :param weight: Id of meta attribute with weights of instances
863        :type weight: int
864        :rtype: :class:`Orange.evaluation.reliability.Classifier`
865        """
866
867        blending_classifier = None
868        new_domain = None
869
870#        if instances.domain.class_var.var_type != Orange.feature.Continuous.Continuous:
871#            raise Exception("This method only works on data with continuous class.")
872
873        return Classifier(instances, self.box_learner, self.estimators, self.blending, new_domain, blending_classifier)
874
875    def internal_cross_validation(self, instances, folds=10):
876        """ Perform the internal cross validation for getting the best
877        reliability estimate. It uses the reliability estimators defined in
878        estimators attribute.
879
880        Returns the id of the method that scored the best.
881
882        :param instances: Data instances to use for ICV.
883        :type instances: :class:`Orange.data.Table`
884        :param folds: number of folds for ICV.
885        :type folds: int
886        :rtype: int
887
888        """
889        res = Orange.evaluation.testing.cross_validation([self], instances, folds=folds)
890        results = get_pearson_r(res)
891        sorted_results = sorted(results)
892        return sorted_results[-1][3]
893
894    def internal_cross_validation_testing(self, instances, folds=10):
895        """ Perform internal cross validation (as in Automatic selection of
896        reliability estimates for individual regression predictions,
897        Zoran Bosnic, 2010) and return id of the method
898        that scored best on this data.
899
900        :param instances: Data instances to use for ICV.
901        :type instances: :class:`Orange.data.Table`
902        :param folds: number of folds for ICV.
903        :type folds: int
904        :rtype: int
905
906        """
907        cv_indices = Orange.core.MakeRandomIndicesCV(instances, folds)
908
909        list_of_rs = []
910
911        sum_of_rs = defaultdict(float)
912
913        for fold in xrange(folds):
914            data = instances.select(cv_indices, fold)
915            if len(data) < 10:
916                res = Orange.evaluation.testing.leave_one_out([self], data)
917            else:
918                res = Orange.evaluation.testing.cross_validation([self], data)
919            results = get_pearson_r(res)
920            for r, _, _, method in results:
921                sum_of_rs[method] += r
922        sorted_sum_of_rs = sorted(sum_of_rs.items(), key=lambda estimate: estimate[1], reverse=True)
923        return sorted_sum_of_rs[0][0]
924
925    labels = ["SAvar", "SAbias", "BAGV", "CNK", "LCV", "BVCK", "Mahalanobis", "ICV"]
926
927class Classifier:
928    """
929    A reliability estimation wrapper for classifiers.
930
931    What distinguishes this classifier is that the returned probabilities (if
932    :obj:`Orange.classification.Classifier.GetProbabilities` or
933    :obj:`Orange.classification.Classifier.GetBoth` is passed) contain an
934    additional attribute :obj:`reliability_estimate`, which is an instance of
935    :class:`~Orange.evaluation.reliability.Estimate`.
936
937    """
938
939    def __init__(self, instances, box_learner, estimators, blending, blending_domain, rf_classifier, **kwds):
940        self.__dict__.update(kwds)
941        self.instances = instances
942        self.box_learner = box_learner
943        self.estimators = estimators
944        self.blending = blending
945        self.blending_domain = blending_domain
946        self.rf_classifier = rf_classifier
947
948        # Train the learner with original data
949        self.classifier = box_learner(instances)
950
951        # Train all the estimators and create their classifiers
952        self.estimation_classifiers = [estimator(instances, box_learner) for estimator in estimators]
953
954    def __call__(self, instance, result_type=Orange.core.GetValue):
955        """
956        Classify and estimate reliability of estimation for a new instance.
957        When :obj:`result_type` is set to
958        :obj:`Orange.classification.Classifier.GetBoth` or
959        :obj:`Orange.classification.Classifier.GetProbabilities`,
960        an additional attribute :obj:`reliability_estimate`,
961        which is an instance of
962        :class:`~Orange.evaluation.reliability.Estimate`,
963        is added to the distribution object.
964       
965        :param instance: instance to be classified.
966        :type instance: :class:`Orange.data.Instance`
967        :param result_type: :class:`Orange.classification.Classifier.GetValue` or \
968              :class:`Orange.classification.Classifier.GetProbabilities` or
969              :class:`Orange.classification.Classifier.GetBoth`
970       
971        :rtype: :class:`Orange.data.Value`,
972              :class:`Orange.statistics.Distribution` or a tuple with both
973        """
974        predicted, probabilities = self.classifier(instance, Orange.core.GetBoth)
975
976        # Create a place holder for estimates
977        if probabilities is None:
978            probabilities = Orange.statistics.distribution.Continuous()
979        #with warnings.catch_warnings():
980        #    warnings.simplefilter("ignore")
981        probabilities.setattr('reliability_estimate', [])
982
983        # Calculate all the estimates and add them to the results
984        for estimate in self.estimation_classifiers:
985            probabilities.reliability_estimate.extend(estimate(instance, predicted, probabilities))
986
987        # Return the appropriate type of result
988        if result_type == Orange.core.GetValue:
989            return predicted
990        elif result_type == Orange.core.GetProbabilities:
991            return probabilities
992        else:
993            return predicted, probabilities
994
995# Functions for testing and plotting
996#TODO Document those.
997def get_acc_rel(method, data, learner):
998    estimators = [method]
999    reliability = Orange.evaluation.reliability.Learner(learner, estimators=estimators)
1000    #results = Orange.evaluation.testing.leave_one_out([reliability], data)
1001    results = Orange.evaluation.testing.cross_validation([reliability], data)
1002
1003    rels, acc = [], []
1004
1005    for res in results.results:
1006        rels.append(res.probabilities[0].reliability_estimate[0].estimate)
1007        acc.append(res.probabilities[0][res.actual_class])
1008
1009    return rels, acc
1010
1011def acc_rel_plot(method, data, learner, file_name="acc_rel_plot.png", colors=None):
1012
1013    import matplotlib.pylab as plt
1014
1015    plt.clf()
1016
1017    rels, acc = get_acc_rel(method, data, learner)
1018    print "rels", rels
1019    print "acc", acc
1020
1021    if colors is None:
1022        colors = "k"
1023    plt.scatter(acc, rels, c=colors)
1024    plt.xlim(0.,1.)
1025    plt.ylim(ymin=0.)
1026    plt.savefig(file_name)
1027
1028def acc_rel_correlation(method, data, learner):
1029    import scipy.stats
1030    rels, acc = get_acc_rel(method, data, learner)
1031    return scipy.stats.spearmanr(acc, rels)[0]
Note: See TracBrowser for help on using the repository browser.