source: orange-reliability/orangecontrib/reliability/__init__.py @ 40:d1cc6a016f2c

Revision 40:d1cc6a016f2c, 44.6 KB checked in by markotoplak, 7 months ago (diff)

Added stubs of ICV and Stacking to the documentation

RevLine 
[0]1import Orange
2
3import random
4from Orange import statc
5import math
6import warnings
7import numpy
8
9from collections import defaultdict
10from itertools import izip
11
12# All the estimator method constants
13SAVAR_ABSOLUTE = 0
14SABIAS_SIGNED = 1
15SABIAS_ABSOLUTE = 2
16BAGV_ABSOLUTE = 3
17CNK_SIGNED = 4
18CNK_ABSOLUTE = 5
19LCV_ABSOLUTE = 6
20BVCK_ABSOLUTE = 7
21MAHAL_ABSOLUTE = 8
22BLENDING_ABSOLUTE = 9
23ICV_METHOD = 10
24MAHAL_TO_CENTER_ABSOLUTE = 13
[5]25DENS_ABSOLUTE = 14
[10]26ERR_ABSOLUTE = 15
[37]27STACKING = 101
[0]28
29# Type of estimator constant
30SIGNED = 0
31ABSOLUTE = 1
32
33# Names of all the estimator methods
34METHOD_NAME = {0: "SAvar absolute", 1: "SAbias signed", 2: "SAbias absolute",
35               3: "BAGV absolute", 4: "CNK signed", 5: "CNK absolute",
[37]36               6: "LCV absolute", 7: "BVCK absolute", 8: "Mahalanobis absolute",
[0]37               9: "BLENDING absolute", 10: "ICV", 11: "RF Variance", 12: "RF Std",
[37]38               13: "Mahalanobis to center", 14: "Density based", 15: "Reference expected error",
39               101: "Stacking" }
[0]40
41def get_reliability_estimation_list(res, i):
[37]42    return [ result.probabilities[0].reliability_estimate[i].estimate for result in res.results], \
43        res.results[0].probabilities[0].reliability_estimate[i].signed_or_absolute, \
44        res.results[0].probabilities[0].reliability_estimate[i].method
[0]45
46def get_prediction_error_list(res):
47    return [result.actual_class - result.classes[0] for result in res.results]
48
49def get_description_list(res, i):
50    return [result.probabilities[0].reliability_estimate[i].text_description for result in res.results]
51
52def get_pearson_r(res):
53    """
54    :param res: results of evaluation, done using learners,
55        wrapped into :class:`Orange.evaluation.reliability.Classifier`.
56    :type res: :class:`Orange.evaluation.testing.ExperimentResults`
57
58    Return Pearson's coefficient between the prediction error and each of the
59    used reliability estimates. Also, return the p-value of each of
60    the coefficients.
61    """
62    prediction_error = get_prediction_error_list(res)
63    results = []
64    for i in xrange(len(res.results[0].probabilities[0].reliability_estimate)):
65        reliability_estimate, signed_or_absolute, method = get_reliability_estimation_list(res, i)
66        try:
67            if signed_or_absolute == SIGNED:
68                r, p = statc.pearsonr(prediction_error, reliability_estimate)
69            else:
70                r, p = statc.pearsonr([abs(pe) for pe in prediction_error], reliability_estimate)
71        except Exception:
72            r = p = float("NaN")
73        results.append((r, p, signed_or_absolute, method))
74    return results
75
76def get_spearman_r(res):
77    """
78    :param res: results of evaluation, done using learners,
79        wrapped into :class:`Orange.evaluation.reliability.Classifier`.
80    :type res: :class:`Orange.evaluation.testing.ExperimentResults`
81
82    Return Spearman's coefficient between the prediction error and each of the
83    used reliability estimates. Also, return the p-value of each of
84    the coefficients.
85    """
86    prediction_error = get_prediction_error_list(res)
87    results = []
88    for i in xrange(len(res.results[0].probabilities[0].reliability_estimate)):
89        reliability_estimate, signed_or_absolute, method = get_reliability_estimation_list(res, i)
90        try:
91            if signed_or_absolute == SIGNED:
92                r, p = statc.spearmanr(prediction_error, reliability_estimate)
93            else:
94                r, p = statc.spearmanr([abs(pe) for pe in prediction_error], reliability_estimate)
95        except Exception:
96            r = p = float("NaN")
97        results.append((r, p, signed_or_absolute, method))
98    return results
99
100def get_pearson_r_by_iterations(res):
101    """
102    :param res: results of evaluation, done using learners,
103        wrapped into :class:`Orange.evaluation.reliability.Classifier`.
104    :type res: :class:`Orange.evaluation.testing.ExperimentResults`
105
106    Return average Pearson's coefficient over all folds between prediction error
107    and each of the used estimates.
108    """
109    results_by_fold = Orange.evaluation.scoring.split_by_iterations(res)
110    number_of_estimates = len(res.results[0].probabilities[0].reliability_estimate)
111    number_of_instances = len(res.results)
112    number_of_folds = len(results_by_fold)
113    results = [0 for _ in xrange(number_of_estimates)]
114    sig = [0 for _ in xrange(number_of_estimates)]
115    method_list = [0 for _ in xrange(number_of_estimates)]
116
117    for res in results_by_fold:
118        prediction_error = get_prediction_error_list(res)
119        for i in xrange(number_of_estimates):
120            reliability_estimate, signed_or_absolute, method = get_reliability_estimation_list(res, i)
121            try:
122                if signed_or_absolute == SIGNED:
123                    r, _ = statc.pearsonr(prediction_error, reliability_estimate)
124                else:
125                    r, _ = statc.pearsonr([abs(pe) for pe in prediction_error], reliability_estimate)
126            except Exception:
127                r = float("NaN")
128            results[i] += r
129            sig[i] = signed_or_absolute
130            method_list[i] = method
131
132    # Calculate p-values
133    results = [float(res) / number_of_folds for res in results]
134    ps = [p_value_from_r(r, number_of_instances) for r in results]
135
136    return zip(results, ps, sig, method_list)
137
138def p_value_from_r(r, n):
139    """
140    Calculate p-value from the paerson coefficient and the sample size.
141    """
142    df = n - 2
143    t = r * (df / ((-r + 1.0 + 1e-30) * (r + 1.0 + 1e-30))) ** 0.5
144    return statc.betai (df * 0.5, 0.5, df / (df + t * t))
145
[5]146
147# Distances between two discrete probability distributions
148#TODO Document those.
149def normalize_both(p, q):
150    if not p.normalized:
151        p.normalize()
152    if not q.normalized:
153        q.normalize()
154    return p, q
155
156def minkowsky_dist(p, q, m=2):
157    p, q = normalize_both(p, q)
158    dist = 0
159    for i in range(len(p)):
160        dist += abs(p[i]-q[i])**m
161    return dist**(1./m)
162
163def manhattan_distance(p, q):
164    return minkowsky_dist(p, q, m=1)
165
166def euclidean_dist(p, q):
167    return minkowsky_dist(p, q, m=2)
168
169def variance_dist(p, q):
170    return euclidean_dist(p, q) ** 2
171
172def max_dist(p, q):
173    p, q = normalize_both(p, q)
174    return max([abs(p[i]-q[i]) for i in range(len(p))])
175
176def hellinger_dist(p, q):
177    p, q = normalize_both(p, q)
178    dist = 0
179    for i in range(len(p)):
180        dist += (math.sqrt(p[i])-math.sqrt(q[i])) ** 2
181    return dist
182
183def my_log(x):
184    return 0 if x == 0 else x * math.log(x)
185
186def kullback_leibler(p, q):
187    p, q = normalize_both(p, q)
188    dist = 0
189    for i in range(len(p)):
190        dist += my_log(p[i]-q[i])
191    return dist
192
193def cosine(p, q):
194    p, q = normalize_both(p, q)
195    p, q = [pp for pp in p], [qq for qq in q]
196    return 1 - numpy.dot(x,y) / (numpy.linalg.norm(p)*numpy.linalg.norm(q))
197
198
[0]199class Estimate:
200    """
201    Reliability estimate. Contains attributes that describe the results of
202    reliability estimation.
203
204    .. attribute:: estimate
205
206        A numerical reliability estimate.
207
208    .. attribute:: signed_or_absolute
209
210        Determines whether the method used gives a signed or absolute result.
211        Has a value of either :obj:`SIGNED` or :obj:`ABSOLUTE`.
212
213    .. attribute:: method
214
215        An integer ID of reliability estimation method used.
216
217    .. attribute:: method_name
218
219        Name (string) of reliability estimation method used.
220
221    """
[40]222    def __init__(self, estimate, signed_or_absolute, method):
[0]223        self.estimate = estimate
224        self.signed_or_absolute = signed_or_absolute
225        self.method = method
226        self.method_name = METHOD_NAME[method]
227        self.text_description = None
228
229class DescriptiveAnalysis:
[14]230    def __init__(self, estimator, desc=["high", "medium", "low"], procentage=[0.00, 0.33, 0.66], name="da"):
[0]231        self.desc = desc
232        self.procentage = procentage
233        self.estimator = estimator
[14]234        self.name = name
[0]235
236    def __call__(self, instances, weight=None, **kwds):
237
238        # Calculate borders using cross validation
239        res = Orange.evaluation.testing.cross_validation([self.estimator], instances)
240        all_borders = []
241        for i in xrange(len(res.results[0].probabilities[0].reliability_estimate)):
242            estimates, signed_or_absolute, method = get_reliability_estimation_list(res, i)
243            sorted_estimates = sorted(abs(x) for x in estimates)
244            borders = [sorted_estimates[int(len(estimates) * p) - 1]  for p in self.procentage]
245            all_borders.append(borders)
246
247        # Learn on whole train data
248        estimator_classifier = self.estimator(instances)
249
250        return DescriptiveAnalysisClassifier(estimator_classifier, all_borders, self.desc)
251
252class DescriptiveAnalysisClassifier:
253    def __init__(self, estimator_classifier, all_borders, desc):
254        self.estimator_classifier = estimator_classifier
255        self.all_borders = all_borders
256        self.desc = desc
257
258    def __call__(self, instance, result_type=Orange.core.GetValue):
259        predicted, probabilities = self.estimator_classifier(instance, Orange.core.GetBoth)
260
261        for borders, estimate in zip(self.all_borders, probabilities.reliability_estimate):
262            estimate.text_description = self.desc[0]
263            for lower_border, text_desc in zip(borders, self.desc):
264                if estimate.estimate >= lower_border:
265                    estimate.text_description = text_desc
266
267        # Return the appropriate type of result
268        if result_type == Orange.core.GetValue:
269            return predicted
270        elif result_type == Orange.core.GetProbabilities:
271            return probabilities
272        else:
273            return predicted, probabilities
274
275class SensitivityAnalysis:
276    """
277   
278    :param e: List of possible :math:`\epsilon` values for SAvar and SAbias
279        reliability estimates.
280    :type e: list of floats
281   
282    :rtype: :class:`Orange.evaluation.reliability.SensitivityAnalysisClassifier`
283   
284    To estimate the reliability of prediction for given instance,
285    the learning set is extended with this instance, labeled with
286    :math:`K + \epsilon (l_{max} - l_{min})`,
287    where :math:`K` denotes the initial prediction,
288    :math:`\epsilon` is sensitivity parameter and :math:`l_{min}` and
289    :math:`l_{max}` denote lower and the upper bound of the learning
290    instances' labels. After computing different sensitivity predictions
291    using different values of :math:`\epsilon`, the prediction are combined
292    into SAvar and SAbias. SAbias can be used in a signed or absolute form.
293
294    :math:`SAvar = \\frac{\sum_{\epsilon \in E}(K_{\epsilon} - K_{-\epsilon})}{|E|}`
295
296    :math:`SAbias = \\frac{\sum_{\epsilon \in E} (K_{\epsilon} - K ) + (K_{-\epsilon} - K)}{2 |E|}`
297   
298   
299    """
[14]300    def __init__(self, e=[0.01, 0.1, 0.5, 1.0, 2.0], name="sa"):
[0]301        self.e = e
[14]302        self.name = name
[0]303
304    def __call__(self, instances, learner):
305        min_value = max_value = instances[0].getclass().value
306        for ex in instances:
307            if ex.getclass().value > max_value:
308                max_value = ex.getclass().value
309            if ex.getclass().value < min_value:
310                min_value = ex.getclass().value
311        return SensitivityAnalysisClassifier(self.e, instances, min_value, max_value, learner)
312
313class SensitivityAnalysisClassifier:
314    def __init__(self, e, instances, min_value, max_value, learner):
315        self.e = e
316        self.instances = instances
317        self.max_value = max_value
318        self.min_value = min_value
319        self.learner = learner
320
321    def __call__(self, instance, predicted, probabilities):
322        # Create new dataset
323        r_data = Orange.data.Table(self.instances)
324
325        # Create new instance
326        modified_instance = Orange.data.Instance(instance)
327
328        # Append it to the data
329        r_data.append(modified_instance)
330
331        # Calculate SAvar & SAbias
332        SAvar = SAbias = 0
333
334        for eps in self.e:
335            # +epsilon
336            r_data[-1].setclass(predicted.value + eps * (self.max_value - self.min_value))
337            c = self.learner(r_data)
338            k_plus = c(instance, Orange.core.GetValue)
339
340            # -epsilon
341            r_data[-1].setclass(predicted.value - eps * (self.max_value - self.min_value))
342            c = self.learner(r_data)
343            k_minus = c(instance, Orange.core.GetValue)
344            #print len(r_data)
345            #print eps*(self.max_value - self.min_value)
346            #print k_plus
347            #print k_minus
348            # calculate part SAvar and SAbias
349            SAvar += k_plus.value - k_minus.value
350            SAbias += k_plus.value + k_minus.value - 2 * predicted.value
351
352        SAvar /= len(self.e)
353        SAbias /= 2 * len(self.e)
354
355        return [Estimate(SAvar, ABSOLUTE, SAVAR_ABSOLUTE),
356                Estimate(SAbias, SIGNED, SABIAS_SIGNED),
357                Estimate(abs(SAbias), ABSOLUTE, SABIAS_ABSOLUTE)]
358
[10]359
360
361class ReferenceExpectedError:
[13]362    """
[10]363
[13]364    :rtype: :class:`Orange.evaluation.reliability.ReferenceExpectedErrorClassifier`
365
366    Reference reliability estimation method for classification as used in Evaluating Reliability of Single
367    Classifications of Neural Networks, Darko Pevec, 2011.
368
369    :math:`O_{ref} = 2 (\hat y - \hat y ^2) = 2 \hat y (1-\hat y)`
370
371    where :math:`\hat y` is the estimated probability of the predicted class.
372
373    Note that for this method, in contrast with all others, a greater estimate means lower reliability (greater
374    expected error).
375
376    """
[10]377    def __init__(self, name="reference"):
378        self.name = name
379
380    def __call__(self, instances, learner):
381        classifier = learner(instances)
382        return ReferenceExpectedErrorClassifier(classifier)
383
384   
385class ReferenceExpectedErrorClassifier:
386
387    def __init__(self, classifier):
388        self.classifier = classifier
389
390    def __call__(self, instance, *args):
391        y_hat = max(self.classifier(instance, Orange.classification.Classifier.GetProbabilities))
392        return [Estimate(2 * y_hat * (1 - y_hat), ABSOLUTE, ERR_ABSOLUTE)]
393
394
[0]395class BaggingVariance:
396    """
397   
398    :param m: Number of bagging models to be used with BAGV estimate
399    :type m: int
400   
401    :rtype: :class:`Orange.evaluation.reliability.BaggingVarianceClassifier`
402   
403    :math:`m` different bagging models are constructed and used to estimate
[5]404    the value of dependent variable for a given instance. In regression,
405    the variance of those predictions is used as a prediction reliability
406    estimate.
[0]407
408    :math:`BAGV = \\frac{1}{m} \sum_{i=1}^{m} (K_i - K)^2`
409
410    where :math:`K = \\frac{\sum_{i=1}^{m} K_i}{m}` and :math:`K_i` are
[5]411    predictions of individual constructed models. Note that a greater value
412    implies greater error.
413
414    For classification, 1 minus the average Euclidean distance between class
415    probability distributions predicted by the model, and distributions
416    predicted by the individual bagged models, is used as the BAGV reliability
417    measure. Note that in this case a greater value implies a better
418    prediction.
[0]419   
[37]420    This reliability measure can run out of memory fast if individual classifiers
421    use a lot of memory, as it build m of them, thereby using :math:`m` times memory
422    for a single classifier. If instances for measuring predictions
423    are given as a parameter, this class can only compute their reliability,
[40]424    which saves memory.
[37]425
[0]426    """
[37]427    def __init__(self, m=50, name="bv", randseed=0, for_instances=None):
428        """
429        for_instances:
430        """
[0]431        self.m = m
[9]432        self.name = name
[37]433        self.select_with_repeat = Orange.core.MakeRandomIndicesMultiple()
434        self.select_with_repeat.random_generator = Orange.misc.Random(randseed)
435        self.for_instances = for_instances
[0]436
437    def __call__(self, instances, learner):
438        classifiers = []
439
[5]440        if instances.domain.class_var.var_type == Orange.feature.Descriptor.Discrete:
441            classifier = learner(instances)
442        else:
443            classifier = None
444
[37]445        for_inst_class = defaultdict(list)
446        this_iteration = None
447       
448        if self.for_instances:
449            his = map(_hashable_instance, self.for_instances)
450
[0]451        # Create bagged classifiers using sampling with replacement
[37]452        for i in xrange(self.m):
453            this_iteration = set()
454            selection = self.select_with_repeat(len(instances))
[0]455            data = instances.select(selection)
[37]456            cl = learner(data)
457            if cl:
458                if self.for_instances: # predict reliability for testing instances and throw cl away
459                    for instance, hi in zip(self.for_instances, his):
460                        if hi not in this_iteration:
461                            for_inst_class[hi].append(_bagged_value(instance, cl, classifier))
462                            this_iteration.add(hi)
463                else:
464                    classifiers.append(cl)
465
466        return BaggingVarianceClassifier(classifiers, classifier, for_inst_class=dict(for_inst_class))
[0]467
468class BaggingVarianceClassifier:
[37]469    def __init__(self, classifiers, classifier=None, for_inst_class=None):
[0]470        self.classifiers = classifiers
[8]471        self.classifier = classifier
[37]472        self.for_inst_class = for_inst_class
[5]473
474    def __call__(self, instance, *args):
[0]475        BAGV = 0
476
477        # Calculate the bagging variance
[37]478        if self.for_inst_class:
479            bagged_values = self.for_inst_class[_hashable_instance(instance)]
480        else:
481            bagged_values = [ _bagged_value(instance, c, self.classifier) for c in self.classifiers ]
482
[0]483        k = sum(bagged_values) / len(bagged_values)
484
485        BAGV = sum((bagged_value - k) ** 2 for bagged_value in bagged_values) / len(bagged_values)
[5]486        if instance.domain.class_var.var_type == Orange.feature.Descriptor.Discrete:
487            BAGV = 1 - BAGV
[0]488
489        return [Estimate(BAGV, ABSOLUTE, BAGV_ABSOLUTE)]
490
[37]491def _hashable_instance(instance):
492    return tuple(instance[i].value for i in range(len(instance.domain.attributes)))
493
494def _bagged_value(instance, c, classifier):
495    if instance.domain.class_var.var_type == Orange.feature.Descriptor.Continuous:
496        return c(instance, Orange.core.GetValue).value
497    elif instance.domain.class_var.var_type == Orange.feature.Descriptor.Discrete:
498        estimate = classifier(instance, Orange.core.GetProbabilities)
499        return euclidean_dist(c(instance, Orange.core.GetProbabilities), estimate)
500
501
[0]502class LocalCrossValidation:
503    """
[5]504
[0]505    :param k: Number of nearest neighbours used in LCV estimate
506    :type k: int
[5]507
508    :param distance: function that computes a distance between two discrete
509        distributions (used only in classification problems). The default
510        is Hellinger distance.
511    :type distance: function
512
513    :param distance_weighted: for classification reliability estimation,
514        use an average distance between distributions, weighted by :math:`e^{-d}`,
515        where :math:`d` is the distance between predicted instance and the
516        neighbour.
517
[0]518    :rtype: :class:`Orange.evaluation.reliability.LocalCrossValidationClassifier`
[5]519
[0]520    :math:`k` nearest neighbours to the given instance are found and put in
521    a separate data set. On this data set, a leave-one-out validation is
[5]522    performed. Reliability estimate for regression is then the distance
523    weighted absolute prediction error. In classification, 1 minus the average
524    distance between the predicted class probability distribution and the
525    (trivial) probability distributions of the nearest neighbour.
[0]526
527    If a special value 0 is passed as :math:`k` (as is by default),
528    it is set as 1/20 of data set size (or 5, whichever is greater).
[5]529
530    Summary of the algorithm for regression:
531
[0]532    1. Determine the set of k nearest neighours :math:`N = { (x_1, c_1),...,
533       (x_k, c_k)}`.
534    2. On this set, compute leave-one-out predictions :math:`K_i` and
535       prediction errors :math:`E_i = | C_i - K_i |`.
536    3. :math:`LCV(x) = \\frac{ \sum_{(x_i, c_i) \in N} d(x_i, x) * E_i }{ \sum_{(x_i, c_i) \in N} d(x_i, x) }`
[5]537
[0]538    """
[9]539    def __init__(self, k=0, distance=hellinger_dist, distance_weighted=True, name="lcv"):
[0]540        self.k = k
[5]541        self.distance = distance
542        self.distance_weighted = distance_weighted
[9]543        self.name = name
[0]544
545    def __call__(self, instances, learner):
546        nearest_neighbours_constructor = Orange.classification.knn.FindNearestConstructor()
547        nearest_neighbours_constructor.distanceConstructor = Orange.distance.Euclidean()
548
549        distance_id = Orange.feature.Descriptor.new_meta_id()
550        nearest_neighbours = nearest_neighbours_constructor(instances, 0, distance_id)
551
552        if self.k == 0:
553            self.k = max(5, len(instances) / 20)
554
[5]555        return LocalCrossValidationClassifier(distance_id, nearest_neighbours, self.k, learner,
556            distance=self.distance, distance_weighted=self.distance_weighted)
[0]557
558class LocalCrossValidationClassifier:
[5]559    def __init__(self, distance_id, nearest_neighbours, k, learner, **kwds):
[0]560        self.distance_id = distance_id
561        self.nearest_neighbours = nearest_neighbours
562        self.k = k
563        self.learner = learner
[5]564        for a,b in kwds.items():
565            setattr(self, a, b)
[0]566
567    def __call__(self, instance, *args):
568        LCVer = 0
569        LCVdi = 0
570
571        # Find k nearest neighbors
572
573        knn = [ex for ex in self.nearest_neighbours(instance, self.k)]
574
575        # leave one out of prediction error
576        for i in xrange(len(knn)):
577            train = knn[:]
578            del train[i]
579
580            classifier = self.learner(Orange.data.Table(train))
581
[5]582            if instance.domain.class_var.var_type == Orange.feature.Descriptor.Continuous:
583                returned_value = classifier(knn[i], Orange.core.GetValue)
584                e = abs(knn[i].getclass().value - returned_value.value)
[0]585
[5]586            elif instance.domain.class_var.var_type == Orange.feature.Descriptor.Discrete:
587                returned_value = classifier(knn[i], Orange.core.GetProbabilities)
588                probabilities = [knn[i].get_class() == val for val in instance.domain.class_var.values]
589                e = self.distance(returned_value, Orange.statistics.distribution.Discrete(probabilities))
[0]590
[5]591            dist = math.exp(-knn[i][self.distance_id]) if self.distance_weighted else 1.0
592            LCVer += e * dist
593            LCVdi += dist
[0]594
595        LCV = LCVer / LCVdi if LCVdi != 0 else 0
596        if math.isnan(LCV):
597            LCV = 0.0
[5]598
599        if instance.domain.class_var.var_type == Orange.feature.Descriptor.Discrete:
600            LCV = 1 - LCV
601
[0]602        return [ Estimate(LCV, ABSOLUTE, LCV_ABSOLUTE) ]
603
604class CNeighbours:
605    """
606   
607    :param k: Number of nearest neighbours used in CNK estimate
608    :type k: int
[5]609
610    :param distance: function that computes a distance between two discrete
611        distributions (used only in classification problems). The default
612        is Hellinger distance.
613    :type distance: function
[0]614   
615    :rtype: :class:`Orange.evaluation.reliability.CNeighboursClassifier`
616   
[5]617    For regression, CNK is defined for an unlabeled instance as a difference
618    between average label of its nearest neighbours and its prediction. CNK
619    can be used as a signed or absolute estimate.
[0]620   
621    :math:`CNK = \\frac{\sum_{i=1}^{k}C_i}{k} - K`
622   
623    where :math:`k` denotes number of neighbors, C :sub:`i` denotes neighbours'
[5]624    labels and :math:`K` denotes the instance's prediction. Note that a greater
625    value implies greater prediction error.
626
627    For classification, CNK is equal to 1 minus the average distance between
628    predicted class distribution and (trivial) class distributions of the
629    $k$ nearest neighbours from the learning set. Note that in this case
630    a greater value implies better prediction.
[0]631   
632    """
[9]633    def __init__(self, k=5, distance=hellinger_dist, name = "cnk"):
[0]634        self.k = k
[5]635        self.distance = distance
[9]636        self.name = name
[0]637
638    def __call__(self, instances, learner):
639        nearest_neighbours_constructor = Orange.classification.knn.FindNearestConstructor()
640        nearest_neighbours_constructor.distanceConstructor = Orange.distance.Euclidean()
641
642        distance_id = Orange.feature.Descriptor.new_meta_id()
643        nearest_neighbours = nearest_neighbours_constructor(instances, 0, distance_id)
[5]644        return CNeighboursClassifier(nearest_neighbours, self.k, distance=self.distance)
[0]645
646class CNeighboursClassifier:
[8]647    def __init__(self, nearest_neighbours, k, distance):
[0]648        self.nearest_neighbours = nearest_neighbours
649        self.k = k
[8]650        self.distance = distance
[0]651
652    def __call__(self, instance, predicted, probabilities):
653        CNK = 0
654
655        # Find k nearest neighbors
656
657        knn = [ex for ex in self.nearest_neighbours(instance, self.k)]
658
659        # average label of neighbors
[5]660        if ex.domain.class_var.var_type == Orange.feature.Descriptor.Continuous:
661            for ex in knn:
662                CNK += ex.getclass().value
663            CNK /= self.k
664            CNK -= predicted.value
[0]665
[5]666            return [Estimate(CNK, SIGNED, CNK_SIGNED),
667                    Estimate(abs(CNK), ABSOLUTE, CNK_ABSOLUTE)]
668        elif ex.domain.class_var.var_type == Orange.feature.Descriptor.Discrete:
669            knn_l = Orange.classification.knn.kNNLearner(k=self.k)
670            knn_c = knn_l(knn)
671            for ex in knn:
672                CNK -= self.distance(probabilities, knn_c(ex, Orange.classification.Classifier.GetProbabilities))
673            CNK /= self.k
674            CNK += 1
[0]675
[5]676            return [Estimate(CNK, ABSOLUTE, CNK_ABSOLUTE)]
[0]677
678class Mahalanobis:
679    """
680   
681    :param k: Number of nearest neighbours used in Mahalanobis estimate.
682    :type k: int
683   
684    :rtype: :class:`Orange.evaluation.reliability.MahalanobisClassifier`
685   
686    Mahalanobis distance reliability estimate is defined as
687    `mahalanobis distance <http://en.wikipedia.org/wiki/Mahalanobis_distance>`_
688    to the evaluated instance's :math:`k` nearest neighbours.
689
690   
691    """
[14]692    def __init__(self, k=3, name="mahalanobis"):
[0]693        self.k = k
[14]694        self.name = name
[0]695
696    def __call__(self, instances, *args):
697        nnm = Orange.classification.knn.FindNearestConstructor()
698        nnm.distanceConstructor = Orange.distance.Mahalanobis()
699
700        mid = Orange.feature.Descriptor.new_meta_id()
701        nnm = nnm(instances, 0, mid)
702        return MahalanobisClassifier(self.k, nnm, mid)
703
704class MahalanobisClassifier:
705    def __init__(self, k, nnm, mid):
706        self.k = k
707        self.nnm = nnm
708        self.mid = mid
709
710    def __call__(self, instance, *args):
711        mahalanobis_distance = 0
712
713        mahalanobis_distance = sum(ex[self.mid].value for ex in self.nnm(instance, self.k))
714
715        return [ Estimate(mahalanobis_distance, ABSOLUTE, MAHAL_ABSOLUTE) ]
716
717class MahalanobisToCenter:
718    """
719    :rtype: :class:`Orange.evaluation.reliability.MahalanobisToCenterClassifier`
720   
721    Mahalanobis distance to center reliability estimate is defined as a
722    `mahalanobis distance <http://en.wikipedia.org/wiki/Mahalanobis_distance>`_
723    between the predicted instance and the centroid of the data.
724
725   
726    """
[14]727    def __init__(self, name="mahalanobis to center"):
728        self.name = name
[0]729
730    def __call__(self, instances, *args):
731        dc = Orange.core.DomainContinuizer()
732        dc.classTreatment = Orange.core.DomainContinuizer.Ignore
733        dc.continuousTreatment = Orange.core.DomainContinuizer.NormalizeBySpan
734        dc.multinomialTreatment = Orange.core.DomainContinuizer.NValues
735
736        new_domain = dc(instances)
737        new_instances = instances.translate(new_domain)
738
739        X, _, _ = new_instances.to_numpy()
740        instance_avg = numpy.average(X, 0)
741
742        distance_constructor = Orange.distance.Mahalanobis()
743        distance = distance_constructor(new_instances)
744
745        average_instance = Orange.data.Instance(new_instances.domain, list(instance_avg) + ["?"])
746
747        return MahalanobisToCenterClassifier(distance, average_instance, new_domain)
748
749class MahalanobisToCenterClassifier:
750    def __init__(self, distance, average_instance, new_domain):
751        self.distance = distance
752        self.average_instance = average_instance
753        self.new_domain = new_domain
754
755    def __call__(self, instance, *args):
756
757        inst = Orange.data.Instance(self.new_domain, instance)
758
759        mahalanobis_to_center = self.distance(inst, self.average_instance)
760
761        return [ Estimate(mahalanobis_to_center, ABSOLUTE, MAHAL_TO_CENTER_ABSOLUTE) ]
762
763
764class BaggingVarianceCNeighbours:
765    """
766   
767    :param bagv: Instance of Bagging Variance estimator.
768    :type bagv: :class:`BaggingVariance`
769   
770    :param cnk: Instance of CNK estimator.
771    :type cnk: :class:`CNeighbours`
772   
773    :rtype: :class:`Orange.evaluation.reliability.BaggingVarianceCNeighboursClassifier`
774   
775    BVCK is a combination (average) of Bagging variance and local modeling of
776    prediction error.
777   
778    """
[37]779    def __init__(self, bagv=None, cnk=None, name="bvck"):
780        if bagv is None:
781            bagv = BaggingVariance()
782        if cnk is None:
783            cnk = CNeighbours()
[0]784        self.bagv = bagv
785        self.cnk = cnk
[14]786        self.name = "bvck"
[0]787
788    def __call__(self, instances, learner):
789        bagv_classifier = self.bagv(instances, learner)
790        cnk_classifier = self.cnk(instances, learner)
791        return BaggingVarianceCNeighboursClassifier(bagv_classifier, cnk_classifier)
792
793class BaggingVarianceCNeighboursClassifier:
794    def __init__(self, bagv_classifier, cnk_classifier):
795        self.bagv_classifier = bagv_classifier
796        self.cnk_classifier = cnk_classifier
797
798    def __call__(self, instance, predicted, probabilities):
799        bagv_estimates = self.bagv_classifier(instance, predicted, probabilities)
800        cnk_estimates = self.cnk_classifier(instance, predicted, probabilities)
801
802        bvck_value = (bagv_estimates[0].estimate + cnk_estimates[1].estimate) / 2
803        bvck_estimates = [ Estimate(bvck_value, ABSOLUTE, BVCK_ABSOLUTE) ]
804        bvck_estimates.extend(bagv_estimates)
805        bvck_estimates.extend(cnk_estimates)
806        return bvck_estimates
807
808class ErrorPredicting:
[14]809    def __init__(self, name = "ep"):
810        self.name = name
[0]811
812    def __call__(self, instances, learner):
813        res = Orange.evaluation.testing.cross_validation([learner], instances)
814        prediction_errors = get_prediction_error_list(res)
815
816        new_domain = Orange.data.Domain(instances.domain.attributes, Orange.core.FloatVariable("pe"))
817        new_dataset = Orange.data.Table(new_domain, instances)
818
819        for instance, prediction_error in izip(new_dataset, prediction_errors):
820            instance.set_class(prediction_error)
821
822        rf = Orange.ensemble.forest.RandomForestLearner()
823        rf_classifier = rf(new_dataset)
824
825        return ErrorPredictingClassification(rf_classifier, new_domain)
826
827class ErrorPredictingClassification:
828    def __init__(self, rf_classifier, new_domain):
829        self.rf_classifier = rf_classifier
830        self.new_domain = new_domain
831
832    def __call__(self, instance, predicted, probabilities):
833        new_instance = Orange.data.Instance(self.new_domain, instance)
834        value = self.rf_classifier(new_instance, Orange.core.GetValue)
835
836        return [Estimate(value.value, SIGNED, SABIAS_SIGNED)]
837
[5]838def gauss_kernel(x, sigma=1):
839    return 1./(sigma*math.sqrt(2*math.pi)) * math.exp(-1./2*(x/sigma)**2)
840
841class ParzenWindowDensityBased:
842    """
843    :param K: kernel function. Default: gaussian.
844    :type K: function
845
846    :param d_measure: distance measure for inter-instance distance.
847    :type d_measure: :class:`Orange.distance.DistanceConstructor`
848
849    :rtype: :class:`Orange.evaluation.reliability.ParzenWindowDensityBasedClassifier`
850
851    Returns a value that estimates a density of problem space around the
852    instance being predicted.
853    """
[9]854    def __init__(self, K=gauss_kernel, d_measure=Orange.distance.Euclidean(), name="density"):
[5]855        self.K = K
856        self.d_measure = d_measure
[9]857        self.name = name
[5]858
[11]859    def __call__(self, instances, learner):
[5]860
861        self.distance = self.d_measure(instances)
862
863        def density(x):
864            l, dens = len(instances), 0
865            for ex in instances:
866                dens += self.K(self.distance(x,ex))
867            return dens / l
868
869        max_density = max([density(ex) for ex in instances])
870
871        return ParzenWindowDensityBasedClassifier(density, max_density)
872
873class ParzenWindowDensityBasedClassifier:
874
875    def __init__(self, density, max_density):
876        self.density = density
877        self.max_density = max_density
878
879
880    def __call__(self, instance, *args):
881
882        DENS = self.max_density-self.density(instance)
883
884        return [Estimate(DENS, ABSOLUTE, DENS_ABSOLUTE)]
885
[37]886class Stacking:
887
[38]888    def __init__(self, stack_learner, estimators=None, folds=10, save_data=False):
[37]889        self.stack_learner = stack_learner
890        self.estimators = estimators
891        self.folds = folds
892        self.save_data = save_data
[38]893        if self.estimators is None:
894             self.estimators = [SensitivityAnalysis(),
895                           LocalCrossValidation(),
896                           BaggingVarianceCNeighbours(),
897                           Mahalanobis(),
898                           MahalanobisToCenter()]
[37]899   
900    def __call__(self, data, learner):
901
902        newfeatures = None
903       
904        if self.folds > 1:
905
906            cvi = Orange.data.sample.SubsetIndicesCV(data, self.folds)
907            data_cv = [ None ] * len(data)
908            for f in set(cvi): #for each fold
909                learn = data.select(cvi, f, negate=True)
910                test = data.select(cvi, f)
911
912                #learn reliability estimates for the learning set
913                lf = Learner(learner, estimators=self.estimators)(learn)
914               
915                #pos is used to retain the order of instances
916                for ex, pos in zip(test, [ i for i,n in enumerate(cvi) if n == f ]):
917                    pred = lf(ex, Orange.core.GetBoth)
918                    re = pred[1].reliability_estimate
919                    names = [ e.method_name for e in re ]
920                    assert newfeatures is None or names == newfeatures
921                    newfeatures = names
922                    estimates = [ abs(e.estimate) for e in re ]
923                    error = ex[-1].value - pred[0].value
924                    data_cv[pos] = estimates + [ abs(error) ]
925
926        else:
927 
928            #use half of the data to learn reliability estimates
929            #and the other half for induction of a stacking classifier
930            cvi = Orange.data.sample.SubsetIndicesCV(data, 2)
931            data_cv = []
932
933            learn = data.select(cvi, 0, negate=True)
934            test = data.select(cvi, 0)
935
936            #learn reliability estimates for the learning set
937            lf = Learner(learner, estimators=self.estimators)(learn)
938           
939            for ex in test:
940                pred = lf(ex, Orange.core.GetBoth)
941                re = pred[1].reliability_estimate
942                names = [ e.method_name for e in re ]
943                assert newfeatures is None or names == newfeatures
944                newfeatures = names
945                estimates = [ abs(e.estimate) for e in re ]
946                error = ex[-1].value - pred[0].value
947                data_cv.append(estimates + [ abs(error) ])
948
949            print "DCV", len(data_cv)
950
951        lf = None
952
953        #induce the classifier on cross-validated reliability estimates
954        newfeatures = [ Orange.feature.Continuous(name=n) for n in newfeatures ]
955        newdomain = Orange.data.Domain(newfeatures, Orange.feature.Continuous(name="error"))
956        classifier_data = Orange.data.Table(newdomain, data_cv)
957        stack_classifier = self.stack_learner(classifier_data)
958
959        #induce reliability estimates on the whole data set
960        lf = Learner(learner, estimators=self.estimators)(data)
961
962        if self.save_data:
963            self.classifier_data = classifier_data
964
965        return StackingClassifier(stack_classifier, lf, newdomain)
966
967
968class StackingClassifier:
969
970    def __init__(self, stacking_classifier, reliability_classifier, domain):
971        self.stacking_classifier = stacking_classifier
972        print self.stacking_classifier
973        self.domain = domain
974        self.reliability_classifier = reliability_classifier
975
976    def convert(self, instance):
977        """ Return example in the space of reliability estimates. """
978        re = self.reliability_classifier(instance, Orange.core.GetProbabilities).reliability_estimate
979        #take absolute values for all
980        tex = [ abs(e.estimate) for e in re ] + [ "?" ]
981        tex =  Orange.data.Instance(self.domain, tex)
982        return tex
983
984    def __call__(self, instance, *args):
985        tex = self.convert(instance)
986        r = self.stacking_classifier(tex)
987        r = float(r)
988        r = max(0., r)
989        return [ Estimate(r, ABSOLUTE, STACKING) ]
990
991class ICV:
992    """ Perform internal cross validation (as in Automatic selection of
[38]993    reliability estimates for individual regression predictions,
[37]994    Zoran Bosnic, 2010) and return id of the method
995    that scored best on this data.
996    """
997 
[38]998    def __init__(self, estimators=None, folds=10):
[37]999        self.estimators = estimators
[38]1000        if self.estimators is None:
1001             self.estimators = [SensitivityAnalysis(),
1002                           LocalCrossValidation(),
1003                           BaggingVarianceCNeighbours(),
1004                           Mahalanobis(),
1005                           MahalanobisToCenter()]
[37]1006        self.folds = folds
1007   
1008    def __call__(self, data, learner):
1009
1010        cvi = Orange.data.sample.SubsetIndicesCV(data, self.folds)
1011        sum_of_rs = defaultdict(float)
[38]1012        n_rs = defaultdict(int)
[37]1013
1014        elearner = Learner(learner, estimators=self.estimators)
1015
1016        #average correlations from each fold
1017        for f in set(cvi):
1018            learn = data.select(cvi, f, negate=True)
1019            test = data.select(cvi, f)
1020
1021            res = Orange.evaluation.testing.learn_and_test_on_test_data([elearner], learn, test)
1022            results = get_pearson_r(res)
[38]1023   
[37]1024            for r, p, sa, method in results:
[38]1025                if not math.isnan(r): #ignore NaN values
1026                    sum_of_rs[(method, sa)] += r
1027                    n_rs[(method, sa)] += 1 
[37]1028
[38]1029        avg_rs = [ (k,(sum_of_rs[k]/n_rs[k])) for k in sum_of_rs ]
1030
1031        avg_rs = sorted(avg_rs, key=lambda estimate: estimate[1], reverse=True)
1032        chosen = avg_rs[0][0]
[37]1033
1034        lf = elearner(data)
1035        return ICVClassifier(chosen, lf)
1036
1037
1038class ICVClassifier:
1039
1040    def __init__(self, chosen, reliability_classifier):
1041        self.chosen = chosen
1042        self.reliability_classifier = reliability_classifier
1043
1044    def __call__(self, instance, *args):
1045        re = self.reliability_classifier(instance, Orange.core.GetProbabilities).reliability_estimate
1046        for e in re:
1047            if e.method == self.chosen[0] and e.signed_or_absolute == self.chosen[1]:
1048                r = e.estimate
1049
1050        return [ Estimate(r, self.chosen[1], ICV_METHOD) ]
1051
[0]1052class Learner:
1053    """
1054    Reliability estimation wrapper around a learner we want to test.
1055    Different reliability estimation algorithms can be used on the
1056    chosen learner. This learner works as any other and can be used as one,
1057    but it returns the classifier, wrapped into an instance of
1058    :class:`Orange.evaluation.reliability.Classifier`.
1059   
1060    :param box_learner: Learner we want to wrap into a reliability estimation
1061        classifier.
1062    :type box_learner: :obj:`~Orange.classification.Learner`
1063   
1064    :param estimators: List of different reliability estimation methods we
1065                       want to use on the chosen learner.
1066    :type estimators: :obj:`list` of reliability estimators
1067   
1068    :param name: Name of this reliability learner
1069    :type name: string
1070   
1071    :rtype: :class:`Orange.evaluation.reliability.Learner`
1072    """
1073    def __init__(self, box_learner, name="Reliability estimation",
[38]1074                 estimators=None,
[0]1075                 **kwds):
1076        self.__dict__.update(kwds)
1077        self.name = name
1078        self.estimators = estimators
[38]1079        if self.estimators is None:
1080             self.estimators = [SensitivityAnalysis(),
1081                           LocalCrossValidation(),
1082                           BaggingVarianceCNeighbours(),
1083                           Mahalanobis(),
1084                           MahalanobisToCenter()]
1085 
[0]1086        self.box_learner = box_learner
1087        self.blending = False
1088
1089
1090    def __call__(self, instances, weight=None, **kwds):
1091        """Learn from the given table of data instances.
1092       
1093        :param instances: Data instances to learn from.
1094        :type instances: Orange.data.Table
1095        :param weight: Id of meta attribute with weights of instances
1096        :type weight: int
1097        :rtype: :class:`Orange.evaluation.reliability.Classifier`
1098        """
1099
1100        blending_classifier = None
1101        new_domain = None
1102
1103#        if instances.domain.class_var.var_type != Orange.feature.Continuous.Continuous:
1104#            raise Exception("This method only works on data with continuous class.")
1105
1106        return Classifier(instances, self.box_learner, self.estimators, self.blending, new_domain, blending_classifier)
[37]1107 
[0]1108class Classifier:
1109    """
1110    A reliability estimation wrapper for classifiers.
1111
1112    What distinguishes this classifier is that the returned probabilities (if
1113    :obj:`Orange.classification.Classifier.GetProbabilities` or
1114    :obj:`Orange.classification.Classifier.GetBoth` is passed) contain an
1115    additional attribute :obj:`reliability_estimate`, which is an instance of
1116    :class:`~Orange.evaluation.reliability.Estimate`.
1117
1118    """
1119
1120    def __init__(self, instances, box_learner, estimators, blending, blending_domain, rf_classifier, **kwds):
1121        self.__dict__.update(kwds)
1122        self.instances = instances
1123        self.box_learner = box_learner
1124        self.estimators = estimators
1125        self.blending = blending
1126        self.blending_domain = blending_domain
1127        self.rf_classifier = rf_classifier
1128
1129        # Train the learner with original data
1130        self.classifier = box_learner(instances)
1131
1132        # Train all the estimators and create their classifiers
1133        self.estimation_classifiers = [estimator(instances, box_learner) for estimator in estimators]
1134
1135    def __call__(self, instance, result_type=Orange.core.GetValue):
1136        """
1137        Classify and estimate reliability of estimation for a new instance.
1138        When :obj:`result_type` is set to
1139        :obj:`Orange.classification.Classifier.GetBoth` or
1140        :obj:`Orange.classification.Classifier.GetProbabilities`,
1141        an additional attribute :obj:`reliability_estimate`,
1142        which is an instance of
1143        :class:`~Orange.evaluation.reliability.Estimate`,
1144        is added to the distribution object.
1145       
1146        :param instance: instance to be classified.
1147        :type instance: :class:`Orange.data.Instance`
1148        :param result_type: :class:`Orange.classification.Classifier.GetValue` or \
1149              :class:`Orange.classification.Classifier.GetProbabilities` or
1150              :class:`Orange.classification.Classifier.GetBoth`
1151       
1152        :rtype: :class:`Orange.data.Value`,
1153              :class:`Orange.statistics.Distribution` or a tuple with both
1154        """
1155        predicted, probabilities = self.classifier(instance, Orange.core.GetBoth)
1156
1157        # Create a place holder for estimates
1158        if probabilities is None:
1159            probabilities = Orange.statistics.distribution.Continuous()
1160        #with warnings.catch_warnings():
1161        #    warnings.simplefilter("ignore")
1162        probabilities.setattr('reliability_estimate', [])
1163
1164        # Calculate all the estimates and add them to the results
1165        for estimate in self.estimation_classifiers:
1166            probabilities.reliability_estimate.extend(estimate(instance, predicted, probabilities))
1167
1168        # Return the appropriate type of result
1169        if result_type == Orange.core.GetValue:
1170            return predicted
1171        elif result_type == Orange.core.GetProbabilities:
1172            return probabilities
1173        else:
1174            return predicted, probabilities
[5]1175
1176# Functions for testing and plotting
1177#TODO Document those.
1178def get_acc_rel(method, data, learner):
1179    estimators = [method]
1180    reliability = Orange.evaluation.reliability.Learner(learner, estimators=estimators)
1181    #results = Orange.evaluation.testing.leave_one_out([reliability], data)
1182    results = Orange.evaluation.testing.cross_validation([reliability], data)
1183
1184    rels, acc = [], []
1185
1186    for res in results.results:
1187        rels.append(res.probabilities[0].reliability_estimate[0].estimate)
1188        acc.append(res.probabilities[0][res.actual_class])
1189
1190    return rels, acc
1191
[12]1192
1193def rel_acc_plot(rels, acc, file_name=None, colors=None):
[5]1194
1195    import matplotlib.pylab as plt
[12]1196   
1197    if colors is None:
1198        colors = "k"
1199    plt.scatter(rels, acc, c=colors)
1200    plt.xlim(0.,1.)
1201    plt.ylim(ymin=0.)
1202    plt.xlabel("Reliability")
1203    plt.ylabel("Accuracy")
1204    if file_name is None:
1205        plt.show()
1206    else:
1207        plt.savefig(file_name)
1208
1209def rel_acc_compute_plot(method, data, learner, file_name=None, colors=None):
[5]1210
1211    plt.clf()
1212
1213    rels, acc = get_acc_rel(method, data, learner)
[12]1214    el_acc_plot(acc, rels, file_name=file_name, colors=colors)
1215   
[5]1216
1217def acc_rel_correlation(method, data, learner):
1218    import scipy.stats
1219    rels, acc = get_acc_rel(method, data, learner)
[37]1220    return scipy.stats.spearmanr(acc, rels)[0]
Note: See TracBrowser for help on using the repository browser.