source: orange-reliability/orangecontrib/reliability/__init__.py @ 37:11955f57b3ae

Revision 37:11955f57b3ae, 44.3 KB checked in by markotoplak, 7 months ago (diff)

Rewrote ICV, added Stacking, and a memory efficient option to compute BAGV.

RevLine 
[0]1import Orange
2
3import random
4from Orange import statc
5import math
6import warnings
7import numpy
8
9from collections import defaultdict
10from itertools import izip
11
12# All the estimator method constants
13SAVAR_ABSOLUTE = 0
14SABIAS_SIGNED = 1
15SABIAS_ABSOLUTE = 2
16BAGV_ABSOLUTE = 3
17CNK_SIGNED = 4
18CNK_ABSOLUTE = 5
19LCV_ABSOLUTE = 6
20BVCK_ABSOLUTE = 7
21MAHAL_ABSOLUTE = 8
22BLENDING_ABSOLUTE = 9
23ICV_METHOD = 10
24MAHAL_TO_CENTER_ABSOLUTE = 13
[5]25DENS_ABSOLUTE = 14
[10]26ERR_ABSOLUTE = 15
[37]27STACKING = 101
[0]28
29# Type of estimator constant
30SIGNED = 0
31ABSOLUTE = 1
32
33# Names of all the estimator methods
34METHOD_NAME = {0: "SAvar absolute", 1: "SAbias signed", 2: "SAbias absolute",
35               3: "BAGV absolute", 4: "CNK signed", 5: "CNK absolute",
[37]36               6: "LCV absolute", 7: "BVCK absolute", 8: "Mahalanobis absolute",
[0]37               9: "BLENDING absolute", 10: "ICV", 11: "RF Variance", 12: "RF Std",
[37]38               13: "Mahalanobis to center", 14: "Density based", 15: "Reference expected error",
39               101: "Stacking" }
[0]40
41def get_reliability_estimation_list(res, i):
[37]42    return [ result.probabilities[0].reliability_estimate[i].estimate for result in res.results], \
43        res.results[0].probabilities[0].reliability_estimate[i].signed_or_absolute, \
44        res.results[0].probabilities[0].reliability_estimate[i].method
[0]45
46def get_prediction_error_list(res):
47    return [result.actual_class - result.classes[0] for result in res.results]
48
49def get_description_list(res, i):
50    return [result.probabilities[0].reliability_estimate[i].text_description for result in res.results]
51
52def get_pearson_r(res):
53    """
54    :param res: results of evaluation, done using learners,
55        wrapped into :class:`Orange.evaluation.reliability.Classifier`.
56    :type res: :class:`Orange.evaluation.testing.ExperimentResults`
57
58    Return Pearson's coefficient between the prediction error and each of the
59    used reliability estimates. Also, return the p-value of each of
60    the coefficients.
61    """
62    prediction_error = get_prediction_error_list(res)
63    results = []
64    for i in xrange(len(res.results[0].probabilities[0].reliability_estimate)):
65        reliability_estimate, signed_or_absolute, method = get_reliability_estimation_list(res, i)
66        try:
67            if signed_or_absolute == SIGNED:
68                r, p = statc.pearsonr(prediction_error, reliability_estimate)
69            else:
70                r, p = statc.pearsonr([abs(pe) for pe in prediction_error], reliability_estimate)
71        except Exception:
72            r = p = float("NaN")
73        results.append((r, p, signed_or_absolute, method))
74    return results
75
76def get_spearman_r(res):
77    """
78    :param res: results of evaluation, done using learners,
79        wrapped into :class:`Orange.evaluation.reliability.Classifier`.
80    :type res: :class:`Orange.evaluation.testing.ExperimentResults`
81
82    Return Spearman's coefficient between the prediction error and each of the
83    used reliability estimates. Also, return the p-value of each of
84    the coefficients.
85    """
86    prediction_error = get_prediction_error_list(res)
87    results = []
88    for i in xrange(len(res.results[0].probabilities[0].reliability_estimate)):
89        reliability_estimate, signed_or_absolute, method = get_reliability_estimation_list(res, i)
90        try:
91            if signed_or_absolute == SIGNED:
92                r, p = statc.spearmanr(prediction_error, reliability_estimate)
93            else:
94                r, p = statc.spearmanr([abs(pe) for pe in prediction_error], reliability_estimate)
95        except Exception:
96            r = p = float("NaN")
97        results.append((r, p, signed_or_absolute, method))
98    return results
99
100def get_pearson_r_by_iterations(res):
101    """
102    :param res: results of evaluation, done using learners,
103        wrapped into :class:`Orange.evaluation.reliability.Classifier`.
104    :type res: :class:`Orange.evaluation.testing.ExperimentResults`
105
106    Return average Pearson's coefficient over all folds between prediction error
107    and each of the used estimates.
108    """
109    results_by_fold = Orange.evaluation.scoring.split_by_iterations(res)
110    number_of_estimates = len(res.results[0].probabilities[0].reliability_estimate)
111    number_of_instances = len(res.results)
112    number_of_folds = len(results_by_fold)
113    results = [0 for _ in xrange(number_of_estimates)]
114    sig = [0 for _ in xrange(number_of_estimates)]
115    method_list = [0 for _ in xrange(number_of_estimates)]
116
117    for res in results_by_fold:
118        prediction_error = get_prediction_error_list(res)
119        for i in xrange(number_of_estimates):
120            reliability_estimate, signed_or_absolute, method = get_reliability_estimation_list(res, i)
121            try:
122                if signed_or_absolute == SIGNED:
123                    r, _ = statc.pearsonr(prediction_error, reliability_estimate)
124                else:
125                    r, _ = statc.pearsonr([abs(pe) for pe in prediction_error], reliability_estimate)
126            except Exception:
127                r = float("NaN")
128            results[i] += r
129            sig[i] = signed_or_absolute
130            method_list[i] = method
131
132    # Calculate p-values
133    results = [float(res) / number_of_folds for res in results]
134    ps = [p_value_from_r(r, number_of_instances) for r in results]
135
136    return zip(results, ps, sig, method_list)
137
138def p_value_from_r(r, n):
139    """
140    Calculate p-value from the paerson coefficient and the sample size.
141    """
142    df = n - 2
143    t = r * (df / ((-r + 1.0 + 1e-30) * (r + 1.0 + 1e-30))) ** 0.5
144    return statc.betai (df * 0.5, 0.5, df / (df + t * t))
145
[5]146
147# Distances between two discrete probability distributions
148#TODO Document those.
149def normalize_both(p, q):
150    if not p.normalized:
151        p.normalize()
152    if not q.normalized:
153        q.normalize()
154    return p, q
155
156def minkowsky_dist(p, q, m=2):
157    p, q = normalize_both(p, q)
158    dist = 0
159    for i in range(len(p)):
160        dist += abs(p[i]-q[i])**m
161    return dist**(1./m)
162
163def manhattan_distance(p, q):
164    return minkowsky_dist(p, q, m=1)
165
166def euclidean_dist(p, q):
167    return minkowsky_dist(p, q, m=2)
168
169def variance_dist(p, q):
170    return euclidean_dist(p, q) ** 2
171
172def max_dist(p, q):
173    p, q = normalize_both(p, q)
174    return max([abs(p[i]-q[i]) for i in range(len(p))])
175
176def hellinger_dist(p, q):
177    p, q = normalize_both(p, q)
178    dist = 0
179    for i in range(len(p)):
180        dist += (math.sqrt(p[i])-math.sqrt(q[i])) ** 2
181    return dist
182
183def my_log(x):
184    return 0 if x == 0 else x * math.log(x)
185
186def kullback_leibler(p, q):
187    p, q = normalize_both(p, q)
188    dist = 0
189    for i in range(len(p)):
190        dist += my_log(p[i]-q[i])
191    return dist
192
193def cosine(p, q):
194    p, q = normalize_both(p, q)
195    p, q = [pp for pp in p], [qq for qq in q]
196    return 1 - numpy.dot(x,y) / (numpy.linalg.norm(p)*numpy.linalg.norm(q))
197
198
[0]199class Estimate:
200    """
201    Reliability estimate. Contains attributes that describe the results of
202    reliability estimation.
203
204    .. attribute:: estimate
205
206        A numerical reliability estimate.
207
208    .. attribute:: signed_or_absolute
209
210        Determines whether the method used gives a signed or absolute result.
211        Has a value of either :obj:`SIGNED` or :obj:`ABSOLUTE`.
212
213    .. attribute:: method
214
215        An integer ID of reliability estimation method used.
216
217    .. attribute:: method_name
218
219        Name (string) of reliability estimation method used.
220
221    .. attribute:: icv_method
222
223        An integer ID of reliability estimation method that performed best,
224        as determined by ICV, and of which estimate is stored in the
225        :obj:`estimate` field. (:obj:`None` when ICV was not used.)
226
227    .. attribute:: icv_method_name
228
229        Name (string) of reliability estimation method that performed best,
230        as determined by ICV. (:obj:`None` when ICV was not used.)
231
232    """
233    def __init__(self, estimate, signed_or_absolute, method, icv_method= -1):
234        self.estimate = estimate
235        self.signed_or_absolute = signed_or_absolute
236        self.method = method
237        self.method_name = METHOD_NAME[method]
238        self.icv_method = icv_method
239        self.icv_method_name = METHOD_NAME[icv_method] if icv_method != -1 else ""
240        self.text_description = None
241
242class DescriptiveAnalysis:
[14]243    def __init__(self, estimator, desc=["high", "medium", "low"], procentage=[0.00, 0.33, 0.66], name="da"):
[0]244        self.desc = desc
245        self.procentage = procentage
246        self.estimator = estimator
[14]247        self.name = name
[0]248
249    def __call__(self, instances, weight=None, **kwds):
250
251        # Calculate borders using cross validation
252        res = Orange.evaluation.testing.cross_validation([self.estimator], instances)
253        all_borders = []
254        for i in xrange(len(res.results[0].probabilities[0].reliability_estimate)):
255            estimates, signed_or_absolute, method = get_reliability_estimation_list(res, i)
256            sorted_estimates = sorted(abs(x) for x in estimates)
257            borders = [sorted_estimates[int(len(estimates) * p) - 1]  for p in self.procentage]
258            all_borders.append(borders)
259
260        # Learn on whole train data
261        estimator_classifier = self.estimator(instances)
262
263        return DescriptiveAnalysisClassifier(estimator_classifier, all_borders, self.desc)
264
265class DescriptiveAnalysisClassifier:
266    def __init__(self, estimator_classifier, all_borders, desc):
267        self.estimator_classifier = estimator_classifier
268        self.all_borders = all_borders
269        self.desc = desc
270
271    def __call__(self, instance, result_type=Orange.core.GetValue):
272        predicted, probabilities = self.estimator_classifier(instance, Orange.core.GetBoth)
273
274        for borders, estimate in zip(self.all_borders, probabilities.reliability_estimate):
275            estimate.text_description = self.desc[0]
276            for lower_border, text_desc in zip(borders, self.desc):
277                if estimate.estimate >= lower_border:
278                    estimate.text_description = text_desc
279
280        # Return the appropriate type of result
281        if result_type == Orange.core.GetValue:
282            return predicted
283        elif result_type == Orange.core.GetProbabilities:
284            return probabilities
285        else:
286            return predicted, probabilities
287
288class SensitivityAnalysis:
289    """
290   
291    :param e: List of possible :math:`\epsilon` values for SAvar and SAbias
292        reliability estimates.
293    :type e: list of floats
294   
295    :rtype: :class:`Orange.evaluation.reliability.SensitivityAnalysisClassifier`
296   
297    To estimate the reliability of prediction for given instance,
298    the learning set is extended with this instance, labeled with
299    :math:`K + \epsilon (l_{max} - l_{min})`,
300    where :math:`K` denotes the initial prediction,
301    :math:`\epsilon` is sensitivity parameter and :math:`l_{min}` and
302    :math:`l_{max}` denote lower and the upper bound of the learning
303    instances' labels. After computing different sensitivity predictions
304    using different values of :math:`\epsilon`, the prediction are combined
305    into SAvar and SAbias. SAbias can be used in a signed or absolute form.
306
307    :math:`SAvar = \\frac{\sum_{\epsilon \in E}(K_{\epsilon} - K_{-\epsilon})}{|E|}`
308
309    :math:`SAbias = \\frac{\sum_{\epsilon \in E} (K_{\epsilon} - K ) + (K_{-\epsilon} - K)}{2 |E|}`
310   
311   
312    """
[14]313    def __init__(self, e=[0.01, 0.1, 0.5, 1.0, 2.0], name="sa"):
[0]314        self.e = e
[14]315        self.name = name
[0]316
317    def __call__(self, instances, learner):
318        min_value = max_value = instances[0].getclass().value
319        for ex in instances:
320            if ex.getclass().value > max_value:
321                max_value = ex.getclass().value
322            if ex.getclass().value < min_value:
323                min_value = ex.getclass().value
324        return SensitivityAnalysisClassifier(self.e, instances, min_value, max_value, learner)
325
326class SensitivityAnalysisClassifier:
327    def __init__(self, e, instances, min_value, max_value, learner):
328        self.e = e
329        self.instances = instances
330        self.max_value = max_value
331        self.min_value = min_value
332        self.learner = learner
333
334    def __call__(self, instance, predicted, probabilities):
335        # Create new dataset
336        r_data = Orange.data.Table(self.instances)
337
338        # Create new instance
339        modified_instance = Orange.data.Instance(instance)
340
341        # Append it to the data
342        r_data.append(modified_instance)
343
344        # Calculate SAvar & SAbias
345        SAvar = SAbias = 0
346
347        for eps in self.e:
348            # +epsilon
349            r_data[-1].setclass(predicted.value + eps * (self.max_value - self.min_value))
350            c = self.learner(r_data)
351            k_plus = c(instance, Orange.core.GetValue)
352
353            # -epsilon
354            r_data[-1].setclass(predicted.value - eps * (self.max_value - self.min_value))
355            c = self.learner(r_data)
356            k_minus = c(instance, Orange.core.GetValue)
357            #print len(r_data)
358            #print eps*(self.max_value - self.min_value)
359            #print k_plus
360            #print k_minus
361            # calculate part SAvar and SAbias
362            SAvar += k_plus.value - k_minus.value
363            SAbias += k_plus.value + k_minus.value - 2 * predicted.value
364
365        SAvar /= len(self.e)
366        SAbias /= 2 * len(self.e)
367
368        return [Estimate(SAvar, ABSOLUTE, SAVAR_ABSOLUTE),
369                Estimate(SAbias, SIGNED, SABIAS_SIGNED),
370                Estimate(abs(SAbias), ABSOLUTE, SABIAS_ABSOLUTE)]
371
[10]372
373
374class ReferenceExpectedError:
[13]375    """
[10]376
[13]377    :rtype: :class:`Orange.evaluation.reliability.ReferenceExpectedErrorClassifier`
378
379    Reference reliability estimation method for classification as used in Evaluating Reliability of Single
380    Classifications of Neural Networks, Darko Pevec, 2011.
381
382    :math:`O_{ref} = 2 (\hat y - \hat y ^2) = 2 \hat y (1-\hat y)`
383
384    where :math:`\hat y` is the estimated probability of the predicted class.
385
386    Note that for this method, in contrast with all others, a greater estimate means lower reliability (greater
387    expected error).
388
389    """
[10]390    def __init__(self, name="reference"):
391        self.name = name
392
393    def __call__(self, instances, learner):
394        classifier = learner(instances)
395        return ReferenceExpectedErrorClassifier(classifier)
396
397   
398class ReferenceExpectedErrorClassifier:
399
400    def __init__(self, classifier):
401        self.classifier = classifier
402
403    def __call__(self, instance, *args):
404        y_hat = max(self.classifier(instance, Orange.classification.Classifier.GetProbabilities))
405        return [Estimate(2 * y_hat * (1 - y_hat), ABSOLUTE, ERR_ABSOLUTE)]
406
407
[0]408class BaggingVariance:
409    """
410   
411    :param m: Number of bagging models to be used with BAGV estimate
412    :type m: int
413   
414    :rtype: :class:`Orange.evaluation.reliability.BaggingVarianceClassifier`
415   
416    :math:`m` different bagging models are constructed and used to estimate
[5]417    the value of dependent variable for a given instance. In regression,
418    the variance of those predictions is used as a prediction reliability
419    estimate.
[0]420
421    :math:`BAGV = \\frac{1}{m} \sum_{i=1}^{m} (K_i - K)^2`
422
423    where :math:`K = \\frac{\sum_{i=1}^{m} K_i}{m}` and :math:`K_i` are
[5]424    predictions of individual constructed models. Note that a greater value
425    implies greater error.
426
427    For classification, 1 minus the average Euclidean distance between class
428    probability distributions predicted by the model, and distributions
429    predicted by the individual bagged models, is used as the BAGV reliability
430    measure. Note that in this case a greater value implies a better
431    prediction.
[0]432   
[37]433    This reliability measure can run out of memory fast if individual classifiers
434    use a lot of memory, as it build m of them, thereby using :math:`m` times memory
435    for a single classifier. If instances for measuring predictions
436    are given as a parameter, this class can only compute their reliability,
437    which allows less memory use.
438
[0]439    """
[37]440    def __init__(self, m=50, name="bv", randseed=0, for_instances=None):
441        """
442        for_instances:
443        """
[0]444        self.m = m
[9]445        self.name = name
[37]446        self.select_with_repeat = Orange.core.MakeRandomIndicesMultiple()
447        self.select_with_repeat.random_generator = Orange.misc.Random(randseed)
448        self.for_instances = for_instances
[0]449
450    def __call__(self, instances, learner):
451        classifiers = []
452
[5]453        if instances.domain.class_var.var_type == Orange.feature.Descriptor.Discrete:
454            classifier = learner(instances)
455        else:
456            classifier = None
457
[37]458        for_inst_class = defaultdict(list)
459        this_iteration = None
460       
461        if self.for_instances:
462            his = map(_hashable_instance, self.for_instances)
463
[0]464        # Create bagged classifiers using sampling with replacement
[37]465        for i in xrange(self.m):
466            this_iteration = set()
467            selection = self.select_with_repeat(len(instances))
[0]468            data = instances.select(selection)
[37]469            cl = learner(data)
470            if cl:
471                if self.for_instances: # predict reliability for testing instances and throw cl away
472                    for instance, hi in zip(self.for_instances, his):
473                        if hi not in this_iteration:
474                            for_inst_class[hi].append(_bagged_value(instance, cl, classifier))
475                            this_iteration.add(hi)
476                else:
477                    classifiers.append(cl)
478
479        return BaggingVarianceClassifier(classifiers, classifier, for_inst_class=dict(for_inst_class))
[0]480
481class BaggingVarianceClassifier:
[37]482    def __init__(self, classifiers, classifier=None, for_inst_class=None):
[0]483        self.classifiers = classifiers
[8]484        self.classifier = classifier
[37]485        self.for_inst_class = for_inst_class
[5]486
487    def __call__(self, instance, *args):
[0]488        BAGV = 0
489
490        # Calculate the bagging variance
[37]491        if self.for_inst_class:
492            bagged_values = self.for_inst_class[_hashable_instance(instance)]
493        else:
494            bagged_values = [ _bagged_value(instance, c, self.classifier) for c in self.classifiers ]
495
[0]496        k = sum(bagged_values) / len(bagged_values)
497
498        BAGV = sum((bagged_value - k) ** 2 for bagged_value in bagged_values) / len(bagged_values)
[5]499        if instance.domain.class_var.var_type == Orange.feature.Descriptor.Discrete:
500            BAGV = 1 - BAGV
[0]501
502        return [Estimate(BAGV, ABSOLUTE, BAGV_ABSOLUTE)]
503
[37]504def _hashable_instance(instance):
505    return tuple(instance[i].value for i in range(len(instance.domain.attributes)))
506
507def _bagged_value(instance, c, classifier):
508    if instance.domain.class_var.var_type == Orange.feature.Descriptor.Continuous:
509        return c(instance, Orange.core.GetValue).value
510    elif instance.domain.class_var.var_type == Orange.feature.Descriptor.Discrete:
511        estimate = classifier(instance, Orange.core.GetProbabilities)
512        return euclidean_dist(c(instance, Orange.core.GetProbabilities), estimate)
513
514
[0]515class LocalCrossValidation:
516    """
[5]517
[0]518    :param k: Number of nearest neighbours used in LCV estimate
519    :type k: int
[5]520
521    :param distance: function that computes a distance between two discrete
522        distributions (used only in classification problems). The default
523        is Hellinger distance.
524    :type distance: function
525
526    :param distance_weighted: for classification reliability estimation,
527        use an average distance between distributions, weighted by :math:`e^{-d}`,
528        where :math:`d` is the distance between predicted instance and the
529        neighbour.
530
[0]531    :rtype: :class:`Orange.evaluation.reliability.LocalCrossValidationClassifier`
[5]532
[0]533    :math:`k` nearest neighbours to the given instance are found and put in
534    a separate data set. On this data set, a leave-one-out validation is
[5]535    performed. Reliability estimate for regression is then the distance
536    weighted absolute prediction error. In classification, 1 minus the average
537    distance between the predicted class probability distribution and the
538    (trivial) probability distributions of the nearest neighbour.
[0]539
540    If a special value 0 is passed as :math:`k` (as is by default),
541    it is set as 1/20 of data set size (or 5, whichever is greater).
[5]542
543    Summary of the algorithm for regression:
544
[0]545    1. Determine the set of k nearest neighours :math:`N = { (x_1, c_1),...,
546       (x_k, c_k)}`.
547    2. On this set, compute leave-one-out predictions :math:`K_i` and
548       prediction errors :math:`E_i = | C_i - K_i |`.
549    3. :math:`LCV(x) = \\frac{ \sum_{(x_i, c_i) \in N} d(x_i, x) * E_i }{ \sum_{(x_i, c_i) \in N} d(x_i, x) }`
[5]550
[0]551    """
[9]552    def __init__(self, k=0, distance=hellinger_dist, distance_weighted=True, name="lcv"):
[0]553        self.k = k
[5]554        self.distance = distance
555        self.distance_weighted = distance_weighted
[9]556        self.name = name
[0]557
558    def __call__(self, instances, learner):
559        nearest_neighbours_constructor = Orange.classification.knn.FindNearestConstructor()
560        nearest_neighbours_constructor.distanceConstructor = Orange.distance.Euclidean()
561
562        distance_id = Orange.feature.Descriptor.new_meta_id()
563        nearest_neighbours = nearest_neighbours_constructor(instances, 0, distance_id)
564
565        if self.k == 0:
566            self.k = max(5, len(instances) / 20)
567
[5]568        return LocalCrossValidationClassifier(distance_id, nearest_neighbours, self.k, learner,
569            distance=self.distance, distance_weighted=self.distance_weighted)
[0]570
571class LocalCrossValidationClassifier:
[5]572    def __init__(self, distance_id, nearest_neighbours, k, learner, **kwds):
[0]573        self.distance_id = distance_id
574        self.nearest_neighbours = nearest_neighbours
575        self.k = k
576        self.learner = learner
[5]577        for a,b in kwds.items():
578            setattr(self, a, b)
[0]579
580    def __call__(self, instance, *args):
581        LCVer = 0
582        LCVdi = 0
583
584        # Find k nearest neighbors
585
586        knn = [ex for ex in self.nearest_neighbours(instance, self.k)]
587
588        # leave one out of prediction error
589        for i in xrange(len(knn)):
590            train = knn[:]
591            del train[i]
592
593            classifier = self.learner(Orange.data.Table(train))
594
[5]595            if instance.domain.class_var.var_type == Orange.feature.Descriptor.Continuous:
596                returned_value = classifier(knn[i], Orange.core.GetValue)
597                e = abs(knn[i].getclass().value - returned_value.value)
[0]598
[5]599            elif instance.domain.class_var.var_type == Orange.feature.Descriptor.Discrete:
600                returned_value = classifier(knn[i], Orange.core.GetProbabilities)
601                probabilities = [knn[i].get_class() == val for val in instance.domain.class_var.values]
602                e = self.distance(returned_value, Orange.statistics.distribution.Discrete(probabilities))
[0]603
[5]604            dist = math.exp(-knn[i][self.distance_id]) if self.distance_weighted else 1.0
605            LCVer += e * dist
606            LCVdi += dist
[0]607
608        LCV = LCVer / LCVdi if LCVdi != 0 else 0
609        if math.isnan(LCV):
610            LCV = 0.0
[5]611
612        if instance.domain.class_var.var_type == Orange.feature.Descriptor.Discrete:
613            LCV = 1 - LCV
614
[0]615        return [ Estimate(LCV, ABSOLUTE, LCV_ABSOLUTE) ]
616
617class CNeighbours:
618    """
619   
620    :param k: Number of nearest neighbours used in CNK estimate
621    :type k: int
[5]622
623    :param distance: function that computes a distance between two discrete
624        distributions (used only in classification problems). The default
625        is Hellinger distance.
626    :type distance: function
[0]627   
628    :rtype: :class:`Orange.evaluation.reliability.CNeighboursClassifier`
629   
[5]630    For regression, CNK is defined for an unlabeled instance as a difference
631    between average label of its nearest neighbours and its prediction. CNK
632    can be used as a signed or absolute estimate.
[0]633   
634    :math:`CNK = \\frac{\sum_{i=1}^{k}C_i}{k} - K`
635   
636    where :math:`k` denotes number of neighbors, C :sub:`i` denotes neighbours'
[5]637    labels and :math:`K` denotes the instance's prediction. Note that a greater
638    value implies greater prediction error.
639
640    For classification, CNK is equal to 1 minus the average distance between
641    predicted class distribution and (trivial) class distributions of the
642    $k$ nearest neighbours from the learning set. Note that in this case
643    a greater value implies better prediction.
[0]644   
645    """
[9]646    def __init__(self, k=5, distance=hellinger_dist, name = "cnk"):
[0]647        self.k = k
[5]648        self.distance = distance
[9]649        self.name = name
[0]650
651    def __call__(self, instances, learner):
652        nearest_neighbours_constructor = Orange.classification.knn.FindNearestConstructor()
653        nearest_neighbours_constructor.distanceConstructor = Orange.distance.Euclidean()
654
655        distance_id = Orange.feature.Descriptor.new_meta_id()
656        nearest_neighbours = nearest_neighbours_constructor(instances, 0, distance_id)
[5]657        return CNeighboursClassifier(nearest_neighbours, self.k, distance=self.distance)
[0]658
659class CNeighboursClassifier:
[8]660    def __init__(self, nearest_neighbours, k, distance):
[0]661        self.nearest_neighbours = nearest_neighbours
662        self.k = k
[8]663        self.distance = distance
[0]664
665    def __call__(self, instance, predicted, probabilities):
666        CNK = 0
667
668        # Find k nearest neighbors
669
670        knn = [ex for ex in self.nearest_neighbours(instance, self.k)]
671
672        # average label of neighbors
[5]673        if ex.domain.class_var.var_type == Orange.feature.Descriptor.Continuous:
674            for ex in knn:
675                CNK += ex.getclass().value
676            CNK /= self.k
677            CNK -= predicted.value
[0]678
[5]679            return [Estimate(CNK, SIGNED, CNK_SIGNED),
680                    Estimate(abs(CNK), ABSOLUTE, CNK_ABSOLUTE)]
681        elif ex.domain.class_var.var_type == Orange.feature.Descriptor.Discrete:
682            knn_l = Orange.classification.knn.kNNLearner(k=self.k)
683            knn_c = knn_l(knn)
684            for ex in knn:
685                CNK -= self.distance(probabilities, knn_c(ex, Orange.classification.Classifier.GetProbabilities))
686            CNK /= self.k
687            CNK += 1
[0]688
[5]689            return [Estimate(CNK, ABSOLUTE, CNK_ABSOLUTE)]
[0]690
691class Mahalanobis:
692    """
693   
694    :param k: Number of nearest neighbours used in Mahalanobis estimate.
695    :type k: int
696   
697    :rtype: :class:`Orange.evaluation.reliability.MahalanobisClassifier`
698   
699    Mahalanobis distance reliability estimate is defined as
700    `mahalanobis distance <http://en.wikipedia.org/wiki/Mahalanobis_distance>`_
701    to the evaluated instance's :math:`k` nearest neighbours.
702
703   
704    """
[14]705    def __init__(self, k=3, name="mahalanobis"):
[0]706        self.k = k
[14]707        self.name = name
[0]708
709    def __call__(self, instances, *args):
710        nnm = Orange.classification.knn.FindNearestConstructor()
711        nnm.distanceConstructor = Orange.distance.Mahalanobis()
712
713        mid = Orange.feature.Descriptor.new_meta_id()
714        nnm = nnm(instances, 0, mid)
715        return MahalanobisClassifier(self.k, nnm, mid)
716
717class MahalanobisClassifier:
718    def __init__(self, k, nnm, mid):
719        self.k = k
720        self.nnm = nnm
721        self.mid = mid
722
723    def __call__(self, instance, *args):
724        mahalanobis_distance = 0
725
726        mahalanobis_distance = sum(ex[self.mid].value for ex in self.nnm(instance, self.k))
727
728        return [ Estimate(mahalanobis_distance, ABSOLUTE, MAHAL_ABSOLUTE) ]
729
730class MahalanobisToCenter:
731    """
732    :rtype: :class:`Orange.evaluation.reliability.MahalanobisToCenterClassifier`
733   
734    Mahalanobis distance to center reliability estimate is defined as a
735    `mahalanobis distance <http://en.wikipedia.org/wiki/Mahalanobis_distance>`_
736    between the predicted instance and the centroid of the data.
737
738   
739    """
[14]740    def __init__(self, name="mahalanobis to center"):
741        self.name = name
[0]742
743    def __call__(self, instances, *args):
744        dc = Orange.core.DomainContinuizer()
745        dc.classTreatment = Orange.core.DomainContinuizer.Ignore
746        dc.continuousTreatment = Orange.core.DomainContinuizer.NormalizeBySpan
747        dc.multinomialTreatment = Orange.core.DomainContinuizer.NValues
748
749        new_domain = dc(instances)
750        new_instances = instances.translate(new_domain)
751
752        X, _, _ = new_instances.to_numpy()
753        instance_avg = numpy.average(X, 0)
754
755        distance_constructor = Orange.distance.Mahalanobis()
756        distance = distance_constructor(new_instances)
757
758        average_instance = Orange.data.Instance(new_instances.domain, list(instance_avg) + ["?"])
759
760        return MahalanobisToCenterClassifier(distance, average_instance, new_domain)
761
762class MahalanobisToCenterClassifier:
763    def __init__(self, distance, average_instance, new_domain):
764        self.distance = distance
765        self.average_instance = average_instance
766        self.new_domain = new_domain
767
768    def __call__(self, instance, *args):
769
770        inst = Orange.data.Instance(self.new_domain, instance)
771
772        mahalanobis_to_center = self.distance(inst, self.average_instance)
773
774        return [ Estimate(mahalanobis_to_center, ABSOLUTE, MAHAL_TO_CENTER_ABSOLUTE) ]
775
776
777class BaggingVarianceCNeighbours:
778    """
779   
780    :param bagv: Instance of Bagging Variance estimator.
781    :type bagv: :class:`BaggingVariance`
782   
783    :param cnk: Instance of CNK estimator.
784    :type cnk: :class:`CNeighbours`
785   
786    :rtype: :class:`Orange.evaluation.reliability.BaggingVarianceCNeighboursClassifier`
787   
788    BVCK is a combination (average) of Bagging variance and local modeling of
789    prediction error.
790   
791    """
[37]792    def __init__(self, bagv=None, cnk=None, name="bvck"):
793        if bagv is None:
794            bagv = BaggingVariance()
795        if cnk is None:
796            cnk = CNeighbours()
[0]797        self.bagv = bagv
798        self.cnk = cnk
[14]799        self.name = "bvck"
[0]800
801    def __call__(self, instances, learner):
802        bagv_classifier = self.bagv(instances, learner)
803        cnk_classifier = self.cnk(instances, learner)
804        return BaggingVarianceCNeighboursClassifier(bagv_classifier, cnk_classifier)
805
806class BaggingVarianceCNeighboursClassifier:
807    def __init__(self, bagv_classifier, cnk_classifier):
808        self.bagv_classifier = bagv_classifier
809        self.cnk_classifier = cnk_classifier
810
811    def __call__(self, instance, predicted, probabilities):
812        bagv_estimates = self.bagv_classifier(instance, predicted, probabilities)
813        cnk_estimates = self.cnk_classifier(instance, predicted, probabilities)
814
815        bvck_value = (bagv_estimates[0].estimate + cnk_estimates[1].estimate) / 2
816        bvck_estimates = [ Estimate(bvck_value, ABSOLUTE, BVCK_ABSOLUTE) ]
817        bvck_estimates.extend(bagv_estimates)
818        bvck_estimates.extend(cnk_estimates)
819        return bvck_estimates
820
821class ErrorPredicting:
[14]822    def __init__(self, name = "ep"):
823        self.name = name
[0]824
825    def __call__(self, instances, learner):
826        res = Orange.evaluation.testing.cross_validation([learner], instances)
827        prediction_errors = get_prediction_error_list(res)
828
829        new_domain = Orange.data.Domain(instances.domain.attributes, Orange.core.FloatVariable("pe"))
830        new_dataset = Orange.data.Table(new_domain, instances)
831
832        for instance, prediction_error in izip(new_dataset, prediction_errors):
833            instance.set_class(prediction_error)
834
835        rf = Orange.ensemble.forest.RandomForestLearner()
836        rf_classifier = rf(new_dataset)
837
838        return ErrorPredictingClassification(rf_classifier, new_domain)
839
840class ErrorPredictingClassification:
841    def __init__(self, rf_classifier, new_domain):
842        self.rf_classifier = rf_classifier
843        self.new_domain = new_domain
844
845    def __call__(self, instance, predicted, probabilities):
846        new_instance = Orange.data.Instance(self.new_domain, instance)
847        value = self.rf_classifier(new_instance, Orange.core.GetValue)
848
849        return [Estimate(value.value, SIGNED, SABIAS_SIGNED)]
850
[5]851def gauss_kernel(x, sigma=1):
852    return 1./(sigma*math.sqrt(2*math.pi)) * math.exp(-1./2*(x/sigma)**2)
853
854class ParzenWindowDensityBased:
855    """
856    :param K: kernel function. Default: gaussian.
857    :type K: function
858
859    :param d_measure: distance measure for inter-instance distance.
860    :type d_measure: :class:`Orange.distance.DistanceConstructor`
861
862    :rtype: :class:`Orange.evaluation.reliability.ParzenWindowDensityBasedClassifier`
863
864    Returns a value that estimates a density of problem space around the
865    instance being predicted.
866    """
[9]867    def __init__(self, K=gauss_kernel, d_measure=Orange.distance.Euclidean(), name="density"):
[5]868        self.K = K
869        self.d_measure = d_measure
[9]870        self.name = name
[5]871
[11]872    def __call__(self, instances, learner):
[5]873
874        self.distance = self.d_measure(instances)
875
876        def density(x):
877            l, dens = len(instances), 0
878            for ex in instances:
879                dens += self.K(self.distance(x,ex))
880            return dens / l
881
882        max_density = max([density(ex) for ex in instances])
883
884        return ParzenWindowDensityBasedClassifier(density, max_density)
885
886class ParzenWindowDensityBasedClassifier:
887
888    def __init__(self, density, max_density):
889        self.density = density
890        self.max_density = max_density
891
892
893    def __call__(self, instance, *args):
894
895        DENS = self.max_density-self.density(instance)
896
897        return [Estimate(DENS, ABSOLUTE, DENS_ABSOLUTE)]
898
[37]899class Stacking:
900
901    def __init__(self, stack_learner, estimators, folds=10, save_data=False):
902        self.stack_learner = stack_learner
903        self.estimators = estimators
904        self.folds = folds
905        self.save_data = save_data
906   
907    def __call__(self, data, learner):
908
909        newfeatures = None
910       
911        if self.folds > 1:
912
913            cvi = Orange.data.sample.SubsetIndicesCV(data, self.folds)
914            data_cv = [ None ] * len(data)
915            for f in set(cvi): #for each fold
916                learn = data.select(cvi, f, negate=True)
917                test = data.select(cvi, f)
918
919                #learn reliability estimates for the learning set
920                lf = Learner(learner, estimators=self.estimators)(learn)
921               
922                #pos is used to retain the order of instances
923                for ex, pos in zip(test, [ i for i,n in enumerate(cvi) if n == f ]):
924                    pred = lf(ex, Orange.core.GetBoth)
925                    re = pred[1].reliability_estimate
926                    names = [ e.method_name for e in re ]
927                    assert newfeatures is None or names == newfeatures
928                    newfeatures = names
929                    estimates = [ abs(e.estimate) for e in re ]
930                    error = ex[-1].value - pred[0].value
931                    data_cv[pos] = estimates + [ abs(error) ]
932
933        else:
934 
935            #use half of the data to learn reliability estimates
936            #and the other half for induction of a stacking classifier
937            cvi = Orange.data.sample.SubsetIndicesCV(data, 2)
938            data_cv = []
939
940            learn = data.select(cvi, 0, negate=True)
941            test = data.select(cvi, 0)
942
943            #learn reliability estimates for the learning set
944            lf = Learner(learner, estimators=self.estimators)(learn)
945           
946            for ex in test:
947                pred = lf(ex, Orange.core.GetBoth)
948                re = pred[1].reliability_estimate
949                names = [ e.method_name for e in re ]
950                assert newfeatures is None or names == newfeatures
951                newfeatures = names
952                estimates = [ abs(e.estimate) for e in re ]
953                error = ex[-1].value - pred[0].value
954                data_cv.append(estimates + [ abs(error) ])
955
956            print "DCV", len(data_cv)
957
958        lf = None
959
960        #induce the classifier on cross-validated reliability estimates
961        newfeatures = [ Orange.feature.Continuous(name=n) for n in newfeatures ]
962        newdomain = Orange.data.Domain(newfeatures, Orange.feature.Continuous(name="error"))
963        classifier_data = Orange.data.Table(newdomain, data_cv)
964        stack_classifier = self.stack_learner(classifier_data)
965
966        #induce reliability estimates on the whole data set
967        lf = Learner(learner, estimators=self.estimators)(data)
968
969        if self.save_data:
970            self.classifier_data = classifier_data
971
972        return StackingClassifier(stack_classifier, lf, newdomain)
973
974
975class StackingClassifier:
976
977    def __init__(self, stacking_classifier, reliability_classifier, domain):
978        self.stacking_classifier = stacking_classifier
979        print self.stacking_classifier
980        self.domain = domain
981        self.reliability_classifier = reliability_classifier
982
983    def convert(self, instance):
984        """ Return example in the space of reliability estimates. """
985        re = self.reliability_classifier(instance, Orange.core.GetProbabilities).reliability_estimate
986        #take absolute values for all
987        tex = [ abs(e.estimate) for e in re ] + [ "?" ]
988        tex =  Orange.data.Instance(self.domain, tex)
989        return tex
990
991    def __call__(self, instance, *args):
992        tex = self.convert(instance)
993        r = self.stacking_classifier(tex)
994        r = float(r)
995        r = max(0., r)
996        return [ Estimate(r, ABSOLUTE, STACKING) ]
997
998class ICV:
999    """ Perform internal cross validation (as in Automatic selection of
1000     reliability estimates for individual regression predictions,
1001    Zoran Bosnic, 2010) and return id of the method
1002    that scored best on this data.
1003
1004
1005    """
1006 
1007    def __init__(self, estimators, folds=10):
1008        self.estimators = estimators
1009        self.folds = folds
1010   
1011    def __call__(self, data, learner):
1012
1013        cvi = Orange.data.sample.SubsetIndicesCV(data, self.folds)
1014        sum_of_rs = defaultdict(float)
1015
1016        elearner = Learner(learner, estimators=self.estimators)
1017
1018        #average correlations from each fold
1019        for f in set(cvi):
1020            learn = data.select(cvi, f, negate=True)
1021            test = data.select(cvi, f)
1022
1023            res = Orange.evaluation.testing.learn_and_test_on_test_data([elearner], learn, test)
1024            results = get_pearson_r(res)
1025            for r, p, sa, method in results:
1026                sum_of_rs[(method, sa)] += r
1027
1028        sum_of_rs = sorted(sum_of_rs.items(), key=lambda estimate: estimate[1], reverse=True)
1029        chosen = sum_of_rs[0][0]
1030        print "chosen", chosen
1031        print sum_of_rs
1032
1033        lf = elearner(data)
1034        return ICVClassifier(chosen, lf)
1035
1036
1037class ICVClassifier:
1038
1039    def __init__(self, chosen, reliability_classifier):
1040        self.chosen = chosen
1041        self.reliability_classifier = reliability_classifier
1042
1043    def __call__(self, instance, *args):
1044        re = self.reliability_classifier(instance, Orange.core.GetProbabilities).reliability_estimate
1045        for e in re:
1046            if e.method == self.chosen[0] and e.signed_or_absolute == self.chosen[1]:
1047                r = e.estimate
1048
1049        return [ Estimate(r, self.chosen[1], ICV_METHOD) ]
1050
[0]1051class Learner:
1052    """
1053    Reliability estimation wrapper around a learner we want to test.
1054    Different reliability estimation algorithms can be used on the
1055    chosen learner. This learner works as any other and can be used as one,
1056    but it returns the classifier, wrapped into an instance of
1057    :class:`Orange.evaluation.reliability.Classifier`.
1058   
1059    :param box_learner: Learner we want to wrap into a reliability estimation
1060        classifier.
1061    :type box_learner: :obj:`~Orange.classification.Learner`
1062   
1063    :param estimators: List of different reliability estimation methods we
1064                       want to use on the chosen learner.
1065    :type estimators: :obj:`list` of reliability estimators
1066   
1067    :param name: Name of this reliability learner
1068    :type name: string
1069   
1070    :rtype: :class:`Orange.evaluation.reliability.Learner`
1071    """
1072    def __init__(self, box_learner, name="Reliability estimation",
1073                 estimators=[SensitivityAnalysis(),
1074                             LocalCrossValidation(),
1075                             BaggingVarianceCNeighbours(),
1076                             Mahalanobis(),
1077                             MahalanobisToCenter()],
1078                 **kwds):
1079        self.__dict__.update(kwds)
1080        self.name = name
1081        self.estimators = estimators
1082        self.box_learner = box_learner
1083        self.blending = False
1084
1085
1086    def __call__(self, instances, weight=None, **kwds):
1087        """Learn from the given table of data instances.
1088       
1089        :param instances: Data instances to learn from.
1090        :type instances: Orange.data.Table
1091        :param weight: Id of meta attribute with weights of instances
1092        :type weight: int
1093        :rtype: :class:`Orange.evaluation.reliability.Classifier`
1094        """
1095
1096        blending_classifier = None
1097        new_domain = None
1098
1099#        if instances.domain.class_var.var_type != Orange.feature.Continuous.Continuous:
1100#            raise Exception("This method only works on data with continuous class.")
1101
1102        return Classifier(instances, self.box_learner, self.estimators, self.blending, new_domain, blending_classifier)
[37]1103 
[0]1104class Classifier:
1105    """
1106    A reliability estimation wrapper for classifiers.
1107
1108    What distinguishes this classifier is that the returned probabilities (if
1109    :obj:`Orange.classification.Classifier.GetProbabilities` or
1110    :obj:`Orange.classification.Classifier.GetBoth` is passed) contain an
1111    additional attribute :obj:`reliability_estimate`, which is an instance of
1112    :class:`~Orange.evaluation.reliability.Estimate`.
1113
1114    """
1115
1116    def __init__(self, instances, box_learner, estimators, blending, blending_domain, rf_classifier, **kwds):
1117        self.__dict__.update(kwds)
1118        self.instances = instances
1119        self.box_learner = box_learner
1120        self.estimators = estimators
1121        self.blending = blending
1122        self.blending_domain = blending_domain
1123        self.rf_classifier = rf_classifier
1124
1125        # Train the learner with original data
1126        self.classifier = box_learner(instances)
1127
1128        # Train all the estimators and create their classifiers
1129        self.estimation_classifiers = [estimator(instances, box_learner) for estimator in estimators]
1130
1131    def __call__(self, instance, result_type=Orange.core.GetValue):
1132        """
1133        Classify and estimate reliability of estimation for a new instance.
1134        When :obj:`result_type` is set to
1135        :obj:`Orange.classification.Classifier.GetBoth` or
1136        :obj:`Orange.classification.Classifier.GetProbabilities`,
1137        an additional attribute :obj:`reliability_estimate`,
1138        which is an instance of
1139        :class:`~Orange.evaluation.reliability.Estimate`,
1140        is added to the distribution object.
1141       
1142        :param instance: instance to be classified.
1143        :type instance: :class:`Orange.data.Instance`
1144        :param result_type: :class:`Orange.classification.Classifier.GetValue` or \
1145              :class:`Orange.classification.Classifier.GetProbabilities` or
1146              :class:`Orange.classification.Classifier.GetBoth`
1147       
1148        :rtype: :class:`Orange.data.Value`,
1149              :class:`Orange.statistics.Distribution` or a tuple with both
1150        """
1151        predicted, probabilities = self.classifier(instance, Orange.core.GetBoth)
1152
1153        # Create a place holder for estimates
1154        if probabilities is None:
1155            probabilities = Orange.statistics.distribution.Continuous()
1156        #with warnings.catch_warnings():
1157        #    warnings.simplefilter("ignore")
1158        probabilities.setattr('reliability_estimate', [])
1159
1160        # Calculate all the estimates and add them to the results
1161        for estimate in self.estimation_classifiers:
1162            probabilities.reliability_estimate.extend(estimate(instance, predicted, probabilities))
1163
1164        # Return the appropriate type of result
1165        if result_type == Orange.core.GetValue:
1166            return predicted
1167        elif result_type == Orange.core.GetProbabilities:
1168            return probabilities
1169        else:
1170            return predicted, probabilities
[5]1171
1172# Functions for testing and plotting
1173#TODO Document those.
1174def get_acc_rel(method, data, learner):
1175    estimators = [method]
1176    reliability = Orange.evaluation.reliability.Learner(learner, estimators=estimators)
1177    #results = Orange.evaluation.testing.leave_one_out([reliability], data)
1178    results = Orange.evaluation.testing.cross_validation([reliability], data)
1179
1180    rels, acc = [], []
1181
1182    for res in results.results:
1183        rels.append(res.probabilities[0].reliability_estimate[0].estimate)
1184        acc.append(res.probabilities[0][res.actual_class])
1185
1186    return rels, acc
1187
[12]1188
1189def rel_acc_plot(rels, acc, file_name=None, colors=None):
[5]1190
1191    import matplotlib.pylab as plt
[12]1192   
1193    if colors is None:
1194        colors = "k"
1195    plt.scatter(rels, acc, c=colors)
1196    plt.xlim(0.,1.)
1197    plt.ylim(ymin=0.)
1198    plt.xlabel("Reliability")
1199    plt.ylabel("Accuracy")
1200    if file_name is None:
1201        plt.show()
1202    else:
1203        plt.savefig(file_name)
1204
1205def rel_acc_compute_plot(method, data, learner, file_name=None, colors=None):
[5]1206
1207    plt.clf()
1208
1209    rels, acc = get_acc_rel(method, data, learner)
[12]1210    el_acc_plot(acc, rels, file_name=file_name, colors=colors)
1211   
[5]1212
1213def acc_rel_correlation(method, data, learner):
1214    import scipy.stats
1215    rels, acc = get_acc_rel(method, data, learner)
[37]1216    return scipy.stats.spearmanr(acc, rels)[0]
Note: See TracBrowser for help on using the repository browser.