Changeset 37:11955f57b3ae in orange-reliability


Ignore:
Timestamp:
09/30/13 18:50:34 (7 months ago)
Author:
markotoplak
Branch:
default
Message:

Rewrote ICV, added Stacking, and a memory efficient option to compute BAGV.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • orangecontrib/reliability/__init__.py

    r35 r37  
    99from collections import defaultdict 
    1010from itertools import izip 
    11  
    12 # Labels and final variables 
    13 labels = ["SAvar", "SAbias", "BAGV", "CNK", "LCV", "BVCK", "Mahalanobis", "ICV"] 
    14  
    15 """ 
    16 # All the estimators calculation constants 
    17 DO_SA = 0 
    18 DO_BAGV = 1 
    19 DO_CNK = 2 
    20 DO_LCV = 3 
    21 DO_BVCK = 4 
    22 DO_MAHAL = 5 
    23 """ 
    2411 
    2512# All the estimator method constants 
     
    3825DENS_ABSOLUTE = 14 
    3926ERR_ABSOLUTE = 15 
     27STACKING = 101 
    4028 
    4129# Type of estimator constant 
     
    4634METHOD_NAME = {0: "SAvar absolute", 1: "SAbias signed", 2: "SAbias absolute", 
    4735               3: "BAGV absolute", 4: "CNK signed", 5: "CNK absolute", 
    48                6: "LCV absolute", 7: "BVCK_absolute", 8: "Mahalanobis absolute", 
     36               6: "LCV absolute", 7: "BVCK absolute", 8: "Mahalanobis absolute", 
    4937               9: "BLENDING absolute", 10: "ICV", 11: "RF Variance", 12: "RF Std", 
    50                13: "Mahalanobis to center", 14: "Density based", 15: "Reference expected error"} 
    51  
    52 select_with_repeat = Orange.core.MakeRandomIndicesMultiple() 
    53 select_with_repeat.random_generator = Orange.misc.Random() 
     38               13: "Mahalanobis to center", 14: "Density based", 15: "Reference expected error", 
     39               101: "Stacking" } 
    5440 
    5541def get_reliability_estimation_list(res, i): 
    56     return [result.probabilities[0].reliability_estimate[i].estimate for result in res.results], res.results[0].probabilities[0].reliability_estimate[i].signed_or_absolute, res.results[0].probabilities[0].reliability_estimate[i].method 
     42    return [ result.probabilities[0].reliability_estimate[i].estimate for result in res.results], \ 
     43        res.results[0].probabilities[0].reliability_estimate[i].signed_or_absolute, \ 
     44        res.results[0].probabilities[0].reliability_estimate[i].method 
    5745 
    5846def get_prediction_error_list(res): 
     
    417405        return [Estimate(2 * y_hat * (1 - y_hat), ABSOLUTE, ERR_ABSOLUTE)] 
    418406 
    419      
    420407 
    421408class BaggingVariance: 
     
    444431    prediction. 
    445432     
    446     """ 
    447     def __init__(self, m=50, name="bv"): 
     433    This reliability measure can run out of memory fast if individual classifiers 
     434    use a lot of memory, as it build m of them, thereby using :math:`m` times memory 
     435    for a single classifier. If instances for measuring predictions 
     436    are given as a parameter, this class can only compute their reliability, 
     437    which allows less memory use.  
     438 
     439    """ 
     440    def __init__(self, m=50, name="bv", randseed=0, for_instances=None): 
     441        """ 
     442        for_instances:  
     443        """ 
    448444        self.m = m 
    449445        self.name = name 
     446        self.select_with_repeat = Orange.core.MakeRandomIndicesMultiple() 
     447        self.select_with_repeat.random_generator = Orange.misc.Random(randseed) 
     448        self.for_instances = for_instances 
    450449 
    451450    def __call__(self, instances, learner): 
     
    457456            classifier = None 
    458457 
     458        for_inst_class = defaultdict(list) 
     459        this_iteration = None 
     460         
     461        if self.for_instances: 
     462            his = map(_hashable_instance, self.for_instances) 
     463 
    459464        # Create bagged classifiers using sampling with replacement 
    460         for _ in xrange(self.m): 
    461             selection = select_with_repeat(len(instances)) 
     465        for i in xrange(self.m): 
     466            this_iteration = set() 
     467            selection = self.select_with_repeat(len(instances)) 
    462468            data = instances.select(selection) 
    463             classifiers.append(learner(data)) 
    464         return BaggingVarianceClassifier(classifiers, classifier) 
     469            cl = learner(data) 
     470            if cl: 
     471                if self.for_instances: # predict reliability for testing instances and throw cl away 
     472                    for instance, hi in zip(self.for_instances, his): 
     473                        if hi not in this_iteration: 
     474                            for_inst_class[hi].append(_bagged_value(instance, cl, classifier)) 
     475                            this_iteration.add(hi) 
     476                else: 
     477                    classifiers.append(cl) 
     478 
     479        return BaggingVarianceClassifier(classifiers, classifier, for_inst_class=dict(for_inst_class)) 
    465480 
    466481class BaggingVarianceClassifier: 
    467     def __init__(self, classifiers, classifier=None): 
     482    def __init__(self, classifiers, classifier=None, for_inst_class=None): 
    468483        self.classifiers = classifiers 
    469484        self.classifier = classifier 
     485        self.for_inst_class = for_inst_class 
    470486 
    471487    def __call__(self, instance, *args): 
     
    473489 
    474490        # Calculate the bagging variance 
    475         if instance.domain.class_var.var_type == Orange.feature.Descriptor.Continuous: 
    476             bagged_values = [c(instance, Orange.core.GetValue).value for c in self.classifiers if c is not None] 
    477         elif instance.domain.class_var.var_type == Orange.feature.Descriptor.Discrete: 
    478             estimate = self.classifier(instance, Orange.core.GetProbabilities) 
    479             bagged_values = [euclidean_dist(c(instance, Orange.core.GetProbabilities), estimate) for c in self.classifiers if c is not None] 
     491        if self.for_inst_class: 
     492            bagged_values = self.for_inst_class[_hashable_instance(instance)] 
     493        else: 
     494            bagged_values = [ _bagged_value(instance, c, self.classifier) for c in self.classifiers ] 
     495 
    480496        k = sum(bagged_values) / len(bagged_values) 
    481497 
     
    485501 
    486502        return [Estimate(BAGV, ABSOLUTE, BAGV_ABSOLUTE)] 
     503 
     504def _hashable_instance(instance): 
     505    return tuple(instance[i].value for i in range(len(instance.domain.attributes))) 
     506 
     507def _bagged_value(instance, c, classifier): 
     508    if instance.domain.class_var.var_type == Orange.feature.Descriptor.Continuous: 
     509        return c(instance, Orange.core.GetValue).value 
     510    elif instance.domain.class_var.var_type == Orange.feature.Descriptor.Discrete: 
     511        estimate = classifier(instance, Orange.core.GetProbabilities) 
     512        return euclidean_dist(c(instance, Orange.core.GetProbabilities), estimate) 
     513 
    487514 
    488515class LocalCrossValidation: 
     
    763790     
    764791    """ 
    765     def __init__(self, bagv=BaggingVariance(), cnk=CNeighbours(), name="bvck"): 
     792    def __init__(self, bagv=None, cnk=None, name="bvck"): 
     793        if bagv is None: 
     794            bagv = BaggingVariance() 
     795        if cnk is None: 
     796            cnk = CNeighbours() 
    766797        self.bagv = bagv 
    767798        self.cnk = cnk 
     
    865896 
    866897        return [Estimate(DENS, ABSOLUTE, DENS_ABSOLUTE)] 
     898 
     899class Stacking: 
     900 
     901    def __init__(self, stack_learner, estimators, folds=10, save_data=False): 
     902        self.stack_learner = stack_learner 
     903        self.estimators = estimators 
     904        self.folds = folds 
     905        self.save_data = save_data 
     906     
     907    def __call__(self, data, learner): 
     908 
     909        newfeatures = None 
     910         
     911        if self.folds > 1: 
     912 
     913            cvi = Orange.data.sample.SubsetIndicesCV(data, self.folds) 
     914            data_cv = [ None ] * len(data) 
     915            for f in set(cvi): #for each fold 
     916                learn = data.select(cvi, f, negate=True) 
     917                test = data.select(cvi, f) 
     918 
     919                #learn reliability estimates for the learning set 
     920                lf = Learner(learner, estimators=self.estimators)(learn) 
     921                 
     922                #pos is used to retain the order of instances 
     923                for ex, pos in zip(test, [ i for i,n in enumerate(cvi) if n == f ]): 
     924                    pred = lf(ex, Orange.core.GetBoth) 
     925                    re = pred[1].reliability_estimate 
     926                    names = [ e.method_name for e in re ] 
     927                    assert newfeatures is None or names == newfeatures 
     928                    newfeatures = names 
     929                    estimates = [ abs(e.estimate) for e in re ] 
     930                    error = ex[-1].value - pred[0].value 
     931                    data_cv[pos] = estimates + [ abs(error) ] 
     932 
     933        else: 
     934  
     935            #use half of the data to learn reliability estimates 
     936            #and the other half for induction of a stacking classifier 
     937            cvi = Orange.data.sample.SubsetIndicesCV(data, 2) 
     938            data_cv = [] 
     939 
     940            learn = data.select(cvi, 0, negate=True) 
     941            test = data.select(cvi, 0) 
     942 
     943            #learn reliability estimates for the learning set 
     944            lf = Learner(learner, estimators=self.estimators)(learn) 
     945             
     946            for ex in test: 
     947                pred = lf(ex, Orange.core.GetBoth) 
     948                re = pred[1].reliability_estimate 
     949                names = [ e.method_name for e in re ] 
     950                assert newfeatures is None or names == newfeatures 
     951                newfeatures = names 
     952                estimates = [ abs(e.estimate) for e in re ] 
     953                error = ex[-1].value - pred[0].value 
     954                data_cv.append(estimates + [ abs(error) ]) 
     955 
     956            print "DCV", len(data_cv) 
     957 
     958        lf = None 
     959 
     960        #induce the classifier on cross-validated reliability estimates 
     961        newfeatures = [ Orange.feature.Continuous(name=n) for n in newfeatures ] 
     962        newdomain = Orange.data.Domain(newfeatures, Orange.feature.Continuous(name="error")) 
     963        classifier_data = Orange.data.Table(newdomain, data_cv) 
     964        stack_classifier = self.stack_learner(classifier_data) 
     965 
     966        #induce reliability estimates on the whole data set 
     967        lf = Learner(learner, estimators=self.estimators)(data) 
     968 
     969        if self.save_data: 
     970            self.classifier_data = classifier_data 
     971 
     972        return StackingClassifier(stack_classifier, lf, newdomain) 
     973 
     974 
     975class StackingClassifier: 
     976 
     977    def __init__(self, stacking_classifier, reliability_classifier, domain): 
     978        self.stacking_classifier = stacking_classifier 
     979        print self.stacking_classifier 
     980        self.domain = domain 
     981        self.reliability_classifier = reliability_classifier 
     982 
     983    def convert(self, instance): 
     984        """ Return example in the space of reliability estimates. """ 
     985        re = self.reliability_classifier(instance, Orange.core.GetProbabilities).reliability_estimate 
     986        #take absolute values for all 
     987        tex = [ abs(e.estimate) for e in re ] + [ "?" ] 
     988        tex =  Orange.data.Instance(self.domain, tex) 
     989        return tex 
     990 
     991    def __call__(self, instance, *args): 
     992        tex = self.convert(instance) 
     993        r = self.stacking_classifier(tex) 
     994        r = float(r) 
     995        r = max(0., r) 
     996        return [ Estimate(r, ABSOLUTE, STACKING) ] 
     997 
     998class ICV: 
     999    """ Perform internal cross validation (as in Automatic selection of 
     1000     reliability estimates for individual regression predictions, 
     1001    Zoran Bosnic, 2010) and return id of the method 
     1002    that scored best on this data. 
     1003 
     1004 
     1005    """ 
     1006   
     1007    def __init__(self, estimators, folds=10): 
     1008        self.estimators = estimators 
     1009        self.folds = folds 
     1010     
     1011    def __call__(self, data, learner): 
     1012 
     1013        cvi = Orange.data.sample.SubsetIndicesCV(data, self.folds) 
     1014        sum_of_rs = defaultdict(float) 
     1015 
     1016        elearner = Learner(learner, estimators=self.estimators) 
     1017 
     1018        #average correlations from each fold 
     1019        for f in set(cvi): 
     1020            learn = data.select(cvi, f, negate=True) 
     1021            test = data.select(cvi, f) 
     1022 
     1023            res = Orange.evaluation.testing.learn_and_test_on_test_data([elearner], learn, test) 
     1024            results = get_pearson_r(res) 
     1025            for r, p, sa, method in results: 
     1026                sum_of_rs[(method, sa)] += r  
     1027 
     1028        sum_of_rs = sorted(sum_of_rs.items(), key=lambda estimate: estimate[1], reverse=True) 
     1029        chosen = sum_of_rs[0][0] 
     1030        print "chosen", chosen 
     1031        print sum_of_rs 
     1032 
     1033        lf = elearner(data) 
     1034        return ICVClassifier(chosen, lf) 
     1035 
     1036 
     1037class ICVClassifier: 
     1038 
     1039    def __init__(self, chosen, reliability_classifier): 
     1040        self.chosen = chosen 
     1041        self.reliability_classifier = reliability_classifier 
     1042 
     1043    def __call__(self, instance, *args): 
     1044        re = self.reliability_classifier(instance, Orange.core.GetProbabilities).reliability_estimate 
     1045        for e in re: 
     1046            if e.method == self.chosen[0] and e.signed_or_absolute == self.chosen[1]: 
     1047                r = e.estimate 
     1048 
     1049        return [ Estimate(r, self.chosen[1], ICV_METHOD) ] 
    8671050 
    8681051class Learner: 
     
    9181101 
    9191102        return Classifier(instances, self.box_learner, self.estimators, self.blending, new_domain, blending_classifier) 
    920  
    921     def internal_cross_validation(self, instances, folds=10): 
    922         """ Perform the internal cross validation for getting the best 
    923         reliability estimate. It uses the reliability estimators defined in 
    924         estimators attribute. 
    925  
    926         Returns the id of the method that scored the best. 
    927  
    928         :param instances: Data instances to use for ICV. 
    929         :type instances: :class:`Orange.data.Table` 
    930         :param folds: number of folds for ICV. 
    931         :type folds: int 
    932         :rtype: int 
    933  
    934         """ 
    935         res = Orange.evaluation.testing.cross_validation([self], instances, folds=folds) 
    936         results = get_pearson_r(res) 
    937         sorted_results = sorted(results) 
    938         return sorted_results[-1][3] 
    939  
    940     def internal_cross_validation_testing(self, instances, folds=10): 
    941         """ Perform internal cross validation (as in Automatic selection of 
    942         reliability estimates for individual regression predictions, 
    943         Zoran Bosnic, 2010) and return id of the method 
    944         that scored best on this data. 
    945  
    946         :param instances: Data instances to use for ICV. 
    947         :type instances: :class:`Orange.data.Table` 
    948         :param folds: number of folds for ICV. 
    949         :type folds: int 
    950         :rtype: int 
    951  
    952         """ 
    953         cv_indices = Orange.core.MakeRandomIndicesCV(instances, folds) 
    954  
    955         list_of_rs = [] 
    956  
    957         sum_of_rs = defaultdict(float) 
    958  
    959         for fold in xrange(folds): 
    960             data = instances.select(cv_indices, fold) 
    961             if len(data) < 10: 
    962                 res = Orange.evaluation.testing.leave_one_out([self], data) 
    963             else: 
    964                 res = Orange.evaluation.testing.cross_validation([self], data) 
    965             results = get_pearson_r(res) 
    966             for r, _, _, method in results: 
    967                 sum_of_rs[method] += r 
    968         sorted_sum_of_rs = sorted(sum_of_rs.items(), key=lambda estimate: estimate[1], reverse=True) 
    969         return sorted_sum_of_rs[0][0] 
    970  
    971     labels = ["SAvar", "SAbias", "BAGV", "CNK", "LCV", "BVCK", "Mahalanobis", "ICV"] 
    972  
     1103  
    9731104class Classifier: 
    9741105    """ 
Note: See TracChangeset for help on using the changeset viewer.