Changeset 5:f50b356a019c in orange-reliability for _reliability/__init__.py


Ignore:
Timestamp:
06/09/12 13:54:12 (23 months ago)
Author:
Matija Polajnar <matija.polajnar@…>
Branch:
default
Message:

Merge in Lan Umek's implementations of reliability estimation for classification.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • _reliability/__init__.py

    r0 r5  
    3636ICV_METHOD = 10 
    3737MAHAL_TO_CENTER_ABSOLUTE = 13 
     38DENS_ABSOLUTE = 14 
    3839 
    3940# Type of estimator constant 
     
    4647               6: "LCV absolute", 7: "BVCK_absolute", 8: "Mahalanobis absolute", 
    4748               9: "BLENDING absolute", 10: "ICV", 11: "RF Variance", 12: "RF Std", 
    48                13: "Mahalanobis to center"} 
     49               13: "Mahalanobis to center", 14: "Density based"} 
    4950 
    5051select_with_repeat = Orange.core.MakeRandomIndicesMultiple() 
     
    154155    return statc.betai (df * 0.5, 0.5, df / (df + t * t)) 
    155156 
     157 
     158# Distances between two discrete probability distributions 
     159#TODO Document those. 
     160def normalize_both(p, q): 
     161    if not p.normalized: 
     162        p.normalize() 
     163    if not q.normalized: 
     164        q.normalize() 
     165    return p, q 
     166 
     167def minkowsky_dist(p, q, m=2): 
     168    p, q = normalize_both(p, q) 
     169    dist = 0 
     170    for i in range(len(p)): 
     171        dist += abs(p[i]-q[i])**m 
     172    return dist**(1./m) 
     173 
     174def manhattan_distance(p, q): 
     175    return minkowsky_dist(p, q, m=1) 
     176 
     177def euclidean_dist(p, q): 
     178    return minkowsky_dist(p, q, m=2) 
     179 
     180def variance_dist(p, q): 
     181    return euclidean_dist(p, q) ** 2 
     182 
     183def max_dist(p, q): 
     184    p, q = normalize_both(p, q) 
     185    return max([abs(p[i]-q[i]) for i in range(len(p))]) 
     186 
     187def hellinger_dist(p, q): 
     188    p, q = normalize_both(p, q) 
     189    dist = 0 
     190    for i in range(len(p)): 
     191        dist += (math.sqrt(p[i])-math.sqrt(q[i])) ** 2 
     192    return dist 
     193 
     194def my_log(x): 
     195    return 0 if x == 0 else x * math.log(x) 
     196 
     197def kullback_leibler(p, q): 
     198    p, q = normalize_both(p, q) 
     199    dist = 0 
     200    for i in range(len(p)): 
     201        dist += my_log(p[i]-q[i]) 
     202    return dist 
     203 
     204def cosine(p, q): 
     205    p, q = normalize_both(p, q) 
     206    p, q = [pp for pp in p], [qq for qq in q] 
     207    return 1 - numpy.dot(x,y) / (numpy.linalg.norm(p)*numpy.linalg.norm(q)) 
     208 
     209 
    156210class Estimate: 
    157211    """ 
     
    334388     
    335389    :math:`m` different bagging models are constructed and used to estimate 
    336     the value of dependent variable for a given instance. The variance of 
    337     those predictions is used as a prediction reliability estimate. 
     390    the value of dependent variable for a given instance. In regression, 
     391    the variance of those predictions is used as a prediction reliability 
     392    estimate. 
    338393 
    339394    :math:`BAGV = \\frac{1}{m} \sum_{i=1}^{m} (K_i - K)^2` 
    340395 
    341396    where :math:`K = \\frac{\sum_{i=1}^{m} K_i}{m}` and :math:`K_i` are 
    342     predictions of individual constructed models. 
     397    predictions of individual constructed models. Note that a greater value 
     398    implies greater error. 
     399 
     400    For classification, 1 minus the average Euclidean distance between class 
     401    probability distributions predicted by the model, and distributions 
     402    predicted by the individual bagged models, is used as the BAGV reliability 
     403    measure. Note that in this case a greater value implies a better 
     404    prediction. 
    343405     
    344406    """ 
     
    348410    def __call__(self, instances, learner): 
    349411        classifiers = [] 
     412 
     413        if instances.domain.class_var.var_type == Orange.feature.Descriptor.Discrete: 
     414            classifier = learner(instances) 
     415        else: 
     416            classifier = None 
    350417 
    351418        # Create bagged classifiers using sampling with replacement 
     
    354421            data = instances.select(selection) 
    355422            classifiers.append(learner(data)) 
    356         return BaggingVarianceClassifier(classifiers) 
     423        return BaggingVarianceClassifier(classifiers, classifier) 
    357424 
    358425class BaggingVarianceClassifier: 
     
    361428 
    362429    def __call__(self, instance, *args): 
     430        def __init__(self, classifiers, classifier=None): 
     431            self.classifiers = classifiers 
     432            self.classifier = classifier 
     433 
     434    def __call__(self, instance, *args): 
    363435        BAGV = 0 
    364436 
    365437        # Calculate the bagging variance 
    366         bagged_values = [c(instance, Orange.core.GetValue).value for c in self.classifiers if c is not None] 
    367  
     438        if instance.domain.class_var.var_type == Orange.feature.Descriptor.Continuous: 
     439            bagged_values = [c(instance, Orange.core.GetValue).value for c in self.classifiers if c is not None] 
     440        elif instance.domain.class_var.var_type == Orange.feature.Descriptor.Discrete: 
     441            estimate = self.classifier(instance, Orange.core.GetProbabilities) 
     442            bagged_values = [euclidean_dist(c(instance, Orange.core.GetProbabilities), estimate) for c in self.classifiers if c is not None] 
    368443        k = sum(bagged_values) / len(bagged_values) 
    369444 
    370445        BAGV = sum((bagged_value - k) ** 2 for bagged_value in bagged_values) / len(bagged_values) 
     446        if instance.domain.class_var.var_type == Orange.feature.Descriptor.Discrete: 
     447            BAGV = 1 - BAGV 
    371448 
    372449        return [Estimate(BAGV, ABSOLUTE, BAGV_ABSOLUTE)] 
     
    374451class LocalCrossValidation: 
    375452    """ 
    376      
     453 
    377454    :param k: Number of nearest neighbours used in LCV estimate 
    378455    :type k: int 
    379      
     456 
     457    :param distance: function that computes a distance between two discrete 
     458        distributions (used only in classification problems). The default 
     459        is Hellinger distance. 
     460    :type distance: function 
     461 
     462    :param distance_weighted: for classification reliability estimation, 
     463        use an average distance between distributions, weighted by :math:`e^{-d}`, 
     464        where :math:`d` is the distance between predicted instance and the 
     465        neighbour. 
     466 
    380467    :rtype: :class:`Orange.evaluation.reliability.LocalCrossValidationClassifier` 
    381      
     468 
    382469    :math:`k` nearest neighbours to the given instance are found and put in 
    383470    a separate data set. On this data set, a leave-one-out validation is 
    384     performed. Reliability estimate is then the distance weighted absolute 
    385     prediction error. 
     471    performed. Reliability estimate for regression is then the distance 
     472    weighted absolute prediction error. In classification, 1 minus the average 
     473    distance between the predicted class probability distribution and the 
     474    (trivial) probability distributions of the nearest neighbour. 
    386475 
    387476    If a special value 0 is passed as :math:`k` (as is by default), 
    388477    it is set as 1/20 of data set size (or 5, whichever is greater). 
    389      
     478 
     479    Summary of the algorithm for regression: 
     480 
    390481    1. Determine the set of k nearest neighours :math:`N = { (x_1, c_1),..., 
    391482       (x_k, c_k)}`. 
     
    393484       prediction errors :math:`E_i = | C_i - K_i |`. 
    394485    3. :math:`LCV(x) = \\frac{ \sum_{(x_i, c_i) \in N} d(x_i, x) * E_i }{ \sum_{(x_i, c_i) \in N} d(x_i, x) }` 
    395      
    396     """ 
    397     def __init__(self, k=0): 
     486 
     487    """ 
     488    def __init__(self, k=0, distance=hellinger_dist, distance_weighted=True): 
    398489        self.k = k 
     490        self.distance = distance 
     491        self.distance_weighted = distance_weighted 
    399492 
    400493    def __call__(self, instances, learner): 
     
    408501            self.k = max(5, len(instances) / 20) 
    409502 
    410         return LocalCrossValidationClassifier(distance_id, nearest_neighbours, self.k, learner) 
     503        return LocalCrossValidationClassifier(distance_id, nearest_neighbours, self.k, learner, 
     504            distance=self.distance, distance_weighted=self.distance_weighted) 
    411505 
    412506class LocalCrossValidationClassifier: 
    413     def __init__(self, distance_id, nearest_neighbours, k, learner): 
     507    def __init__(self, distance_id, nearest_neighbours, k, learner, **kwds): 
    414508        self.distance_id = distance_id 
    415509        self.nearest_neighbours = nearest_neighbours 
    416510        self.k = k 
    417511        self.learner = learner 
     512        for a,b in kwds.items(): 
     513            setattr(self, a, b) 
    418514 
    419515    def __call__(self, instance, *args): 
     
    432528            classifier = self.learner(Orange.data.Table(train)) 
    433529 
    434             returned_value = classifier(knn[i], Orange.core.GetValue) 
    435  
    436             e = abs(knn[i].getclass().value - returned_value.value) 
    437  
    438             LCVer += e * math.exp(-knn[i][self.distance_id]) 
    439             LCVdi += math.exp(-knn[i][self.distance_id]) 
     530            if instance.domain.class_var.var_type == Orange.feature.Descriptor.Continuous: 
     531                returned_value = classifier(knn[i], Orange.core.GetValue) 
     532                e = abs(knn[i].getclass().value - returned_value.value) 
     533 
     534            elif instance.domain.class_var.var_type == Orange.feature.Descriptor.Discrete: 
     535                returned_value = classifier(knn[i], Orange.core.GetProbabilities) 
     536                probabilities = [knn[i].get_class() == val for val in instance.domain.class_var.values] 
     537                e = self.distance(returned_value, Orange.statistics.distribution.Discrete(probabilities)) 
     538 
     539            dist = math.exp(-knn[i][self.distance_id]) if self.distance_weighted else 1.0 
     540            LCVer += e * dist 
     541            LCVdi += dist 
    440542 
    441543        LCV = LCVer / LCVdi if LCVdi != 0 else 0 
    442544        if math.isnan(LCV): 
    443545            LCV = 0.0 
     546 
     547        if instance.domain.class_var.var_type == Orange.feature.Descriptor.Discrete: 
     548            LCV = 1 - LCV 
     549 
    444550        return [ Estimate(LCV, ABSOLUTE, LCV_ABSOLUTE) ] 
    445551 
     
    449555    :param k: Number of nearest neighbours used in CNK estimate 
    450556    :type k: int 
     557 
     558    :param distance: function that computes a distance between two discrete 
     559        distributions (used only in classification problems). The default 
     560        is Hellinger distance. 
     561    :type distance: function 
    451562     
    452563    :rtype: :class:`Orange.evaluation.reliability.CNeighboursClassifier` 
    453564     
    454     CNK is defined for an unlabeled instance as a difference between average 
    455     label of its nearest neighbours and its prediction. CNK can be used as a 
    456     signed or absolute estimate. 
     565    For regression, CNK is defined for an unlabeled instance as a difference 
     566    between average label of its nearest neighbours and its prediction. CNK 
     567    can be used as a signed or absolute estimate. 
    457568     
    458569    :math:`CNK = \\frac{\sum_{i=1}^{k}C_i}{k} - K` 
    459570     
    460571    where :math:`k` denotes number of neighbors, C :sub:`i` denotes neighbours' 
    461     labels and :math:`K` denotes the instance's prediction. 
    462      
    463     """ 
    464     def __init__(self, k=5): 
     572    labels and :math:`K` denotes the instance's prediction. Note that a greater 
     573    value implies greater prediction error. 
     574 
     575    For classification, CNK is equal to 1 minus the average distance between 
     576    predicted class distribution and (trivial) class distributions of the 
     577    $k$ nearest neighbours from the learning set. Note that in this case 
     578    a greater value implies better prediction. 
     579     
     580    """ 
     581    def __init__(self, k=5, distance=hellinger_dist): 
    465582        self.k = k 
     583        self.distance = distance 
    466584 
    467585    def __call__(self, instances, learner): 
     
    471589        distance_id = Orange.feature.Descriptor.new_meta_id() 
    472590        nearest_neighbours = nearest_neighbours_constructor(instances, 0, distance_id) 
    473         return CNeighboursClassifier(nearest_neighbours, self.k) 
     591        return CNeighboursClassifier(nearest_neighbours, self.k, distance=self.distance) 
    474592 
    475593class CNeighboursClassifier: 
     
    486604 
    487605        # average label of neighbors 
    488         for ex in knn: 
    489             CNK += ex.getclass().value 
    490  
    491         CNK /= self.k 
    492         CNK -= predicted.value 
    493  
    494         return [Estimate(CNK, SIGNED, CNK_SIGNED), 
    495                 Estimate(abs(CNK), ABSOLUTE, CNK_ABSOLUTE)] 
     606        if ex.domain.class_var.var_type == Orange.feature.Descriptor.Continuous: 
     607            for ex in knn: 
     608                CNK += ex.getclass().value 
     609            CNK /= self.k 
     610            CNK -= predicted.value 
     611 
     612            return [Estimate(CNK, SIGNED, CNK_SIGNED), 
     613                    Estimate(abs(CNK), ABSOLUTE, CNK_ABSOLUTE)] 
     614        elif ex.domain.class_var.var_type == Orange.feature.Descriptor.Discrete: 
     615            knn_l = Orange.classification.knn.kNNLearner(k=self.k) 
     616            knn_c = knn_l(knn) 
     617            for ex in knn: 
     618                CNK -= self.distance(probabilities, knn_c(ex, Orange.classification.Classifier.GetProbabilities)) 
     619            CNK /= self.k 
     620            CNK += 1 
     621 
     622            return [Estimate(CNK, ABSOLUTE, CNK_ABSOLUTE)] 
    496623 
    497624class Mahalanobis: 
     
    648775 
    649776        return [Estimate(value.value, SIGNED, SABIAS_SIGNED)] 
     777 
     778def gauss_kernel(x, sigma=1): 
     779    return 1./(sigma*math.sqrt(2*math.pi)) * math.exp(-1./2*(x/sigma)**2) 
     780 
     781class ParzenWindowDensityBased: 
     782    """ 
     783    :param K: kernel function. Default: gaussian. 
     784    :type K: function 
     785 
     786    :param d_measure: distance measure for inter-instance distance. 
     787    :type d_measure: :class:`Orange.distance.DistanceConstructor` 
     788 
     789    :rtype: :class:`Orange.evaluation.reliability.ParzenWindowDensityBasedClassifier` 
     790 
     791    Returns a value that estimates a density of problem space around the 
     792    instance being predicted. 
     793    """ 
     794    def __init__(self, K=gauss_kernel, d_measure=Orange.distance.Euclidean()): 
     795        self.K = K 
     796        self.d_measure = d_measure 
     797 
     798    def __call__(self, instances): 
     799 
     800        self.distance = self.d_measure(instances) 
     801 
     802        def density(x): 
     803            l, dens = len(instances), 0 
     804            for ex in instances: 
     805                dens += self.K(self.distance(x,ex)) 
     806            return dens / l 
     807 
     808        max_density = max([density(ex) for ex in instances]) 
     809 
     810        return ParzenWindowDensityBasedClassifier(density, max_density) 
     811 
     812class ParzenWindowDensityBasedClassifier: 
     813 
     814    def __init__(self, density, max_density): 
     815        self.density = density 
     816        self.max_density = max_density 
     817 
     818 
     819    def __call__(self, instance, *args): 
     820 
     821        DENS = self.max_density-self.density(instance) 
     822 
     823        return [Estimate(DENS, ABSOLUTE, DENS_ABSOLUTE)] 
    650824 
    651825class Learner: 
     
    821995        else: 
    822996            return predicted, probabilities 
     997 
     998# Functions for testing and plotting 
     999#TODO Document those. 
     1000def get_acc_rel(method, data, learner): 
     1001    estimators = [method] 
     1002    reliability = Orange.evaluation.reliability.Learner(learner, estimators=estimators) 
     1003    #results = Orange.evaluation.testing.leave_one_out([reliability], data) 
     1004    results = Orange.evaluation.testing.cross_validation([reliability], data) 
     1005 
     1006    rels, acc = [], [] 
     1007 
     1008    for res in results.results: 
     1009        rels.append(res.probabilities[0].reliability_estimate[0].estimate) 
     1010        acc.append(res.probabilities[0][res.actual_class]) 
     1011 
     1012    return rels, acc 
     1013 
     1014def acc_rel_plot(method, data, learner, file_name="acc_rel_plot.png", colors=None): 
     1015 
     1016    import matplotlib.pylab as plt 
     1017 
     1018    plt.clf() 
     1019 
     1020    rels, acc = get_acc_rel(method, data, learner) 
     1021    print "rels", rels 
     1022    print "acc", acc 
     1023 
     1024    if colors is None: 
     1025        colors = "k" 
     1026    plt.scatter(acc, rels, c=colors) 
     1027    plt.xlim(0.,1.) 
     1028    plt.ylim(ymin=0.) 
     1029    plt.savefig(file_name) 
     1030 
     1031def acc_rel_correlation(method, data, learner): 
     1032    import scipy.stats 
     1033    rels, acc = get_acc_rel(method, data, learner) 
     1034    return scipy.stats.spearmanr(acc, rels)[0] 
Note: See TracChangeset for help on using the changeset viewer.