Changeset 3671:2695d18bf2e0 in orange


Ignore:
Timestamp:
05/18/07 18:24:20 (7 years ago)
Author:
Gregor <Gregor@…>
Branch:
default
Convert:
8b2dbdaef01d6b521ecebdef5bcf899f45c5636c
Message:
  • added MDL and distance measure for measuring quality of attributes
File:
1 edited

Legend:

Unmodified
Added
Removed
  • orange/orngEvalAttr.py

    r1 r3671  
    11### Janez 03-02-14: Added weights 
    22### Inform Blaz and remove this comment 
     3import orange 
     4 
     5class OrderAttributesByMeasure: 
     6    def __init__(self, measure=None): 
     7        self.measure=measure 
     8 
     9    def __call__(self, data, weight): 
     10        if self.measure: 
     11            measure=self.measure 
     12        else: 
     13            measure=orange.MeasureAttribute_relief(m=5,k=10) 
     14 
     15        measured=[(attr, measure(attr, data, None, weight)) for attr in data.domain.attributes] 
     16        measured.sort(lambda x, y: cmp(x[1], y[1])) 
     17        return [x[0] for x in measured] 
     18 
     19def MeasureAttribute_Distance(attr = None, data = None): 
     20    m = MeasureAttribute_DistanceClass() 
     21    if attr != None and data != None: 
     22        return m(attr, data) 
     23    else: 
     24        return m 
    325 
    426 
    5 class OrderAttributesByMeasure: 
    6   def __init__(self, measure=None): 
    7     self.measure=measure 
     27# measure 1-D as described in Strojno ucenje (Kononenko) 
     28class MeasureAttribute_DistanceClass(orange.MeasureAttribute): 
     29    def __call__(self, attr, data, aprioriDist = None, weightID = None): 
     30        import numpy 
     31        from orngContingency import Entropy 
     32        if attr in data.domain:  # if we receive attr as string we have to convert to variable 
     33            attr = data.domain[attr] 
     34        attrClassCont = orange.ContingencyAttrClass(attr, data) 
     35        dist = [] 
     36        for vals in attrClassCont.values(): 
     37            dist += list(vals) 
     38        classAttrEntropy = Entropy(numpy.array(dist)) 
     39        infoGain = orange.MeasureAttribute_info(attr, data) 
     40        if classAttrEntropy > 0: 
     41            return float(infoGain) / classAttrEntropy 
     42        else: 
     43            return 0 
    844 
    9   def __call__(self, data, weight): 
    10     if self.measure: 
    11       measure=self.measure 
     45# attribute quality measure based on the minimum description length principle 
     46def MeasureAttribute_MDL(attr = None, data = None): 
     47    m = MeasureAttribute_MDLClass() 
     48    if attr != None and data != None: 
     49        return m(attr, data) 
    1250    else: 
    13       measure=orange.MeasureAttribute_relief(m=5,k=10) 
    14        
    15     measured=[(attr, measure(attr, data, None, weight)) for attr in data.domain.attributes] 
    16     measured.sort(lambda x, y: cmp(x[1], y[1])) 
    17     return [x[0] for x in measured] 
     51        return m 
     52 
     53class MeasureAttribute_MDLClass(orange.MeasureAttribute): 
     54    def __call__(self, attr, data, aprioriDist = None, weightID = None): 
     55        attrClassCont = orange.ContingencyAttrClass(attr, data) 
     56        classDist = orange.Distribution(data.domain.classVar, data).values() 
     57        nCls = len(classDist) 
     58        nEx = len(data) 
     59        priorMDL = logMultipleCombs(nEx, classDist) + logMultipleCombs(nEx+nCls-1, [nEx, nCls-1]) 
     60        postPart1 = [logMultipleCombs(sum(attrClassCont[key]), attrClassCont[key].values()) for key in attrClassCont.keys()] 
     61        postPart2 = [logMultipleCombs(sum(attrClassCont[key])+nCls-1, [sum(attrClassCont[key]), nCls-1]) for key in attrClassCont.keys()] 
     62        ret = priorMDL 
     63        for val in postPart1 + postPart2: 
     64            ret -= val 
     65        return ret / max(1, nEx) 
    1866 
    1967 
     68# compute n! / k1! * k2! * k3! * ... kc! 
     69# ks = [k1, k2, ...] 
     70def logMultipleCombs(n, ks): 
     71    import math 
     72    m = max(ks) 
     73    ks.remove(m) 
     74    resArray = [] 
     75    for (start, end) in [(m+1, n+1)] + [(1, k+1) for k in ks]: 
     76        ret = 0 
     77        curr = 1 
     78        for val in range(int(start), int(end)): 
     79            curr *= val 
     80            if curr > 1e40: 
     81                ret += math.log(curr) 
     82                curr = 1 
     83        ret += math.log(curr) 
     84        resArray.append(ret) 
     85    ret = resArray[0] 
     86    for val in resArray[1:]: 
     87        ret -= val 
     88    return ret 
     89 
     90 
     91 
     92##if __name__=="__main__": 
     93##    data = orange.ExampleTable(r"E:\Development\Orange Datasets\UCI\zoo.tab") 
     94##    #newFeature, quality = FeatureByCartesianProduct(data, ["sex", "age"]) 
     95##    #MeasureAttribute_Distance()(newFeature, data) 
     96##    #print logMultipleCombs(200, [70,30,100]) 
     97##    #import orngCI 
     98##    #newFeature, quality = orngCI.FeatureByIM(data, ["milk", "airborne"], binary = 0, measure = MeasureAttribute_MDL()) 
     99     
Note: See TracChangeset for help on using the changeset viewer.