Changeset 7199:0e112d39e1de in orange


Ignore:
Timestamp:
02/02/11 16:42:51 (3 years ago)
Author:
tomazc <tomaz.curk@…>
Branch:
default
Convert:
9746ac406e4d71d2404ef531098cf4aec00595e1
Message:
 
Location:
orange
Files:
8 added
4 edited

Legend:

Unmodified
Added
Removed
  • orange/doc/Orange/rst/index.rst

    r7198 r7199  
    1818   orange.classification.rules 
    1919 
     20   orange.feature 
     21 
    2022Indices and tables 
    2123================== 
  • orange/orngDisc.py

    r6538 r7199  
    1 import orange 
    2  
    3 def entropyDiscretization(data): 
    4   """ 
    5   Discretizes continuous attributes using the entropy based discretization. 
    6   It removes the attributes discretized to a single interval and prints their names. 
    7   Arguments: data 
    8   Returns:   table of examples with discretized atributes. Attributes that are 
    9              categorized to a single value (constant) are removed. 
    10   """ 
    11   orange.setrandseed(0) 
    12   tablen=orange.Preprocessor_discretize(data, method=orange.EntropyDiscretization()) 
    13  
    14   attrlist=[] 
    15   nrem=0 
    16   for i in tablen.domain.attributes: 
    17     if (len(i.values)>1): 
    18       attrlist.append(i) 
    19     else: 
    20       nrem=nrem+1 
    21  
    22   attrlist.append(tablen.domain.classVar) 
    23   return tablen.select(attrlist) 
    24  
    25  
    26 class EntropyDiscretization: 
    27   def __call__(self, data): 
    28     return entropyDiscretization(data) 
    29  
    30  
    31 def DiscretizedLearner(baseLearner, examples=None, weight=0, **kwds): 
    32   learner = apply(DiscretizedLearner_Class, [baseLearner], kwds) 
    33   if examples: return learner(examples, weight) 
    34   else: return learner 
    35  
    36 class DiscretizedLearner_Class: 
    37   def __init__(self, baseLearner, discretizer=EntropyDiscretization(), **kwds): 
    38     self.baseLearner = baseLearner 
    39     if hasattr(baseLearner, "name"): 
    40       self.name = baseLearner.name 
    41     self.discretizer = discretizer 
    42     self.__dict__.update(kwds) 
    43   def __call__(self, data, weight=None): 
    44     # filter the data and then learn 
    45     ddata = self.discretizer(data) 
    46     if weight<>None: 
    47       model = self.baseLearner(ddata, weight) 
    48     else: 
    49       model = self.baseLearner(ddata) 
    50     dcl = DiscretizedClassifier(classifier = model) 
    51     if hasattr(model, "domain"): 
    52       dcl.domain = model.domain 
    53     if hasattr(model, "name"): 
    54       dcl.name = model.name 
    55     return dcl 
    56  
    57 class DiscretizedClassifier: 
    58   def __init__(self, **kwds): 
    59     self.__dict__.update(kwds) 
    60   def __call__(self, example, resultType = orange.GetValue): 
    61     return self.classifier(example, resultType) 
     1from Orange.discretization import * 
  • orange/orngEvalAttr.py

    r6538 r7199  
    11### Janez 03-02-14: Added weights 
    22### Inform Blaz and remove this comment 
    3 import orange 
    43 
    5 class OrderAttributesByMeasure: 
    6     def __init__(self, measure=None): 
    7         self.measure=measure 
    8  
    9     def __call__(self, data, weight): 
    10         if self.measure: 
    11             measure=self.measure 
    12         else: 
    13             measure=orange.MeasureAttribute_relief(m=5,k=10) 
    14  
    15         measured=[(attr, measure(attr, data, None, weight)) for attr in data.domain.attributes] 
    16         measured.sort(lambda x, y: cmp(x[1], y[1])) 
    17         return [x[0] for x in measured] 
    18  
    19 def MeasureAttribute_Distance(attr = None, data = None): 
    20     m = MeasureAttribute_DistanceClass() 
    21     if attr != None and data != None: 
    22         return m(attr, data) 
    23     else: 
    24         return m 
    25  
    26  
    27 # measure 1-D as described in Strojno ucenje (Kononenko) 
    28 class MeasureAttribute_DistanceClass(orange.MeasureAttribute): 
    29     def __call__(self, attr, data, aprioriDist = None, weightID = None): 
    30         import numpy 
    31         from orngContingency import Entropy 
    32         if attr in data.domain:  # if we receive attr as string we have to convert to variable 
    33             attr = data.domain[attr] 
    34         attrClassCont = orange.ContingencyAttrClass(attr, data) 
    35         dist = [] 
    36         for vals in attrClassCont.values(): 
    37             dist += list(vals) 
    38         classAttrEntropy = Entropy(numpy.array(dist)) 
    39         infoGain = orange.MeasureAttribute_info(attr, data) 
    40         if classAttrEntropy > 0: 
    41             return float(infoGain) / classAttrEntropy 
    42         else: 
    43             return 0 
    44  
    45 # attribute quality measure based on the minimum description length principle 
    46 def MeasureAttribute_MDL(attr = None, data = None): 
    47     m = MeasureAttribute_MDLClass() 
    48     if attr != None and data != None: 
    49         return m(attr, data) 
    50     else: 
    51         return m 
    52  
    53 class MeasureAttribute_MDLClass(orange.MeasureAttribute): 
    54     def __call__(self, attr, data, aprioriDist = None, weightID = None): 
    55         attrClassCont = orange.ContingencyAttrClass(attr, data) 
    56         classDist = orange.Distribution(data.domain.classVar, data).values() 
    57         nCls = len(classDist) 
    58         nEx = len(data) 
    59         priorMDL = logMultipleCombs(nEx, classDist) + logMultipleCombs(nEx+nCls-1, [nEx, nCls-1]) 
    60         postPart1 = [logMultipleCombs(sum(attrClassCont[key]), attrClassCont[key].values()) for key in attrClassCont.keys()] 
    61         postPart2 = [logMultipleCombs(sum(attrClassCont[key])+nCls-1, [sum(attrClassCont[key]), nCls-1]) for key in attrClassCont.keys()] 
    62         ret = priorMDL 
    63         for val in postPart1 + postPart2: 
    64             ret -= val 
    65         return ret / max(1, nEx) 
    66  
    67  
    68 # compute n! / k1! * k2! * k3! * ... kc! 
    69 # ks = [k1, k2, ...] 
    70 def logMultipleCombs(n, ks): 
    71     import math 
    72     m = max(ks) 
    73     ks.remove(m) 
    74     resArray = [] 
    75     for (start, end) in [(m+1, n+1)] + [(1, k+1) for k in ks]: 
    76         ret = 0 
    77         curr = 1 
    78         for val in range(int(start), int(end)): 
    79             curr *= val 
    80             if curr > 1e40: 
    81                 ret += math.log(curr) 
    82                 curr = 1 
    83         ret += math.log(curr) 
    84         resArray.append(ret) 
    85     ret = resArray[0] 
    86     for val in resArray[1:]: 
    87         ret -= val 
    88     return ret 
    89  
    90  
    91  
    92 def mergeAttrValues(data, attrList, attrMeasure, removeUnusedValues = 1): 
    93     import orngCI 
    94     #data = data.select([data.domain[attr] for attr in attrList] + [data.domain.classVar]) 
    95     newData = data.select(attrList + [data.domain.classVar]) 
    96     newAttr = orngCI.FeatureByCartesianProduct(newData, attrList)[0] 
    97     dist = orange.Distribution(newAttr, newData) 
    98     activeValues = [] 
    99     for i in range(len(newAttr.values)): 
    100         if dist[newAttr.values[i]] > 0: activeValues.append(i) 
    101     currScore = attrMeasure(newAttr, newData) 
    102     while 1: 
    103         bestScore, bestMerge = currScore, None 
    104         for i1, ind1 in enumerate(activeValues): 
    105             oldInd1 = newAttr.getValueFrom.lookupTable[ind1] 
    106             for ind2 in activeValues[:i1]: 
    107                 newAttr.getValueFrom.lookupTable[ind1] = ind2 
    108                 score = attrMeasure(newAttr, newData) 
    109                 if score >= bestScore: 
    110                     bestScore, bestMerge = score, (ind1, ind2) 
    111                 newAttr.getValueFrom.lookupTable[ind1] = oldInd1 
    112  
    113         if bestMerge: 
    114             ind1, ind2 = bestMerge 
    115             currScore = bestScore 
    116             for i, l in enumerate(newAttr.getValueFrom.lookupTable): 
    117                 if not l.isSpecial() and int(l) == ind1: 
    118                     newAttr.getValueFrom.lookupTable[i] = ind2 
    119             newAttr.values[ind2] = newAttr.values[ind2] + "+" + newAttr.values[ind1] 
    120             del activeValues[activeValues.index(ind1)] 
    121         else: 
    122             break 
    123  
    124     if not removeUnusedValues: 
    125         return newAttr 
    126  
    127     reducedAttr = orange.EnumVariable(newAttr.name, values = [newAttr.values[i] for i in activeValues]) 
    128     reducedAttr.getValueFrom = newAttr.getValueFrom 
    129     reducedAttr.getValueFrom.classVar = reducedAttr 
    130     return reducedAttr 
    131  
    132  
    133  
    134 ##if __name__=="__main__": 
    135 ##    data = orange.ExampleTable(r"E:\Development\Orange Datasets\UCI\zoo.tab") 
    136 ##    #newFeature, quality = FeatureByCartesianProduct(data, ["sex", "age"]) 
    137 ##    #MeasureAttribute_Distance()(newFeature, data) 
    138 ##    #print logMultipleCombs(200, [70,30,100]) 
    139 ##    #import orngCI 
    140 ##    #newFeature, quality = orngCI.FeatureByIM(data, ["milk", "airborne"], binary = 0, measure = MeasureAttribute_MDL()) 
    141  
     4from Orange.feature.scoring import * 
  • orange/orngFSS.py

    r6538 r7199  
    1 import orange 
    2  
    3 ############################################################################## 
    4 # utility functions 
    5  
    6 def attMeasure(data, measure = orange.MeasureAttribute_relief(k=20, m=50)): 
    7   """ 
    8   Assesses the quality of attributes using the given measure, outputs the results and 
    9   returns a sorted list of tuples (attribute name, measure) 
    10   Arguments: data       example table 
    11              measure    an attribute scoring function (derived from orange.MeasureAttribute) 
    12   Result:    a sorted list of tuples (attribute name, measure) 
    13   """ 
    14   measl=[] 
    15   for i in data.domain.attributes: 
    16     measl.append((i.name, measure(i, data))) 
    17   measl.sort(lambda x,y:cmp(y[1], x[1])) 
    18     
    19 #  for i in measl: 
    20 #    print "%25s, %6.3f" % (i[0], i[1]) 
    21   return measl 
    22  
    23 def bestNAtts(scores, N): 
    24   """ 
    25   Returns the first N attributes from the list returned by function attMeasure. 
    26   Arguments: scores   a list such as one returned by "attMeasure" 
    27              N             the number of attributes 
    28   Result: the first N attributes (without measures) 
    29   """ 
    30   return map(lambda x:x[0], scores[:N]) 
    31  
    32 def attsAbovethreshold(scores, threshold=0.0): 
    33   """ 
    34   Returns attributes from the list returned by function attMeasure that 
    35   have the score above or equal to a specified threshold 
    36   Arguments: scores   a list such as one returned by "attMeasure" 
    37              threshold      threshold, default is 0.0 
    38   Result: the first N attributes (without measures) 
    39   """ 
    40   pairs = filter(lambda x, t=threshold: x[1] > t, scores) 
    41   return map(lambda x:x[0], pairs) 
    42  
    43 def selectBestNAtts(data, scores, N): 
    44   """ 
    45   Constructs and returns a new set of examples that includes a 
    46   class and only N best attributes from a list scores 
    47   Arguments: data          an example table 
    48              scores   a list such as one returned by "attMeasure" 
    49              N             the number of attributes 
    50   Result: data with the first N attributes (without measures) 
    51   """ 
    52   return data.select(bestNAtts(scores, N)+[data.domain.classVar.name]) 
    53  
    54  
    55 def selectAttsAboveThresh(data, scores, threshold=0.0): 
    56   """ 
    57   Constructs and returns a new set of examples that includes a 
    58   class and attributes from the list returned by function attMeasure that 
    59   have the score above or equal to a specified threshold 
    60   Arguments: data          an example table 
    61              scores      a list such as one returned by "attMeasure" 
    62              threshold      threshold, default is 0.0 
    63   Result: the first N attributes (without measures) 
    64   """ 
    65   return data.select(attsAbovethreshold(scores, threshold)+[data.domain.classVar.name]) 
    66  
    67  
    68 def filterRelieff(data, measure = orange.MeasureAttribute_relief(k=20, m=50), margin=0): 
    69   """ 
    70   Takes the data set and an attribute measure (Relief by default). Estimates 
    71   attibute score by the measure, removes worst attribute if its measure 
    72   is below the margin. Repeats, until no attribute has negative or zero score. 
    73   Arguments: data          an example table 
    74              measure       an attribute measure (derived from mlpy.MeasureAttribute) 
    75              margin        if score is higher than margin, attribute is not removed 
    76   """ 
    77   measl = attMeasure(data, measure) 
    78    
    79   while len(data.domain.attributes)>0 and measl[-1][1]<margin: 
    80     data = selectBestNAtts(data, measl, len(data.domain.attributes)-1) 
    81 #    print 'remaining ', len(data.domain.attributes) 
    82     measl = attMeasure(data, measure) 
    83   return data 
    84  
    85 ############################################################################## 
    86 # wrappers 
    87  
    88 def FilterAttsAboveThresh(data=None, **kwds): 
    89   filter = apply(FilterAttsAboveThresh_Class, (), kwds) 
    90   if data: return filter(data) 
    91   else: return filter 
    92    
    93 class FilterAttsAboveThresh_Class: 
    94   def __init__(self, measure=orange.MeasureAttribute_relief(k=20, m=50), threshold=0.0): 
    95     self.measure = measure 
    96     self.threshold = threshold 
    97   def __call__(self, data): 
    98     ma = attMeasure(data, self.measure) 
    99     return selectAttsAboveThresh(data, ma, self.threshold) 
    100  
    101 # 
    102  
    103 def FilterBestNAtts(data=None, **kwds): 
    104   filter = apply(FilterBestNAtts_Class, (), kwds) 
    105   if data: return filter(data) 
    106   else: return filter 
    107    
    108 class FilterBestNAtts_Class: 
    109   def __init__(self, measure=orange.MeasureAttribute_relief(k=20, m=50), n=5): 
    110     self.measure = measure 
    111     self.n = n 
    112   def __call__(self, data): 
    113     ma = attMeasure(data, self.measure) 
    114     self.n = min(self.n, len(data.domain.attributes)) 
    115     return selectBestNAtts(data, ma, self.n) 
    116  
    117 # 
    118  
    119 def FilterRelief(data=None, **kwds): 
    120   filter = apply(FilterRelief_Class, (), kwds) 
    121   if data: return filter(data) 
    122   else: return filter 
    123    
    124 class FilterRelief_Class: 
    125   def __init__(self, measure=orange.MeasureAttribute_relief(k=20, m=50), margin=0): 
    126     self.measure = measure 
    127     self.margin = margin 
    128   def __call__(self, data): 
    129     return filterRelieff(data, self.measure, self.margin) 
    130  
    131 ############################################################################## 
    132 # wrapped learner 
    133  
    134 def FilteredLearner(baseLearner, examples = None, weight = None, **kwds): 
    135   learner = apply(FilteredLearner_Class, [baseLearner], kwds) 
    136   if examples: return learner(examples, weight) 
    137   else: return learner 
    138  
    139 class FilteredLearner_Class: 
    140   def __init__(self, baseLearner, filter=FilterAttsAboveThresh(), name='filtered'): 
    141     self.baseLearner = baseLearner 
    142     self.filter = filter 
    143     self.name = name 
    144   def __call__(self, data, weight=0): 
    145     # filter the data and then learn 
    146     fdata = self.filter(data) 
    147     model = self.baseLearner(fdata, weight) 
    148     return FilteredClassifier(classifier = model, domain = model.domain) 
    149  
    150 class FilteredClassifier: 
    151   def __init__(self, **kwds): 
    152     self.__dict__.update(kwds) 
    153   def __call__(self, example, resultType = orange.GetValue): 
    154     return self.classifier(example, resultType) 
    155   def atts(self): 
    156     return self.domain.attributes   
    157  
    158 class StepwiseLearner_Class: 
    159   def __init__(self, **kwds): 
    160     import orngStat 
    161     self.removeThreshold = 0.3 
    162     self.addThreshold = 0.2 
    163     self.stat, self.statsign = orngStat.CA, 1 
    164     self.__dict__.update(kwds) 
    165  
    166   def __call__(self, examples, weightID = 0, **kwds): 
    167     import orngTest, orngStat, statc 
    168      
    169     self.__dict__.update(kwds) 
    170  
    171     if self.removeThreshold < self.addThreshold: 
    172         raise "'removeThreshold' should be larger or equal to 'addThreshold'" 
    173  
    174     classVar = examples.domain.classVar 
    175      
    176     indices = orange.MakeRandomIndicesCV(examples, folds = getattr(self, "folds", 10)) 
    177     domain = orange.Domain([], classVar) 
    178  
    179     res = orngTest.testWithIndices([self.learner], orange.ExampleTable(domain, examples), indices) 
    180      
    181     oldStat = self.stat(res)[0] 
    182     oldStats = [self.stat(x)[0] for x in orngStat.splitByIterations(res)] 
    183     print ".", oldStat, domain 
    184     stop = False 
    185     while not stop: 
    186         stop = True 
    187         if len(domain.attributes)>=2: 
    188             bestStat = None 
    189             for attr in domain.attributes: 
    190                 newdomain = orange.Domain(filter(lambda x: x!=attr, domain.attributes), classVar) 
    191                 res = orngTest.testWithIndices([self.learner], (orange.ExampleTable(newdomain, examples), weightID), indices) 
    192                  
    193                 newStat = self.stat(res)[0] 
    194                 newStats = [self.stat(x)[0] for x in orngStat.splitByIterations(res)]  
    195                 print "-", newStat, newdomain 
    196                 ## If stat has increased (ie newStat is better than bestStat) 
    197                 if not bestStat or cmp(newStat, bestStat) == self.statsign: 
    198                     if cmp(newStat, oldStat) == self.statsign: 
    199                         bestStat, bestStats, bestAttr = newStat, newStats, attr 
    200                     elif statc.wilcoxont(oldStats, newStats)[1] > self.removeThreshold: 
    201                             bestStat, bestAttr, bestStats = newStat, newStats, attr 
    202             if bestStat: 
    203                 domain = orange.Domain(filter(lambda x: x!=bestAttr, domain.attributes), classVar) 
    204                 oldStat, oldStats = bestStat, bestStats 
    205                 stop = False 
    206                 print "removed", bestAttr.name 
    207  
    208         bestStat, bestAttr = oldStat, None 
    209         for attr in examples.domain.attributes: 
    210             if not attr in domain.attributes: 
    211                 newdomain = orange.Domain(domain.attributes + [attr], classVar) 
    212                 res = orngTest.testWithIndices([self.learner], (orange.ExampleTable(newdomain, examples), weightID), indices) 
    213                  
    214                 newStat = self.stat(res)[0] 
    215                 newStats = [self.stat(x)[0] for x in orngStat.splitByIterations(res)]  
    216                 print "+", newStat, newdomain 
    217  
    218                 ## If stat has increased (ie newStat is better than bestStat) 
    219                 if cmp(newStat, bestStat) == self.statsign and statc.wilcoxont(oldStats, newStats)[1] < self.addThreshold: 
    220                     bestStat, bestStats, bestAttr = newStat, newStats, attr 
    221         if bestAttr: 
    222             domain = orange.Domain(domain.attributes + [bestAttr], classVar) 
    223             oldStat, oldStats = bestStat, bestStats 
    224             stop = False 
    225             print "added", bestAttr.name 
    226  
    227     return self.learner(orange.ExampleTable(domain, examples), weightID) 
    228  
    229 def StepwiseLearner(examples = None, weightID = None, **argkw): 
    230     sl = apply(StepwiseLearner_Class, (), argkw) 
    231     if examples: 
    232         return sl(examples, weightID) 
    233     else: 
    234         return sl 
     1from Orange.feature.scoring import * 
     2from Orange.feature.selection import * 
     3from Orange.classification.wrapper import StepwiseLearner_Class, StepwiseLearner 
Note: See TracChangeset for help on using the changeset viewer.