Changeset 7290:f71f1a06ef5d in orange


Ignore:
Timestamp:
02/03/11 10:11:52 (3 years ago)
Author:
tomazc <tomaz.curk@…>
Branch:
default
Convert:
0491059e33e7af60821400a733ccb2029e73c90f
Message:
 
Location:
orange/Orange/feature
Files:
3 added
4 edited

Legend:

Unmodified
Added
Removed
  • orange/Orange/feature/__init__.py

    r7224 r7290  
    11""" 
    2  
    32.. index:: feature 
    43 
     
    1110.. index:: feature scoring 
    1211 
    13 .. autoclass:: scoring 
     12.. automodule:: Orange.feature.scoring 
    1413   :members: 
    15  
    16 Examples 
    17 ======== 
    18  
    1914 
    2015================== 
     
    2419.. index:: feature selection 
    2520 
    26 .. autoclass:: selection 
    27    :members: 
    28  
    29 Examples 
    30 ======== 
    31  
     21.. automodule:: Orange.feature.selection 
    3222 
    3323================== 
     
    3727.. index:: discretization 
    3828 
    39 .. autoclass:: discretization 
     29.. automodule:: Orange.feature.discretization 
    4030   :members: 
    41  
    42 Examples 
    43 ======== 
    44  
    4531 
    4632================== 
     
    5036.. index:: continuization 
    5137 
    52 .. autoclass:: continuization 
     38.. automodule:: Orange.feature.continuization 
    5339   :members: 
    54  
    55 Examples 
    56 ======== 
    57  
    5840 
    5941================== 
     
    6345.. index:: imputation 
    6446 
    65 .. autoclass:: imputation 
     47.. automodule:: Orange.feature.imputation 
    6648   :members: 
    67  
    68 Examples 
    69 ======== 
    7049 
    7150""" 
  • orange/Orange/feature/discretization.py

    r7216 r7290  
    1818  Returns:   table of examples with discretized atributes. Attributes that are 
    1919             categorized to a single value (constant) are removed. 
     20 
    2021  """ 
    2122  orange.setrandseed(0) 
  • orange/Orange/feature/scoring.py

    r7199 r7290  
     1""" 
     2Feature scoring is normally used in feature subset selection for classification 
     3problems. 
     4 
     5Let start with a simple script that reads the data, uses :obj:`attMeasure` to 
     6derive attribute scores and prints out these for the first three best scored 
     7attributes. Same scoring function is then used to report (only) on three best 
     8score attributes. 
     9 
     10`fss1.py`_ (uses `voting.tab`_):: 
     11 
     12    import orange, import orngFSS 
     13    data = orange.ExampleTable("voting") 
     14 
     15    print 'Score estimate for first three attributes:' 
     16    ma = orngFSS.attMeasure(data) 
     17    for m in ma[:3]: 
     18        print "%5.3f %s" % (m[1], m[0]) 
     19 
     20    n = 3 
     21    best = orngFSS.bestNAtts(ma, n) 
     22    print '\\nBest %d attributes:' % n 
     23    for s in best: 
     24        print s 
     25 
     26The script should output this:: 
     27 
     28    Attribute scores for best three attributes: 
     29    Attribute scores for best three attributes: 
     30    0.728 physician-fee-freeze 
     31    0.329 adoption-of-the-budget-resolution 
     32    0.321 synfuels-corporation-cutback 
     33 
     34    Best 3 attributes: 
     35    physician-fee-freeze 
     36    adoption-of-the-budget-resolution 
     37    synfuels-corporation-cutback</xmp> 
     38 
     39Functions and classes for feature scoring: 
     40 
     41.. autoclass:: Orange.feature.scoring.OrderAttributesByMeasure 
     42   :members: 
     43 
     44.. automethod:: Orange.feature.scoring.MeasureAttribute_Distance 
     45 
     46.. autoclass:: Orange.feature.scoring.MeasureAttribute_DistanceClass 
     47   :members: 
     48    
     49.. automethod:: Orange.feature.scoring.MeasureAttribute_MDL 
     50 
     51.. autoclass:: Orange.feature.scoring.MeasureAttribute_MDLClass 
     52   :members: 
     53 
     54.. automethod:: Orange.feature.scoring.mergeAttrValues 
     55 
     56.. automethod:: Orange.feature.scoring.attMeasure 
     57 
     58 
     59======================== 
     60Different Score Measures 
     61======================== 
     62 
     63.. note: add links to gain ratio, relief and other feature scores 
     64 
     65The following script reports on gain ratio and relief attribute 
     66scores. Notice that for our data set the ranks of the attributes 
     67rather match well! 
     68 
     69`fss2.py`_ (uses `voting.tab`_):: 
     70 
     71    import orange, orngFSS 
     72    data = orange.ExampleTable("voting") 
     73 
     74    print 'Relief GainRt Attribute' 
     75    ma_def = orngFSS.attMeasure(data) 
     76    gainRatio = orange.MeasureAttribute_gainRatio() 
     77    ma_gr  = orngFSS.attMeasure(data, gainRatio) 
     78    for i in range(5): 
     79        print "%5.3f  %5.3f  %s" % (ma_def[i][1], ma_gr[i][1], ma_def[i][0]) 
     80 
     81========== 
     82References 
     83========== 
     84 
     85* Kononeko: Strojno ucenje. 
     86 
     87""" 
     88 
    189import Orange.core as orange 
    290 
     
    492# from orngEvalAttr.py 
    593class OrderAttributesByMeasure: 
     94    """Construct an instance that orders features by their scores. 
     95     
     96    :param measure: an attribute measure, derived from  
     97      :obj:`orange.MeasureAttribute`. 
     98     
     99    """ 
    6100    def __init__(self, measure=None): 
    7         self.measure=measure 
     101        self.measure = measure 
    8102 
    9103    def __call__(self, data, weight): 
     104        """Take :obj:`Orange.data.table` data table and an instance of 
     105        :obj:`orange.MeasureAttribute` to score and order features.   
     106 
     107        :param data: a data table used to score features 
     108        :type data: Orange.data.table 
     109 
     110        :param weight: meta feature that stores weights of individual data 
     111          instances 
     112        :type weight: Orange.data.feature 
     113 
     114        """ 
    10115        if self.measure: 
    11116            measure=self.measure 
     
    18123 
    19124def MeasureAttribute_Distance(attr = None, data = None): 
     125    """Instantiate :obj:`MeasureAttribute_DistanceClass` and use it to return 
     126    the score of a given feature on given data. 
     127     
     128    :param attr: feature to score 
     129    :type attr: Orange.data.feature 
     130     
     131    :param data: data table used for feature scoring 
     132    :type data: Orange.data.table  
     133     
     134    """ 
    20135    m = MeasureAttribute_DistanceClass() 
    21136    if attr != None and data != None: 
     
    24139        return m 
    25140 
    26 # measure 1-D as described in Strojno ucenje (Kononenko) 
    27141class MeasureAttribute_DistanceClass(orange.MeasureAttribute): 
     142    """Implement the 1-D feature distance measure described in Kononenko.""" 
    28143    def __call__(self, attr, data, aprioriDist = None, weightID = None): 
     144        """Take :obj:`Orange.data.table` data table and score the given  
     145        :obj:`Orange.data.feature`. 
     146 
     147        :param attr: feature to score 
     148        :type attr: Orange.data.feature 
     149 
     150        :param data: a data table used to score features 
     151        :type data: Orange.data.table 
     152 
     153        :param aprioriDist:  
     154        :type aprioriDist: 
     155         
     156        :param weightID: meta feature used to weight individual data instances 
     157        :type weightID: Orange.data.feature 
     158 
     159        """ 
    29160        import numpy 
    30161        from orngContingency import Entropy 
     
    42173            return 0 
    43174 
    44 # attribute quality measure based on the minimum description length principle 
    45175def MeasureAttribute_MDL(attr = None, data = None): 
     176    """Instantiate :obj:`MeasureAttribute_MDLClass` and use it n given data to 
     177    return the feature's score.""" 
    46178    m = MeasureAttribute_MDLClass() 
    47179    if attr != None and data != None: 
     
    51183 
    52184class MeasureAttribute_MDLClass(orange.MeasureAttribute): 
     185    """Score feature based on the minimum description length principle.""" 
    53186    def __call__(self, attr, data, aprioriDist = None, weightID = None): 
     187        """Take :obj:`Orange.data.table` data table and score the given  
     188        :obj:`Orange.data.feature`. 
     189 
     190        :param attr: feature to score 
     191        :type attr: Orange.data.feature 
     192 
     193        :param data: a data table used to score the feature 
     194        :type data: Orange.data.table 
     195 
     196        :param aprioriDist:  
     197        :type aprioriDist: 
     198         
     199        :param weightID: meta feature used to weight individual data instances 
     200        :type weightID: Orange.data.feature 
     201 
     202        """ 
    54203        attrClassCont = orange.ContingencyAttrClass(attr, data) 
    55204        classDist = orange.Distribution(data.domain.classVar, data).values() 
    56205        nCls = len(classDist) 
    57206        nEx = len(data) 
    58         priorMDL = logMultipleCombs(nEx, classDist) + logMultipleCombs(nEx+nCls-1, [nEx, nCls-1]) 
    59         postPart1 = [logMultipleCombs(sum(attrClassCont[key]), attrClassCont[key].values()) for key in attrClassCont.keys()] 
    60         postPart2 = [logMultipleCombs(sum(attrClassCont[key])+nCls-1, [sum(attrClassCont[key]), nCls-1]) for key in attrClassCont.keys()] 
     207        priorMDL = _logMultipleCombs(nEx, classDist) + _logMultipleCombs(nEx+nCls-1, [nEx, nCls-1]) 
     208        postPart1 = [_logMultipleCombs(sum(attrClassCont[key]), attrClassCont[key].values()) for key in attrClassCont.keys()] 
     209        postPart2 = [_logMultipleCombs(sum(attrClassCont[key])+nCls-1, [sum(attrClassCont[key]), nCls-1]) for key in attrClassCont.keys()] 
    61210        ret = priorMDL 
    62211        for val in postPart1 + postPart2: 
     
    66215# compute n! / k1! * k2! * k3! * ... kc! 
    67216# ks = [k1, k2, ...] 
    68 def logMultipleCombs(n, ks): 
     217def _logMultipleCombs(n, ks): 
    69218    import math 
    70219    m = max(ks) 
     
    129278# from orngFSS 
    130279def attMeasure(data, measure = orange.MeasureAttribute_relief(k=20, m=50)): 
    131   """ 
    132   Assesses the quality of attributes using the given measure, outputs the results and 
    133   returns a sorted list of tuples (attribute name, measure) 
    134   Arguments: data       example table 
    135              measure    an attribute scoring function (derived from orange.MeasureAttribute) 
    136   Result:    a sorted list of tuples (attribute name, measure) 
    137   """ 
    138   measl=[] 
    139   for i in data.domain.attributes: 
    140     measl.append((i.name, measure(i, data))) 
    141   measl.sort(lambda x,y:cmp(y[1], x[1])) 
     280    """Assess the quality of attributes using the given measure and return 
     281    a sorted list of tuples (attribute name, measure). 
     282 
     283    :param data: data table should include a discrete class. 
     284    :type data: Orange.data.table. 
     285    :param measure:  attribute scoring function. Derived from 
     286      :obj:`orange.MeasureAttribute`. Defaults to Defaults to  
     287      :obj:`orange.MeasureAttribute_relief` with k=20 and m=50. 
     288    :type measure: :obj:`orange.MeasureAttribute`  
     289    :rtype: :obj:`list` a sorted list of tuples (attribute name, score) 
     290 
     291    """ 
     292    measl=[] 
     293    for i in data.domain.attributes: 
     294        measl.append((i.name, measure(i, data))) 
     295    measl.sort(lambda x,y:cmp(y[1], x[1])) 
    142296    
    143297#  for i in measl: 
    144298#    print "%25s, %6.3f" % (i[0], i[1]) 
    145   return measl 
     299    return measl 
  • orange/Orange/feature/selection.py

    r7199 r7290  
     1""" 
     2.. index:: feature selection 
     3 
     4Some machine learning methods may perform better if they learn only from a  
     5selected subset of "best" features. Here we mostly implement filter approaches, 
     6were feature scores are estimated prior to the modelling, that is, without 
     7knowing of which machine learning method will be used to construct a predictive 
     8model. 
     9 
     10.. automethod:: Orange.feature.selection.FilterAttsAboveThresh 
     11 
     12.. autoclass:: Orange.feature.selection.FilterAttsAboveThresh_Class 
     13   :members: 
     14 
     15.. automethod:: Orange.feature.selection.FilterBestNAtts 
     16 
     17.. autoclass:: Orange.feature.selection.FilterBestNAtts_Class 
     18   :members: 
     19 
     20.. automethod:: Orange.feature.selection.FilterRelief 
     21 
     22.. autoclass:: Orange.feature.selection.FilterRelief_Class 
     23   :members: 
     24 
     25.. automethod:: Orange.feature.selection.FilteredLearner 
     26 
     27.. autoclass:: Orange.feature.selection.FilteredLearner_Class 
     28   :members: 
     29 
     30.. autoclass:: Orange.feature.selection.FilteredClassifier 
     31   :members: 
     32 
     33These functions support in the design of feature subset selection for 
     34classification problems. 
     35 
     36.. automethod:: Orange.feature.selection.bestNAtts 
     37 
     38.. automethod:: Orange.feature.selection.attsAboveThreshold 
     39 
     40.. automethod:: Orange.feature.selection.selectBestNAtts 
     41 
     42.. automethod:: Orange.feature.selection.selectAttsAboveThresh 
     43 
     44.. automethod:: Orange.feature.selection.filterRelieff 
     45 
     46 
     47==================================== 
     48Filter Approach for Machine Learning 
     49==================================== 
     50 
     51Attribute scoring has at least two potential uses. One is 
     52informative (or descriptive): the data analyst can use attribute 
     53scoring to find "good" features and those that are irrelevant for 
     54given classification task. The other use is in improving the 
     55performance of machine learning by learning only from the data set 
     56that includes the most informative features. This so-called filter 
     57approach can boost the performance of learner both in terms of 
     58predictive accuracy, speed-up of induction, and simplicity of 
     59resulting models. 
     60 
     61Following is a script that defines a new classifier that is based 
     62on naive Bayes and prior to learning selects five best features from 
     63the data set. The new classifier is wrapped-up in a special class (see 
     64<a href="../ofb/c_pythonlearner.htm">Building your own learner</a> 
     65lesson in <a href="../ofb/default.htm">Orange for Beginners</a>). The 
     66script compares this filtered learner naive Bayes that uses a complete 
     67set of features. 
     68 
     69`fss3.py`_ (uses `voting.tab`_):: 
     70 
     71    import orange, orngFSS 
     72 
     73    class BayesFSS(object): 
     74        def __new__(cls, examples=None, **kwds): 
     75            learner = object.__new__(cls, **kwds) 
     76            if examples: 
     77                return learner(examples) 
     78            else: 
     79                return learner 
     80     
     81        def __init__(self, name='Naive Bayes with FSS', N=5): 
     82            self.name = name 
     83            self.N = 5 
     84     
     85        def __call__(self, data, weight=None): 
     86            ma = orngFSS.attMeasure(data) 
     87            filtered = orngFSS.selectBestNAtts(data, ma, self.N) 
     88            model = orange.BayesLearner(filtered) 
     89            return BayesFSS_Classifier(classifier=model, N=self.N, name=self.name) 
     90     
     91    class BayesFSS_Classifier: 
     92        def __init__(self, **kwds): 
     93            self.__dict__.update(kwds) 
     94     
     95        def __call__(self, example, resultType = orange.GetValue): 
     96            return self.classifier(example, resultType) 
     97     
     98    # test above wraper on a data set 
     99    import orngStat, orngTest 
     100    data = orange.ExampleTable("voting") 
     101    learners = (orange.BayesLearner(name='Naive Bayes'), BayesFSS(name="with FSS")) 
     102    results = orngTest.crossValidation(learners, data) 
     103     
     104    # output the results 
     105    print "Learner      CA" 
     106    for i in range(len(learners)): 
     107        print "%-12s %5.3f" % (learners[i].name, orngStat.CA(results)[i]) 
     108 
     109Interestingly, and somehow expected, feature subset selection 
     110helps. This is the output that we get:: 
     111 
     112    Learner      CA 
     113    Naive Bayes  0.903 
     114    with FSS     0.940 
     115 
     116====================== 
     117And a Much Simpler One 
     118====================== 
     119 
     120Although perhaps educational, we can do all of the above by 
     121wrapping the learner using <code>FilteredLearner</code>, thus creating 
     122an object that is assembled from data filter and a base learner. When 
     123given the data, this learner uses attribute filter to construct a new 
     124data set and base learner to construct a corresponding 
     125classifier. Attribute filters should be of the type like 
     126<code>orngFSS.FilterAttsAboveThresh</code> or 
     127<code>orngFSS.FilterBestNAtts</code> that can be initialized with the 
     128arguments and later presented with a data, returning new reduced data 
     129set. 
     130 
     131The following code fragment essentially replaces the bulk of code 
     132from previous example, and compares naive Bayesian classifier to the 
     133same classifier when only a single most important attribute is 
     134used. 
     135 
     136`fss4.py`_ (uses `voting.tab`_):: 
     137 
     138    nb = orange.BayesLearner() 
     139    learners = (orange.BayesLearner(name='bayes'), 
     140                FilteredLearner(nb, filter=FilterBestNAtts(n=1), name='filtered')) 
     141    results = orngEval.CrossValidation(learners, data) 
     142 
     143Now, let's decide to retain three features (change the code in <a 
     144href="fss4.py">fss4.py</a> accordingly!), but observe how many times 
     145an attribute was used. Remember, 10-fold cross validation constructs 
     146ten instances for each classifier, and each time we run 
     147FilteredLearner a different set of features may be 
     148selected. <code>orngEval.CrossValidation</code> stores classifiers in 
     149<code>results</code> variable, and <code>FilteredLearner</code> 
     150returns a classifier that can tell which features it used (how 
     151convenient!), so the code to do all this is quite short. 
     152 
     153`fss4.py`_ (uses `voting.tab`_):: 
     154 
     155    print "\\nNumber of times features were used in cross-validation:\\n" 
     156    attsUsed = {} 
     157    for i in range(10): 
     158        for a in results.classifiers[i][1].atts(): 
     159            if a.name in attsUsed.keys(): 
     160                attsUsed[a.name] += 1 
     161            else: 
     162                attsUsed[a.name] = 1 
     163    for k in attsUsed.keys(): 
     164        print "%2d x %s" % (attsUsed[k], k) 
     165 
     166Running `fss4.py`_ with three features selected each time a learner is run 
     167gives the following result:: 
     168 
     169    Learner      CA 
     170    bayes        0.903 
     171    filtered     0.956 
     172 
     173    Number of times features were used in cross-validation: 
     174     3 x el-salvador-aid 
     175     6 x synfuels-corporation-cutback 
     176     7 x adoption-of-the-budget-resolution 
     177    10 x physician-fee-freeze 
     178     4 x crime 
     179 
     180Experiment yourself to see, if only one attribute is retained for 
     181classifier, which attribute was the one most frequently selected over 
     182all the ten cross-validation tests! 
     183 
     184References 
     185---------- 
     186 
     187* K. Kira and L. Rendell. A practical approach to feature 
     188  selection. In D. Sleeman and P. Edwards, editors, <em>Proc. 9th Int'l 
     189  Conf. on Machine Learning</em>, pages 249{256, Aberdeen, 1992. Morgan 
     190  Kaufmann Publishers. 
     191 
     192* I. Kononenko. Estimating attributes: Analysis and extensions of 
     193  RELIEF. In F. Bergadano and L. De Raedt, editors, <em>Proc. European 
     194  Conf. on Machine Learning (ECML-94)</em>, pages 
     195  171-182. Springer-Verlag, 1994. 
     196 
     197* R. Kohavi, G. John: Wrappers for Feature Subset Selection, 
     198  <em>Artificial Intelligence</em>, 97 (1-2), pages 273-324, 1997 
     199 
     200.. _fss1.py: code/fss1.py 
     201.. _fss2.py: code/fss2.py 
     202.. _fss3.py: code/fss3.py 
     203.. _fss4.py: code/fss4.py 
     204.. _voting.tab: code/voting.tab 
     205 
     206""" 
     207 
     208__docformat__ = 'restructuredtext' 
     209 
    1210import Orange.core as orange 
    2211 
    3212# from orngFSS 
    4213def bestNAtts(scores, N): 
    5   """ 
    6   Returns the first N attributes from the list returned by function attMeasure. 
    7   Arguments: scores   a list such as one returned by "attMeasure" 
    8              N             the number of attributes 
    9   Result: the first N attributes (without measures) 
    10   """ 
    11   return map(lambda x:x[0], scores[:N]) 
    12  
    13 def attsAbovethreshold(scores, threshold=0.0): 
    14   """ 
    15   Returns attributes from the list returned by function attMeasure that 
    16   have the score above or equal to a specified threshold 
    17   Arguments: scores   a list such as one returned by "attMeasure" 
    18              threshold      threshold, default is 0.0 
    19   Result: the first N attributes (without measures) 
    20   """ 
    21   pairs = filter(lambda x, t=threshold: x[1] > t, scores) 
    22   return map(lambda x:x[0], pairs) 
     214    """Return the best N features (without scores) from the list returned 
     215    by function :obj:`Orange.feature.scoring.attMeasure`. 
     216     
     217    :param scores: a list such as one returned by  
     218      :obj:`Orange.feature.scoring.attMeasure` 
     219    :type scores: list 
     220    :param N: number of best features to select.  
     221    :type N: int 
     222    :rtype: :obj:`list` 
     223 
     224    """ 
     225    return map(lambda x:x[0], scores[:N]) 
     226 
     227def attsAboveThreshold(scores, threshold=0.0): 
     228    """Return features (without scores) from the list returned by 
     229    :obj:`Orange.feature.scoring.attMeasure` with score above or 
     230    equal to a specified threshold. 
     231     
     232    :param scores: a list such as one returned by 
     233      :obj:`Orange.feature.scoring.attMeasure` 
     234    :type scores: list 
     235    :param threshold: score threshold for attribute selection. Defaults to 0. 
     236    :type threshold: float 
     237    :rtype: :obj:`list` 
     238 
     239    """ 
     240    pairs = filter(lambda x, t=threshold: x[1] > t, scores) 
     241    return map(lambda x:x[0], pairs) 
    23242 
    24243def selectBestNAtts(data, scores, N): 
    25   """ 
    26   Constructs and returns a new set of examples that includes a 
    27   class and only N best attributes from a list scores 
    28   Arguments: data          an example table 
    29              scores   a list such as one returned by "attMeasure" 
    30              N             the number of attributes 
    31   Result: data with the first N attributes (without measures) 
    32   """ 
    33   return data.select(bestNAtts(scores, N)+[data.domain.classVar.name]) 
     244    """Construct and return a new set of examples that includes a 
     245    class and only N best features from a list scores. 
     246     
     247    :param data: an example table 
     248    :type data: Orange.data.table 
     249    :param scores: a list such as one returned by  
     250      :obj:`Orange.feature.scoring.attMeasure` 
     251    :type scores: list 
     252    :param N: number of features to select 
     253    :type N: int 
     254    :rtype: :class:`Orange.data.table` holding N best features 
     255 
     256    """ 
     257    return data.select(bestNAtts(scores, N)+[data.domain.classVar.name]) 
    34258 
    35259 
    36260def selectAttsAboveThresh(data, scores, threshold=0.0): 
    37   """ 
    38   Constructs and returns a new set of examples that includes a 
    39   class and attributes from the list returned by function attMeasure that 
    40   have the score above or equal to a specified threshold 
    41   Arguments: data          an example table 
    42              scores      a list such as one returned by "attMeasure" 
    43              threshold      threshold, default is 0.0 
    44   Result: the first N attributes (without measures) 
    45   """ 
    46   return data.select(attsAbovethreshold(scores, threshold)+[data.domain.classVar.name]) 
     261    """Construct and return a new set of examples that includes a class and  
     262    features from the list returned by  
     263    :obj:`Orange.feature.scoring.attMeasure` that have the score above or  
     264    equal to a specified threshold. 
     265     
     266    :param data: an example table 
     267    :type data: Orange.data.table 
     268    :param scores: a list such as one returned by 
     269      :obj:`Orange.feature.scoring.attMeasure`     
     270    :type scores: list 
     271    :param threshold: score threshold for attribute selection. Defaults to 0. 
     272    :type threshold: float 
     273    :rtype: :obj:`list` 
     274    Result: the first N features (without measures) 
     275   
     276    """ 
     277    return data.select(attsAboveThreshold(scores, threshold)+[data.domain.classVar.name]) 
    47278 
    48279def filterRelieff(data, measure = orange.MeasureAttribute_relief(k=20, m=50), margin=0): 
    49   """ 
    50   Takes the data set and an attribute measure (Relief by default). Estimates 
    51   attibute score by the measure, removes worst attribute if its measure 
    52   is below the margin. Repeats, until no attribute has negative or zero score. 
    53   Arguments: data          an example table 
    54              measure       an attribute measure (derived from mlpy.MeasureAttribute) 
    55              margin        if score is higher than margin, attribute is not removed 
    56   """ 
    57   measl = attMeasure(data, measure) 
    58    
    59   while len(data.domain.attributes)>0 and measl[-1][1]<margin: 
    60     data = selectBestNAtts(data, measl, len(data.domain.attributes)-1) 
    61 #    print 'remaining ', len(data.domain.attributes) 
     280    """Take the data set and use an attribute measure to removes the worst  
     281    scored attribute (those below the margin). Repeats, until no attribute has 
     282    negative or zero score. 
     283     
     284    .. note:: Notice that this filter procedure was originally designed for \ 
     285    measures such as Relief, which are context dependent, i.e., removal of \ 
     286    features may change the scores of other remaining features. Hence the \ 
     287    need to re-estimate score every time an attribute is removed. 
     288 
     289    :param data: an data table 
     290    :type data: Orange.data.table 
     291    :param measure: an attribute measure (derived from  
     292      :obj:`Orange.MeasureAttribute`). Defaults to  
     293      :obj:`Orange.MeasureAttribute_relief` for k=20 and m=50. 
     294    :param margin: if score is higher than margin, attribute is not removed. 
     295      Defaults to 0. 
     296    :type margin: float 
     297     
     298    """ 
    62299    measl = attMeasure(data, measure) 
    63   return data 
     300    while len(data.domain.attributes)>0 and measl[-1][1]<margin: 
     301        data = selectBestNAtts(data, measl, len(data.domain.attributes)-1) 
     302#        print 'remaining ', len(data.domain.attributes) 
     303        measl = attMeasure(data, measure) 
     304    return data 
    64305 
    65306############################################################################## 
     
    67308 
    68309def FilterAttsAboveThresh(data=None, **kwds): 
    69   filter = apply(FilterAttsAboveThresh_Class, (), kwds) 
    70   if data: return filter(data) 
    71   else: return filter 
     310    filter = apply(FilterAttsAboveThresh_Class, (), kwds) 
     311    if data: 
     312        return filter(data) 
     313    else: 
     314        return filter 
    72315   
    73316class FilterAttsAboveThresh_Class: 
    74   def __init__(self, measure=orange.MeasureAttribute_relief(k=20, m=50), threshold=0.0): 
    75     self.measure = measure 
    76     self.threshold = threshold 
    77   def __call__(self, data): 
    78     ma = attMeasure(data, self.measure) 
    79     return selectAttsAboveThresh(data, ma, self.threshold) 
    80  
    81 # 
     317    """FilterAttsAboveThresh([<em>measure</em>[<em>, threshold</em>]])</dt> 
     318    <dd class="ddfun">This is simply a wrapper around the function 
     319    <code>selectAttsAboveThresh</code>. It allows to create an object 
     320    which stores filter's parameters and can be later called with the data 
     321    to return the data set that includes only the selected 
     322    features. <em>measure</em> is a function that returns a list of 
     323    couples (attribute name, score), and it defaults to 
     324    <code>orange.MeasureAttribute_relief(k=20, m=50)</code>. The default 
     325    threshold is 0.0. Some examples of how to use this class are:: 
     326 
     327        filter = orngFSS.FilterAttsAboveThresh(threshold=.15) 
     328        new_data = filter(data) 
     329        new_data = orngFSS.FilterAttsAboveThresh(data) 
     330        new_data = orngFSS.FilterAttsAboveThresh(data, threshold=.1) 
     331        new_data = orngFSS.FilterAttsAboveThresh(data, threshold=.1, 
     332                     measure=orange.MeasureAttribute_gini()) 
     333 
     334    """ 
     335    def __init__(self, measure=orange.MeasureAttribute_relief(k=20, m=50),  
     336               threshold=0.0): 
     337        self.measure = measure 
     338        self.threshold = threshold 
     339 
     340    def __call__(self, data): 
     341        ma = attMeasure(data, self.measure) 
     342        return selectAttsAboveThresh(data, ma, self.threshold) 
    82343 
    83344def FilterBestNAtts(data=None, **kwds): 
    84   filter = apply(FilterBestNAtts_Class, (), kwds) 
    85   if data: return filter(data) 
    86   else: return filter 
     345    """FilterBestNAtts</b>([<em>measure</em>[<em>, n</em>]])</dt> 
     346    <dd class="ddfun">Similarly to <code>FilterAttsAboveThresh</code>, 
     347    this is a wrapper around the function 
     348    <code>selectBestNAtts</code>. Measure and the number of features to 
     349    retain are optional (the latter defaults to 5). 
     350 
     351    """ 
     352    filter = apply(FilterBestNAtts_Class, (), kwds) 
     353    if data: return filter(data) 
     354    else: return filter 
    87355   
    88356class FilterBestNAtts_Class: 
    89   def __init__(self, measure=orange.MeasureAttribute_relief(k=20, m=50), n=5): 
    90     self.measure = measure 
    91     self.n = n 
    92   def __call__(self, data): 
    93     ma = attMeasure(data, self.measure) 
    94     self.n = min(self.n, len(data.domain.attributes)) 
    95     return selectBestNAtts(data, ma, self.n) 
    96  
    97 # 
     357    def __init__(self, measure=orange.MeasureAttribute_relief(k=20, m=50), n=5): 
     358        self.measure = measure 
     359        self.n = n 
     360    def __call__(self, data): 
     361        ma = attMeasure(data, self.measure) 
     362        self.n = min(self.n, len(data.domain.attributes)) 
     363        return selectBestNAtts(data, ma, self.n) 
    98364 
    99365def FilterRelief(data=None, **kwds): 
    100   filter = apply(FilterRelief_Class, (), kwds) 
    101   if data: return filter(data) 
    102   else: return filter 
     366    """FilterRelieff</b>([<em>measure</em>[<em>, margin</em>]])</dt> 
     367    <dd class="ddfun">Similarly to <code>FilterBestNAtts</code>, this is a 
     368    wrapper around the function 
     369    <code>filterRelieff</code>. <em>measure</em> and <em>margin</em> are 
     370    optional attributes, where <em>measure</em> defaults to 
     371    <code>orange.MeasureAttribute_relief(k=20, m=50)</code> and 
     372    <em>margin</em> to 0.0. 
     373 
     374    """     
     375    filter = apply(FilterRelief_Class, (), kwds) 
     376    if data: 
     377        return filter(data) 
     378    else: 
     379        return filter 
    103380   
    104381class FilterRelief_Class: 
    105   def __init__(self, measure=orange.MeasureAttribute_relief(k=20, m=50), margin=0): 
    106     self.measure = measure 
    107     self.margin = margin 
    108   def __call__(self, data): 
    109     return filterRelieff(data, self.measure, self.margin) 
     382    def __init__(self, measure=orange.MeasureAttribute_relief(k=20, m=50), margin=0): 
     383        self.measure = measure 
     384        self.margin = margin 
     385    def __call__(self, data): 
     386        return filterRelieff(data, self.measure, self.margin) 
    110387 
    111388############################################################################## 
     
    113390 
    114391def FilteredLearner(baseLearner, examples = None, weight = None, **kwds): 
    115   learner = apply(FilteredLearner_Class, [baseLearner], kwds) 
    116   if examples: return learner(examples, weight) 
    117   else: return learner 
     392    """FilteredLearner</b>([<em>baseLearner</em>[<em>, 
     393    examples</em>[<em>, filter</em>[<em>, name</em>]]]])</dt> <dd>Wraps a 
     394    <em>baseLearner</em> using a data <em>filter</em>, and returns the 
     395    corresponding learner. When such learner is presented a data set, data 
     396    is first filtered and then passed to 
     397    <em>baseLearner</em>. <em>FilteredLearner</em> comes handy when one 
     398    wants to test the schema of feature-subset-selection-and-learning by 
     399    some repetitive evaluation method, e.g., cross validation. Filter 
     400    defaults to orngFSS.FilterAttsAboveThresh with default 
     401    attributes. Here is an example of how to set such learner (build a 
     402    wrapper around naive Bayesian learner) and use it on a data set:: 
     403 
     404        nb = orange.BayesLearner() 
     405        learner = orngFSS.FilteredLearner(nb, filter=orngFSS.FilterBestNAtts(n=5), name='filtered') 
     406        classifier = learner(data) 
     407 
     408    """ 
     409    learner = apply(FilteredLearner_Class, [baseLearner], kwds) 
     410    if examples: 
     411        return learner(examples, weight) 
     412    else: 
     413        return learner 
    118414 
    119415class FilteredLearner_Class: 
    120   def __init__(self, baseLearner, filter=FilterAttsAboveThresh(), name='filtered'): 
    121     self.baseLearner = baseLearner 
    122     self.filter = filter 
    123     self.name = name 
    124   def __call__(self, data, weight=0): 
    125     # filter the data and then learn 
    126     fdata = self.filter(data) 
    127     model = self.baseLearner(fdata, weight) 
    128     return FilteredClassifier(classifier = model, domain = model.domain) 
     416    def __init__(self, baseLearner, filter=FilterAttsAboveThresh(), name='filtered'): 
     417        self.baseLearner = baseLearner 
     418        self.filter = filter 
     419        self.name = name 
     420    def __call__(self, data, weight=0): 
     421        # filter the data and then learn 
     422        fdata = self.filter(data) 
     423        model = self.baseLearner(fdata, weight) 
     424        return FilteredClassifier(classifier = model, domain = model.domain) 
    129425 
    130426class FilteredClassifier: 
    131   def __init__(self, **kwds): 
    132     self.__dict__.update(kwds) 
    133   def __call__(self, example, resultType = orange.GetValue): 
    134     return self.classifier(example, resultType) 
    135   def atts(self): 
    136     return self.domain.attributes   
     427    def __init__(self, **kwds): 
     428        self.__dict__.update(kwds) 
     429    def __call__(self, example, resultType = orange.GetValue): 
     430        return self.classifier(example, resultType) 
     431    def atts(self): 
     432        return self.domain.attributes   
Note: See TracChangeset for help on using the changeset viewer.