source: orange/Orange/feature/selection.py @ 11647:dfa6d31c2fc2

Revision 11647:dfa6d31c2fc2, 8.8 KB checked in by Ales Erjavec <ales.erjavec@…>, 9 months ago (diff)

Preserve the domain's meta attributes and class_vars.

Line 
1__docformat__ = 'restructuredtext'
2
3from operator import itemgetter
4
5import Orange.data
6import Orange.core as orange
7
8from Orange.feature.scoring import score_all
9
10
11def _select_features_subset(data, features):
12    """Select the `features` from the `data`.
13
14    .. note::
15        The `features` must be a subset of the `data.domain.features`.
16
17    """
18    def as_descriptor(arg):
19        """Ensure `arg` is an descriptor from `data.domain`"""
20        return data.domain[arg]
21
22    features = map(as_descriptor, features)
23    domain = Orange.data.Domain(features, data.domain.class_var,
24                                class_vars=data.domain.class_vars)
25    domain.add_metas(data.domain.get_metas())
26    return Orange.data.Table(domain, data)
27
28
29def top_rated(scores, n, highest_best=True):
30    """Return n top-rated features from the list of scores.
31
32    :param list scores:
33        A list such as the one returned by :func:`.score_all`
34    :param int n: Number of features to select.
35    :param bool highest_best:
36        If true, the features that are scored higher are preferred.
37    :rtype: :obj:`list`
38
39    """
40    return [f for f, score in
41            sorted(scores, key=itemgetter(1), reverse=highest_best)[:n]]
42
43bestNAtts = top_rated
44
45
46def above_threshold(scores, threshold=0.0):
47    """Return features (without scores) with scores above or
48    equal to a specified threshold.
49
50    :param list scores:
51        A list such as one returned by :func:`.score_all`
52    :param float threshold: Threshold for selection.
53    :rtype: :obj:`list`
54
55    """
56    return [f for f, score in scores if score >= threshold]
57
58
59attsAboveThreshold = above_threshold
60
61
62def select(data, scores, n):
63    """Construct and return a new data table that includes a
64    class and only the best features from a list scores.
65
66    :param data: a data table
67    :type data: :obj:`Orange.data.Table`
68    :param scores: a list such as the one returned by
69      :obj:`~Orange.feature.scoring.score_all`
70    :type scores: list
71    :param n: number of features to select
72    :type n: int
73    :rtype: :obj:`Orange.data.Table`
74
75    """
76    features = top_rated(scores, n)
77    return _select_features_subset(data, features)
78
79selectBestNAtts = select
80select_best_n = select
81
82
83def select_above_threshold(data, scores, threshold=0.0):
84    """Construct and return a new data table that includes a class and
85    features from the list returned by
86    :obj:`~Orange.feature.scoring.score_all` with higher or equal score
87    to a given threshold.
88
89    :param data: a data table
90    :type data: :obj:`Orange.data.Table`
91    :param scores: a list such as the one returned by
92      :obj:`~Orange.feature.scoring.score_all`
93    :type scores: list
94    :param threshold: threshold for selection
95    :type threshold: float
96    :rtype: :obj:`Orange.data.Table`
97    """
98    features = above_threshold(scores, threshold)
99    return _select_features_subset(data, features)
100
101selectAttsAboveThresh = select_above_threshold
102
103
104def select_relief(data, measure=orange.MeasureAttribute_relief(k=20, m=50), margin=0):
105    """Iteratively remove the worst scored feature until no feature
106    has a score below the margin. The filter procedure was originally
107    designed for measures such as Relief, which are context dependent,
108    i.e., removal of features may change the scores of other remaining
109    features. The score is thus recomputed in each iteration.
110
111    :param data: a data table
112    :type data: :obj:`Orange.data.Table`
113    :param measure: a feature scorer
114    :type measure: :obj:`Orange.feature.scoring.Score`
115    :param margin: margin for removal
116    :type margin: float
117
118    """
119    measl = score_all(data, measure)
120    while len(data.domain.attributes) > 0 and measl[-1][1] < margin:
121        data = select(data, measl, len(data.domain.attributes) - 1)
122        measl = score_all(data, measure)
123    return data
124
125filterRelieff = select_relief
126
127
128class FilterAboveThreshold(object):
129    """A wrapper around :obj:`select_above_threshold`; the
130    constructor stores the parameters of the feature selection
131    procedure that are then applied when the the selection
132    is called with the actual data.
133
134    :param measure: a feature scorer
135    :type measure: :obj:`Orange.feature.scoring.Score`
136    :param threshold: threshold for selection. Defaults to 0.
137    :type threshold: float
138    """
139
140    def __new__(cls, data=None,
141                measure=orange.MeasureAttribute_relief(k=20, m=50),
142                threshold=0.0):
143        if data is None:
144            self = object.__new__(cls)
145            return self
146        else:
147            self = cls(measure=measure, threshold=threshold)
148            return self(data)
149
150    def __init__(self, measure=orange.MeasureAttribute_relief(k=20, m=50), \
151                 threshold=0.0):
152        self.measure = measure
153        self.threshold = threshold
154
155    def __call__(self, data):
156        """Return data table features that have scores above given
157        threshold.
158
159        :param data: data table
160        :type data: Orange.data.Table
161
162        """
163        ma = score_all(data, self.measure)
164        return select_above_threshold(data, ma, self.threshold)
165
166FilterAttsAboveThresh = FilterAboveThreshold
167FilterAttsAboveThresh_Class = FilterAboveThreshold
168
169
170class FilterBestN(object):
171    """A wrapper around :obj:`select`; the
172    constructor stores the filter parameters that are applied when the
173    function is called.
174
175    :param measure: a feature scorer
176    :type measure: :obj:`Orange.feature.scoring.Score`
177    :param n: number of features to select
178    :type n: int
179
180    """
181    def __new__(cls, data=None,
182                measure=orange.MeasureAttribute_relief(k=20, m=50),
183                n=5):
184
185        if data is None:
186            self = object.__new__(cls)
187            return self
188        else:
189            self = cls(measure=measure, n=n)
190            return self(data)
191
192    def __init__(self, measure=orange.MeasureAttribute_relief(k=20, m=50),
193                 n=5):
194        self.measure = measure
195        self.n = n
196
197    def __call__(self, data):
198        ma = score_all(data, self.measure)
199        self.n = min(self.n, len(data.domain.attributes))
200        return select(data, ma, self.n)
201
202FilterBestNAtts = FilterBestN
203FilterBestNAtts_Class = FilterBestN
204
205
206class FilterRelief(object):
207    """A class wrapper around :obj:`select_best_n`; the
208    constructor stores the filter parameters that are applied when the
209    function is called.
210
211    :param measure: a feature scorer
212    :type measure: :obj:`Orange.feature.scoring.Score`
213    :param margin: margin for Relief scoring
214    :type margin: float
215
216    """
217    def __new__(cls, data=None,
218                measure=orange.MeasureAttribute_relief(k=20, m=50),
219                margin=0):
220
221        if data is None:
222            self = object.__new__(cls)
223            return self
224        else:
225            self = cls(measure=measure, margin=margin)
226            return self(data)
227
228    def __init__(self, measure=orange.MeasureAttribute_relief(k=20, m=50),
229                 margin=0):
230        self.measure = measure
231        self.margin = margin
232
233    def __call__(self, data):
234        return select_relief(data, self.measure, self.margin)
235
236FilterRelief_Class = FilterRelief
237
238##############################################################################
239# wrapped learner
240
241
242class FilteredLearner(object):
243    """A feature selection wrapper around base learner. When provided data,
244     this learner applies a given feature selection method and then calls
245     the base learner.
246
247    Here is an example of how to build a wrapper around naive Bayesian learner
248    and use it on a data set::
249
250        nb = Orange.classification.bayes.NaiveBayesLearner()
251        learner = Orange.feature.selection.FilteredLearner(nb,
252            filter=Orange.feature.selection.FilterBestN(n=5), name='filtered')
253        classifier = learner(data)
254
255    """
256    def __new__(cls, baseLearner, data=None, weight=0,
257                filter=FilterAboveThreshold(), name='filtered'):
258
259        if data is None:
260            self = object.__new__(cls)
261            return self
262        else:
263            self = cls(baseLearner, filter=filter, name=name)
264            return self(data, weight)
265
266    def __init__(self, baseLearner, filter=FilterAboveThreshold(),
267                 name='filtered'):
268        self.baseLearner = baseLearner
269        self.filter = filter
270        self.name = name
271
272    def __call__(self, data, weight=0):
273        # filter the data and then learn
274        fdata = self.filter(data)
275        model = self.baseLearner(fdata, weight)
276        return FilteredClassifier(classifier=model, domain=model.domain)
277
278FilteredLearner_Class = FilteredLearner
279
280
281class FilteredClassifier:
282    """A classifier returned by FilteredLearner."""
283    def __init__(self, **kwds):
284        self.__dict__.update(kwds)
285
286    def __call__(self, example, resultType=orange.GetValue):
287        return self.classifier(example, resultType)
288
289    def atts(self):
290        return self.domain.attributes
Note: See TracBrowser for help on using the repository browser.