source: orange/orange/Orange/preprocess/__init__.py @ 9167:c05cdc189733

Revision 9167:c05cdc189733, 11.2 KB checked in by markotoplak, 2 years ago (diff)

Preprocessing: featureSelection fix (before it just selected random feature).

Line 
1"""
2.. autoclass:: Preprocessor_discretizeEntropy
3
4.. autoclass:: Preprocessor_removeContinuous
5
6.. autoclass:: Preprocessor_continuize
7
8.. autoclass:: Preprocessor_removeDiscrete
9
10.. autoclass:: Preprocessor_impute
11
12.. autoclass:: Preprocessor_featureSelection
13
14.. autofunction:: bestP
15
16.. autofunction:: bestN
17
18.. autofunction:: selectNRandom
19
20.. autofunction:: selectPRandom
21
22.. autoclass:: Preprocessor_RFE
23
24.. autoclass:: Preprocessor_sample
25
26.. autoclass:: Preprocessor_preprocessorList
27
28"""
29
30from orange import \
31     DomainContinuizer, \
32    VariableFilterMap, \
33    ValueFilter, \
34         ValueFilter_continuous, \
35         ValueFilter_discrete, \
36         ValueFilter_string, \
37         ValueFilter_stringList, \
38    ValueFilterList, \
39    TransformValue, \
40         Discrete2Continuous, \
41         Discretizer, \
42              BiModalDiscretizer, \
43              EquiDistDiscretizer, \
44              IntervalDiscretizer, \
45              ThresholdDiscretizer, \
46         MapIntValue, \
47         NormalizeContinuous, \
48         Ordinal2Continuous, \
49         TransformValue_IsDefined, \
50    TableAverager, \
51    Preprocessor, \
52         Preprocessor_addCensorWeight, \
53         Preprocessor_addClassNoise, \
54         Preprocessor_addClassWeight, \
55         Preprocessor_addGaussianClassNoise, \
56         Preprocessor_addGaussianNoise, \
57         Preprocessor_addMissing, \
58         Preprocessor_addMissingClasses, \
59         Preprocessor_addNoise, \
60         Preprocessor_discretize, \
61         Preprocessor_drop, \
62         Preprocessor_dropMissing, \
63         Preprocessor_dropMissingClasses, \
64         Preprocessor_filter, \
65         Preprocessor_ignore, \
66         Preprocessor_imputeByLearner, \
67         Preprocessor_removeDuplicates, \
68         Preprocessor_select, \
69         Preprocessor_shuffle, \
70         Preprocessor_take, \
71         Preprocessor_takeMissing, \
72         Preprocessor_takeMissingClasses, \
73    Imputer, \
74         Imputer_asValue, \
75         Imputer_defaults, \
76         Imputer_model, \
77         Imputer_random, \
78    ImputerConstructor, \
79         ImputerConstructor_asValue, \
80         ImputerConstructor_average, \
81         ImputerConstructor_maximal, \
82         ImputerConstructor_minimal, \
83         ImputerConstructor_model, \
84         ImputerConstructor_random, \
85    FilterList, \
86    Filter, \
87         Filter_conjunction, \
88         Filter_disjunction, \
89         Filter_hasClassValue, \
90         Filter_hasMeta, \
91         Filter_hasSpecial, \
92         Filter_isDefined, \
93         Filter_random, \
94         Filter_sameValue, \
95         Filter_values, \
96    Discretization, \
97         BiModalDiscretization, \
98         EntropyDiscretization, \
99         EquiDistDiscretization, \
100         EquiNDiscretization, \
101    DomainTransformerConstructor, \
102    RemoveRedundant, \
103         RemoveRedundantByInduction, \
104         RemoveRedundantByQuality, \
105         RemoveRedundantOneValue, \
106    RemoveUnusedValues
107
108import outliers
109
110
111import math
112
113import orange
114from Orange.misc import _orange__new__, _orange__reduce__
115
116class Preprocessor_discretizeEntropy(Preprocessor_discretize):
117    """ An discretizer that uses orange.EntropyDiscretization method but,
118    unlike Preprocessor_discretize class, also removes unused attributes
119    from the domain.
120   
121    """
122   
123    __new__ = _orange__new__(Preprocessor_discretize)
124    __reduce__ = _orange__reduce__
125   
126    def __init__(self, method=orange.EntropyDiscretization()):
127        self.method = method
128        assert(isinstance(method, orange.EntropyDiscretization))
129       
130    def __call__(self, data, wightId=0):
131        newattr_list = []
132        for attr in data.domain.attributes:
133            if attr.varType == orange.VarTypes.Continuous:
134                newattr = self.method(attr, data)
135                if newattr.getValueFrom.transformer.points:
136                    newattr_list.append(newattr)
137            else:
138                newattr_list.append(attr)
139        newdomain = orange.Domain(newattr_list, data.domain.classVar)
140        newdomain.addmetas(data.domain.getmetas())
141        return orange.ExampleTable(newdomain, data)
142   
143class Preprocessor_removeContinuous(Preprocessor_discretize):
144    """ A preprocessor that removes all continuous features.
145    """
146    __new__ = _orange__new__(Preprocessor_discretize)
147    __reduce__ = _orange__reduce__
148   
149    def __call__(self, data, weightId=None):
150        attrs = [attr for attr in data.domain.attributes if attr.varType == orange.VarTypes.Discrete]
151        domain = orange.Domain(attrs, data.domain.classVar)
152        domain.addmetas(data.domain.getmetas())
153        return orange.ExampleTable(domain, data)
154               
155class Preprocessor_continuize(orange.Preprocessor):
156    """ A preprocessor that continuizes a discrete domain (and optionally normalizes it).
157    See :obj:`Orange.feature.continuization.DomainContinuizer` for list of accepted arguments.
158   
159    """
160    __new__ = _orange__new__(orange.Preprocessor)
161    __reduce__ = _orange__reduce__
162   
163    def __init__(self, zeroBased=True, multinomialTreatment=orange.DomainContinuizer.NValues,
164                 continuousTreatment=orange.DomainContinuizer.Leave,
165                 classTreatment=orange.DomainContinuizer.Ignore,
166                 **kwargs):
167        self.zeroBased = zeroBased
168        self.multinomialTreatment = multinomialTreatment
169        self.continuousTreatment = continuousTreatment
170        self.classTreatment = classTreatment
171           
172    def __call__(self, data, weightId=0):
173        continuizer = orange.DomainContinuizer(zeroBased=self.zeroBased,
174                                               multinomialTreatment=self.multinomialTreatment,
175                                               continuousTreatment=self.continuousTreatment,
176                                               classTreatment=self.classTreatment)
177        c_domain = continuizer(data, weightId)
178        return data.translate(c_domain)
179   
180class Preprocessor_removeDiscrete(Preprocessor_continuize):
181    """ A Preprocessor that removes all discrete attributes from the domain.
182    """
183    __new__ = _orange__new__(Preprocessor_continuize)
184   
185    def __call__(self, data, weightId=None):
186        attrs = [attr for attr in data.domain.attributes if attr.varType == orange.VarTypes.Continuous]
187        domain = orange.Domain(attrs, data.domain.classVar)
188        domain.addmetas(data.domain.getmetas())
189        return orange.ExampleTable(domain, data)
190         
191class Preprocessor_impute(orange.Preprocessor):
192    """ A preprocessor that imputes unknown values using a learner.
193   
194    :param model: a learner class.
195   
196    """
197    __new__ = _orange__new__(orange.Preprocessor)
198    __reduce__ = _orange__reduce__
199   
200    def __init__(self, model=None, **kwargs):
201        self.model = orange.MajorityLearner() if model is None else model
202       
203    def __call__(self, data, weightId=0):
204        return orange.Preprocessor_imputeByLearner(data, learner=self.model)
205
206def bestN(attrMeasures, N=10):
207    """ Return best N attributes
208    """
209    return attrMeasures[-N:]
210
211def bestP(attrMeasures, P=10):
212    """ Return best P percent of attributes
213    """
214    count = len(attrMeasures)
215    return  attrMeasures[-max(int(math.ceil(count * P / 100.0)), 1):]
216
217class Preprocessor_featureSelection(orange.Preprocessor):
218    """ A preprocessor that runs feature selection using an feature scoring function.
219   
220    :param measure: a scoring function (default: orange.MeasureAttribute_relief)
221    :param filter: a filter function to use for selection (default Preprocessor_featureSelection.bestN)
222    :param limit: the limit for the filter function (default 10)
223       
224    """
225    __new__ = _orange__new__(orange.Preprocessor)
226    __reduce__ = _orange__reduce__
227   
228    bestN = staticmethod(bestN)
229    bestP = staticmethod(bestP)
230   
231    def __init__(self, measure=orange.MeasureAttribute_relief(), filter=None, limit=10):
232        self.measure = measure
233        self.filter = filter if filter is not None else self.bestN
234        self.limit = limit
235   
236    def attrScores(self, data):
237        """ Return a list of computed scores for all attributes in `data`.
238        """
239        measures = sorted([(self.measure(attr, data), attr) for attr in data.domain.attributes])
240        return measures
241         
242    def __call__(self, data, weightId=None):
243        measures = self.attrScores(data)
244        attrs = [attr for _, attr in self.filter(measures, self.limit)]
245        domain = orange.Domain(attrs, data.domain.classVar)
246        domain.addmetas(data.domain.getmetas())
247        return orange.ExampleTable(domain, data)
248   
249class Preprocessor_RFE(Preprocessor_featureSelection):
250    """ A preprocessor that runs RFE(Recursive Feature Elimination) using
251    linear SVM derived attribute weights.
252   
253    :param filter: a filter function to use for selection (default
254                   Preprocessor_featureSelection.bestN)
255    :param limit: the limit for the filter function (default 10)
256       
257    """
258    __new__ = _orange__new__(Preprocessor_featureSelection)
259    __reduce__ = _orange__reduce__
260    def __init__(self, filter=None, limit=10):
261        self.limit = limit
262        self.filter = filter if filter is not None else self.bestN
263       
264    def __call__(self, data, weightId=None):
265        from Orange.classification.svm import RFE
266        rfe = RFE()
267        filtered = self.filter(range(len(data)), self.limit)
268        return rfe(data, len(filtered))
269   
270def selectNRandom(examples, N=10):
271    """ Select N random examples.
272    """
273    import random
274    return random.sample(examples, N)
275
276def selectPRandom(examples, P=10):
277    """ Select P percent random examples.
278    """
279    import random
280    count = len(examples)
281    return random.sample(examples, max(int(math.ceil(count * P / 100.0)), 1))
282
283class Preprocessor_sample(orange.Preprocessor):
284    """ A preprocessor that samples a subset of the data.
285   
286    :param filter: a filter function to use for selection (default
287                   Preprocessor_sample.selectNRandom)
288    :param limit: the limit for the filter function (default 10)
289   
290    """
291    __new__ = _orange__new__(orange.Preprocessor)
292    __reduce__ = _orange__reduce__
293
294    selectNRandom = staticmethod(selectNRandom)
295    selectPRandom = staticmethod(selectPRandom)
296   
297    def __init__(self, filter=None, limit=10):
298        self.filter = filter if filter is not None else self.selectNRandom
299        self.limit = limit
300       
301    def __call__(self, data, weightId=None):
302        return orange.ExampleTable(data.domain, self.filter(data, self.limit))
303   
304
305class Preprocessor_preprocessorList(orange.Preprocessor):
306    """ A preprocessor wrapping a sequence of other preprocessors.
307   
308    :param preprocessors: a list of :obj:`Preprocessor` instances
309   
310    """
311   
312    __new__ = _orange__new__(orange.Preprocessor)
313    __reduce__ = _orange__reduce__
314   
315    def __init__(self, preprocessors=[]):
316        self.preprocessors = preprocessors
317       
318    def __call__(self, data, weightId=None):
319        import orange
320        hadWeight = hasWeight = weightId is not None
321        for preprocessor in self.preprocessors:
322            t = preprocessor(data, weightId) if hasWeight else preprocessor(data)
323            if isinstance(t, tuple):
324                data, weightId = t
325                hasWeight = True
326            else:
327                data = t
328        if hadWeight:
329            return data, weightId
330        else:
331            return data
332       
Note: See TracBrowser for help on using the repository browser.