source: orange/Orange/preprocess/__init__.py @ 10053:25e7c56b6964

Revision 10053:25e7c56b6964, 13.1 KB checked in by gregorr, 2 years ago (diff)

Outliers moved from preprocess to to Orange.data.

Line 
1"""
2.. autoclass:: Preprocessor_discretizeEntropy
3
4.. autoclass:: Preprocessor_removeContinuous
5
6.. autoclass:: Preprocessor_continuize
7
8.. autoclass:: Preprocessor_removeDiscrete
9
10.. autoclass:: Preprocessor_impute
11
12.. autoclass:: Preprocessor_featureSelection
13
14.. autofunction:: bestP
15
16.. autofunction:: bestN
17
18.. autofunction:: selectNRandom
19
20.. autofunction:: selectPRandom
21
22.. autoclass:: Preprocessor_RFE
23
24.. autoclass:: Preprocessor_sample
25
26.. autoclass:: Preprocessor_preprocessorList
27
28.. class:: RemoveUnusedValues(variable, data, remove_one_valued=False)
29
30    Removes unused values and reduces the variable, if a variable
31    declares values that do not appear in the data.
32
33    :param variable: :class:`Orange.feature.Descriptor`
34    :param data: :class:`Orange.data.Table`
35    :param remove_one_valued: Decides whether to remove or to retain
36        the attributes with only one value defined (default: False).
37   
38    Example:
39   
40    .. literalinclude:: code/unusedValues.py   
41
42    There are four possible outcomes:
43   
44    1. The variable does not have any used values in the data - value
45    of this variable is undefined for all examples. The variable is
46    thus useless and the class returns None.
47
48    2. The variable has only one used value (or, possibly, only one
49    value at all). Such a variable is in fact useless, and can
50    probably be removed without harm. Nevertheless, its fate is
51    decided by the flag remove_one_valued which is False by default,
52    so such variables are retained unless explicitly specified
53    otherwise.
54
55    3. All variable's values occur in the data (and the variable has more
56    than one value; otherwise the above case applies). The original variable
57    is returned.
58
59    4. There are some unused values. A new variable is constructed and the
60    unused values are omitted. The value of the new variable is computed
61    automatically from the value of the original variable
62    :class:`Orange.classification.lookup.ClassifierByLookupTable` is used
63    for mapping.
64   
65    Results of example:
66   
67    .. literalinclude:: code/unusedValues.res
68   
69    Variables a and y are OK and are left alone. In b, value 1 is not used
70    and is removed (not in the original variable, of course; a new variable
71    is created). c is useless and is removed altogether. d is retained since
72    remove_one_valued was left at False; if we set it to True, this variable
73    would be removed as well.
74
75"""
76
77from orange import \
78     DomainContinuizer, \
79    VariableFilterMap, \
80    ValueFilter, \
81         ValueFilter_continuous, \
82         ValueFilter_discrete, \
83         ValueFilter_string, \
84         ValueFilter_stringList, \
85    ValueFilterList, \
86    TransformValue, \
87         Discrete2Continuous, \
88         Discretizer, \
89              BiModalDiscretizer, \
90              EquiDistDiscretizer, \
91              IntervalDiscretizer, \
92              ThresholdDiscretizer, \
93         MapIntValue, \
94         NormalizeContinuous, \
95         Ordinal2Continuous, \
96         TransformValue_IsDefined, \
97    TableAverager, \
98    Preprocessor, \
99         Preprocessor_addCensorWeight, \
100         Preprocessor_addClassNoise, \
101         Preprocessor_addClassWeight, \
102         Preprocessor_addGaussianClassNoise, \
103         Preprocessor_addGaussianNoise, \
104         Preprocessor_addMissing, \
105         Preprocessor_addMissingClasses, \
106         Preprocessor_addNoise, \
107         Preprocessor_discretize, \
108         Preprocessor_drop, \
109         Preprocessor_dropMissing, \
110         Preprocessor_dropMissingClasses, \
111         Preprocessor_filter, \
112         Preprocessor_ignore, \
113         Preprocessor_imputeByLearner, \
114         Preprocessor_removeDuplicates, \
115         Preprocessor_select, \
116         Preprocessor_shuffle, \
117         Preprocessor_take, \
118         Preprocessor_takeMissing, \
119         Preprocessor_takeMissingClasses, \
120    Imputer, \
121         Imputer_asValue, \
122         Imputer_defaults, \
123         Imputer_model, \
124         Imputer_random, \
125    ImputerConstructor, \
126         ImputerConstructor_asValue, \
127         ImputerConstructor_average, \
128         ImputerConstructor_maximal, \
129         ImputerConstructor_minimal, \
130         ImputerConstructor_model, \
131         ImputerConstructor_random, \
132    FilterList, \
133    Filter, \
134         Filter_conjunction, \
135         Filter_disjunction, \
136         Filter_hasClassValue, \
137         Filter_hasMeta, \
138         Filter_hasSpecial, \
139         Filter_isDefined, \
140         Filter_random, \
141         Filter_sameValue, \
142         Filter_values, \
143    Discretization, \
144         BiModalDiscretization, \
145         EntropyDiscretization, \
146         EquiDistDiscretization, \
147         EquiNDiscretization, \
148    DomainTransformerConstructor, \
149    RemoveRedundant, \
150         RemoveRedundantByInduction, \
151         RemoveRedundantByQuality, \
152         RemoveRedundantOneValue, \
153    RemoveUnusedValues
154
155import math
156
157import orange
158from Orange.misc import _orange__new__, _orange__reduce__
159
160class Preprocessor_discretizeEntropy(Preprocessor_discretize):
161    """ An discretizer that uses orange.EntropyDiscretization method but,
162    unlike Preprocessor_discretize class, also removes unused attributes
163    from the domain.
164   
165    """
166   
167    __new__ = _orange__new__(Preprocessor_discretize)
168    __reduce__ = _orange__reduce__
169   
170    def __init__(self, method=orange.EntropyDiscretization()):
171        self.method = method
172        assert(isinstance(method, orange.EntropyDiscretization))
173       
174    def __call__(self, data, wightId=0):
175        newattr_list = []
176        for attr in data.domain.attributes:
177            if attr.varType == orange.VarTypes.Continuous:
178                newattr = self.method(attr, data)
179                if newattr.getValueFrom.transformer.points:
180                    newattr_list.append(newattr)
181            else:
182                newattr_list.append(attr)
183        newdomain = orange.Domain(newattr_list, data.domain.classVar)
184        newdomain.addmetas(data.domain.getmetas())
185        return orange.ExampleTable(newdomain, data)
186   
187class Preprocessor_removeContinuous(Preprocessor_discretize):
188    """ A preprocessor that removes all continuous features.
189    """
190    __new__ = _orange__new__(Preprocessor_discretize)
191    __reduce__ = _orange__reduce__
192   
193    def __call__(self, data, weightId=None):
194        attrs = [attr for attr in data.domain.attributes if attr.varType == orange.VarTypes.Discrete]
195        domain = orange.Domain(attrs, data.domain.classVar)
196        domain.addmetas(data.domain.getmetas())
197        return orange.ExampleTable(domain, data)
198               
199class Preprocessor_continuize(orange.Preprocessor):
200    """ A preprocessor that continuizes a discrete domain (and optionally normalizes it).
201    See :obj:`Orange.feature.continuization.DomainContinuizer` for list of accepted arguments.
202   
203    """
204    __new__ = _orange__new__(orange.Preprocessor)
205    __reduce__ = _orange__reduce__
206   
207    def __init__(self, zeroBased=True, multinomialTreatment=orange.DomainContinuizer.NValues,
208                 continuousTreatment=orange.DomainContinuizer.Leave,
209                 classTreatment=orange.DomainContinuizer.Ignore,
210                 **kwargs):
211        self.zeroBased = zeroBased
212        self.multinomialTreatment = multinomialTreatment
213        self.continuousTreatment = continuousTreatment
214        self.classTreatment = classTreatment
215           
216    def __call__(self, data, weightId=0):
217        continuizer = orange.DomainContinuizer(zeroBased=self.zeroBased,
218                                               multinomialTreatment=self.multinomialTreatment,
219                                               continuousTreatment=self.continuousTreatment,
220                                               classTreatment=self.classTreatment)
221        c_domain = continuizer(data, weightId)
222        return data.translate(c_domain)
223   
224class Preprocessor_removeDiscrete(Preprocessor_continuize):
225    """ A Preprocessor that removes all discrete attributes from the domain.
226    """
227    __new__ = _orange__new__(Preprocessor_continuize)
228   
229    def __call__(self, data, weightId=None):
230        attrs = [attr for attr in data.domain.attributes if attr.varType == orange.VarTypes.Continuous]
231        domain = orange.Domain(attrs, data.domain.classVar)
232        domain.addmetas(data.domain.getmetas())
233        return orange.ExampleTable(domain, data)
234         
235class Preprocessor_impute(orange.Preprocessor):
236    """ A preprocessor that imputes unknown values using a learner.
237   
238    :param model: a learner class.
239   
240    """
241    __new__ = _orange__new__(orange.Preprocessor)
242    __reduce__ = _orange__reduce__
243   
244    def __init__(self, model=None, **kwargs):
245        self.model = orange.MajorityLearner() if model is None else model
246       
247    def __call__(self, data, weightId=0):
248        return orange.Preprocessor_imputeByLearner(data, learner=self.model)
249
250def bestN(attrMeasures, N=10):
251    """ Return best N attributes
252    """
253    return attrMeasures[-N:]
254
255def bestP(attrMeasures, P=10):
256    """ Return best P percent of attributes
257    """
258    count = len(attrMeasures)
259    return  attrMeasures[-max(int(math.ceil(count * P / 100.0)), 1):]
260
261class Preprocessor_featureSelection(orange.Preprocessor):
262    """ A preprocessor that runs feature selection using an feature scoring function.
263   
264    :param measure: a scoring function (default: orange.MeasureAttribute_relief)
265    :param filter: a filter function to use for selection (default Preprocessor_featureSelection.bestN)
266    :param limit: the limit for the filter function (default 10)
267       
268    """
269    __new__ = _orange__new__(orange.Preprocessor)
270    __reduce__ = _orange__reduce__
271   
272    bestN = staticmethod(bestN)
273    bestP = staticmethod(bestP)
274   
275    def __init__(self, measure=orange.MeasureAttribute_relief(), filter=None, limit=10):
276        self.measure = measure
277        self.filter = filter if filter is not None else self.bestN
278        self.limit = limit
279   
280    def attrScores(self, data):
281        """ Return a list of computed scores for all attributes in `data`.
282        """
283        measures = sorted([(self.measure(attr, data), attr) for attr in data.domain.attributes])
284        return measures
285         
286    def __call__(self, data, weightId=None):
287        measures = self.attrScores(data)
288        attrs = [attr for _, attr in self.filter(measures, self.limit)]
289        domain = orange.Domain(attrs, data.domain.classVar)
290        domain.addmetas(data.domain.getmetas())
291        return orange.ExampleTable(domain, data)
292   
293class Preprocessor_RFE(Preprocessor_featureSelection):
294    """ A preprocessor that runs RFE(Recursive Feature Elimination) using
295    linear SVM derived attribute weights.
296   
297    :param filter: a filter function to use for selection (default
298                   Preprocessor_featureSelection.bestN)
299    :param limit: the limit for the filter function (default 10)
300       
301    """
302    __new__ = _orange__new__(Preprocessor_featureSelection)
303    __reduce__ = _orange__reduce__
304    def __init__(self, filter=None, limit=10):
305        self.limit = limit
306        self.filter = filter if filter is not None else self.bestN
307       
308    def __call__(self, data, weightId=None):
309        from Orange.classification.svm import RFE
310        rfe = RFE()
311        filtered = self.filter(range(len(data)), self.limit)
312        return rfe(data, len(filtered))
313   
314def selectNRandom(examples, N=10):
315    """ Select N random examples.
316    """
317    import random
318    return random.sample(examples, N)
319
320def selectPRandom(examples, P=10):
321    """ Select P percent random examples.
322    """
323    import random
324    count = len(examples)
325    return random.sample(examples, max(int(math.ceil(count * P / 100.0)), 1))
326
327class Preprocessor_sample(orange.Preprocessor):
328    """ A preprocessor that samples a subset of the data.
329   
330    :param filter: a filter function to use for selection (default
331                   Preprocessor_sample.selectNRandom)
332    :param limit: the limit for the filter function (default 10)
333   
334    """
335    __new__ = _orange__new__(orange.Preprocessor)
336    __reduce__ = _orange__reduce__
337
338    selectNRandom = staticmethod(selectNRandom)
339    selectPRandom = staticmethod(selectPRandom)
340   
341    def __init__(self, filter=None, limit=10):
342        self.filter = filter if filter is not None else self.selectNRandom
343        self.limit = limit
344       
345    def __call__(self, data, weightId=None):
346        return orange.ExampleTable(data.domain, self.filter(data, self.limit))
347   
348
349class Preprocessor_preprocessorList(orange.Preprocessor):
350    """ A preprocessor wrapping a sequence of other preprocessors.
351   
352    :param preprocessors: a list of :obj:`Preprocessor` instances
353   
354    """
355   
356    __new__ = _orange__new__(orange.Preprocessor)
357    __reduce__ = _orange__reduce__
358   
359    def __init__(self, preprocessors=[]):
360        self.preprocessors = preprocessors
361       
362    def __call__(self, data, weightId=None):
363        import orange
364        hadWeight = hasWeight = weightId is not None
365        for preprocessor in self.preprocessors:
366            t = preprocessor(data, weightId) if hasWeight else preprocessor(data)
367            if isinstance(t, tuple):
368                data, weightId = t
369                hasWeight = True
370            else:
371                data = t
372        if hadWeight:
373            return data, weightId
374        else:
375            return data
376       
Note: See TracBrowser for help on using the repository browser.