source: orange/Orange/preprocess/__init__.py @ 9755:0885bc601c1a

Revision 9755:0885bc601c1a, 13.2 KB checked in by gregorr, 2 years ago (diff)

Additional documentation change (Orange.preprocess).

Line 
1"""
2.. autoclass:: Preprocessor_discretizeEntropy
3
4.. autoclass:: Preprocessor_removeContinuous
5
6.. autoclass:: Preprocessor_continuize
7
8.. autoclass:: Preprocessor_removeDiscrete
9
10.. autoclass:: Preprocessor_impute
11
12.. autoclass:: Preprocessor_featureSelection
13
14.. autofunction:: bestP
15
16.. autofunction:: bestN
17
18.. autofunction:: selectNRandom
19
20.. autofunction:: selectPRandom
21
22.. autoclass:: Preprocessor_RFE
23
24.. autoclass:: Preprocessor_sample
25
26.. autoclass:: Preprocessor_preprocessorList
27
28.. class:: RemoveUnusedValues(variable, data, remove_one_valued=False)
29
30    Removes unused values and reduces the variable, if a variable
31    declares values that do not appear in the data.
32
33    :param variable: :class:`Orange.data.variable.Variable`
34    :param data: :class:`Orange.data.Table`
35    :param remove_one_valued: Decides whether to remove or to retain
36        the attributes with only one value defined (default: False).
37   
38    Example:
39   
40    .. literalinclude:: code/unusedValues.py   
41
42    There are four possible outcomes:
43   
44    1. The variable does not have any used values in the data - value
45    of this variable is undefined for all examples. The variable is
46    thus useless and the class returns None.
47
48    2. The variable has only one used value (or, possibly, only one
49    value at all). Such a variable is in fact useless, and can
50    probably be removed without harm. Nevertheless, its fate is
51    decided by the flag remove_one_valued which is False by default,
52    so such variables are retained unless explicitly specified
53    otherwise.
54
55    3. All variable's values occur in the data (and the variable has more
56    than one value; otherwise the above case applies). The original variable
57    is returned.
58
59    4. There are some unused values. A new variable is constructed and the
60    unused values are omitted. The value of the new variable is computed
61    automatically from the value of the original variable
62    :class:`Orange.classification.lookup.ClassifierByLookupTable` is used
63    for mapping.
64   
65    Results of example:
66   
67    .. literalinclude:: code/unusedValues.res
68   
69    Variables a and y are OK and are left alone. In b, value 1 is not used
70    and is removed (not in the original variable, of course; a new variable
71    is created). c is useless and is removed altogether. d is retained since
72    remove_one_valued was left at False; if we set it to True, this variable
73    would be removed as well.
74
75"""
76
77from orange import \
78     DomainContinuizer, \
79    VariableFilterMap, \
80    ValueFilter, \
81         ValueFilter_continuous, \
82         ValueFilter_discrete, \
83         ValueFilter_string, \
84         ValueFilter_stringList, \
85    ValueFilterList, \
86    TransformValue, \
87         Discrete2Continuous, \
88         Discretizer, \
89              BiModalDiscretizer, \
90              EquiDistDiscretizer, \
91              IntervalDiscretizer, \
92              ThresholdDiscretizer, \
93         MapIntValue, \
94         NormalizeContinuous, \
95         Ordinal2Continuous, \
96         TransformValue_IsDefined, \
97    TableAverager, \
98    Preprocessor, \
99         Preprocessor_addCensorWeight, \
100         Preprocessor_addClassNoise, \
101         Preprocessor_addClassWeight, \
102         Preprocessor_addGaussianClassNoise, \
103         Preprocessor_addGaussianNoise, \
104         Preprocessor_addMissing, \
105         Preprocessor_addMissingClasses, \
106         Preprocessor_addNoise, \
107         Preprocessor_discretize, \
108         Preprocessor_drop, \
109         Preprocessor_dropMissing, \
110         Preprocessor_dropMissingClasses, \
111         Preprocessor_filter, \
112         Preprocessor_ignore, \
113         Preprocessor_imputeByLearner, \
114         Preprocessor_removeDuplicates, \
115         Preprocessor_select, \
116         Preprocessor_shuffle, \
117         Preprocessor_take, \
118         Preprocessor_takeMissing, \
119         Preprocessor_takeMissingClasses, \
120    Imputer, \
121         Imputer_asValue, \
122         Imputer_defaults, \
123         Imputer_model, \
124         Imputer_random, \
125    ImputerConstructor, \
126         ImputerConstructor_asValue, \
127         ImputerConstructor_average, \
128         ImputerConstructor_maximal, \
129         ImputerConstructor_minimal, \
130         ImputerConstructor_model, \
131         ImputerConstructor_random, \
132    FilterList, \
133    Filter, \
134         Filter_conjunction, \
135         Filter_disjunction, \
136         Filter_hasClassValue, \
137         Filter_hasMeta, \
138         Filter_hasSpecial, \
139         Filter_isDefined, \
140         Filter_random, \
141         Filter_sameValue, \
142         Filter_values, \
143    Discretization, \
144         BiModalDiscretization, \
145         EntropyDiscretization, \
146         EquiDistDiscretization, \
147         EquiNDiscretization, \
148    DomainTransformerConstructor, \
149    RemoveRedundant, \
150         RemoveRedundantByInduction, \
151         RemoveRedundantByQuality, \
152         RemoveRedundantOneValue, \
153    RemoveUnusedValues
154
155import outliers
156
157
158import math
159
160import orange
161from Orange.misc import _orange__new__, _orange__reduce__
162
163class Preprocessor_discretizeEntropy(Preprocessor_discretize):
164    """ An discretizer that uses orange.EntropyDiscretization method but,
165    unlike Preprocessor_discretize class, also removes unused attributes
166    from the domain.
167   
168    """
169   
170    __new__ = _orange__new__(Preprocessor_discretize)
171    __reduce__ = _orange__reduce__
172   
173    def __init__(self, method=orange.EntropyDiscretization()):
174        self.method = method
175        assert(isinstance(method, orange.EntropyDiscretization))
176       
177    def __call__(self, data, wightId=0):
178        newattr_list = []
179        for attr in data.domain.attributes:
180            if attr.varType == orange.VarTypes.Continuous:
181                newattr = self.method(attr, data)
182                if newattr.getValueFrom.transformer.points:
183                    newattr_list.append(newattr)
184            else:
185                newattr_list.append(attr)
186        newdomain = orange.Domain(newattr_list, data.domain.classVar)
187        newdomain.addmetas(data.domain.getmetas())
188        return orange.ExampleTable(newdomain, data)
189   
190class Preprocessor_removeContinuous(Preprocessor_discretize):
191    """ A preprocessor that removes all continuous features.
192    """
193    __new__ = _orange__new__(Preprocessor_discretize)
194    __reduce__ = _orange__reduce__
195   
196    def __call__(self, data, weightId=None):
197        attrs = [attr for attr in data.domain.attributes if attr.varType == orange.VarTypes.Discrete]
198        domain = orange.Domain(attrs, data.domain.classVar)
199        domain.addmetas(data.domain.getmetas())
200        return orange.ExampleTable(domain, data)
201               
202class Preprocessor_continuize(orange.Preprocessor):
203    """ A preprocessor that continuizes a discrete domain (and optionally normalizes it).
204    See :obj:`Orange.feature.continuization.DomainContinuizer` for list of accepted arguments.
205   
206    """
207    __new__ = _orange__new__(orange.Preprocessor)
208    __reduce__ = _orange__reduce__
209   
210    def __init__(self, zeroBased=True, multinomialTreatment=orange.DomainContinuizer.NValues,
211                 continuousTreatment=orange.DomainContinuizer.Leave,
212                 classTreatment=orange.DomainContinuizer.Ignore,
213                 **kwargs):
214        self.zeroBased = zeroBased
215        self.multinomialTreatment = multinomialTreatment
216        self.continuousTreatment = continuousTreatment
217        self.classTreatment = classTreatment
218           
219    def __call__(self, data, weightId=0):
220        continuizer = orange.DomainContinuizer(zeroBased=self.zeroBased,
221                                               multinomialTreatment=self.multinomialTreatment,
222                                               continuousTreatment=self.continuousTreatment,
223                                               classTreatment=self.classTreatment)
224        c_domain = continuizer(data, weightId)
225        return data.translate(c_domain)
226   
227class Preprocessor_removeDiscrete(Preprocessor_continuize):
228    """ A Preprocessor that removes all discrete attributes from the domain.
229    """
230    __new__ = _orange__new__(Preprocessor_continuize)
231   
232    def __call__(self, data, weightId=None):
233        attrs = [attr for attr in data.domain.attributes if attr.varType == orange.VarTypes.Continuous]
234        domain = orange.Domain(attrs, data.domain.classVar)
235        domain.addmetas(data.domain.getmetas())
236        return orange.ExampleTable(domain, data)
237         
238class Preprocessor_impute(orange.Preprocessor):
239    """ A preprocessor that imputes unknown values using a learner.
240   
241    :param model: a learner class.
242   
243    """
244    __new__ = _orange__new__(orange.Preprocessor)
245    __reduce__ = _orange__reduce__
246   
247    def __init__(self, model=None, **kwargs):
248        self.model = orange.MajorityLearner() if model is None else model
249       
250    def __call__(self, data, weightId=0):
251        return orange.Preprocessor_imputeByLearner(data, learner=self.model)
252
253def bestN(attrMeasures, N=10):
254    """ Return best N attributes
255    """
256    return attrMeasures[-N:]
257
258def bestP(attrMeasures, P=10):
259    """ Return best P percent of attributes
260    """
261    count = len(attrMeasures)
262    return  attrMeasures[-max(int(math.ceil(count * P / 100.0)), 1):]
263
264class Preprocessor_featureSelection(orange.Preprocessor):
265    """ A preprocessor that runs feature selection using an feature scoring function.
266   
267    :param measure: a scoring function (default: orange.MeasureAttribute_relief)
268    :param filter: a filter function to use for selection (default Preprocessor_featureSelection.bestN)
269    :param limit: the limit for the filter function (default 10)
270       
271    """
272    __new__ = _orange__new__(orange.Preprocessor)
273    __reduce__ = _orange__reduce__
274   
275    bestN = staticmethod(bestN)
276    bestP = staticmethod(bestP)
277   
278    def __init__(self, measure=orange.MeasureAttribute_relief(), filter=None, limit=10):
279        self.measure = measure
280        self.filter = filter if filter is not None else self.bestN
281        self.limit = limit
282   
283    def attrScores(self, data):
284        """ Return a list of computed scores for all attributes in `data`.
285        """
286        measures = sorted([(self.measure(attr, data), attr) for attr in data.domain.attributes])
287        return measures
288         
289    def __call__(self, data, weightId=None):
290        measures = self.attrScores(data)
291        attrs = [attr for _, attr in self.filter(measures, self.limit)]
292        domain = orange.Domain(attrs, data.domain.classVar)
293        domain.addmetas(data.domain.getmetas())
294        return orange.ExampleTable(domain, data)
295   
296class Preprocessor_RFE(Preprocessor_featureSelection):
297    """ A preprocessor that runs RFE(Recursive Feature Elimination) using
298    linear SVM derived attribute weights.
299   
300    :param filter: a filter function to use for selection (default
301                   Preprocessor_featureSelection.bestN)
302    :param limit: the limit for the filter function (default 10)
303       
304    """
305    __new__ = _orange__new__(Preprocessor_featureSelection)
306    __reduce__ = _orange__reduce__
307    def __init__(self, filter=None, limit=10):
308        self.limit = limit
309        self.filter = filter if filter is not None else self.bestN
310       
311    def __call__(self, data, weightId=None):
312        from Orange.classification.svm import RFE
313        rfe = RFE()
314        filtered = self.filter(range(len(data)), self.limit)
315        return rfe(data, len(filtered))
316   
317def selectNRandom(examples, N=10):
318    """ Select N random examples.
319    """
320    import random
321    return random.sample(examples, N)
322
323def selectPRandom(examples, P=10):
324    """ Select P percent random examples.
325    """
326    import random
327    count = len(examples)
328    return random.sample(examples, max(int(math.ceil(count * P / 100.0)), 1))
329
330class Preprocessor_sample(orange.Preprocessor):
331    """ A preprocessor that samples a subset of the data.
332   
333    :param filter: a filter function to use for selection (default
334                   Preprocessor_sample.selectNRandom)
335    :param limit: the limit for the filter function (default 10)
336   
337    """
338    __new__ = _orange__new__(orange.Preprocessor)
339    __reduce__ = _orange__reduce__
340
341    selectNRandom = staticmethod(selectNRandom)
342    selectPRandom = staticmethod(selectPRandom)
343   
344    def __init__(self, filter=None, limit=10):
345        self.filter = filter if filter is not None else self.selectNRandom
346        self.limit = limit
347       
348    def __call__(self, data, weightId=None):
349        return orange.ExampleTable(data.domain, self.filter(data, self.limit))
350   
351
352class Preprocessor_preprocessorList(orange.Preprocessor):
353    """ A preprocessor wrapping a sequence of other preprocessors.
354   
355    :param preprocessors: a list of :obj:`Preprocessor` instances
356   
357    """
358   
359    __new__ = _orange__new__(orange.Preprocessor)
360    __reduce__ = _orange__reduce__
361   
362    def __init__(self, preprocessors=[]):
363        self.preprocessors = preprocessors
364       
365    def __call__(self, data, weightId=None):
366        import orange
367        hadWeight = hasWeight = weightId is not None
368        for preprocessor in self.preprocessors:
369            t = preprocessor(data, weightId) if hasWeight else preprocessor(data)
370            if isinstance(t, tuple):
371                data, weightId = t
372                hasWeight = True
373            else:
374                data = t
375        if hadWeight:
376            return data, weightId
377        else:
378            return data
379       
Note: See TracBrowser for help on using the repository browser.