source: orange/Orange/preprocess/__init__.py @ 9754:060889552ecf

Revision 9754:060889552ecf, 13.4 KB checked in by gregorr, 2 years ago (diff)

Added new documentation: Orange.classification.random and Orange.preprocess.RemoveUnusedValues.

Line 
1"""
2.. autoclass:: Preprocessor_discretizeEntropy
3
4.. autoclass:: Preprocessor_removeContinuous
5
6.. autoclass:: Preprocessor_continuize
7
8.. autoclass:: Preprocessor_removeDiscrete
9
10.. autoclass:: Preprocessor_impute
11
12.. autoclass:: Preprocessor_featureSelection
13
14.. autofunction:: bestP
15
16.. autofunction:: bestN
17
18.. autofunction:: selectNRandom
19
20.. autofunction:: selectPRandom
21
22.. autoclass:: Preprocessor_RFE
23
24.. autoclass:: Preprocessor_sample
25
26.. autoclass:: Preprocessor_preprocessorList
27
28.. class:: RemoveUnusedValues(variable, data, remove_one_valued=False)
29
30    Often the definition of a discrete attribute declares values that
31    do not actually appear in the data, either originally or as a
32    consequence of some preprocessing. Such anomalies are taken
33    care of by class RemoveUnusedValues that, given an attribute and the
34    data, determines whether there are any unused values and reduces the
35    attribute if needed.
36
37    :param variable: :class:`Orange.data.variable.Variable`
38    :param data: :class:`Orange.data.Table`
39    :param remove_one_valued: Decides whether to remove or to retain
40        the attributes with only one value defined (default: False).
41   
42    Example:
43   
44    .. literalinclude:: code/unusedValues.py   
45
46    There are four possible outcomes:
47   
48    1. The variable does not have any used values in the data - value
49    of this variable is undefined for all examples. The variable is
50    thus useless and the class returns None.
51
52    2. The variable has only one used value (or, possibly, only one
53    value at all). Such a variable is in fact useless, and can
54    probably be removed without harm. Nevertheless, its fate is
55    decided by the flag remove_one_valued which is False by default,
56    so such variables are retained unless explicitly specified
57    otherwise.
58
59    3. All variable's values occur in the data (and the variable has more
60    than one value; otherwise the above case applies). The original variable
61    is returned.
62
63    4. There are some unused values. A new variable is constructed and the
64    unused values are omitted. The value of the new variable is computed
65    automatically from the value of the original variable
66    :class:`Orange.classification.lookup.ClassifierByLookupTable` is used
67    for mapping.
68   
69    Results of example:
70   
71    .. literalinclude:: code/unusedValues.res
72   
73    Variables a and y are OK and are left alone. In b, value 1 is not used
74    and is removed (not in the original variable, of course; a new variable
75    is created). c is useless and is removed altogether. d is retained since
76    remove_one_valued was left at False; if we set it to True, this variable
77    would be removed as well.
78
79"""
80
81from orange import \
82     DomainContinuizer, \
83    VariableFilterMap, \
84    ValueFilter, \
85         ValueFilter_continuous, \
86         ValueFilter_discrete, \
87         ValueFilter_string, \
88         ValueFilter_stringList, \
89    ValueFilterList, \
90    TransformValue, \
91         Discrete2Continuous, \
92         Discretizer, \
93              BiModalDiscretizer, \
94              EquiDistDiscretizer, \
95              IntervalDiscretizer, \
96              ThresholdDiscretizer, \
97         MapIntValue, \
98         NormalizeContinuous, \
99         Ordinal2Continuous, \
100         TransformValue_IsDefined, \
101    TableAverager, \
102    Preprocessor, \
103         Preprocessor_addCensorWeight, \
104         Preprocessor_addClassNoise, \
105         Preprocessor_addClassWeight, \
106         Preprocessor_addGaussianClassNoise, \
107         Preprocessor_addGaussianNoise, \
108         Preprocessor_addMissing, \
109         Preprocessor_addMissingClasses, \
110         Preprocessor_addNoise, \
111         Preprocessor_discretize, \
112         Preprocessor_drop, \
113         Preprocessor_dropMissing, \
114         Preprocessor_dropMissingClasses, \
115         Preprocessor_filter, \
116         Preprocessor_ignore, \
117         Preprocessor_imputeByLearner, \
118         Preprocessor_removeDuplicates, \
119         Preprocessor_select, \
120         Preprocessor_shuffle, \
121         Preprocessor_take, \
122         Preprocessor_takeMissing, \
123         Preprocessor_takeMissingClasses, \
124    Imputer, \
125         Imputer_asValue, \
126         Imputer_defaults, \
127         Imputer_model, \
128         Imputer_random, \
129    ImputerConstructor, \
130         ImputerConstructor_asValue, \
131         ImputerConstructor_average, \
132         ImputerConstructor_maximal, \
133         ImputerConstructor_minimal, \
134         ImputerConstructor_model, \
135         ImputerConstructor_random, \
136    FilterList, \
137    Filter, \
138         Filter_conjunction, \
139         Filter_disjunction, \
140         Filter_hasClassValue, \
141         Filter_hasMeta, \
142         Filter_hasSpecial, \
143         Filter_isDefined, \
144         Filter_random, \
145         Filter_sameValue, \
146         Filter_values, \
147    Discretization, \
148         BiModalDiscretization, \
149         EntropyDiscretization, \
150         EquiDistDiscretization, \
151         EquiNDiscretization, \
152    DomainTransformerConstructor, \
153    RemoveRedundant, \
154         RemoveRedundantByInduction, \
155         RemoveRedundantByQuality, \
156         RemoveRedundantOneValue, \
157    RemoveUnusedValues
158
159import outliers
160
161
162import math
163
164import orange
165from Orange.misc import _orange__new__, _orange__reduce__
166
167class Preprocessor_discretizeEntropy(Preprocessor_discretize):
168    """ An discretizer that uses orange.EntropyDiscretization method but,
169    unlike Preprocessor_discretize class, also removes unused attributes
170    from the domain.
171   
172    """
173   
174    __new__ = _orange__new__(Preprocessor_discretize)
175    __reduce__ = _orange__reduce__
176   
177    def __init__(self, method=orange.EntropyDiscretization()):
178        self.method = method
179        assert(isinstance(method, orange.EntropyDiscretization))
180       
181    def __call__(self, data, wightId=0):
182        newattr_list = []
183        for attr in data.domain.attributes:
184            if attr.varType == orange.VarTypes.Continuous:
185                newattr = self.method(attr, data)
186                if newattr.getValueFrom.transformer.points:
187                    newattr_list.append(newattr)
188            else:
189                newattr_list.append(attr)
190        newdomain = orange.Domain(newattr_list, data.domain.classVar)
191        newdomain.addmetas(data.domain.getmetas())
192        return orange.ExampleTable(newdomain, data)
193   
194class Preprocessor_removeContinuous(Preprocessor_discretize):
195    """ A preprocessor that removes all continuous features.
196    """
197    __new__ = _orange__new__(Preprocessor_discretize)
198    __reduce__ = _orange__reduce__
199   
200    def __call__(self, data, weightId=None):
201        attrs = [attr for attr in data.domain.attributes if attr.varType == orange.VarTypes.Discrete]
202        domain = orange.Domain(attrs, data.domain.classVar)
203        domain.addmetas(data.domain.getmetas())
204        return orange.ExampleTable(domain, data)
205               
206class Preprocessor_continuize(orange.Preprocessor):
207    """ A preprocessor that continuizes a discrete domain (and optionally normalizes it).
208    See :obj:`Orange.feature.continuization.DomainContinuizer` for list of accepted arguments.
209   
210    """
211    __new__ = _orange__new__(orange.Preprocessor)
212    __reduce__ = _orange__reduce__
213   
214    def __init__(self, zeroBased=True, multinomialTreatment=orange.DomainContinuizer.NValues,
215                 continuousTreatment=orange.DomainContinuizer.Leave,
216                 classTreatment=orange.DomainContinuizer.Ignore,
217                 **kwargs):
218        self.zeroBased = zeroBased
219        self.multinomialTreatment = multinomialTreatment
220        self.continuousTreatment = continuousTreatment
221        self.classTreatment = classTreatment
222           
223    def __call__(self, data, weightId=0):
224        continuizer = orange.DomainContinuizer(zeroBased=self.zeroBased,
225                                               multinomialTreatment=self.multinomialTreatment,
226                                               continuousTreatment=self.continuousTreatment,
227                                               classTreatment=self.classTreatment)
228        c_domain = continuizer(data, weightId)
229        return data.translate(c_domain)
230   
231class Preprocessor_removeDiscrete(Preprocessor_continuize):
232    """ A Preprocessor that removes all discrete attributes from the domain.
233    """
234    __new__ = _orange__new__(Preprocessor_continuize)
235   
236    def __call__(self, data, weightId=None):
237        attrs = [attr for attr in data.domain.attributes if attr.varType == orange.VarTypes.Continuous]
238        domain = orange.Domain(attrs, data.domain.classVar)
239        domain.addmetas(data.domain.getmetas())
240        return orange.ExampleTable(domain, data)
241         
242class Preprocessor_impute(orange.Preprocessor):
243    """ A preprocessor that imputes unknown values using a learner.
244   
245    :param model: a learner class.
246   
247    """
248    __new__ = _orange__new__(orange.Preprocessor)
249    __reduce__ = _orange__reduce__
250   
251    def __init__(self, model=None, **kwargs):
252        self.model = orange.MajorityLearner() if model is None else model
253       
254    def __call__(self, data, weightId=0):
255        return orange.Preprocessor_imputeByLearner(data, learner=self.model)
256
257def bestN(attrMeasures, N=10):
258    """ Return best N attributes
259    """
260    return attrMeasures[-N:]
261
262def bestP(attrMeasures, P=10):
263    """ Return best P percent of attributes
264    """
265    count = len(attrMeasures)
266    return  attrMeasures[-max(int(math.ceil(count * P / 100.0)), 1):]
267
268class Preprocessor_featureSelection(orange.Preprocessor):
269    """ A preprocessor that runs feature selection using an feature scoring function.
270   
271    :param measure: a scoring function (default: orange.MeasureAttribute_relief)
272    :param filter: a filter function to use for selection (default Preprocessor_featureSelection.bestN)
273    :param limit: the limit for the filter function (default 10)
274       
275    """
276    __new__ = _orange__new__(orange.Preprocessor)
277    __reduce__ = _orange__reduce__
278   
279    bestN = staticmethod(bestN)
280    bestP = staticmethod(bestP)
281   
282    def __init__(self, measure=orange.MeasureAttribute_relief(), filter=None, limit=10):
283        self.measure = measure
284        self.filter = filter if filter is not None else self.bestN
285        self.limit = limit
286   
287    def attrScores(self, data):
288        """ Return a list of computed scores for all attributes in `data`.
289        """
290        measures = sorted([(self.measure(attr, data), attr) for attr in data.domain.attributes])
291        return measures
292         
293    def __call__(self, data, weightId=None):
294        measures = self.attrScores(data)
295        attrs = [attr for _, attr in self.filter(measures, self.limit)]
296        domain = orange.Domain(attrs, data.domain.classVar)
297        domain.addmetas(data.domain.getmetas())
298        return orange.ExampleTable(domain, data)
299   
300class Preprocessor_RFE(Preprocessor_featureSelection):
301    """ A preprocessor that runs RFE(Recursive Feature Elimination) using
302    linear SVM derived attribute weights.
303   
304    :param filter: a filter function to use for selection (default
305                   Preprocessor_featureSelection.bestN)
306    :param limit: the limit for the filter function (default 10)
307       
308    """
309    __new__ = _orange__new__(Preprocessor_featureSelection)
310    __reduce__ = _orange__reduce__
311    def __init__(self, filter=None, limit=10):
312        self.limit = limit
313        self.filter = filter if filter is not None else self.bestN
314       
315    def __call__(self, data, weightId=None):
316        from Orange.classification.svm import RFE
317        rfe = RFE()
318        filtered = self.filter(range(len(data)), self.limit)
319        return rfe(data, len(filtered))
320   
321def selectNRandom(examples, N=10):
322    """ Select N random examples.
323    """
324    import random
325    return random.sample(examples, N)
326
327def selectPRandom(examples, P=10):
328    """ Select P percent random examples.
329    """
330    import random
331    count = len(examples)
332    return random.sample(examples, max(int(math.ceil(count * P / 100.0)), 1))
333
334class Preprocessor_sample(orange.Preprocessor):
335    """ A preprocessor that samples a subset of the data.
336   
337    :param filter: a filter function to use for selection (default
338                   Preprocessor_sample.selectNRandom)
339    :param limit: the limit for the filter function (default 10)
340   
341    """
342    __new__ = _orange__new__(orange.Preprocessor)
343    __reduce__ = _orange__reduce__
344
345    selectNRandom = staticmethod(selectNRandom)
346    selectPRandom = staticmethod(selectPRandom)
347   
348    def __init__(self, filter=None, limit=10):
349        self.filter = filter if filter is not None else self.selectNRandom
350        self.limit = limit
351       
352    def __call__(self, data, weightId=None):
353        return orange.ExampleTable(data.domain, self.filter(data, self.limit))
354   
355
356class Preprocessor_preprocessorList(orange.Preprocessor):
357    """ A preprocessor wrapping a sequence of other preprocessors.
358   
359    :param preprocessors: a list of :obj:`Preprocessor` instances
360   
361    """
362   
363    __new__ = _orange__new__(orange.Preprocessor)
364    __reduce__ = _orange__reduce__
365   
366    def __init__(self, preprocessors=[]):
367        self.preprocessors = preprocessors
368       
369    def __call__(self, data, weightId=None):
370        import orange
371        hadWeight = hasWeight = weightId is not None
372        for preprocessor in self.preprocessors:
373            t = preprocessor(data, weightId) if hasWeight else preprocessor(data)
374            if isinstance(t, tuple):
375                data, weightId = t
376                hasWeight = True
377            else:
378                data = t
379        if hadWeight:
380            return data, weightId
381        else:
382            return data
383       
Note: See TracBrowser for help on using the repository browser.