source: orange/Orange/preprocess/__init__.py @ 10393:4dbd54af3ac8

Revision 10393:4dbd54af3ac8, 13.3 KB checked in by Lan Zagar <lan.zagar@…>, 2 years ago (diff)

Fixed object addresses appearing in documentation (#1103).

Line 
1"""
2.. autoclass:: Preprocessor_discretizeEntropy(method=Orange.feature.discretization.Entropy())
3
4.. autoclass:: Preprocessor_removeContinuous
5
6.. autoclass:: Preprocessor_continuize
7
8.. autoclass:: Preprocessor_removeDiscrete
9
10.. autoclass:: Preprocessor_impute
11
12.. autoclass:: Preprocessor_featureSelection(measure=Orange.feature.scoring.Relief(), filter=None, limit=10)
13
14.. autofunction:: bestP
15
16.. autofunction:: bestN
17
18.. autofunction:: selectNRandom
19
20.. autofunction:: selectPRandom
21
22.. autoclass:: Preprocessor_RFE
23
24.. autoclass:: Preprocessor_sample
25
26.. autoclass:: Preprocessor_preprocessorList
27
28.. class:: RemoveUnusedValues(variable, data, remove_one_valued=False)
29
30    Removes unused values and reduces the variable, if a variable
31    declares values that do not appear in the data.
32
33    :param variable: :class:`Orange.feature.Descriptor`
34    :param data: :class:`Orange.data.Table`
35    :param remove_one_valued: Decides whether to remove or to retain
36        the attributes with only one value defined (default: False).
37   
38    Example:
39   
40    .. literalinclude:: code/unusedValues.py   
41
42    There are four possible outcomes:
43   
44    1. The variable does not have any used values in the data - value
45    of this variable is undefined for all examples. The variable is
46    thus useless and the class returns None.
47
48    2. The variable has only one used value (or, possibly, only one
49    value at all). Such a variable is in fact useless, and can
50    probably be removed without harm. Nevertheless, its fate is
51    decided by the flag remove_one_valued which is False by default,
52    so such variables are retained unless explicitly specified
53    otherwise.
54
55    3. All variable's values occur in the data (and the variable has more
56    than one value; otherwise the above case applies). The original variable
57    is returned.
58
59    4. There are some unused values. A new variable is constructed and the
60    unused values are omitted. The value of the new variable is computed
61    automatically from the value of the original variable
62    :class:`Orange.classification.lookup.ClassifierByLookupTable` is used
63    for mapping.
64   
65    Results of example:
66   
67    .. literalinclude:: code/unusedValues.res
68   
69    Variables a and y are OK and are left alone. In b, value 1 is not used
70    and is removed (not in the original variable, of course; a new variable
71    is created). c is useless and is removed altogether. d is retained since
72    remove_one_valued was left at False; if we set it to True, this variable
73    would be removed as well.
74
75"""
76
77from orange import \
78     DomainContinuizer, \
79    VariableFilterMap, \
80    ValueFilter, \
81         ValueFilter_continuous, \
82         ValueFilter_discrete, \
83         ValueFilter_string, \
84         ValueFilter_stringList, \
85    ValueFilterList, \
86    TransformValue, \
87         Discrete2Continuous, \
88         Discretizer, \
89              BiModalDiscretizer, \
90              EquiDistDiscretizer, \
91              IntervalDiscretizer, \
92              ThresholdDiscretizer, \
93         MapIntValue, \
94         NormalizeContinuous, \
95         Ordinal2Continuous, \
96         TransformValue_IsDefined, \
97    TableAverager, \
98    Preprocessor, \
99         Preprocessor_addCensorWeight, \
100         Preprocessor_addClassNoise, \
101         Preprocessor_addClassWeight, \
102         Preprocessor_addGaussianClassNoise, \
103         Preprocessor_addGaussianNoise, \
104         Preprocessor_addMissing, \
105         Preprocessor_addMissingClasses, \
106         Preprocessor_addNoise, \
107         Preprocessor_discretize, \
108         Preprocessor_drop, \
109         Preprocessor_dropMissing, \
110         Preprocessor_dropMissingClasses, \
111         Preprocessor_filter, \
112         Preprocessor_ignore, \
113         Preprocessor_imputeByLearner, \
114         Preprocessor_removeDuplicates, \
115         Preprocessor_select, \
116         Preprocessor_shuffle, \
117         Preprocessor_take, \
118         Preprocessor_takeMissing, \
119         Preprocessor_takeMissingClasses, \
120    Imputer, \
121         Imputer_asValue, \
122         Imputer_defaults, \
123         Imputer_model, \
124         Imputer_random, \
125    ImputerConstructor, \
126         ImputerConstructor_asValue, \
127         ImputerConstructor_average, \
128         ImputerConstructor_maximal, \
129         ImputerConstructor_minimal, \
130         ImputerConstructor_model, \
131         ImputerConstructor_random, \
132    FilterList, \
133    Filter, \
134         Filter_conjunction, \
135         Filter_disjunction, \
136         Filter_hasClassValue, \
137         Filter_hasMeta, \
138         Filter_hasSpecial, \
139         Filter_isDefined, \
140         Filter_random, \
141         Filter_sameValue, \
142         Filter_values, \
143    Discretization, \
144         BiModalDiscretization, \
145         EntropyDiscretization, \
146         EquiDistDiscretization, \
147         EquiNDiscretization, \
148    DomainTransformerConstructor, \
149    RemoveRedundant, \
150         RemoveRedundantByInduction, \
151         RemoveRedundantByQuality, \
152         RemoveRedundantOneValue, \
153    RemoveUnusedValues
154
155import math
156
157import orange
158import Orange
159from Orange.misc import _orange__new__, _orange__reduce__
160
161class Preprocessor_discretizeEntropy(Preprocessor_discretize):
162    """ An discretizer that uses orange.EntropyDiscretization method but,
163    unlike Preprocessor_discretize class, also removes unused attributes
164    from the domain.
165   
166    """
167   
168    __new__ = _orange__new__(Preprocessor_discretize)
169    __reduce__ = _orange__reduce__
170   
171    def __init__(self, method=Orange.feature.discretization.Entropy()):
172        self.method = method
173        assert(isinstance(method, Orange.feature.discretization.Entropy))
174       
175    def __call__(self, data, wightId=0):
176        newattr_list = []
177        for attr in data.domain.attributes:
178            if attr.varType == orange.VarTypes.Continuous:
179                newattr = self.method(attr, data)
180                if newattr.getValueFrom.transformer.points:
181                    newattr_list.append(newattr)
182            else:
183                newattr_list.append(attr)
184        newdomain = orange.Domain(newattr_list, data.domain.classVar)
185        newdomain.addmetas(data.domain.getmetas())
186        return orange.ExampleTable(newdomain, data)
187   
188class Preprocessor_removeContinuous(Preprocessor_discretize):
189    """ A preprocessor that removes all continuous features.
190    """
191    __new__ = _orange__new__(Preprocessor_discretize)
192    __reduce__ = _orange__reduce__
193   
194    def __call__(self, data, weightId=None):
195        attrs = [attr for attr in data.domain.attributes if attr.varType == orange.VarTypes.Discrete]
196        domain = orange.Domain(attrs, data.domain.classVar)
197        domain.addmetas(data.domain.getmetas())
198        return orange.ExampleTable(domain, data)
199               
200class Preprocessor_continuize(orange.Preprocessor):
201    """ A preprocessor that continuizes a discrete domain (and optionally normalizes it).
202    See :obj:`Orange.data.continuization.DomainContinuizer` for list of
203    accepted arguments.
204   
205    """
206    __new__ = _orange__new__(orange.Preprocessor)
207    __reduce__ = _orange__reduce__
208   
209    def __init__(self, zeroBased=True, multinomialTreatment=orange.DomainContinuizer.NValues,
210                 continuousTreatment=orange.DomainContinuizer.Leave,
211                 classTreatment=orange.DomainContinuizer.Ignore,
212                 **kwargs):
213        self.zeroBased = zeroBased
214        self.multinomialTreatment = multinomialTreatment
215        self.continuousTreatment = continuousTreatment
216        self.classTreatment = classTreatment
217           
218    def __call__(self, data, weightId=0):
219        continuizer = orange.DomainContinuizer(zeroBased=self.zeroBased,
220                                               multinomialTreatment=self.multinomialTreatment,
221                                               continuousTreatment=self.continuousTreatment,
222                                               classTreatment=self.classTreatment)
223        c_domain = continuizer(data, weightId)
224        return data.translate(c_domain)
225   
226class Preprocessor_removeDiscrete(Preprocessor_continuize):
227    """ A Preprocessor that removes all discrete attributes from the domain.
228    """
229    __new__ = _orange__new__(Preprocessor_continuize)
230   
231    def __call__(self, data, weightId=None):
232        attrs = [attr for attr in data.domain.attributes if attr.varType == orange.VarTypes.Continuous]
233        domain = orange.Domain(attrs, data.domain.classVar)
234        domain.addmetas(data.domain.getmetas())
235        return orange.ExampleTable(domain, data)
236         
237class Preprocessor_impute(orange.Preprocessor):
238    """ A preprocessor that imputes unknown values using a learner.
239   
240    :param model: a learner class.
241   
242    """
243    __new__ = _orange__new__(orange.Preprocessor)
244    __reduce__ = _orange__reduce__
245   
246    def __init__(self, model=None, **kwargs):
247        self.model = orange.MajorityLearner() if model is None else model
248       
249    def __call__(self, data, weightId=0):
250        return orange.Preprocessor_imputeByLearner(data, learner=self.model)
251
252def bestN(attrMeasures, N=10):
253    """ Return best N attributes
254    """
255    return attrMeasures[-N:]
256
257def bestP(attrMeasures, P=10):
258    """ Return best P percent of attributes
259    """
260    count = len(attrMeasures)
261    return  attrMeasures[-max(int(math.ceil(count * P / 100.0)), 1):]
262
263class Preprocessor_featureSelection(orange.Preprocessor):
264    """ A preprocessor that runs feature selection using an feature scoring function.
265   
266    :param measure: a scoring function (default: orange.MeasureAttribute_relief)
267    :param filter: a filter function to use for selection (default Preprocessor_featureSelection.bestN)
268    :param limit: the limit for the filter function (default 10)
269       
270    """
271    __new__ = _orange__new__(orange.Preprocessor)
272    __reduce__ = _orange__reduce__
273   
274    bestN = staticmethod(bestN)
275    bestP = staticmethod(bestP)
276   
277    def __init__(self, measure=Orange.feature.scoring.Relief(), filter=None, limit=10):
278        self.measure = measure
279        self.filter = filter if filter is not None else self.bestN
280        self.limit = limit
281   
282    def attrScores(self, data):
283        """ Return a list of computed scores for all attributes in `data`.
284        """
285        measures = sorted([(self.measure(attr, data), attr) for attr in data.domain.attributes])
286        return measures
287         
288    def __call__(self, data, weightId=None):
289        measures = self.attrScores(data)
290        attrs = [attr for _, attr in self.filter(measures, self.limit)]
291        domain = orange.Domain(attrs, data.domain.classVar)
292        domain.addmetas(data.domain.getmetas())
293        return orange.ExampleTable(domain, data)
294   
295class Preprocessor_RFE(Preprocessor_featureSelection):
296    """ A preprocessor that runs RFE(Recursive Feature Elimination) using
297    linear SVM derived attribute weights.
298   
299    :param filter: a filter function to use for selection (default
300                   Preprocessor_featureSelection.bestN)
301    :param limit: the limit for the filter function (default 10)
302       
303    """
304    __new__ = _orange__new__(Preprocessor_featureSelection)
305    __reduce__ = _orange__reduce__
306    def __init__(self, filter=None, limit=10):
307        self.limit = limit
308        self.filter = filter if filter is not None else self.bestN
309       
310    def __call__(self, data, weightId=None):
311        from Orange.classification.svm import RFE
312        rfe = RFE()
313        filtered = self.filter(range(len(data)), self.limit)
314        return rfe(data, len(filtered))
315   
316def selectNRandom(examples, N=10):
317    """ Select N random examples.
318    """
319    import random
320    return random.sample(examples, N)
321
322def selectPRandom(examples, P=10):
323    """ Select P percent random examples.
324    """
325    import random
326    count = len(examples)
327    return random.sample(examples, max(int(math.ceil(count * P / 100.0)), 1))
328
329class Preprocessor_sample(orange.Preprocessor):
330    """ A preprocessor that samples a subset of the data.
331   
332    :param filter: a filter function to use for selection (default
333                   Preprocessor_sample.selectNRandom)
334    :param limit: the limit for the filter function (default 10)
335   
336    """
337    __new__ = _orange__new__(orange.Preprocessor)
338    __reduce__ = _orange__reduce__
339
340    selectNRandom = staticmethod(selectNRandom)
341    selectPRandom = staticmethod(selectPRandom)
342   
343    def __init__(self, filter=None, limit=10):
344        self.filter = filter if filter is not None else self.selectNRandom
345        self.limit = limit
346       
347    def __call__(self, data, weightId=None):
348        return orange.ExampleTable(data.domain, self.filter(data, self.limit))
349   
350
351class Preprocessor_preprocessorList(orange.Preprocessor):
352    """ A preprocessor wrapping a sequence of other preprocessors.
353   
354    :param preprocessors: a list of :obj:`Preprocessor` instances
355   
356    """
357   
358    __new__ = _orange__new__(orange.Preprocessor)
359    __reduce__ = _orange__reduce__
360   
361    def __init__(self, preprocessors=[]):
362        self.preprocessors = preprocessors
363       
364    def __call__(self, data, weightId=None):
365        import orange
366        hadWeight = hasWeight = weightId is not None
367        for preprocessor in self.preprocessors:
368            t = preprocessor(data, weightId) if hasWeight else preprocessor(data)
369            if isinstance(t, tuple):
370                data, weightId = t
371                hasWeight = True
372            else:
373                data = t
374        if hadWeight:
375            return data, weightId
376        else:
377            return data
378       
Note: See TracBrowser for help on using the repository browser.