source: orange/Orange/preprocess/__init__.py @ 10238:d58af84b3ee5

Revision 10238:d58af84b3ee5, 13.1 KB checked in by anzeh <anze.staric@…>, 2 years ago (diff)

Changed references to feature.continuization to data.continuization.

Line 
1"""
2.. autoclass:: Preprocessor_discretizeEntropy
3
4.. autoclass:: Preprocessor_removeContinuous
5
6.. autoclass:: Preprocessor_continuize
7
8.. autoclass:: Preprocessor_removeDiscrete
9
10.. autoclass:: Preprocessor_impute
11
12.. autoclass:: Preprocessor_featureSelection
13
14.. autofunction:: bestP
15
16.. autofunction:: bestN
17
18.. autofunction:: selectNRandom
19
20.. autofunction:: selectPRandom
21
22.. autoclass:: Preprocessor_RFE
23
24.. autoclass:: Preprocessor_sample
25
26.. autoclass:: Preprocessor_preprocessorList
27
28.. class:: RemoveUnusedValues(variable, data, remove_one_valued=False)
29
30    Removes unused values and reduces the variable, if a variable
31    declares values that do not appear in the data.
32
33    :param variable: :class:`Orange.feature.Descriptor`
34    :param data: :class:`Orange.data.Table`
35    :param remove_one_valued: Decides whether to remove or to retain
36        the attributes with only one value defined (default: False).
37   
38    Example:
39   
40    .. literalinclude:: code/unusedValues.py   
41
42    There are four possible outcomes:
43   
44    1. The variable does not have any used values in the data - value
45    of this variable is undefined for all examples. The variable is
46    thus useless and the class returns None.
47
48    2. The variable has only one used value (or, possibly, only one
49    value at all). Such a variable is in fact useless, and can
50    probably be removed without harm. Nevertheless, its fate is
51    decided by the flag remove_one_valued which is False by default,
52    so such variables are retained unless explicitly specified
53    otherwise.
54
55    3. All variable's values occur in the data (and the variable has more
56    than one value; otherwise the above case applies). The original variable
57    is returned.
58
59    4. There are some unused values. A new variable is constructed and the
60    unused values are omitted. The value of the new variable is computed
61    automatically from the value of the original variable
62    :class:`Orange.classification.lookup.ClassifierByLookupTable` is used
63    for mapping.
64   
65    Results of example:
66   
67    .. literalinclude:: code/unusedValues.res
68   
69    Variables a and y are OK and are left alone. In b, value 1 is not used
70    and is removed (not in the original variable, of course; a new variable
71    is created). c is useless and is removed altogether. d is retained since
72    remove_one_valued was left at False; if we set it to True, this variable
73    would be removed as well.
74
75"""
76
77from orange import \
78     DomainContinuizer, \
79    VariableFilterMap, \
80    ValueFilter, \
81         ValueFilter_continuous, \
82         ValueFilter_discrete, \
83         ValueFilter_string, \
84         ValueFilter_stringList, \
85    ValueFilterList, \
86    TransformValue, \
87         Discrete2Continuous, \
88         Discretizer, \
89              BiModalDiscretizer, \
90              EquiDistDiscretizer, \
91              IntervalDiscretizer, \
92              ThresholdDiscretizer, \
93         MapIntValue, \
94         NormalizeContinuous, \
95         Ordinal2Continuous, \
96         TransformValue_IsDefined, \
97    TableAverager, \
98    Preprocessor, \
99         Preprocessor_addCensorWeight, \
100         Preprocessor_addClassNoise, \
101         Preprocessor_addClassWeight, \
102         Preprocessor_addGaussianClassNoise, \
103         Preprocessor_addGaussianNoise, \
104         Preprocessor_addMissing, \
105         Preprocessor_addMissingClasses, \
106         Preprocessor_addNoise, \
107         Preprocessor_discretize, \
108         Preprocessor_drop, \
109         Preprocessor_dropMissing, \
110         Preprocessor_dropMissingClasses, \
111         Preprocessor_filter, \
112         Preprocessor_ignore, \
113         Preprocessor_imputeByLearner, \
114         Preprocessor_removeDuplicates, \
115         Preprocessor_select, \
116         Preprocessor_shuffle, \
117         Preprocessor_take, \
118         Preprocessor_takeMissing, \
119         Preprocessor_takeMissingClasses, \
120    Imputer, \
121         Imputer_asValue, \
122         Imputer_defaults, \
123         Imputer_model, \
124         Imputer_random, \
125    ImputerConstructor, \
126         ImputerConstructor_asValue, \
127         ImputerConstructor_average, \
128         ImputerConstructor_maximal, \
129         ImputerConstructor_minimal, \
130         ImputerConstructor_model, \
131         ImputerConstructor_random, \
132    FilterList, \
133    Filter, \
134         Filter_conjunction, \
135         Filter_disjunction, \
136         Filter_hasClassValue, \
137         Filter_hasMeta, \
138         Filter_hasSpecial, \
139         Filter_isDefined, \
140         Filter_random, \
141         Filter_sameValue, \
142         Filter_values, \
143    Discretization, \
144         BiModalDiscretization, \
145         EntropyDiscretization, \
146         EquiDistDiscretization, \
147         EquiNDiscretization, \
148    DomainTransformerConstructor, \
149    RemoveRedundant, \
150         RemoveRedundantByInduction, \
151         RemoveRedundantByQuality, \
152         RemoveRedundantOneValue, \
153    RemoveUnusedValues
154
155import math
156
157import orange
158from Orange.misc import _orange__new__, _orange__reduce__
159
160class Preprocessor_discretizeEntropy(Preprocessor_discretize):
161    """ An discretizer that uses orange.EntropyDiscretization method but,
162    unlike Preprocessor_discretize class, also removes unused attributes
163    from the domain.
164   
165    """
166   
167    __new__ = _orange__new__(Preprocessor_discretize)
168    __reduce__ = _orange__reduce__
169   
170    def __init__(self, method=orange.EntropyDiscretization()):
171        self.method = method
172        assert(isinstance(method, orange.EntropyDiscretization))
173       
174    def __call__(self, data, wightId=0):
175        newattr_list = []
176        for attr in data.domain.attributes:
177            if attr.varType == orange.VarTypes.Continuous:
178                newattr = self.method(attr, data)
179                if newattr.getValueFrom.transformer.points:
180                    newattr_list.append(newattr)
181            else:
182                newattr_list.append(attr)
183        newdomain = orange.Domain(newattr_list, data.domain.classVar)
184        newdomain.addmetas(data.domain.getmetas())
185        return orange.ExampleTable(newdomain, data)
186   
187class Preprocessor_removeContinuous(Preprocessor_discretize):
188    """ A preprocessor that removes all continuous features.
189    """
190    __new__ = _orange__new__(Preprocessor_discretize)
191    __reduce__ = _orange__reduce__
192   
193    def __call__(self, data, weightId=None):
194        attrs = [attr for attr in data.domain.attributes if attr.varType == orange.VarTypes.Discrete]
195        domain = orange.Domain(attrs, data.domain.classVar)
196        domain.addmetas(data.domain.getmetas())
197        return orange.ExampleTable(domain, data)
198               
199class Preprocessor_continuize(orange.Preprocessor):
200    """ A preprocessor that continuizes a discrete domain (and optionally normalizes it).
201    See :obj:`Orange.data.continuization.DomainContinuizer` for list of
202    accepted arguments.
203   
204    """
205    __new__ = _orange__new__(orange.Preprocessor)
206    __reduce__ = _orange__reduce__
207   
208    def __init__(self, zeroBased=True, multinomialTreatment=orange.DomainContinuizer.NValues,
209                 continuousTreatment=orange.DomainContinuizer.Leave,
210                 classTreatment=orange.DomainContinuizer.Ignore,
211                 **kwargs):
212        self.zeroBased = zeroBased
213        self.multinomialTreatment = multinomialTreatment
214        self.continuousTreatment = continuousTreatment
215        self.classTreatment = classTreatment
216           
217    def __call__(self, data, weightId=0):
218        continuizer = orange.DomainContinuizer(zeroBased=self.zeroBased,
219                                               multinomialTreatment=self.multinomialTreatment,
220                                               continuousTreatment=self.continuousTreatment,
221                                               classTreatment=self.classTreatment)
222        c_domain = continuizer(data, weightId)
223        return data.translate(c_domain)
224   
225class Preprocessor_removeDiscrete(Preprocessor_continuize):
226    """ A Preprocessor that removes all discrete attributes from the domain.
227    """
228    __new__ = _orange__new__(Preprocessor_continuize)
229   
230    def __call__(self, data, weightId=None):
231        attrs = [attr for attr in data.domain.attributes if attr.varType == orange.VarTypes.Continuous]
232        domain = orange.Domain(attrs, data.domain.classVar)
233        domain.addmetas(data.domain.getmetas())
234        return orange.ExampleTable(domain, data)
235         
236class Preprocessor_impute(orange.Preprocessor):
237    """ A preprocessor that imputes unknown values using a learner.
238   
239    :param model: a learner class.
240   
241    """
242    __new__ = _orange__new__(orange.Preprocessor)
243    __reduce__ = _orange__reduce__
244   
245    def __init__(self, model=None, **kwargs):
246        self.model = orange.MajorityLearner() if model is None else model
247       
248    def __call__(self, data, weightId=0):
249        return orange.Preprocessor_imputeByLearner(data, learner=self.model)
250
251def bestN(attrMeasures, N=10):
252    """ Return best N attributes
253    """
254    return attrMeasures[-N:]
255
256def bestP(attrMeasures, P=10):
257    """ Return best P percent of attributes
258    """
259    count = len(attrMeasures)
260    return  attrMeasures[-max(int(math.ceil(count * P / 100.0)), 1):]
261
262class Preprocessor_featureSelection(orange.Preprocessor):
263    """ A preprocessor that runs feature selection using an feature scoring function.
264   
265    :param measure: a scoring function (default: orange.MeasureAttribute_relief)
266    :param filter: a filter function to use for selection (default Preprocessor_featureSelection.bestN)
267    :param limit: the limit for the filter function (default 10)
268       
269    """
270    __new__ = _orange__new__(orange.Preprocessor)
271    __reduce__ = _orange__reduce__
272   
273    bestN = staticmethod(bestN)
274    bestP = staticmethod(bestP)
275   
276    def __init__(self, measure=orange.MeasureAttribute_relief(), filter=None, limit=10):
277        self.measure = measure
278        self.filter = filter if filter is not None else self.bestN
279        self.limit = limit
280   
281    def attrScores(self, data):
282        """ Return a list of computed scores for all attributes in `data`.
283        """
284        measures = sorted([(self.measure(attr, data), attr) for attr in data.domain.attributes])
285        return measures
286         
287    def __call__(self, data, weightId=None):
288        measures = self.attrScores(data)
289        attrs = [attr for _, attr in self.filter(measures, self.limit)]
290        domain = orange.Domain(attrs, data.domain.classVar)
291        domain.addmetas(data.domain.getmetas())
292        return orange.ExampleTable(domain, data)
293   
294class Preprocessor_RFE(Preprocessor_featureSelection):
295    """ A preprocessor that runs RFE(Recursive Feature Elimination) using
296    linear SVM derived attribute weights.
297   
298    :param filter: a filter function to use for selection (default
299                   Preprocessor_featureSelection.bestN)
300    :param limit: the limit for the filter function (default 10)
301       
302    """
303    __new__ = _orange__new__(Preprocessor_featureSelection)
304    __reduce__ = _orange__reduce__
305    def __init__(self, filter=None, limit=10):
306        self.limit = limit
307        self.filter = filter if filter is not None else self.bestN
308       
309    def __call__(self, data, weightId=None):
310        from Orange.classification.svm import RFE
311        rfe = RFE()
312        filtered = self.filter(range(len(data)), self.limit)
313        return rfe(data, len(filtered))
314   
315def selectNRandom(examples, N=10):
316    """ Select N random examples.
317    """
318    import random
319    return random.sample(examples, N)
320
321def selectPRandom(examples, P=10):
322    """ Select P percent random examples.
323    """
324    import random
325    count = len(examples)
326    return random.sample(examples, max(int(math.ceil(count * P / 100.0)), 1))
327
328class Preprocessor_sample(orange.Preprocessor):
329    """ A preprocessor that samples a subset of the data.
330   
331    :param filter: a filter function to use for selection (default
332                   Preprocessor_sample.selectNRandom)
333    :param limit: the limit for the filter function (default 10)
334   
335    """
336    __new__ = _orange__new__(orange.Preprocessor)
337    __reduce__ = _orange__reduce__
338
339    selectNRandom = staticmethod(selectNRandom)
340    selectPRandom = staticmethod(selectPRandom)
341   
342    def __init__(self, filter=None, limit=10):
343        self.filter = filter if filter is not None else self.selectNRandom
344        self.limit = limit
345       
346    def __call__(self, data, weightId=None):
347        return orange.ExampleTable(data.domain, self.filter(data, self.limit))
348   
349
350class Preprocessor_preprocessorList(orange.Preprocessor):
351    """ A preprocessor wrapping a sequence of other preprocessors.
352   
353    :param preprocessors: a list of :obj:`Preprocessor` instances
354   
355    """
356   
357    __new__ = _orange__new__(orange.Preprocessor)
358    __reduce__ = _orange__reduce__
359   
360    def __init__(self, preprocessors=[]):
361        self.preprocessors = preprocessors
362       
363    def __call__(self, data, weightId=None):
364        import orange
365        hadWeight = hasWeight = weightId is not None
366        for preprocessor in self.preprocessors:
367            t = preprocessor(data, weightId) if hasWeight else preprocessor(data)
368            if isinstance(t, tuple):
369                data, weightId = t
370                hasWeight = True
371            else:
372                data = t
373        if hadWeight:
374            return data, weightId
375        else:
376            return data
377       
Note: See TracBrowser for help on using the repository browser.