source: orange/Orange/data/preprocess/__init__.py @ 10542:7dde0640e266

Revision 10542:7dde0640e266, 11.6 KB checked in by anzeh <anze.staric@…>, 2 years ago (diff)

Moved preprocess from Orange to Orange.data.

Line 
1from Orange import core
2
3DomainContinuizer = core.DomainContinuizer
4VariableFilterMap = core.VariableFilterMap
5
6ValueFilter = core.ValueFilter
7ValueFilter_continuous = core.ValueFilter_continuous
8ValueFilter_discrete = core.ValueFilter_discrete
9ValueFilter_string = core.ValueFilter_string
10ValueFilter_stringList = core.ValueFilter_stringList
11ValueFilterList = core.ValueFilterList
12
13TransformValue = core.TransformValue
14Discrete2Continuous = core.Discrete2Continuous
15Discretizer = core.Discretizer
16BiModalDiscretizer = core.BiModalDiscretizer
17EquiDistDiscretizer = core.EquiDistDiscretizer
18IntervalDiscretizer = core.IntervalDiscretizer
19ThresholdDiscretizer = core.ThresholdDiscretizer
20MapIntValue = core.MapIntValue
21NormalizeContinuous = core.NormalizeContinuous
22Ordinal2Continuous = core.Ordinal2Continuous
23TransformValue_IsDefined = core.TransformValue_IsDefined
24
25TableAverager = core.TableAverager
26
27Preprocessor = core.Preprocessor
28AddCensorWeight = core.Preprocessor_addCensorWeight
29AddClassNoise = core.Preprocessor_addClassNoise
30AddClassWeight = core.Preprocessor_addClassWeight
31AddGaussianClassNoise = core.Preprocessor_addGaussianClassNoise
32AddGaussianNoise = core.Preprocessor_addGaussianNoise
33AddMissing = core.Preprocessor_addMissing
34AddMissingClasses = core.Preprocessor_addMissingClasses
35AddNoise = core.Preprocessor_addNoise
36
37Discretize = core.Preprocessor_discretize
38Drop = core.Preprocessor_drop
39DropMissing = core.Preprocessor_dropMissing
40DropMissingClasses = core.Preprocessor_dropMissingClasses
41Filter = core.Preprocessor_filter
42Ignore = core.Preprocessor_ignore
43ImputeByLearner = core.Preprocessor_imputeByLearner
44RemoveDuplicates = core.Preprocessor_removeDuplicates
45Select = core.Preprocessor_select
46Shuffle = core.Preprocessor_shuffle
47Take = core.Preprocessor_take
48TakeMissing = core.Preprocessor_takeMissing
49TakeMissingClasses = core.Preprocessor_takeMissingClasses
50
51Imputer = core.Imputer
52Imputer_asValue = core.Imputer_asValue
53Imputer_defaults = core.Imputer_defaults
54Imputer_model = core.Imputer_model
55Imputer_random = core.Imputer_random
56
57ImputerConstructor = core.ImputerConstructor
58ImputerConstructor_asValue = core.ImputerConstructor_asValue
59ImputerConstructor_average = core.ImputerConstructor_average
60ImputerConstructor_maximal = core.ImputerConstructor_maximal
61ImputerConstructor_minimal = core.ImputerConstructor_minimal
62ImputerConstructor_model = core.ImputerConstructor_model
63ImputerConstructor_random = core.ImputerConstructor_random
64
65FilterList = core.FilterList
66Filter = core.Filter
67Filter_conjunction = core.Filter_conjunction
68Filter_disjunction = core.Filter_disjunction
69Filter_hasClassValue = core.Filter_hasClassValue
70Filter_hasMeta = core.Filter_hasMeta
71Filter_hasSpecial = core.Filter_hasSpecial
72Filter_isDefined = core.Filter_isDefined
73Filter_random = core.Filter_random
74Filter_sameValue = core.Filter_sameValue
75Filter_values = core.Filter_values
76
77Discretization = core.Discretization
78BiModalDiscretization = core.BiModalDiscretization
79EntropyDiscretization = core.EntropyDiscretization
80EquiDistDiscretization = core.EquiDistDiscretization
81EquiNDiscretization = core.EquiNDiscretization
82
83DomainTransformerConstructor = core.DomainTransformerConstructor
84
85RemoveRedundant = core.RemoveRedundant
86RemoveRedundantByInduction = core.RemoveRedundantByInduction
87RemoveRedundantByQuality = core.RemoveRedundantByQuality
88RemoveRedundantOneValue = core.RemoveRedundantOneValue
89RemoveUnusedValues = core.RemoveUnusedValues
90
91import math
92
93import Orange
94import Orange.classification.majority
95import Orange.data
96import Orange.feature
97import Orange.feature.discretization
98import Orange.feature.scoring
99from Orange.misc import _orange__new__, _orange__reduce__
100
101class DiscretizeEntropy(Discretize):
102    """ An discretizer that uses orange.EntropyDiscretization method but,
103    unlike Preprocessor_discretize class, also removes unused attributes
104    from the domain.
105    """
106   
107    __new__ = _orange__new__(Discretize)
108    __reduce__ = _orange__reduce__
109   
110    def __init__(self, method=Orange.feature.discretization.Entropy()):
111        self.method = method
112        assert(isinstance(method, Orange.feature.discretization.Entropy))
113       
114    def __call__(self, data, weightId=0):
115        newattr_list = []
116        for attr in data.domain.attributes:
117            if attr.varType == Orange.feature.Type.Continuous:
118                newattr = self.method(attr, data)
119                if newattr.getValueFrom.transformer.points:
120                    newattr_list.append(newattr)
121            else:
122                newattr_list.append(attr)
123        newdomain = Orange.data.Domain(newattr_list, data.domain.classVar)
124        newdomain.addmetas(data.domain.getmetas())
125        return Orange.data.Table(newdomain, data)
126   
127class RemoveContinuous(Discretize):
128    """ A preprocessor that removes all continuous features.
129    """
130    __new__ = _orange__new__(Discretize)
131    __reduce__ = _orange__reduce__
132   
133    def __call__(self, data, weightId=None):
134        attrs = [attr for attr in data.domain.attributes if attr.varType == Orange.feature.Type.Discrete]
135        domain = Orange.data.Domain(attrs, data.domain.classVar)
136        domain.addmetas(data.domain.getmetas())
137        return Orange.data.Table(domain, data)
138               
139class Continuize(Preprocessor):
140    """ A preprocessor that continuizes a discrete domain (and optionally normalizes it).
141    See :obj:`Orange.data.continuization.DomainContinuizer` for list of
142    accepted arguments.
143   
144    """
145    __new__ = _orange__new__(Preprocessor)
146    __reduce__ = _orange__reduce__
147   
148    def __init__(self, zeroBased=True, multinomialTreatment=DomainContinuizer.NValues,
149                 continuousTreatment=DomainContinuizer.Leave,
150                 classTreatment=DomainContinuizer.Ignore,
151                 **kwargs):
152        self.zeroBased = zeroBased
153        self.multinomialTreatment = multinomialTreatment
154        self.continuousTreatment = continuousTreatment
155        self.classTreatment = classTreatment
156           
157    def __call__(self, data, weightId=0):
158        continuizer = DomainContinuizer(zeroBased=self.zeroBased,
159                                        multinomialTreatment=self.multinomialTreatment,
160                                        continuousTreatment=self.continuousTreatment,
161                                        classTreatment=self.classTreatment)
162        c_domain = continuizer(data, weightId)
163        return data.translate(c_domain)
164   
165class RemoveDiscrete(Continuize):
166    """ A Preprocessor that removes all discrete attributes from the domain.
167    """
168    __new__ = _orange__new__(Continuize)
169   
170    def __call__(self, data, weightId=None):
171        attrs = [attr for attr in data.domain.attributes if attr.varType == Orange.feature.Type.Continuous]
172        domain = Orange.data.Domain(attrs, data.domain.classVar)
173        domain.addmetas(data.domain.getmetas())
174        return Orange.data.Table(domain, data)
175         
176class Impute(Preprocessor):
177    """ A preprocessor that imputes unknown values using a learner.
178   
179    :param model: a learner class.
180   
181    """
182    __new__ = _orange__new__(Preprocessor)
183    __reduce__ = _orange__reduce__
184   
185    def __init__(self, model=None, **kwargs):
186        self.model = Orange.classification.majority.MajorityLearner() if model is None else model
187       
188    def __call__(self, data, weightId=0):
189        return ImputeByLearner(data, learner=self.model)
190
191def bestN(attrMeasures, N=10):
192    """ Return best N attributes
193    """
194    return attrMeasures[-N:]
195_bestN = bestN
196
197def bestP(attrMeasures, P=10):
198    """ Return best P percent of attributes
199    """
200    count = len(attrMeasures)
201    return  attrMeasures[-max(int(math.ceil(count * P / 100.0)), 1):]
202_bestP = bestP
203
204class FeatureSelection(Preprocessor):
205    """ A preprocessor that runs feature selection using an feature scoring function.
206   
207    :param measure: a scoring function (default: orange.MeasureAttribute_relief)
208    :param filter: a filter function to use for selection (default Preprocessor_featureSelection.bestN)
209    :param limit: the limit for the filter function (default 10)
210       
211    """
212    __new__ = _orange__new__(Preprocessor)
213    __reduce__ = _orange__reduce__
214   
215    bestN = staticmethod(_bestN)
216    bestP = staticmethod(_bestP)
217   
218    def __init__(self, measure=Orange.feature.scoring.Relief(), filter=None, limit=10):
219        self.measure = measure
220        self.filter = filter if filter is not None else self.bestN
221        self.limit = limit
222   
223    def attrScores(self, data):
224        """ Return a list of computed scores for all attributes in `data`.
225        """
226        measures = sorted([(self.measure(attr, data), attr) for attr in data.domain.attributes])
227        return measures
228         
229    def __call__(self, data, weightId=None):
230        measures = self.attrScores(data)
231        attrs = [attr for _, attr in self.filter(measures, self.limit)]
232        domain = Orange.data.Domain(attrs, data.domain.classVar)
233        domain.addmetas(data.domain.getmetas())
234        return Orange.data.Table(domain, data)
235   
236class RFE(FeatureSelection):
237    """ A preprocessor that runs RFE(Recursive Feature Elimination) using
238    linear SVM derived attribute weights.
239   
240    :param filter: a filter function to use for selection (default
241                   Preprocessor_featureSelection.bestN)
242    :param limit: the limit for the filter function (default 10)
243       
244    """
245    __new__ = _orange__new__(FeatureSelection)
246    __reduce__ = _orange__reduce__
247
248    def __init__(self, filter=None, limit=10):
249        super(RFE, self).__init__(filter=filter, limit=limit)
250       
251    def __call__(self, data, weightId=None):
252        from Orange.classification.svm import RFE
253        rfe = RFE()
254        filtered = self.filter(range(len(data)), self.limit)
255        return rfe(data, len(filtered))
256   
257def selectNRandom(examples, N=10):
258    """ Select N random examples.
259    """
260    import random
261    return random.sample(examples, N)
262_selectNRandom = selectNRandom
263
264def selectPRandom(examples, P=10):
265    """ Select P percent random examples.
266    """
267    import random
268    count = len(examples)
269    return random.sample(examples, max(int(math.ceil(count * P / 100.0)), 1))
270_selectPRandom = selectPRandom
271
272class Sample(Preprocessor):
273    """ A preprocessor that samples a subset of the data.
274   
275    :param filter: a filter function to use for selection (default
276                   Preprocessor_sample.selectNRandom)
277    :param limit: the limit for the filter function (default 10)
278   
279    """
280    __new__ = _orange__new__(Preprocessor)
281    __reduce__ = _orange__reduce__
282
283    selectNRandom = staticmethod(_selectNRandom)
284    selectPRandom = staticmethod(_selectPRandom)
285   
286    def __init__(self, filter=None, limit=10):
287        self.filter = filter if filter is not None else self.selectNRandom
288        self.limit = limit
289       
290    def __call__(self, data, weightId=None):
291        return Orange.data.Table(data.domain, self.filter(data, self.limit))
292   
293
294class PreprocessorList(Preprocessor):
295    """ A preprocessor wrapping a sequence of other preprocessors.
296   
297    :param preprocessors: a list of :obj:`Preprocessor` instances
298   
299    """
300   
301    __new__ = _orange__new__(Preprocessor)
302    __reduce__ = _orange__reduce__
303   
304    def __init__(self, preprocessors=()):
305        self.preprocessors = preprocessors
306       
307    def __call__(self, data, weightId=None):
308        hadWeight = hasWeight = weightId is not None
309        for preprocessor in self.preprocessors:
310            t = preprocessor(data, weightId) if hasWeight else preprocessor(data)
311            if isinstance(t, tuple):
312                data, weightId = t
313                hasWeight = True
314            else:
315                data = t
316        if hadWeight:
317            return data, weightId
318        else:
319            return data
320       
Note: See TracBrowser for help on using the repository browser.