source: orange/Orange/tuning/__init__.py @ 11459:fc07a5c346be

Revision 11459:fc07a5c346be, 20.6 KB checked in by Ales Erjavec <ales.erjavec@…>, 12 months ago (diff)

Fixed checks for passed dataset table argument in new methods.

Use 'instances is not None' idiom and not a boolean test to guard against cases
where the passed dataset length is 0.

Line 
1import Orange.core
2import Orange.classification
3import Orange.evaluation.scoring
4import Orange.evaluation.testing
5import Orange.misc
6
7from Orange.utils import deprecated_class_attribute, deprecated_keywords, \
8                         deprecated_members
9
10class TuneParameters(Orange.classification.Learner):
11
12    """.. attribute:: data
13   
14        Data table with either discrete or continuous features
15   
16    .. attribute:: weight_id
17   
18        The id of the weight meta attribute
19   
20    .. attribute:: learner
21   
22        The learning algorithm whose parameters are to be tuned. This can be,
23        for instance, :obj:`Orange.classification.tree.TreeLearner`.
24   
25    .. attribute:: evaluate
26   
27        The statistics to evaluate. The default is
28        :obj:`Orange.evaluation.scoring.CA`, so the learner will be fit for the
29        optimal classification accuracy. You can replace it with, for instance,
30        :obj:`Orange.evaluation.scoring.AUC` to optimize the AUC. Statistics
31        can return either a single value (classification accuracy), a list with
32        a single value (this is what :obj:`Orange.evaluation.scoring.CA`
33        actually does), or arbitrary objects which the compare function below
34        must be able to compare.
35   
36    .. attribute:: folds
37   
38        The number of folds used in internal cross-validation. Default is 5.
39   
40    .. attribute:: compare
41   
42        The function used to compare the results. The function should accept
43        two arguments (e.g. two classification accuracies, AUCs or whatever the
44        result of ``evaluate`` is) and return a positive value if the first
45        argument is better, 0 if they are equal and a negative value if the
46        first is worse than the second. The default compare function is
47        ``cmp``. You don't need to change this if evaluate is such that higher
48        values mean a better classifier.
49   
50    .. attribute:: return_what
51   
52        Decides what should be result of tuning. Possible values are:
53   
54        * ``TuneParameters.RETURN_NONE`` (or 0): tuning will return nothing,
55        * ``TuneParameters.RETURN_PARAMETERS`` (or 1): return the optimal value(s) of parameter(s),
56        * ``TuneParameters.RETURN_LEARNER`` (or 2): return the learner set to optimal parameters,
57        * ``TuneParameters.RETURN_CLASSIFIER`` (or 3): return a classifier trained with the optimal parameters on the entire data set. This is the default setting.
58       
59        Regardless of this, the learner (given as parameter ``learner``) is
60        left set to the optimal parameters.
61   
62    .. attribute:: verbose
63   
64        If 0 (default), the class doesn't print anything. If set to 1, it will
65        print out the optimal value found, if set to 2, it will print out all
66        tried values and the related
67   
68    If tuner returns the classifier, it behaves as a learning algorithm. As the
69    examples below will demonstrate, it can be called, given the data and
70    the result is a "trained" classifier. It can, for instance, be used in
71    cross-validation.
72
73    Out of these attributes, the only necessary argument is ``learner``. The
74    real tuning classes (subclasses of this class) add two additional -
75    the attributes that tell what parameter(s) to optimize and which values
76    to use.
77   
78    """
79
80    RETURN_NONE = 0
81    RETURN_PARAMETERS = 1
82    RETURN_LEARNER = 2
83    RETURN_CLASSIFIER = 3
84
85    returnNone = \
86        deprecated_class_attribute("returnNone", "RETURN_NONE")
87    returnParameters = \
88        deprecated_class_attribute("returnParameters", "RETURN_PARAMETERS")
89    returnLearner = \
90        deprecated_class_attribute("returnLearner", "RETURN_LEARNER")
91    returnClassifier = \
92        deprecated_class_attribute("returnClassifier", "RETURN_CLASSIFIER")
93
94    @deprecated_keywords({"examples": "data", "weightID": "weight_id"})
95    def __new__(cls, data=None, weight_id=0, **argkw):
96        self = Orange.classification.Learner.__new__(cls, **argkw)
97        if data is not None:
98            for name, value in argkw.items():
99                setattr(self, name, value)
100            self.__init__(**argkw)
101            return self.__call__(data, weight_id)
102        else:
103            return self
104
105    def findobj(self, name):
106        import string
107        names = string.split(name, ".")
108        lastobj = self.learner
109        for i in names[:-1]:
110            lastobj = getattr(lastobj, i)
111        return lastobj, names[-1]
112
113TuneParameters = deprecated_members(
114    {"returnWhat": "return_what",
115     "object": "learner"},
116    )(TuneParameters)
117
118
119class Tune1Parameter(TuneParameters):
120
121    """Class :obj:`Orange.optimization.Tune1Parameter` tunes a single parameter.
122   
123    .. attribute:: parameter
124   
125        The name of the parameter (or a list of names, if the same parameter is
126        stored at multiple places - see the examples) to be tuned.
127   
128    .. attribute:: values
129   
130        A list of parameter's values to be tried.
131   
132    To show how it works, we shall fit the minimal number of examples in a leaf
133    for a tree classifier.
134   
135    part of :download:`optimization-tuning1.py <code/optimization-tuning1.py>`
136
137    .. literalinclude:: code/optimization-tuning1.py
138        :lines: 3-11
139
140    Set up like this, when the tuner is called, set ``learner.min_subset`` to
141    1, 2, 3, 4, 5, 10, 15 and 20, and measure the AUC in 5-fold cross
142    validation. It will then reset the learner.minSubset to the optimal value
143    found and, since we left ``return_what`` at the default
144    (``RETURN_CLASSIFIER``), construct and return the classifier from the
145    entire data set. So, what we get is a  classifier, but if we'd also like
146    to know what the optimal value was, we can get it from
147    ``learner.min_subset``.
148
149    Tuning is of course not limited to setting numeric parameters. You can, for
150    instance, try to find the optimal criteria for assessing the quality of
151    attributes by tuning ``parameter="measure"``, trying settings like
152    ``values=[Orange.feature.scoring.GainRatio(), Orange.feature.scoring.Gini()]``
153   
154    Since the tuner returns a classifier and thus behaves like a learner, it
155    can be used in a cross-validation. Let us see whether a tuning tree indeed
156    enhances the AUC or not. We shall reuse the tuner from above, add another
157    tree learner, and test them both.
158   
159    part of :download:`optimization-tuning1.py <code/optimization-tuning1.py>`
160
161    .. literalinclude:: code/optimization-tuning1.py
162        :lines: 13-18
163   
164    This can be time consuming: for each of 8 values for ``min_subset`` it will
165    perform 5-fold cross validation inside a 10-fold cross validation -
166    altogether 400 trees. Plus, it will learn the optimal tree afterwards for
167    each fold. Adding a tree without tuning, that makes 420 trees build in
168    total.
169   
170    Nevertheless, results are good::
171   
172        Untuned tree: 0.930
173        Tuned tree: 0.986
174   
175    """
176
177    def __call__(self, data, weight=None, verbose=0):
178        verbose = verbose or getattr(self, "verbose", 0)
179        evaluate = getattr(self, "evaluate", Orange.evaluation.scoring.CA)
180        folds = getattr(self, "folds", 5)
181        compare = getattr(self, "compare", cmp)
182        return_what = getattr(self, "return_what",
183                             Tune1Parameter.RETURN_CLASSIFIER)
184
185        if (type(self.parameter) == list) or (type(self.parameter) == tuple):
186            to_set = [self.findobj(ld) for ld in self.parameter]
187        else:
188            to_set = [self.findobj(self.parameter)]
189
190        cvind = Orange.core.MakeRandomIndicesCV(data, folds)
191        findBest = Orange.utils.selection.BestOnTheFly(seed=data.checksum(),
192                                         call_compare_on_1st=True)
193        tableAndWeight = weight and (data, weight) or data
194        for par in self.values:
195            for i in to_set:
196                setattr(i[0], i[1], par)
197            res = evaluate(Orange.evaluation.testing.test_with_indices(
198                                        [self.learner], tableAndWeight, cvind))
199            findBest.candidate((res, par))
200            if verbose == 2:
201                print '*** optimization  %s: %s:' % (par, ", ".join("%.8f" % r for r in res))
202
203        bestpar = findBest.winner()[1]
204        for i in to_set:
205            setattr(i[0], i[1], bestpar)
206
207        if verbose:
208            print "*** Optimal parameter: %s = %s" % (self.parameter, bestpar)
209
210        if return_what == Tune1Parameter.RETURN_NONE:
211            return None
212        elif return_what == Tune1Parameter.RETURN_PARAMETERS:
213            return bestpar
214        elif return_what == Tune1Parameter.RETURN_LEARNER:
215            return self.learner
216        else:
217            classifier = self.learner(data)
218            if not Orange.utils.environ.orange_no_deprecated_members:
219                classifier.setattr("fittedParameter", bestpar)
220            classifier.setattr("fitted_parameter", bestpar)
221            return classifier
222
223class TuneMParameters(TuneParameters):
224
225    """The use of :obj:`Orange.optimization.TuneMParameters` differs from
226    :obj:`Orange.optimization.Tune1Parameter` only in specification of tuning
227    parameters.
228   
229    .. attribute:: parameters
230   
231        A list of two-element tuples, each containing the name of a parameter
232        and its possible values.
233   
234    For example we can try to tune both the minimal number of instances in
235    leaves and the splitting criteria by setting the tuner as follows:
236   
237    :download:`optimization-tuningm.py <code/optimization-tuningm.py>`
238
239    .. literalinclude:: code/optimization-tuningm.py
240   
241    """
242
243    def __call__(self, data, weight=None, verbose=0):
244        evaluate = getattr(self, "evaluate", Orange.evaluation.scoring.CA)
245        folds = getattr(self, "folds", 5)
246        compare = getattr(self, "compare", cmp)
247        verbose = verbose or getattr(self, "verbose", 0)
248        return_what = getattr(self, "return_what", Tune1Parameter.RETURN_CLASSIFIER)
249        progress_callback = getattr(self, "progress_callback", lambda i: None)
250
251        to_set = []
252        parnames = []
253        for par in self.parameters:
254            if (type(par[0]) == list) or (type(par[0]) == tuple):
255                to_set.append([self.findobj(ld) for ld in par[0]])
256                parnames.append(par[0])
257            else:
258                to_set.append([self.findobj(par[0])])
259                parnames.append([par[0]])
260
261
262        cvind = Orange.core.MakeRandomIndicesCV(data, folds)
263        findBest = Orange.utils.selection.BestOnTheFly(seed=data.checksum(),
264                                         call_compare_on_1st=True)
265        tableAndWeight = weight and (data, weight) or data
266        numOfTests = sum([len(x[1]) for x in self.parameters])
267        milestones = set(range(0, numOfTests, max(numOfTests / 100, 1)))
268        for itercount, valueindices in enumerate(Orange.utils.counters.LimitedCounter(\
269                                        [len(x[1]) for x in self.parameters])):
270            values = [self.parameters[i][1][x] for i, x \
271                      in enumerate(valueindices)]
272            for pi, value in enumerate(values):
273                for i, par in enumerate(to_set[pi]):
274                    setattr(par[0], par[1], value)
275                    if verbose == 2:
276                        print "%s: %s" % (parnames[pi][i], value)
277
278            res = evaluate(Orange.evaluation.testing.test_with_indices(
279                                        [self.learner], tableAndWeight, cvind))
280            if itercount in milestones:
281                progress_callback(100.0 * itercount / numOfTests)
282
283            findBest.candidate((res, values))
284            if verbose == 2:
285                print "===> Result: %s\n" % res
286
287        bestpar = findBest.winner()[1]
288        if verbose:
289            print "*** Optimal set of parameters: ",
290        for pi, value in enumerate(bestpar):
291            for i, par in enumerate(to_set[pi]):
292                setattr(par[0], par[1], value)
293                if verbose:
294                    print "%s: %s" % (parnames[pi][i], value),
295        if verbose:
296            print
297
298        if return_what == Tune1Parameter.RETURN_NONE:
299            return None
300        elif return_what == Tune1Parameter.RETURN_PARAMETERS:
301            return bestpar
302        elif return_what == Tune1Parameter.RETURN_LEARNER:
303            return self.learner
304        else:
305            classifier = self.learner(data)
306            if Orange.utils.environ.orange_no_deprecated_members:
307                classifier.fittedParameters = bestpar
308            classifier.fitted_parameters = bestpar
309            return classifier
310
311TuneMParameters = deprecated_members(
312    {"progressCallback": "progress_callback"},
313    )(TuneMParameters)
314
315class ThresholdLearner(Orange.classification.Learner):
316
317    """:obj:`Orange.optimization.ThresholdLearner` is a class that wraps
318    another learner. When given the data, it calls the wrapped learner to build
319    a classifier, than it uses the classifier to predict the class
320    probabilities on the training instances. Storing the probabilities, it
321    computes the threshold that would give the optimal classification accuracy.
322    Then it wraps the classifier and the threshold into an instance of
323    :obj:`Orange.optimization.ThresholdClassifier`.
324
325    Note that the learner doesn't perform internal cross-validation. Also, the
326    learner doesn't work for multivalued classes.
327
328    :obj:`Orange.optimization.ThresholdLearner` has the same interface as any
329    learner: if the constructor is given data, it returns a classifier,
330    else it returns a learner. It has two attributes.
331   
332    .. attribute:: learner
333   
334        The wrapped learner, for example an instance of
335        :obj:`Orange.classification.bayes.NaiveLearner`.
336   
337    .. attribute:: store_curve
338   
339        If `True`, the resulting classifier will contain an attribute curve, with
340        a list of tuples containing thresholds and classification accuracies at
341        that threshold (default `False`).
342   
343    """
344
345    @deprecated_keywords({"examples": "data", "weightID": "weight_id"})
346    def __new__(cls, data=None, weight_id=0, **kwds):
347        self = Orange.classification.Learner.__new__(cls, **kwds)
348        if data is not None:
349            self.__init__(**kwargs)
350            return self.__call__(data, weight_id)
351        else:
352            return self
353
354    @deprecated_keywords({"storeCurve": "store_curve"})
355    def __init__(self, learner=None, store_curve=False, **kwds):
356        self.learner = learner
357        self.store_curve = store_curve
358        for name, value in kwds.items():
359            setattr(self, name, value)
360
361    @deprecated_keywords({"examples": "data", "weightID": "weight_id"})
362    def __call__(self, data, weight_id=0):
363        if self.learner is None:
364            raise AttributeError("Learner not set.")
365
366        classifier = self.learner(data, weight_id)
367        threshold, optCA, curve = Orange.wrappers.ThresholdCA(classifier,
368                                                          data,
369                                                          weight_id)
370        if self.store_curve:
371            return ThresholdClassifier(classifier, threshold, curve=curve)
372        else:
373            return ThresholdClassifier(classifier, threshold)
374
375ThresholdLearner = deprecated_members(
376    {"storeCurve": "store_curve"},
377    wrap_methods=["__init__"]
378    )(ThresholdLearner)
379
380class ThresholdClassifier(Orange.classification.Classifier):
381
382    """:obj:`Orange.optimization.ThresholdClassifier`, used by both
383    :obj:`Orange.optimization.ThredholdLearner` and
384    :obj:`Orange.optimization.ThresholdLearner_fixed` is therefore another
385    wrapper class, containing a classifier and a threshold. When it needs to
386    classify an instance, it calls the wrapped classifier to predict
387    probabilities. The example will be classified into the second class only if
388    the probability of that class is above the threshold.
389
390    .. attribute:: classifier
391   
392        The wrapped classifier, normally the one related to the ThresholdLearner's
393        learner, e.g. an instance of
394        :obj:`Orange.classification.bayes.NaiveLearner`.
395   
396    .. attribute:: threshold
397   
398        The threshold for classification into the second class.
399   
400    The two attributes can be specified set as attributes or given to the
401    constructor as ordinary arguments.
402   
403    """
404
405    def __init__(self, classifier, threshold, **kwds):
406        self.classifier = classifier
407        self.threshold = threshold
408        for name, value in kwds.items():
409            setattr(self, name, value)
410
411    def __call__(self, instance, what=Orange.classification.Classifier.GetValue):
412        probs = self.classifier(instance, self.GetProbabilities)
413        if what == self.GetProbabilities:
414            return probs
415        value = Orange.data.Value(self.classifier.classVar, probs[1] > \
416                                  self.threshold)
417        if what == Orange.classification.Classifier.GetValue:
418            return value
419        else:
420            return (value, probs)
421
422
423class ThresholdLearner_fixed(Orange.classification.Learner):
424    """ This is a convinience  variant of
425    :obj:`Orange.optimization.ThresholdLearner`. Instead of finding the
426    optimal threshold it uses a prescribed one. It has the following two
427    attributes.
428   
429    .. attribute:: learner
430   
431        The wrapped learner, for example an instance of
432        :obj:`~Orange.classification.bayes.NaiveLearner`.
433   
434    .. attribute:: threshold
435   
436        Threshold to use in classification.
437   
438    This class calls its base learner and puts the resulting classifier
439    together with the threshold into an instance of :obj:`ThresholdClassifier`.
440   
441    """
442    @deprecated_keywords({"examples": "data", "weightID": "weight_id"})
443    def __new__(cls, data=None, weight_id=0, **kwds):
444        self = Orange.classification.Learner.__new__(cls, **kwds)
445        if data is not None:
446            self.__init__(**kwds)
447            return self.__call__(data, weight_id)
448        else:
449            return self
450
451    def __init__(self, learner=None, threshold=None, **kwds):
452        self.learner = learner
453        self.threshold = threshold
454        for name, value in kwds.items():
455            setattr(name, value)
456
457    @deprecated_keywords({"examples": "data", "weightID": "weight_id"})
458    def __call__(self, data, weight_id=0):
459        if self.learner is None:
460            raise AttributeError("Learner not set.")
461        if self.threshold is None:
462            raise AttributeError("Threshold not set.")
463        if len(data.domain.classVar.values) != 2:
464            raise ValueError("ThresholdLearner handles binary classes only.")
465
466        return ThresholdClassifier(self.learner(data, weight_id),
467                                   self.threshold)
468
469class PreprocessedLearner(object):
470    def __new__(cls, preprocessor=None, learner=None):
471        self = object.__new__(cls)
472        if learner is not None:
473            self.__init__(preprocessor)
474            return self.wrapLearner(learner)
475        else:
476            return self
477
478    def __init__(self, preprocessor=None, learner=None):
479        if isinstance(preprocessor, list):
480            self.preprocessors = preprocessor
481        elif preprocessor is not None:
482            self.preprocessors = [preprocessor]
483        else:
484            self.preprocessors = []
485        #self.preprocessors = [Orange.core.Preprocessor_addClassNoise(proportion=0.8)]
486        if learner:
487            self.wrapLearner(learner)
488
489    def processData(self, data, weightId=None):
490        hadWeight = hasWeight = weightId is not None
491        for preprocessor in self.preprocessors:
492            if hasWeight:
493                t = preprocessor(data, weightId)
494            else:
495                t = preprocessor(data)
496
497            if isinstance(t, tuple):
498                data, weightId = t
499                hasWeight = True
500            else:
501                data = t
502        if hadWeight:
503            return data, weightId
504        else:
505            return data
506
507    def wrapLearner(self, learner):
508        class WrappedLearner(learner.__class__):
509            preprocessor = self
510            wrappedLearner = learner
511            name = getattr(learner, "name", "")
512            def __call__(self, data, weightId=0, getData=False):
513                t = self.preprocessor.processData(data, weightId or 0)
514                processed, procW = t if isinstance(t, tuple) else (t, 0)
515                classifier = self.wrappedLearner(processed, procW)
516                if getData:
517                    return classifier, processed
518                else:
519                    return classifier # super(WrappedLearner, self).__call__(processed, procW)
520
521            def __reduce__(self):
522                return PreprocessedLearner, (self.preprocessor.preprocessors, \
523                                             self.wrappedLearner)
524
525            def __getattr__(self, name):
526                return getattr(learner, name)
527
528        return WrappedLearner()
Note: See TracBrowser for help on using the repository browser.