source: orange/Orange/tuning/__init__.py @ 10711:38a9de5ed525

Revision 10711:38a9de5ed525, 20.6 KB checked in by anze <anze.staric@…>, 2 years ago (diff)

Removed optimization to tuning.

Line 
1"""
2
3   
4"""
5
6import Orange.core
7import Orange.classification
8import Orange.evaluation.scoring
9import Orange.evaluation.testing
10import Orange.misc
11
12from Orange.utils import deprecated_class_attribute, deprecated_keywords, \
13                         deprecated_members
14
15class TuneParameters(Orange.classification.Learner):
16
17    """.. attribute:: data
18   
19        Data table with either discrete or continuous features
20   
21    .. attribute:: weight_id
22   
23        The id of the weight meta attribute
24   
25    .. attribute:: learner
26   
27        The learning algorithm whose parameters are to be tuned. This can be,
28        for instance, :obj:`Orange.classification.tree.TreeLearner`.
29   
30    .. attribute:: evaluate
31   
32        The statistics to evaluate. The default is
33        :obj:`Orange.evaluation.scoring.CA`, so the learner will be fit for the
34        optimal classification accuracy. You can replace it with, for instance,
35        :obj:`Orange.evaluation.scoring.AUC` to optimize the AUC. Statistics
36        can return either a single value (classification accuracy), a list with
37        a single value (this is what :obj:`Orange.evaluation.scoring.CA`
38        actually does), or arbitrary objects which the compare function below
39        must be able to compare.
40   
41    .. attribute:: folds
42   
43        The number of folds used in internal cross-validation. Default is 5.
44   
45    .. attribute:: compare
46   
47        The function used to compare the results. The function should accept
48        two arguments (e.g. two classification accuracies, AUCs or whatever the
49        result of ``evaluate`` is) and return a positive value if the first
50        argument is better, 0 if they are equal and a negative value if the
51        first is worse than the second. The default compare function is
52        ``cmp``. You don't need to change this if evaluate is such that higher
53        values mean a better classifier.
54   
55    .. attribute:: return_what
56   
57        Decides what should be result of tuning. Possible values are:
58   
59        * ``TuneParameters.RETURN_NONE`` (or 0): tuning will return nothing,
60        * ``TuneParameters.RETURN_PARAMETERS`` (or 1): return the optimal value(s) of parameter(s),
61        * ``TuneParameters.RETURN_LEARNER`` (or 2): return the learner set to optimal parameters,
62        * ``TuneParameters.RETURN_CLASSIFIER`` (or 3): return a classifier trained with the optimal parameters on the entire data set. This is the default setting.
63       
64        Regardless of this, the learner (given as parameter ``learner``) is
65        left set to the optimal parameters.
66   
67    .. attribute:: verbose
68   
69        If 0 (default), the class doesn't print anything. If set to 1, it will
70        print out the optimal value found, if set to 2, it will print out all
71        tried values and the related
72   
73    If tuner returns the classifier, it behaves as a learning algorithm. As the
74    examples below will demonstrate, it can be called, given the data and
75    the result is a "trained" classifier. It can, for instance, be used in
76    cross-validation.
77
78    Out of these attributes, the only necessary argument is ``learner``. The
79    real tuning classes (subclasses of this class) add two additional -
80    the attributes that tell what parameter(s) to optimize and which values
81    to use.
82   
83    """
84
85    RETURN_NONE = 0
86    RETURN_PARAMETERS = 1
87    RETURN_LEARNER = 2
88    RETURN_CLASSIFIER = 3
89
90    returnNone = \
91        deprecated_class_attribute("returnNone", "RETURN_NONE")
92    returnParameters = \
93        deprecated_class_attribute("returnParameters", "RETURN_PARAMETERS")
94    returnLearner = \
95        deprecated_class_attribute("returnLearner", "RETURN_LEARNER")
96    returnClassifier = \
97        deprecated_class_attribute("returnClassifier", "RETURN_CLASSIFIER")
98
99    @deprecated_keywords({"examples": "data", "weightID": "weight_id"})
100    def __new__(cls, data=None, weight_id=0, **argkw):
101        self = Orange.classification.Learner.__new__(cls, **argkw)
102        if data:
103            for name, value in argkw.items():
104                setattr(self, name, value)
105            self.__init__(**argkw)
106            return self.__call__(data, weight_id)
107        else:
108            return self
109
110    def findobj(self, name):
111        import string
112        names = string.split(name, ".")
113        lastobj = self.learner
114        for i in names[:-1]:
115            lastobj = getattr(lastobj, i)
116        return lastobj, names[-1]
117
118TuneParameters = deprecated_members(
119    {"returnWhat": "return_what",
120     "object": "learner"},
121    )(TuneParameters)
122
123
124class Tune1Parameter(TuneParameters):
125
126    """Class :obj:`Orange.optimization.Tune1Parameter` tunes a single parameter.
127   
128    .. attribute:: parameter
129   
130        The name of the parameter (or a list of names, if the same parameter is
131        stored at multiple places - see the examples) to be tuned.
132   
133    .. attribute:: values
134   
135        A list of parameter's values to be tried.
136   
137    To show how it works, we shall fit the minimal number of examples in a leaf
138    for a tree classifier.
139   
140    part of :download:`optimization-tuning1.py <code/optimization-tuning1.py>`
141
142    .. literalinclude:: code/optimization-tuning1.py
143        :lines: 3-11
144
145    Set up like this, when the tuner is called, set ``learner.min_subset`` to
146    1, 2, 3, 4, 5, 10, 15 and 20, and measure the AUC in 5-fold cross
147    validation. It will then reset the learner.minSubset to the optimal value
148    found and, since we left ``return_what`` at the default
149    (``RETURN_CLASSIFIER``), construct and return the classifier from the
150    entire data set. So, what we get is a  classifier, but if we'd also like
151    to know what the optimal value was, we can get it from
152    ``learner.min_subset``.
153
154    Tuning is of course not limited to setting numeric parameters. You can, for
155    instance, try to find the optimal criteria for assessing the quality of
156    attributes by tuning ``parameter="measure"``, trying settings like
157    ``values=[Orange.feature.scoring.GainRatio(), Orange.feature.scoring.Gini()]``
158   
159    Since the tuner returns a classifier and thus behaves like a learner, it
160    can be used in a cross-validation. Let us see whether a tuning tree indeed
161    enhances the AUC or not. We shall reuse the tuner from above, add another
162    tree learner, and test them both.
163   
164    part of :download:`optimization-tuning1.py <code/optimization-tuning1.py>`
165
166    .. literalinclude:: code/optimization-tuning1.py
167        :lines: 13-18
168   
169    This can be time consuming: for each of 8 values for ``min_subset`` it will
170    perform 5-fold cross validation inside a 10-fold cross validation -
171    altogether 400 trees. Plus, it will learn the optimal tree afterwards for
172    each fold. Adding a tree without tuning, that makes 420 trees build in
173    total.
174   
175    Nevertheless, results are good::
176   
177        Untuned tree: 0.930
178        Tuned tree: 0.986
179   
180    """
181
182    def __call__(self, data, weight=None, verbose=0):
183        verbose = verbose or getattr(self, "verbose", 0)
184        evaluate = getattr(self, "evaluate", Orange.evaluation.scoring.CA)
185        folds = getattr(self, "folds", 5)
186        compare = getattr(self, "compare", cmp)
187        return_what = getattr(self, "return_what",
188                             Tune1Parameter.RETURN_CLASSIFIER)
189
190        if (type(self.parameter) == list) or (type(self.parameter) == tuple):
191            to_set = [self.findobj(ld) for ld in self.parameter]
192        else:
193            to_set = [self.findobj(self.parameter)]
194
195        cvind = Orange.core.MakeRandomIndicesCV(data, folds)
196        findBest = Orange.utils.selection.BestOnTheFly(seed=data.checksum(),
197                                         call_compare_on_1st=True)
198        tableAndWeight = weight and (data, weight) or data
199        for par in self.values:
200            for i in to_set:
201                setattr(i[0], i[1], par)
202            res = evaluate(Orange.evaluation.testing.test_with_indices(
203                                        [self.learner], tableAndWeight, cvind))
204            findBest.candidate((res, par))
205            if verbose == 2:
206                print '*** optimization  %s: %s:' % (par, ", ".join("%.8f" % r for r in res))
207
208        bestpar = findBest.winner()[1]
209        for i in to_set:
210            setattr(i[0], i[1], bestpar)
211
212        if verbose:
213            print "*** Optimal parameter: %s = %s" % (self.parameter, bestpar)
214
215        if return_what == Tune1Parameter.RETURN_NONE:
216            return None
217        elif return_what == Tune1Parameter.RETURN_PARAMETERS:
218            return bestpar
219        elif return_what == Tune1Parameter.RETURN_LEARNER:
220            return self.learner
221        else:
222            classifier = self.learner(data)
223            if not Orange.utils.environ.orange_no_deprecated_members:
224                classifier.setattr("fittedParameter", bestpar)
225            classifier.setattr("fitted_parameter", bestpar)
226            return classifier
227
228class TuneMParameters(TuneParameters):
229
230    """The use of :obj:`Orange.optimization.TuneMParameters` differs from
231    :obj:`Orange.optimization.Tune1Parameter` only in specification of tuning
232    parameters.
233   
234    .. attribute:: parameters
235   
236        A list of two-element tuples, each containing the name of a parameter
237        and its possible values.
238   
239    For example we can try to tune both the minimal number of instances in
240    leaves and the splitting criteria by setting the tuner as follows:
241   
242    :download:`optimization-tuningm.py <code/optimization-tuningm.py>`
243
244    .. literalinclude:: code/optimization-tuningm.py
245   
246    """
247
248    def __call__(self, data, weight=None, verbose=0):
249        evaluate = getattr(self, "evaluate", Orange.evaluation.scoring.CA)
250        folds = getattr(self, "folds", 5)
251        compare = getattr(self, "compare", cmp)
252        verbose = verbose or getattr(self, "verbose", 0)
253        return_what = getattr(self, "return_what", Tune1Parameter.RETURN_CLASSIFIER)
254        progress_callback = getattr(self, "progress_callback", lambda i: None)
255
256        to_set = []
257        parnames = []
258        for par in self.parameters:
259            if (type(par[0]) == list) or (type(par[0]) == tuple):
260                to_set.append([self.findobj(ld) for ld in par[0]])
261                parnames.append(par[0])
262            else:
263                to_set.append([self.findobj(par[0])])
264                parnames.append([par[0]])
265
266
267        cvind = Orange.core.MakeRandomIndicesCV(data, folds)
268        findBest = Orange.utils.selection.BestOnTheFly(seed=data.checksum(),
269                                         call_compare_on_1st=True)
270        tableAndWeight = weight and (data, weight) or data
271        numOfTests = sum([len(x[1]) for x in self.parameters])
272        milestones = set(range(0, numOfTests, max(numOfTests / 100, 1)))
273        for itercount, valueindices in enumerate(Orange.utils.counters.LimitedCounter(\
274                                        [len(x[1]) for x in self.parameters])):
275            values = [self.parameters[i][1][x] for i, x \
276                      in enumerate(valueindices)]
277            for pi, value in enumerate(values):
278                for i, par in enumerate(to_set[pi]):
279                    setattr(par[0], par[1], value)
280                    if verbose == 2:
281                        print "%s: %s" % (parnames[pi][i], value)
282
283            res = evaluate(Orange.evaluation.testing.test_with_indices(
284                                        [self.learner], tableAndWeight, cvind))
285            if itercount in milestones:
286                progress_callback(100.0 * itercount / numOfTests)
287
288            findBest.candidate((res, values))
289            if verbose == 2:
290                print "===> Result: %s\n" % res
291
292        bestpar = findBest.winner()[1]
293        if verbose:
294            print "*** Optimal set of parameters: ",
295        for pi, value in enumerate(bestpar):
296            for i, par in enumerate(to_set[pi]):
297                setattr(par[0], par[1], value)
298                if verbose:
299                    print "%s: %s" % (parnames[pi][i], value),
300        if verbose:
301            print
302
303        if return_what == Tune1Parameter.RETURN_NONE:
304            return None
305        elif return_what == Tune1Parameter.RETURN_PARAMETERS:
306            return bestpar
307        elif return_what == Tune1Parameter.RETURN_LEARNER:
308            return self.learner
309        else:
310            classifier = self.learner(data)
311            if Orange.utils.environ.orange_no_deprecated_members:
312                classifier.fittedParameters = bestpar
313            classifier.fitted_parameters = bestpar
314            return classifier
315
316TuneMParameters = deprecated_members(
317    {"progressCallback": "progress_callback"},
318    )(TuneMParameters)
319
320class ThresholdLearner(Orange.classification.Learner):
321
322    """:obj:`Orange.optimization.ThresholdLearner` is a class that wraps
323    another learner. When given the data, it calls the wrapped learner to build
324    a classifier, than it uses the classifier to predict the class
325    probabilities on the training instances. Storing the probabilities, it
326    computes the threshold that would give the optimal classification accuracy.
327    Then it wraps the classifier and the threshold into an instance of
328    :obj:`Orange.optimization.ThresholdClassifier`.
329
330    Note that the learner doesn't perform internal cross-validation. Also, the
331    learner doesn't work for multivalued classes.
332
333    :obj:`Orange.optimization.ThresholdLearner` has the same interface as any
334    learner: if the constructor is given data, it returns a classifier,
335    else it returns a learner. It has two attributes.
336   
337    .. attribute:: learner
338   
339        The wrapped learner, for example an instance of
340        :obj:`Orange.classification.bayes.NaiveLearner`.
341   
342    .. attribute:: store_curve
343   
344        If `True`, the resulting classifier will contain an attribute curve, with
345        a list of tuples containing thresholds and classification accuracies at
346        that threshold (default `False`).
347   
348    """
349
350    @deprecated_keywords({"examples": "data", "weightID": "weight_id"})
351    def __new__(cls, data=None, weight_id=0, **kwds):
352        self = Orange.classification.Learner.__new__(cls, **kwds)
353        if data:
354            self.__init__(**kwargs)
355            return self.__call__(data, weight_id)
356        else:
357            return self
358
359    @deprecated_keywords({"storeCurve": "store_curve"})
360    def __init__(self, learner=None, store_curve=False, **kwds):
361        self.learner = learner
362        self.store_curve = store_curve
363        for name, value in kwds.items():
364            setattr(self, name, value)
365
366    @deprecated_keywords({"examples": "data", "weightID": "weight_id"})
367    def __call__(self, data, weight_id=0):
368        if self.learner is None:
369            raise AttributeError("Learner not set.")
370
371        classifier = self.learner(data, weight_id)
372        threshold, optCA, curve = Orange.wrappers.ThresholdCA(classifier,
373                                                          data,
374                                                          weight_id)
375        if self.store_curve:
376            return ThresholdClassifier(classifier, threshold, curve=curve)
377        else:
378            return ThresholdClassifier(classifier, threshold)
379
380ThresholdLearner = deprecated_members(
381    {"storeCurve": "store_curve"},
382    wrap_methods=["__init__"]
383    )(ThresholdLearner)
384
385class ThresholdClassifier(Orange.classification.Classifier):
386
387    """:obj:`Orange.optimization.ThresholdClassifier`, used by both
388    :obj:`Orange.optimization.ThredholdLearner` and
389    :obj:`Orange.optimization.ThresholdLearner_fixed` is therefore another
390    wrapper class, containing a classifier and a threshold. When it needs to
391    classify an instance, it calls the wrapped classifier to predict
392    probabilities. The example will be classified into the second class only if
393    the probability of that class is above the threshold.
394
395    .. attribute:: classifier
396   
397        The wrapped classifier, normally the one related to the ThresholdLearner's
398        learner, e.g. an instance of
399        :obj:`Orange.classification.bayes.NaiveLearner`.
400   
401    .. attribute:: threshold
402   
403        The threshold for classification into the second class.
404   
405    The two attributes can be specified set as attributes or given to the
406    constructor as ordinary arguments.
407   
408    """
409
410    def __init__(self, classifier, threshold, **kwds):
411        self.classifier = classifier
412        self.threshold = threshold
413        for name, value in kwds.items():
414            setattr(self, name, value)
415
416    def __call__(self, instance, what=Orange.classification.Classifier.GetValue):
417        probs = self.classifier(instance, self.GetProbabilities)
418        if what == self.GetProbabilities:
419            return probs
420        value = Orange.data.Value(self.classifier.classVar, probs[1] > \
421                                  self.threshold)
422        if what == Orange.classification.Classifier.GetValue:
423            return value
424        else:
425            return (value, probs)
426
427
428class ThresholdLearner_fixed(Orange.classification.Learner):
429    """ This is a convinience  variant of
430    :obj:`Orange.optimization.ThresholdLearner`. Instead of finding the
431    optimal threshold it uses a prescribed one. It has the following two
432    attributes.
433   
434    .. attribute:: learner
435   
436        The wrapped learner, for example an instance of
437        :obj:`~Orange.classification.bayes.NaiveLearner`.
438   
439    .. attribute:: threshold
440   
441        Threshold to use in classification.
442   
443    This class calls its base learner and puts the resulting classifier
444    together with the threshold into an instance of :obj:`ThresholdClassifier`.
445   
446    """
447    @deprecated_keywords({"examples": "data", "weightID": "weight_id"})
448    def __new__(cls, data=None, weight_id=0, **kwds):
449        self = Orange.classification.Learner.__new__(cls, **kwds)
450        if data:
451            self.__init__(**kwds)
452            return self.__call__(data, weight_id)
453        else:
454            return self
455
456    def __init__(self, learner=None, threshold=None, **kwds):
457        self.learner = learner
458        self.threshold = threshold
459        for name, value in kwds.items():
460            setattr(name, value)
461
462    @deprecated_keywords({"examples": "data", "weightID": "weight_id"})
463    def __call__(self, data, weight_id=0):
464        if self.learner is None:
465            raise AttributeError("Learner not set.")
466        if self.threshold is None:
467            raise AttributeError("Threshold not set.")
468        if len(data.domain.classVar.values) != 2:
469            raise ValueError("ThresholdLearner handles binary classes only.")
470
471        return ThresholdClassifier(self.learner(data, weight_id),
472                                   self.threshold)
473
474class PreprocessedLearner(object):
475    def __new__(cls, preprocessor=None, learner=None):
476        self = object.__new__(cls)
477        if learner is not None:
478            self.__init__(preprocessor)
479            return self.wrapLearner(learner)
480        else:
481            return self
482
483    def __init__(self, preprocessor=None, learner=None):
484        if isinstance(preprocessor, list):
485            self.preprocessors = preprocessor
486        elif preprocessor is not None:
487            self.preprocessors = [preprocessor]
488        else:
489            self.preprocessors = []
490        #self.preprocessors = [Orange.core.Preprocessor_addClassNoise(proportion=0.8)]
491        if learner:
492            self.wrapLearner(learner)
493
494    def processData(self, data, weightId=None):
495        hadWeight = hasWeight = weightId is not None
496        for preprocessor in self.preprocessors:
497            if hasWeight:
498                t = preprocessor(data, weightId)
499            else:
500                t = preprocessor(data)
501
502            if isinstance(t, tuple):
503                data, weightId = t
504                hasWeight = True
505            else:
506                data = t
507        if hadWeight:
508            return data, weightId
509        else:
510            return data
511
512    def wrapLearner(self, learner):
513        class WrappedLearner(learner.__class__):
514            preprocessor = self
515            wrappedLearner = learner
516            name = getattr(learner, "name", "")
517            def __call__(self, data, weightId=0, getData=False):
518                t = self.preprocessor.processData(data, weightId or 0)
519                processed, procW = t if isinstance(t, tuple) else (t, 0)
520                classifier = self.wrappedLearner(processed, procW)
521                if getData:
522                    return classifier, processed
523                else:
524                    return classifier # super(WrappedLearner, self).__call__(processed, procW)
525
526            def __reduce__(self):
527                return PreprocessedLearner, (self.preprocessor.preprocessors, \
528                                             self.wrappedLearner)
529
530            def __getattr__(self, name):
531                return getattr(learner, name)
532
533        return WrappedLearner()
Note: See TracBrowser for help on using the repository browser.