source: orange/Orange/optimization/__init__.py @ 10633:fb05a6f3a235

Revision 10633:fb05a6f3a235, 23.4 KB checked in by mstajdohar, 2 years ago (diff)

Changed obsolete names.

Line 
1"""
2.. index:: optimization
3
4Wrappers for Tuning Parameters and Thresholds
5
6Classes for two very useful purposes: tuning learning algorithm's parameters
7using internal validation and tuning the threshold for classification into
8positive class.
9
10*****************
11Tuning parameters
12*****************
13
14Two classes support tuning parameters.
15:obj:`Orange.optimization.Tune1Parameter` for fitting a single parameter and
16:obj:`Orange.optimization.TuneMParameters` fitting multiple parameters at once,
17trying all possible combinations. When called with data and, optionally, id
18of meta attribute with weights, they find the optimal setting of arguments
19using cross validation. The classes can also be used as ordinary learning
20algorithms - they are in fact derived from
21:obj:`Orange.classification.Learner`.
22
23Both classes have a common parent, :obj:`Orange.optimization.TuneParameters`,
24and a few common attributes.
25
26.. autoclass:: Orange.optimization.TuneParameters
27   :members:
28
29.. autoclass:: Orange.optimization.Tune1Parameter
30   :members:
31 
32.. autoclass:: Orange.optimization.TuneMParameters
33   :members:
34   
35**************************
36Setting Optimal Thresholds
37**************************
38
39Some models may perform well in terms of AUC which measures the ability to
40distinguish between instances of two classes, but have low classifications
41accuracies. The reason may be in the threshold: in binary problems, classifiers
42usually classify into the more probable class, while sometimes, when class
43distributions are highly skewed, a modified threshold would give better
44accuracies. Here are two classes that can help.
45 
46.. autoclass:: Orange.optimization.ThresholdLearner
47   :members:
48     
49.. autoclass:: Orange.optimization.ThresholdClassifier
50   :members:
51   
52Examples
53========
54
55This is how you use the learner.
56
57part of :download:`optimization-thresholding1.py <code/optimization-thresholding1.py>`
58
59.. literalinclude:: code/optimization-thresholding1.py
60
61The output::
62
63    W/out threshold adjustement: 0.633
64    With adjusted thredhold: 0.659
65    With threshold at 0.80: 0.449
66
67part of :download:`optimization-thresholding2.py <code/optimization-thresholding2.py>`
68
69.. literalinclude:: code/optimization-thresholding2.py
70
71The script first divides the data into training and testing subsets. It trains
72a naive Bayesian classifier and than wraps it into
73:obj:`Orange.optimization.ThresholdClassifiers` with thresholds of .2, .5 and
74.8. The three models are tested on the left-out data, and we compute the
75confusion matrices from the results. The printout::
76
77    0.20: TP 60.000, TN 1.000
78    0.50: TP 42.000, TN 24.000
79    0.80: TP 2.000, TN 43.000
80
81shows how the varying threshold changes the balance between the number of true
82positives and negatives.
83
84.. autoclass:: Orange.optimization.PreprocessedLearner
85   :members:
86   
87"""
88
89import Orange.core
90import Orange.classification
91import Orange.evaluation.scoring
92import Orange.evaluation.testing
93import Orange.misc
94
95from Orange.utils import deprecated_class_attribute, deprecated_keywords, \
96                         deprecated_members
97
98class TuneParameters(Orange.classification.Learner):
99
100    """.. attribute:: data
101   
102        Data table with either discrete or continuous features
103   
104    .. attribute:: weight_id
105   
106        The id of the weight meta attribute
107   
108    .. attribute:: learner
109   
110        The learning algorithm whose parameters are to be tuned. This can be,
111        for instance, :obj:`Orange.classification.tree.TreeLearner`.
112   
113    .. attribute:: evaluate
114   
115        The statistics to evaluate. The default is
116        :obj:`Orange.evaluation.scoring.CA`, so the learner will be fit for the
117        optimal classification accuracy. You can replace it with, for instance,
118        :obj:`Orange.evaluation.scoring.AUC` to optimize the AUC. Statistics
119        can return either a single value (classification accuracy), a list with
120        a single value (this is what :obj:`Orange.evaluation.scoring.CA`
121        actually does), or arbitrary objects which the compare function below
122        must be able to compare.
123   
124    .. attribute:: folds
125   
126        The number of folds used in internal cross-validation. Default is 5.
127   
128    .. attribute:: compare
129   
130        The function used to compare the results. The function should accept
131        two arguments (e.g. two classification accuracies, AUCs or whatever the
132        result of ``evaluate`` is) and return a positive value if the first
133        argument is better, 0 if they are equal and a negative value if the
134        first is worse than the second. The default compare function is
135        ``cmp``. You don't need to change this if evaluate is such that higher
136        values mean a better classifier.
137   
138    .. attribute:: return_what
139   
140        Decides what should be result of tuning. Possible values are:
141   
142        * ``TuneParameters.RETURN_NONE`` (or 0): tuning will return nothing,
143        * ``TuneParameters.RETURN_PARAMETERS`` (or 1): return the optimal value(s) of parameter(s),
144        * ``TuneParameters.RETURN_LEARNER`` (or 2): return the learner set to optimal parameters,
145        * ``TuneParameters.RETURN_CLASSIFIER`` (or 3): return a classifier trained with the optimal parameters on the entire data set. This is the default setting.
146       
147        Regardless of this, the learner (given as parameter ``learner``) is
148        left set to the optimal parameters.
149   
150    .. attribute:: verbose
151   
152        If 0 (default), the class doesn't print anything. If set to 1, it will
153        print out the optimal value found, if set to 2, it will print out all
154        tried values and the related
155   
156    If tuner returns the classifier, it behaves as a learning algorithm. As the
157    examples below will demonstrate, it can be called, given the data and
158    the result is a "trained" classifier. It can, for instance, be used in
159    cross-validation.
160
161    Out of these attributes, the only necessary argument is ``learner``. The
162    real tuning classes (subclasses of this class) add two additional -
163    the attributes that tell what parameter(s) to optimize and which values
164    to use.
165   
166    """
167
168    RETURN_NONE = 0
169    RETURN_PARAMETERS = 1
170    RETURN_LEARNER = 2
171    RETURN_CLASSIFIER = 3
172
173    returnNone = \
174        deprecated_class_attribute("returnNone", "RETURN_NONE")
175    returnParameters = \
176        deprecated_class_attribute("returnParameters", "RETURN_PARAMETERS")
177    returnLearner = \
178        deprecated_class_attribute("returnLearner", "RETURN_LEARNER")
179    returnClassifier = \
180        deprecated_class_attribute("returnClassifier", "RETURN_CLASSIFIER")
181
182    @deprecated_keywords({"examples": "data", "weightID": "weight_id"})
183    def __new__(cls, data=None, weight_id=0, **argkw):
184        self = Orange.classification.Learner.__new__(cls, **argkw)
185        if data:
186            for name, value in argkw.items():
187                setattr(self, name, value)
188            self.__init__(**argkw)
189            return self.__call__(data, weight_id)
190        else:
191            return self
192
193    def findobj(self, name):
194        import string
195        names = string.split(name, ".")
196        lastobj = self.learner
197        for i in names[:-1]:
198            lastobj = getattr(lastobj, i)
199        return lastobj, names[-1]
200
201TuneParameters = deprecated_members(
202    {"returnWhat": "return_what",
203     "object": "learner"},
204    )(TuneParameters)
205
206
207class Tune1Parameter(TuneParameters):
208
209    """Class :obj:`Orange.optimization.Tune1Parameter` tunes a single parameter.
210   
211    .. attribute:: parameter
212   
213        The name of the parameter (or a list of names, if the same parameter is
214        stored at multiple places - see the examples) to be tuned.
215   
216    .. attribute:: values
217   
218        A list of parameter's values to be tried.
219   
220    To show how it works, we shall fit the minimal number of examples in a leaf
221    for a tree classifier.
222   
223    part of :download:`optimization-tuning1.py <code/optimization-tuning1.py>`
224
225    .. literalinclude:: code/optimization-tuning1.py
226        :lines: 3-11
227
228    Set up like this, when the tuner is called, set ``learner.min_subset`` to
229    1, 2, 3, 4, 5, 10, 15 and 20, and measure the AUC in 5-fold cross
230    validation. It will then reset the learner.minSubset to the optimal value
231    found and, since we left ``return_what`` at the default
232    (``RETURN_CLASSIFIER``), construct and return the classifier from the
233    entire data set. So, what we get is a  classifier, but if we'd also like
234    to know what the optimal value was, we can get it from
235    ``learner.min_subset``.
236
237    Tuning is of course not limited to setting numeric parameters. You can, for
238    instance, try to find the optimal criteria for assessing the quality of
239    attributes by tuning ``parameter="measure"``, trying settings like
240    ``values=[Orange.feature.scoring.GainRatio(), Orange.feature.scoring.Gini()]``
241   
242    Since the tuner returns a classifier and thus behaves like a learner, it
243    can be used in a cross-validation. Let us see whether a tuning tree indeed
244    enhances the AUC or not. We shall reuse the tuner from above, add another
245    tree learner, and test them both.
246   
247    part of :download:`optimization-tuning1.py <code/optimization-tuning1.py>`
248
249    .. literalinclude:: code/optimization-tuning1.py
250        :lines: 13-18
251   
252    This can be time consuming: for each of 8 values for ``min_subset`` it will
253    perform 5-fold cross validation inside a 10-fold cross validation -
254    altogether 400 trees. Plus, it will learn the optimal tree afterwards for
255    each fold. Adding a tree without tuning, that makes 420 trees build in
256    total.
257   
258    Nevertheless, results are good::
259   
260        Untuned tree: 0.930
261        Tuned tree: 0.986
262   
263    """
264
265    def __call__(self, data, weight=None, verbose=0):
266        verbose = verbose or getattr(self, "verbose", 0)
267        evaluate = getattr(self, "evaluate", Orange.evaluation.scoring.CA)
268        folds = getattr(self, "folds", 5)
269        compare = getattr(self, "compare", cmp)
270        return_what = getattr(self, "return_what",
271                             Tune1Parameter.RETURN_CLASSIFIER)
272
273        if (type(self.parameter) == list) or (type(self.parameter) == tuple):
274            to_set = [self.findobj(ld) for ld in self.parameter]
275        else:
276            to_set = [self.findobj(self.parameter)]
277
278        cvind = Orange.core.MakeRandomIndicesCV(data, folds)
279        findBest = Orange.misc.selection.BestOnTheFly(seed=data.checksum(),
280                                         call_compare_on_1st=True)
281        tableAndWeight = weight and (data, weight) or data
282        for par in self.values:
283            for i in to_set:
284                setattr(i[0], i[1], par)
285            res = evaluate(Orange.evaluation.testing.test_with_indices(
286                                        [self.learner], tableAndWeight, cvind))
287            findBest.candidate((res, par))
288            if verbose == 2:
289                print '*** optimization  %s: %s:' % (par, ", ".join("%.8f" % r for r in res))
290
291        bestpar = findBest.winner()[1]
292        for i in to_set:
293            setattr(i[0], i[1], bestpar)
294
295        if verbose:
296            print "*** Optimal parameter: %s = %s" % (self.parameter, bestpar)
297
298        if return_what == Tune1Parameter.RETURN_NONE:
299            return None
300        elif return_what == Tune1Parameter.RETURN_PARAMETERS:
301            return bestpar
302        elif return_what == Tune1Parameter.RETURN_LEARNER:
303            return self.learner
304        else:
305            classifier = self.learner(data)
306            if not Orange.utils.environ.orange_no_deprecated_members:
307                classifier.setattr("fittedParameter", bestpar)
308            classifier.setattr("fitted_parameter", bestpar)
309            return classifier
310
311class TuneMParameters(TuneParameters):
312
313    """The use of :obj:`Orange.optimization.TuneMParameters` differs from
314    :obj:`Orange.optimization.Tune1Parameter` only in specification of tuning
315    parameters.
316   
317    .. attribute:: parameters
318   
319        A list of two-element tuples, each containing the name of a parameter
320        and its possible values.
321   
322    For example we can try to tune both the minimal number of instances in
323    leaves and the splitting criteria by setting the tuner as follows:
324   
325    :download:`optimization-tuningm.py <code/optimization-tuningm.py>`
326
327    .. literalinclude:: code/optimization-tuningm.py
328   
329    """
330
331    def __call__(self, data, weight=None, verbose=0):
332        evaluate = getattr(self, "evaluate", Orange.evaluation.scoring.CA)
333        folds = getattr(self, "folds", 5)
334        compare = getattr(self, "compare", cmp)
335        verbose = verbose or getattr(self, "verbose", 0)
336        return_what = getattr(self, "return_what", Tune1Parameter.RETURN_CLASSIFIER)
337        progress_callback = getattr(self, "progress_callback", lambda i: None)
338
339        to_set = []
340        parnames = []
341        for par in self.parameters:
342            if (type(par[0]) == list) or (type(par[0]) == tuple):
343                to_set.append([self.findobj(ld) for ld in par[0]])
344                parnames.append(par[0])
345            else:
346                to_set.append([self.findobj(par[0])])
347                parnames.append([par[0]])
348
349
350        cvind = Orange.core.MakeRandomIndicesCV(data, folds)
351        findBest = Orange.misc.selection.BestOnTheFly(seed=data.checksum(),
352                                         call_compare_on_1st=True)
353        tableAndWeight = weight and (data, weight) or data
354        numOfTests = sum([len(x[1]) for x in self.parameters])
355        milestones = set(range(0, numOfTests, max(numOfTests / 100, 1)))
356        for itercount, valueindices in enumerate(Orange.utils.counters.LimitedCounter(\
357                                        [len(x[1]) for x in self.parameters])):
358            values = [self.parameters[i][1][x] for i, x \
359                      in enumerate(valueindices)]
360            for pi, value in enumerate(values):
361                for i, par in enumerate(to_set[pi]):
362                    setattr(par[0], par[1], value)
363                    if verbose == 2:
364                        print "%s: %s" % (parnames[pi][i], value)
365
366            res = evaluate(Orange.evaluation.testing.test_with_indices(
367                                        [self.learner], tableAndWeight, cvind))
368            if itercount in milestones:
369                progress_callback(100.0 * itercount / numOfTests)
370
371            findBest.candidate((res, values))
372            if verbose == 2:
373                print "===> Result: %s\n" % res
374
375        bestpar = findBest.winner()[1]
376        if verbose:
377            print "*** Optimal set of parameters: ",
378        for pi, value in enumerate(bestpar):
379            for i, par in enumerate(to_set[pi]):
380                setattr(par[0], par[1], value)
381                if verbose:
382                    print "%s: %s" % (parnames[pi][i], value),
383        if verbose:
384            print
385
386        if return_what == Tune1Parameter.RETURN_NONE:
387            return None
388        elif return_what == Tune1Parameter.RETURN_PARAMETERS:
389            return bestpar
390        elif return_what == Tune1Parameter.RETURN_LEARNER:
391            return self.learner
392        else:
393            classifier = self.learner(data)
394            if Orange.utils.environ.orange_no_deprecated_members:
395                classifier.fittedParameters = bestpar
396            classifier.fitted_parameters = bestpar
397            return classifier
398
399TuneMParameters = deprecated_members(
400    {"progressCallback": "progress_callback"},
401    )(TuneMParameters)
402
403class ThresholdLearner(Orange.classification.Learner):
404
405    """:obj:`Orange.optimization.ThresholdLearner` is a class that wraps
406    another learner. When given the data, it calls the wrapped learner to build
407    a classifier, than it uses the classifier to predict the class
408    probabilities on the training instances. Storing the probabilities, it
409    computes the threshold that would give the optimal classification accuracy.
410    Then it wraps the classifier and the threshold into an instance of
411    :obj:`Orange.optimization.ThresholdClassifier`.
412
413    Note that the learner doesn't perform internal cross-validation. Also, the
414    learner doesn't work for multivalued classes.
415
416    :obj:`Orange.optimization.ThresholdLearner` has the same interface as any
417    learner: if the constructor is given data, it returns a classifier,
418    else it returns a learner. It has two attributes.
419   
420    .. attribute:: learner
421   
422        The wrapped learner, for example an instance of
423        :obj:`Orange.classification.bayes.NaiveLearner`.
424   
425    .. attribute:: store_curve
426   
427        If `True`, the resulting classifier will contain an attribute curve, with
428        a list of tuples containing thresholds and classification accuracies at
429        that threshold (default `False`).
430   
431    """
432
433    @deprecated_keywords({"examples": "data", "weightID": "weight_id"})
434    def __new__(cls, data=None, weight_id=0, **kwds):
435        self = Orange.classification.Learner.__new__(cls, **kwds)
436        if data:
437            self.__init__(**kwargs)
438            return self.__call__(data, weight_id)
439        else:
440            return self
441
442    @deprecated_keywords({"storeCurve": "store_curve"})
443    def __init__(self, learner=None, store_curve=False, **kwds):
444        self.learner = learner
445        self.store_curve = store_curve
446        for name, value in kwds.items():
447            setattr(self, name, value)
448
449    @deprecated_keywords({"examples": "data", "weightID": "weight_id"})
450    def __call__(self, data, weight_id=0):
451        if self.learner is None:
452            raise AttributeError("Learner not set.")
453
454        classifier = self.learner(data, weight_id)
455        threshold, optCA, curve = Orange.wrappers.ThresholdCA(classifier,
456                                                          data,
457                                                          weight_id)
458        if self.store_curve:
459            return ThresholdClassifier(classifier, threshold, curve=curve)
460        else:
461            return ThresholdClassifier(classifier, threshold)
462
463ThresholdLearner = deprecated_members(
464    {"storeCurve": "store_curve"},
465    wrap_methods=["__init__"]
466    )(ThresholdLearner)
467
468class ThresholdClassifier(Orange.classification.Classifier):
469
470    """:obj:`Orange.optimization.ThresholdClassifier`, used by both
471    :obj:`Orange.optimization.ThredholdLearner` and
472    :obj:`Orange.optimization.ThresholdLearner_fixed` is therefore another
473    wrapper class, containing a classifier and a threshold. When it needs to
474    classify an instance, it calls the wrapped classifier to predict
475    probabilities. The example will be classified into the second class only if
476    the probability of that class is above the threshold.
477
478    .. attribute:: classifier
479   
480        The wrapped classifier, normally the one related to the ThresholdLearner's
481        learner, e.g. an instance of
482        :obj:`Orange.classification.bayes.NaiveLearner`.
483   
484    .. attribute:: threshold
485   
486        The threshold for classification into the second class.
487   
488    The two attributes can be specified set as attributes or given to the
489    constructor as ordinary arguments.
490   
491    """
492
493    def __init__(self, classifier, threshold, **kwds):
494        self.classifier = classifier
495        self.threshold = threshold
496        for name, value in kwds.items():
497            setattr(self, name, value)
498
499    def __call__(self, instance, what=Orange.classification.Classifier.GetValue):
500        probs = self.classifier(instance, self.GetProbabilities)
501        if what == self.GetProbabilities:
502            return probs
503        value = Orange.data.Value(self.classifier.classVar, probs[1] > \
504                                  self.threshold)
505        if what == Orange.classification.Classifier.GetValue:
506            return value
507        else:
508            return (value, probs)
509
510
511class ThresholdLearner_fixed(Orange.classification.Learner):
512    """ This is a convinience  variant of
513    :obj:`Orange.optimization.ThresholdLearner`. Instead of finding the
514    optimal threshold it uses a prescribed one. It has the following two
515    attributes.
516   
517    .. attribute:: learner
518   
519        The wrapped learner, for example an instance of
520        :obj:`~Orange.classification.bayes.NaiveLearner`.
521   
522    .. attribute:: threshold
523   
524        Threshold to use in classification.
525   
526    This class calls its base learner and puts the resulting classifier
527    together with the threshold into an instance of :obj:`ThresholdClassifier`.
528   
529    """
530    @deprecated_keywords({"examples": "data", "weightID": "weight_id"})
531    def __new__(cls, data=None, weight_id=0, **kwds):
532        self = Orange.classification.Learner.__new__(cls, **kwds)
533        if data:
534            self.__init__(**kwds)
535            return self.__call__(data, weight_id)
536        else:
537            return self
538
539    def __init__(self, learner=None, threshold=None, **kwds):
540        self.learner = learner
541        self.threshold = threshold
542        for name, value in kwds.items():
543            setattr(name, value)
544
545    @deprecated_keywords({"examples": "data", "weightID": "weight_id"})
546    def __call__(self, data, weight_id=0):
547        if self.learner is None:
548            raise AttributeError("Learner not set.")
549        if self.threshold is None:
550            raise AttributeError("Threshold not set.")
551        if len(data.domain.classVar.values) != 2:
552            raise ValueError("ThresholdLearner handles binary classes only.")
553
554        return ThresholdClassifier(self.learner(data, weight_id),
555                                   self.threshold)
556
557class PreprocessedLearner(object):
558    def __new__(cls, preprocessor=None, learner=None):
559        self = object.__new__(cls)
560        if learner is not None:
561            self.__init__(preprocessor)
562            return self.wrapLearner(learner)
563        else:
564            return self
565
566    def __init__(self, preprocessor=None, learner=None):
567        if isinstance(preprocessor, list):
568            self.preprocessors = preprocessor
569        elif preprocessor is not None:
570            self.preprocessors = [preprocessor]
571        else:
572            self.preprocessors = []
573        #self.preprocessors = [Orange.core.Preprocessor_addClassNoise(proportion=0.8)]
574        if learner:
575            self.wrapLearner(learner)
576
577    def processData(self, data, weightId=None):
578        hadWeight = hasWeight = weightId is not None
579        for preprocessor in self.preprocessors:
580            if hasWeight:
581                t = preprocessor(data, weightId)
582            else:
583                t = preprocessor(data)
584
585            if isinstance(t, tuple):
586                data, weightId = t
587                hasWeight = True
588            else:
589                data = t
590        if hadWeight:
591            return data, weightId
592        else:
593            return data
594
595    def wrapLearner(self, learner):
596        class WrappedLearner(learner.__class__):
597            preprocessor = self
598            wrappedLearner = learner
599            name = getattr(learner, "name", "")
600            def __call__(self, data, weightId=0, getData=False):
601                t = self.preprocessor.processData(data, weightId or 0)
602                processed, procW = t if isinstance(t, tuple) else (t, 0)
603                classifier = self.wrappedLearner(processed, procW)
604                if getData:
605                    return classifier, processed
606                else:
607                    return classifier # super(WrappedLearner, self).__call__(processed, procW)
608
609            def __reduce__(self):
610                return PreprocessedLearner, (self.preprocessor.preprocessors, \
611                                             self.wrappedLearner)
612
613            def __getattr__(self, name):
614                return getattr(learner, name)
615
616        return WrappedLearner()
Note: See TracBrowser for help on using the repository browser.