source: orange/Orange/tuning/__init__.py @ 11459:fc07a5c346be

Revision 11459:fc07a5c346be, 20.6 KB checked in by Ales Erjavec <ales.erjavec@…>, 12 months ago (diff)

Fixed checks for passed dataset table argument in new methods.

Use 'instances is not None' idiom and not a boolean test to guard against cases
where the passed dataset length is 0.

RevLine 
[8042]1import Orange.core
2import Orange.classification
3import Orange.evaluation.scoring
4import Orange.evaluation.testing
5import Orange.misc
6
[10580]7from Orange.utils import deprecated_class_attribute, deprecated_keywords, \
8                         deprecated_members
[10077]9
[8042]10class TuneParameters(Orange.classification.Learner):
[10302]11
[10077]12    """.. attribute:: data
[8042]13   
14        Data table with either discrete or continuous features
15   
[10077]16    .. attribute:: weight_id
[8042]17   
[10077]18        The id of the weight meta attribute
[8042]19   
[10077]20    .. attribute:: learner
[8042]21   
22        The learning algorithm whose parameters are to be tuned. This can be,
[10077]23        for instance, :obj:`Orange.classification.tree.TreeLearner`.
[8042]24   
25    .. attribute:: evaluate
26   
27        The statistics to evaluate. The default is
28        :obj:`Orange.evaluation.scoring.CA`, so the learner will be fit for the
29        optimal classification accuracy. You can replace it with, for instance,
30        :obj:`Orange.evaluation.scoring.AUC` to optimize the AUC. Statistics
31        can return either a single value (classification accuracy), a list with
32        a single value (this is what :obj:`Orange.evaluation.scoring.CA`
33        actually does), or arbitrary objects which the compare function below
34        must be able to compare.
35   
36    .. attribute:: folds
37   
38        The number of folds used in internal cross-validation. Default is 5.
39   
40    .. attribute:: compare
41   
42        The function used to compare the results. The function should accept
43        two arguments (e.g. two classification accuracies, AUCs or whatever the
[10077]44        result of ``evaluate`` is) and return a positive value if the first
[8042]45        argument is better, 0 if they are equal and a negative value if the
[10077]46        first is worse than the second. The default compare function is
47        ``cmp``. You don't need to change this if evaluate is such that higher
48        values mean a better classifier.
[8042]49   
[10077]50    .. attribute:: return_what
[8042]51   
52        Decides what should be result of tuning. Possible values are:
53   
[10077]54        * ``TuneParameters.RETURN_NONE`` (or 0): tuning will return nothing,
55        * ``TuneParameters.RETURN_PARAMETERS`` (or 1): return the optimal value(s) of parameter(s),
56        * ``TuneParameters.RETURN_LEARNER`` (or 2): return the learner set to optimal parameters,
57        * ``TuneParameters.RETURN_CLASSIFIER`` (or 3): return a classifier trained with the optimal parameters on the entire data set. This is the default setting.
[8042]58       
[10077]59        Regardless of this, the learner (given as parameter ``learner``) is
60        left set to the optimal parameters.
[8042]61   
62    .. attribute:: verbose
63   
64        If 0 (default), the class doesn't print anything. If set to 1, it will
65        print out the optimal value found, if set to 2, it will print out all
66        tried values and the related
67   
68    If tuner returns the classifier, it behaves as a learning algorithm. As the
[10077]69    examples below will demonstrate, it can be called, given the data and
[8042]70    the result is a "trained" classifier. It can, for instance, be used in
71    cross-validation.
72
[10077]73    Out of these attributes, the only necessary argument is ``learner``. The
74    real tuning classes (subclasses of this class) add two additional -
75    the attributes that tell what parameter(s) to optimize and which values
76    to use.
[8042]77   
78    """
[10302]79
[10077]80    RETURN_NONE = 0
81    RETURN_PARAMETERS = 1
82    RETURN_LEARNER = 2
83    RETURN_CLASSIFIER = 3
[10302]84
[10077]85    returnNone = \
86        deprecated_class_attribute("returnNone", "RETURN_NONE")
87    returnParameters = \
88        deprecated_class_attribute("returnParameters", "RETURN_PARAMETERS")
89    returnLearner = \
90        deprecated_class_attribute("returnLearner", "RETURN_LEARNER")
91    returnClassifier = \
92        deprecated_class_attribute("returnClassifier", "RETURN_CLASSIFIER")
[10302]93
94    @deprecated_keywords({"examples": "data", "weightID": "weight_id"})
95    def __new__(cls, data=None, weight_id=0, **argkw):
[8042]96        self = Orange.classification.Learner.__new__(cls, **argkw)
[11459]97        if data is not None:
[10077]98            for name, value in argkw.items():
99                setattr(self, name, value)
100            self.__init__(**argkw)
101            return self.__call__(data, weight_id)
[8042]102        else:
103            return self
104
105    def findobj(self, name):
106        import string
[10077]107        names = string.split(name, ".")
[10633]108        lastobj = self.learner
[8042]109        for i in names[:-1]:
[10077]110            lastobj = getattr(lastobj, i)
[8042]111        return lastobj, names[-1]
[10302]112
[10077]113TuneParameters = deprecated_members(
114    {"returnWhat": "return_what",
115     "object": "learner"},
116    )(TuneParameters)
[10302]117
118
[8042]119class Tune1Parameter(TuneParameters):
[10302]120
[8042]121    """Class :obj:`Orange.optimization.Tune1Parameter` tunes a single parameter.
122   
123    .. attribute:: parameter
124   
125        The name of the parameter (or a list of names, if the same parameter is
126        stored at multiple places - see the examples) to be tuned.
127   
128    .. attribute:: values
129   
130        A list of parameter's values to be tried.
131   
132    To show how it works, we shall fit the minimal number of examples in a leaf
133    for a tree classifier.
134   
[9349]135    part of :download:`optimization-tuning1.py <code/optimization-tuning1.py>`
[8042]136
137    .. literalinclude:: code/optimization-tuning1.py
138        :lines: 3-11
139
[10077]140    Set up like this, when the tuner is called, set ``learner.min_subset`` to
141    1, 2, 3, 4, 5, 10, 15 and 20, and measure the AUC in 5-fold cross
142    validation. It will then reset the learner.minSubset to the optimal value
143    found and, since we left ``return_what`` at the default
144    (``RETURN_CLASSIFIER``), construct and return the classifier from the
145    entire data set. So, what we get is a  classifier, but if we'd also like
146    to know what the optimal value was, we can get it from
147    ``learner.min_subset``.
[8042]148
149    Tuning is of course not limited to setting numeric parameters. You can, for
150    instance, try to find the optimal criteria for assessing the quality of
[10077]151    attributes by tuning ``parameter="measure"``, trying settings like
152    ``values=[Orange.feature.scoring.GainRatio(), Orange.feature.scoring.Gini()]``
[8042]153   
154    Since the tuner returns a classifier and thus behaves like a learner, it
155    can be used in a cross-validation. Let us see whether a tuning tree indeed
156    enhances the AUC or not. We shall reuse the tuner from above, add another
157    tree learner, and test them both.
158   
[9349]159    part of :download:`optimization-tuning1.py <code/optimization-tuning1.py>`
[8042]160
161    .. literalinclude:: code/optimization-tuning1.py
162        :lines: 13-18
163   
[10077]164    This can be time consuming: for each of 8 values for ``min_subset`` it will
[8042]165    perform 5-fold cross validation inside a 10-fold cross validation -
166    altogether 400 trees. Plus, it will learn the optimal tree afterwards for
[10077]167    each fold. Adding a tree without tuning, that makes 420 trees build in
168    total.
[8042]169   
[10077]170    Nevertheless, results are good::
[8042]171   
172        Untuned tree: 0.930
173        Tuned tree: 0.986
174   
175    """
[10302]176
[10077]177    def __call__(self, data, weight=None, verbose=0):
[8042]178        verbose = verbose or getattr(self, "verbose", 0)
179        evaluate = getattr(self, "evaluate", Orange.evaluation.scoring.CA)
180        folds = getattr(self, "folds", 5)
181        compare = getattr(self, "compare", cmp)
[10302]182        return_what = getattr(self, "return_what",
[10633]183                             Tune1Parameter.RETURN_CLASSIFIER)
[8042]184
[10302]185        if (type(self.parameter) == list) or (type(self.parameter) == tuple):
[8042]186            to_set = [self.findobj(ld) for ld in self.parameter]
187        else:
188            to_set = [self.findobj(self.parameter)]
189
[10077]190        cvind = Orange.core.MakeRandomIndicesCV(data, folds)
[10654]191        findBest = Orange.utils.selection.BestOnTheFly(seed=data.checksum(),
[10633]192                                         call_compare_on_1st=True)
[10077]193        tableAndWeight = weight and (data, weight) or data
[8042]194        for par in self.values:
195            for i in to_set:
196                setattr(i[0], i[1], par)
197            res = evaluate(Orange.evaluation.testing.test_with_indices(
[10633]198                                        [self.learner], tableAndWeight, cvind))
[8042]199            findBest.candidate((res, par))
[10302]200            if verbose == 2:
201                print '*** optimization  %s: %s:' % (par, ", ".join("%.8f" % r for r in res))
[8042]202
203        bestpar = findBest.winner()[1]
204        for i in to_set:
205            setattr(i[0], i[1], bestpar)
206
207        if verbose:
208            print "*** Optimal parameter: %s = %s" % (self.parameter, bestpar)
209
[10633]210        if return_what == Tune1Parameter.RETURN_NONE:
[8042]211            return None
[10633]212        elif return_what == Tune1Parameter.RETURN_PARAMETERS:
[8042]213            return bestpar
[10633]214        elif return_what == Tune1Parameter.RETURN_LEARNER:
215            return self.learner
[8042]216        else:
[10633]217            classifier = self.learner(data)
[10580]218            if not Orange.utils.environ.orange_no_deprecated_members:
[10077]219                classifier.setattr("fittedParameter", bestpar)
220            classifier.setattr("fitted_parameter", bestpar)
[8042]221            return classifier
222
223class TuneMParameters(TuneParameters):
[10302]224
[8042]225    """The use of :obj:`Orange.optimization.TuneMParameters` differs from
226    :obj:`Orange.optimization.Tune1Parameter` only in specification of tuning
227    parameters.
228   
229    .. attribute:: parameters
230   
231        A list of two-element tuples, each containing the name of a parameter
232        and its possible values.
233   
[10077]234    For example we can try to tune both the minimal number of instances in
235    leaves and the splitting criteria by setting the tuner as follows:
[8042]236   
[9349]237    :download:`optimization-tuningm.py <code/optimization-tuningm.py>`
[8042]238
239    .. literalinclude:: code/optimization-tuningm.py
240   
241    """
[10302]242
[10077]243    def __call__(self, data, weight=None, verbose=0):
[8042]244        evaluate = getattr(self, "evaluate", Orange.evaluation.scoring.CA)
245        folds = getattr(self, "folds", 5)
246        compare = getattr(self, "compare", cmp)
247        verbose = verbose or getattr(self, "verbose", 0)
[10633]248        return_what = getattr(self, "return_what", Tune1Parameter.RETURN_CLASSIFIER)
[10077]249        progress_callback = getattr(self, "progress_callback", lambda i: None)
[10302]250
[8042]251        to_set = []
252        parnames = []
253        for par in self.parameters:
[10302]254            if (type(par[0]) == list) or (type(par[0]) == tuple):
[8042]255                to_set.append([self.findobj(ld) for ld in par[0]])
256                parnames.append(par[0])
257            else:
258                to_set.append([self.findobj(par[0])])
259                parnames.append([par[0]])
260
261
[10077]262        cvind = Orange.core.MakeRandomIndicesCV(data, folds)
[10654]263        findBest = Orange.utils.selection.BestOnTheFly(seed=data.checksum(),
[10633]264                                         call_compare_on_1st=True)
[10077]265        tableAndWeight = weight and (data, weight) or data
[8042]266        numOfTests = sum([len(x[1]) for x in self.parameters])
267        milestones = set(range(0, numOfTests, max(numOfTests / 100, 1)))
[10582]268        for itercount, valueindices in enumerate(Orange.utils.counters.LimitedCounter(\
[8042]269                                        [len(x[1]) for x in self.parameters])):
[10302]270            values = [self.parameters[i][1][x] for i, x \
[8042]271                      in enumerate(valueindices)]
272            for pi, value in enumerate(values):
273                for i, par in enumerate(to_set[pi]):
274                    setattr(par[0], par[1], value)
[10302]275                    if verbose == 2:
[8042]276                        print "%s: %s" % (parnames[pi][i], value)
[10302]277
[8042]278            res = evaluate(Orange.evaluation.testing.test_with_indices(
[10633]279                                        [self.learner], tableAndWeight, cvind))
[8042]280            if itercount in milestones:
[10077]281                progress_callback(100.0 * itercount / numOfTests)
[10302]282
[8042]283            findBest.candidate((res, values))
[10302]284            if verbose == 2:
[8042]285                print "===> Result: %s\n" % res
286
287        bestpar = findBest.winner()[1]
288        if verbose:
289            print "*** Optimal set of parameters: ",
290        for pi, value in enumerate(bestpar):
291            for i, par in enumerate(to_set[pi]):
292                setattr(par[0], par[1], value)
293                if verbose:
294                    print "%s: %s" % (parnames[pi][i], value),
295        if verbose:
296            print
297
[10633]298        if return_what == Tune1Parameter.RETURN_NONE:
[8042]299            return None
[10633]300        elif return_what == Tune1Parameter.RETURN_PARAMETERS:
[8042]301            return bestpar
[10633]302        elif return_what == Tune1Parameter.RETURN_LEARNER:
303            return self.learner
[8042]304        else:
[10633]305            classifier = self.learner(data)
[10580]306            if Orange.utils.environ.orange_no_deprecated_members:
[10077]307                classifier.fittedParameters = bestpar
308            classifier.fitted_parameters = bestpar
[8042]309            return classifier
[10302]310
[10077]311TuneMParameters = deprecated_members(
312    {"progressCallback": "progress_callback"},
313    )(TuneMParameters)
[8042]314
315class ThresholdLearner(Orange.classification.Learner):
[10302]316
[10077]317    """:obj:`Orange.optimization.ThresholdLearner` is a class that wraps
[8042]318    another learner. When given the data, it calls the wrapped learner to build
319    a classifier, than it uses the classifier to predict the class
[10077]320    probabilities on the training instances. Storing the probabilities, it
[8042]321    computes the threshold that would give the optimal classification accuracy.
322    Then it wraps the classifier and the threshold into an instance of
323    :obj:`Orange.optimization.ThresholdClassifier`.
324
325    Note that the learner doesn't perform internal cross-validation. Also, the
[10077]326    learner doesn't work for multivalued classes.
[8042]327
328    :obj:`Orange.optimization.ThresholdLearner` has the same interface as any
[10077]329    learner: if the constructor is given data, it returns a classifier,
[8042]330    else it returns a learner. It has two attributes.
331   
332    .. attribute:: learner
333   
334        The wrapped learner, for example an instance of
335        :obj:`Orange.classification.bayes.NaiveLearner`.
336   
[10077]337    .. attribute:: store_curve
[8042]338   
339        If `True`, the resulting classifier will contain an attribute curve, with
340        a list of tuples containing thresholds and classification accuracies at
341        that threshold (default `False`).
342   
343    """
[10302]344
345    @deprecated_keywords({"examples": "data", "weightID": "weight_id"})
346    def __new__(cls, data=None, weight_id=0, **kwds):
[8042]347        self = Orange.classification.Learner.__new__(cls, **kwds)
[11459]348        if data is not None:
[8042]349            self.__init__(**kwargs)
[10077]350            return self.__call__(data, weight_id)
[8042]351        else:
352            return self
[10302]353
[10077]354    @deprecated_keywords({"storeCurve": "store_curve"})
355    def __init__(self, learner=None, store_curve=False, **kwds):
[8042]356        self.learner = learner
[10077]357        self.store_curve = store_curve
358        for name, value in kwds.items():
359            setattr(self, name, value)
[8042]360
[10302]361    @deprecated_keywords({"examples": "data", "weightID": "weight_id"})
362    def __call__(self, data, weight_id=0):
[8042]363        if self.learner is None:
364            raise AttributeError("Learner not set.")
[10302]365
[10077]366        classifier = self.learner(data, weight_id)
[10302]367        threshold, optCA, curve = Orange.wrappers.ThresholdCA(classifier,
368                                                          data,
[10077]369                                                          weight_id)
370        if self.store_curve:
[10302]371            return ThresholdClassifier(classifier, threshold, curve=curve)
[8042]372        else:
373            return ThresholdClassifier(classifier, threshold)
374
[10077]375ThresholdLearner = deprecated_members(
[10302]376    {"storeCurve": "store_curve"},
[10077]377    wrap_methods=["__init__"]
378    )(ThresholdLearner)
[10302]379
[8042]380class ThresholdClassifier(Orange.classification.Classifier):
[10302]381
[8042]382    """:obj:`Orange.optimization.ThresholdClassifier`, used by both
383    :obj:`Orange.optimization.ThredholdLearner` and
384    :obj:`Orange.optimization.ThresholdLearner_fixed` is therefore another
385    wrapper class, containing a classifier and a threshold. When it needs to
[10077]386    classify an instance, it calls the wrapped classifier to predict
[8042]387    probabilities. The example will be classified into the second class only if
388    the probability of that class is above the threshold.
389
390    .. attribute:: classifier
391   
[10077]392        The wrapped classifier, normally the one related to the ThresholdLearner's
393        learner, e.g. an instance of
394        :obj:`Orange.classification.bayes.NaiveLearner`.
[8042]395   
396    .. attribute:: threshold
397   
[10077]398        The threshold for classification into the second class.
[8042]399   
400    The two attributes can be specified set as attributes or given to the
401    constructor as ordinary arguments.
402   
403    """
[10302]404
[8042]405    def __init__(self, classifier, threshold, **kwds):
406        self.classifier = classifier
407        self.threshold = threshold
[10077]408        for name, value in kwds.items():
409            setattr(self, name, value)
[8042]410
[10302]411    def __call__(self, instance, what=Orange.classification.Classifier.GetValue):
[10077]412        probs = self.classifier(instance, self.GetProbabilities)
[8042]413        if what == self.GetProbabilities:
414            return probs
415        value = Orange.data.Value(self.classifier.classVar, probs[1] > \
416                                  self.threshold)
417        if what == Orange.classification.Classifier.GetValue:
418            return value
419        else:
420            return (value, probs)
[10302]421
422
[8042]423class ThresholdLearner_fixed(Orange.classification.Learner):
[10077]424    """ This is a convinience  variant of
425    :obj:`Orange.optimization.ThresholdLearner`. Instead of finding the
426    optimal threshold it uses a prescribed one. It has the following two
[8042]427    attributes.
428   
429    .. attribute:: learner
430   
[10077]431        The wrapped learner, for example an instance of
432        :obj:`~Orange.classification.bayes.NaiveLearner`.
[8042]433   
434    .. attribute:: threshold
435   
[10077]436        Threshold to use in classification.
[8042]437   
[10077]438    This class calls its base learner and puts the resulting classifier
439    together with the threshold into an instance of :obj:`ThresholdClassifier`.
[8042]440   
441    """
[10077]442    @deprecated_keywords({"examples": "data", "weightID": "weight_id"})
[10302]443    def __new__(cls, data=None, weight_id=0, **kwds):
[8042]444        self = Orange.classification.Learner.__new__(cls, **kwds)
[11459]445        if data is not None:
[8042]446            self.__init__(**kwds)
[10077]447            return self.__call__(data, weight_id)
[8042]448        else:
449            return self
[10302]450
[8042]451    def __init__(self, learner=None, threshold=None, **kwds):
452        self.learner = learner
453        self.threshold = threshold
[10077]454        for name, value in kwds.items():
455            setattr(name, value)
[10302]456
[10077]457    @deprecated_keywords({"examples": "data", "weightID": "weight_id"})
[10302]458    def __call__(self, data, weight_id=0):
[8042]459        if self.learner is None:
460            raise AttributeError("Learner not set.")
461        if self.threshold is None:
462            raise AttributeError("Threshold not set.")
[10077]463        if len(data.domain.classVar.values) != 2:
[8042]464            raise ValueError("ThresholdLearner handles binary classes only.")
[10302]465
466        return ThresholdClassifier(self.learner(data, weight_id),
[8042]467                                   self.threshold)
468
469class PreprocessedLearner(object):
[10302]470    def __new__(cls, preprocessor=None, learner=None):
[8042]471        self = object.__new__(cls)
472        if learner is not None:
473            self.__init__(preprocessor)
474            return self.wrapLearner(learner)
475        else:
476            return self
[10302]477
478    def __init__(self, preprocessor=None, learner=None):
[8042]479        if isinstance(preprocessor, list):
480            self.preprocessors = preprocessor
481        elif preprocessor is not None:
482            self.preprocessors = [preprocessor]
483        else:
484            self.preprocessors = []
485        #self.preprocessors = [Orange.core.Preprocessor_addClassNoise(proportion=0.8)]
486        if learner:
487            self.wrapLearner(learner)
[10302]488
489    def processData(self, data, weightId=None):
[8042]490        hadWeight = hasWeight = weightId is not None
491        for preprocessor in self.preprocessors:
492            if hasWeight:
[10302]493                t = preprocessor(data, weightId)
[8042]494            else:
495                t = preprocessor(data)
[10302]496
[8042]497            if isinstance(t, tuple):
498                data, weightId = t
499                hasWeight = True
500            else:
501                data = t
502        if hadWeight:
503            return data, weightId
504        else:
505            return data
506
507    def wrapLearner(self, learner):
508        class WrappedLearner(learner.__class__):
509            preprocessor = self
510            wrappedLearner = learner
511            name = getattr(learner, "name", "")
[10302]512            def __call__(self, data, weightId=0, getData=False):
[8042]513                t = self.preprocessor.processData(data, weightId or 0)
514                processed, procW = t if isinstance(t, tuple) else (t, 0)
515                classifier = self.wrappedLearner(processed, procW)
516                if getData:
517                    return classifier, processed
518                else:
519                    return classifier # super(WrappedLearner, self).__call__(processed, procW)
[10302]520
[8042]521            def __reduce__(self):
522                return PreprocessedLearner, (self.preprocessor.preprocessors, \
523                                             self.wrappedLearner)
[10302]524
[8042]525            def __getattr__(self, name):
526                return getattr(learner, name)
[10302]527
[8042]528        return WrappedLearner()
Note: See TracBrowser for help on using the repository browser.