source: orange/Orange/regression/earth.py @ 10896:9e5b3b2fa757

Revision 10896:9e5b3b2fa757, 49.3 KB checked in by Ales Erjavec <ales.erjavec@…>, 23 months ago (diff)

Speed up the the earth basis term domain translation.

Line 
1"""\
2====================================================
3Multivariate Adaptive Regression Splines (``earth``)
4====================================================
5
6.. index:: regression, linear model
7
8`Multivariate adaptive regression splines (MARS)`_ is a non-parametric
9regression method that extends a linear model with non-linear
10interactions.
11
12This module borrows the implementation of the technique from the `Earth R
13package`_ by Stephen Milborrow.
14
15.. _`Multivariate adaptive regression splines (MARS)`: http://en.wikipedia.org/wiki/Multivariate_adaptive_regression_splines
16.. _`Earth R package`: http://cran.r-project.org/web/packages/earth/index.html
17
18Example ::
19
20    >>> import Orange
21    >>> data = Orange.data.Table("housing")
22    >>> c = Orange.regression.earth.EarthLearner(data, degree=2, terms=10)
23    >>> print c
24    MEDV =
25       23.587
26       +11.896 * max(0, RM - 6.431)
27       +1.142 * max(0, 6.431 - RM)
28       -0.612 * max(0, LSTAT - 6.120)
29       -228.795 * max(0, NOX - 0.647) * max(0, RM - 6.431)
30       +0.023 * max(0, TAX - 307.000) * max(0, 6.120 - LSTAT)
31       +0.029 * max(0, 307.000 - TAX) * max(0, 6.120 - LSTAT)
32
33
34.. autoclass:: EarthLearner
35    :members:
36
37.. autoclass:: EarthClassifier
38    :members:
39
40
41Utility functions
42-----------------
43
44.. autofunction:: gcv
45
46.. autofunction:: plot_evimp
47
48.. autofunction:: bagged_evimp
49
50.. autoclass:: ScoreEarthImportance
51
52"""
53
54import Orange
55from Orange.feature import Discrete, Continuous
56from Orange.data import Table, Domain
57from Orange.data.preprocess import Continuize as Preprocessor_continuize, \
58                              Impute as Preprocessor_impute, \
59                              PreprocessorList as Preprocessor_preprocessorList, \
60                              DomainContinuizer
61
62import numpy
63
64def is_discrete(var):
65    return isinstance(var, Discrete)
66
67def is_continuous(var):
68    return isinstance(var, Continuous)
69
70def expand_discrete(var):
71    """ Expand a discrete variable ``var`` returning one continuous indicator
72    variable for each value of ``var`` (if the number of values is grater
73    then 2 else return only one indicator variable).
74   
75    """
76    if len(var.values) > 2:
77        values = var.values
78    elif len(var.values) == 2:
79        values = var.values[-1:]
80    else:
81        values = var.values[:1]
82    new_vars = []
83    for value in values:
84        new = Continuous("{0}={1}".format(var.name, value))
85        new.get_value_from = cls = Orange.core.ClassifierFromVar(whichVar=var)
86        cls.transformer = Orange.core.Discrete2Continuous()
87        cls.transformer.value = int(Orange.core.Value(var, value))
88        new.source_variable = var
89        new_vars.append(new)
90    return new_vars
91
92def select_attrs(table, features, class_var=None,
93                 class_vars=None, metas=None):
94    """ Select only ``attributes`` from the ``table``.
95    """
96    if class_vars is None:
97        domain = Domain(features, class_var)
98    else:
99        domain = Domain(features, class_var, class_vars=class_vars)
100    if metas:
101        domain.add_metas(metas)
102    return Table(domain, table)
103   
104class EarthLearner(Orange.regression.base.BaseRegressionLearner):
105    """Earth learner class. Supports both regression and classification
106    problems. For classification, class values are expanded into
107    continuous indicator columns (one for each value if the number of
108    values is grater then 2), and a multi response model is fit to these
109    new columns. The resulting classifier the computes response
110    values on new instances to select the final predicted class.
111     
112    """
113    def __new__(cls, instances=None, weight_id=None, **kwargs):
114        self = Orange.regression.base.BaseRegressionLearner.__new__(cls)
115        if instances is not None:
116            self.__init__(**kwargs)
117            return self.__call__(instances, weight_id)
118        else:
119            return self
120       
121    def __init__(self, degree=1, terms=21, penalty= None, thresh=1e-3,
122                 min_span=0, new_var_penalty=0, fast_k=20, fast_beta=1,
123                 pruned_terms=None, scale_resp=True, store_instances=True,
124                **kwds):
125        """Initialize the learner instance.
126       
127        :param degree: Maximum degree (num. of hinge functions per term)
128            of the terms in the model (default: 1).
129        :type degree: int
130        :param terms: Maximum number of terms in the forward pass
131                (default: 21).  If set to ``None``, ``min(200, max(20, 2
132                * n_attributes)) + 1`` will be used, like the default
133                setting in earth R package.
134        :type terms: int
135        :param penalty: Penalty for hinges in the GCV computation (used
136            in the pruning pass). Default is 3.0 if ``degree`` is above 1,
137            and 2.0 otherwise.
138        :type penalty: float
139        :param thresh: Threshold for RSS decrease in the forward pass
140            (default: 0.001).
141        :type thresh: float
142        :param min_span: TODO.
143        :param new_var_penalty: Penalty for introducing a new variable
144            in the model during the forward pass (default: 0).
145        :type new_var_penalty: float
146        :param fast_k: Fast k.
147        :param fast_beta: Fast beta.
148        :param pruned_terms: Maximum number of terms in the model after
149            pruning (default: ``None``, no limit).
150        :type pruned_terms: int
151        :param scale_resp: Scale responses prior to forward pass (default:
152            ``True``); ignored for models with multiple responses.
153        :type scale_resp: bool
154        :param store_instances: Store training instances in the model
155            (default: ``True``).
156        :type store_instances: bool
157         
158        .. todo:: min_span, prunning_method (need Leaps like functionality,
159            currently only eval_subsets_using_xtx is implemented).
160       
161        """
162       
163        super(EarthLearner, self).__init__()
164       
165        self.degree = degree
166        self.terms = terms
167        if penalty is None:
168            penalty = 3 if degree > 1 else 2
169        self.penalty = penalty
170        self.thresh = thresh
171        self.min_span = min_span
172        self.new_var_penalty = new_var_penalty
173        self.fast_k = fast_k
174        self.fast_beta = fast_beta
175        self.pruned_terms = pruned_terms
176        self.scale_resp = scale_resp
177        self.store_instances = store_instances
178        self.__dict__.update(kwds)
179       
180        self.continuizer.class_treatment = DomainContinuizer.Ignore
181       
182    def __call__(self, instances, weight_id=None):
183       
184        expanded_class = None
185        multitarget = False
186       
187        if instances.domain.class_var:
188            instances = self.impute_table(instances)
189            instances = self.continuize_table(instances)
190           
191            if is_discrete(instances.domain.class_var):
192                # Expand a discrete class with indicator columns
193                expanded_class = expand_discrete(instances.domain.class_var)
194                y_table = select_attrs(instances, expanded_class)
195                (y, ) = y_table.to_numpy_MA("A")
196                (x, ) = instances.to_numpy_MA("A")
197            elif is_continuous(instances.domain.class_var):
198                x, y, _ = instances.to_numpy_MA()
199                y = y.reshape((-1, 1))
200            else:
201                raise ValueError("Cannot handle the response.")
202        elif instances.domain.class_vars:
203            # Multi-target domain
204            if not all(map(is_continuous, instances.domain.class_vars)):
205                raise TypeError("Only continuous multi-target classes are supported.")
206            x_table = select_attrs(instances, instances.domain.attributes)
207            y_table = select_attrs(instances, instances.domain.class_vars)
208           
209            # Impute and continuize only the x_table
210            x_table = self.impute_table(x_table)
211            x_table = self.continuize_table(x_table)
212            domain = Domain(x_table.domain.attributes,
213                            class_vars=instances.domain.class_vars)
214           
215            (x, ) = x_table.to_numpy_MA("A")
216            (y, ) = y_table.to_numpy_MA("A")
217           
218            multitarget = True
219        else:
220            raise ValueError("Class variable expected.")
221       
222        if self.scale_resp and y.shape[1] == 1:
223            sy = y - numpy.mean(y, axis=0)
224            sy = sy / numpy.std(sy, axis=0)
225        else:
226            sy = y
227           
228        terms = self.terms
229        if terms is None:
230            # Automatic maximum number of terms
231            terms = min(200, max(20, 2 * x.shape[1])) + 1
232           
233        n_terms, used, bx, dirs, cuts = forward_pass(x, sy,
234            degree=self.degree, terms=terms, penalty=self.penalty,
235            thresh=self.thresh, fast_k=self.fast_k, fast_beta=self.fast_beta,
236            new_var_penalty=self.new_var_penalty)
237       
238        # discard unused terms from bx, dirs, cuts
239        bx = bx[:, used]
240        dirs = dirs[used, :]
241        cuts = cuts[used, :]
242       
243        # pruning
244        used, subsets, rss_per_subset, gcv_per_subset = \
245            pruning_pass(bx, y, self.penalty,
246                         pruned_terms=self.pruned_terms)
247       
248        # Fit betas
249        bx_used = bx[:, used]
250        betas, res, rank, s = numpy.linalg.lstsq(bx_used, y)
251       
252        return EarthClassifier(instances.domain, used, dirs, cuts, betas.T,
253                               subsets, rss_per_subset, gcv_per_subset,
254                               instances=instances if self.store_instances else None,
255                               multitarget=multitarget,
256                               expanded_class=expanded_class
257                               )
258
259
260def soft_max(values):
261    values = numpy.asarray(values)
262    return numpy.exp(values) / numpy.sum(numpy.exp(values))
263
264
265class EarthClassifier(Orange.core.ClassifierFD):
266    """ Earth classifier.
267    """
268    def __init__(self, domain, best_set, dirs, cuts, betas, subsets=None,
269                 rss_per_subset=None, gcv_per_subset=None, instances=None,
270                 multitarget=False, expanded_class=None,
271                 original_domain=None, **kwargs):
272        self.multitarget = multitarget
273        self.domain = domain
274        self.class_var = domain.class_var
275        if self.multitarget:
276            self.class_vars = domain.class_vars
277           
278        self.best_set = best_set
279        self.dirs = dirs
280        self.cuts = cuts
281        self.betas = betas
282        self.subsets = subsets
283        self.rss_per_subset = rss_per_subset
284        self.gcv_per_subset = gcv_per_subset
285        self.instances = instances
286        self.expanded_class = expanded_class
287        self.original_domain = original_domain
288        self.__dict__.update(kwargs)
289       
290    def __call__(self, instance, result_type=Orange.core.GetValue):
291        if self.multitarget and self.domain.class_vars:
292            resp_vars = list(self.domain.class_vars)
293        elif is_discrete(self.class_var):
294            resp_vars = self.expanded_class
295        else:
296            resp_vars = [self.class_var]
297           
298        vals = self.predict(instance)
299        vals = [var(val) for var, val in zip(resp_vars, vals)]
300       
301        from Orange.statistics.distribution import Distribution
302       
303        if not self.multitarget and is_discrete(self.class_var):
304            dist = Distribution(self.class_var)
305            if len(self.class_var.values) == 2:
306                probs = [1 - float(vals[0]), float(vals[0])]
307            else:
308                probs = soft_max(map(float, vals))
309               
310            for val, p in zip(self.class_var.values, probs):
311                dist[val] = p
312            value = dist.modus()
313            vals, probs = [value], [dist]
314        else:
315            probs = []
316            for var, val in zip(resp_vars, vals):
317                dist = Distribution(var)
318                dist[val] = 1.0
319                probs.append(dist)
320           
321        if not self.multitarget:
322            vals, probs = vals[0], probs[0]
323           
324        if result_type == Orange.core.GetValue:
325            return vals
326        elif result_type == Orange.core.GetBoth:
327            return vals, probs
328        else:
329            return probs
330       
331    def base_matrix(self, instances=None):
332        """Return the base matrix (bx) of the Earth model for the table.
333        If table is not supplied, the base matrix of the training instances
334        is returned.
335        Base matrix is a len(instances) x num_terms matrix of computed values
336        of terms in the model (not multiplied by beta) for each instance.
337       
338        :param instances: Input instances for the base matrix.
339        :type instances: :class:`Orange.data.Table`
340       
341        """
342        if instances is None:
343            instances = self.instances
344        instances = select_attrs(instances, self.domain.attributes)
345        (data,) = instances.to_numpy_MA("A")
346        bx = base_matrix(data, self.best_set, self.dirs, self.cuts)
347        return bx
348
349    def base_features(self):
350        """Return a list of features for the included Earth terms.
351        The attributes can be used in Orange's domain translation
352        (i.e. they define the proper ``get_value_from`` functions).
353       
354        """
355        terms = []
356        dirs = self.dirs[self.best_set]
357        cuts = self.cuts[self.best_set]
358
359        for dir, cut in zip(dirs[1:], cuts[1:]):  # Drop the intercept (first column)
360            hinge = [_format_knot(self, attr.name, dir1, cut1) \
361                     for (attr, dir1, cut1) in \
362                     zip(self.domain.attributes, dir, cut) \
363                     if dir1 != 0.0]
364            term_name = " * ".join(hinge)
365            term = Orange.feature.Continuous(term_name)
366            term.get_value_from = term_computer(term, self.domain, dir, cut)
367            terms.append(term)
368        return terms
369
370    def predict(self, instance):
371        """ Predict the response value(s)
372       
373        :param instance: Data instance
374        :type instance: :class:`Orange.data.Instance`
375       
376        """
377        data = Orange.data.Table(self.domain, [instance])
378        bx = self.base_matrix(data)
379        bx_used = bx[:, self.best_set]
380        vals = numpy.dot(bx_used, self.betas.T).ravel()
381        return vals
382   
383    def used_attributes(self, term=None):
384        """Return the used terms for term (index). If no term is given,
385        return all attributes in the model.
386       
387        :param term: term index
388        :type term: int
389       
390        """
391        if term is None:
392            return reduce(set.union, [self.used_attributes(i) \
393                                      for i in range(self.best_set.size)],
394                          set())
395           
396        attrs = self.domain.attributes
397       
398        used_mask = self.dirs[term, :] != 0.0
399        return [a for a, u in zip(attrs, used_mask) if u]
400   
401    def evimp(self, used_only=True):
402        """ Return the estimated variable importances.
403       
404        :param used_only: if True return only used attributes
405       
406        """ 
407        return evimp(self, used_only)
408   
409    def __reduce__(self):
410        return (type(self), (self.domain, self.best_set, self.dirs,
411                            self.cuts, self.betas),
412                dict(self.__dict__))
413       
414    def to_string(self, percision=3, indent=3):
415        """ Return a string representation of the model.
416        """
417        return format_model(self, percision, indent)
418       
419    def __str__(self):
420        return self.to_string()
421
422"""
423Utility functions
424-----------------
425"""
426   
427def base_matrix(data, best_set, dirs, cuts):
428    """ Return the base matrix for the earth model.
429   
430    :param data: Input data
431    :type data: :class:`numpy.ndarray`
432   
433    :param best_set: A array of booleans indicating used terms.
434    :type best_set: :class:`numpy.ndarray`
435   
436    :param dirs: Earth model's dirs members
437    :type dirs: :class:`numpy.ndarray`
438   
439    :param cuts: Earth model's cuts members
440    :type cuts: :class:`numpy.ndarray`
441   
442    """
443    data = numpy.asarray(data)
444    best_set = numpy.asarray(best_set)
445    dirs = numpy.asarray(dirs)
446    cuts = numpy.asarray(cuts)
447   
448    bx = numpy.zeros((data.shape[0], best_set.shape[0]))
449    bx[:, 0] = 1.0 # The intercept
450    for termi in range(1, best_set.shape[0]):
451        term_dirs = dirs[termi]
452        term_cuts = cuts[termi]
453       
454        dir_p1 = numpy.where(term_dirs == 1)[0]
455        dir_m1 = numpy.where(term_dirs == -1)[0]
456        dir_2 = numpy.where(term_dirs == 2)[0]
457       
458        x1 = data[:, dir_p1] - term_cuts[dir_p1]
459        x2 = term_cuts[dir_m1] - data[:, dir_m1]
460        x3 = data[:, dir_2]
461       
462        x1 = numpy.where(x1 > 0.0, x1, 0.0)
463        x2 = numpy.where(x2 > 0.0, x2, 0.0)
464       
465        X = numpy.hstack((x1, x2, x3))
466        X = numpy.cumprod(X, axis=1)
467        bx[:, termi] = X[:, -1] if X.size else 0.0
468       
469    return bx
470
471   
472def gcv(rss, n, n_effective_params):
473    """ Return the generalized cross validation.
474   
475    .. math:: gcv = rss / (n * (1 - NumEffectiveParams / n) ^ 2)
476   
477    :param rss: Residual sum of squares.
478    :param n: Number of training instances.
479    :param n_effective_params: Number of effective paramaters.
480     
481    """
482    return  rss / (n * (1. - n_effective_params / n) ** 2)
483
484
485class term_computer(Orange.core.ClassifierFD):
486    """An utility class for computing basis terms. Can be used as
487    a :obj:`~Orange.feature.Descriptior.get_value_from` member.
488
489    """
490    def __init__(self, term_var=None, domain=None, dir=None, cut=None):
491        self.class_var = term_var
492        self.domain = domain
493        self.dir = dir
494        self.cut = cut
495
496    def __call__(self, instance, return_what=Orange.core.GetValue):
497        instance = Orange.data.Table(self.domain, [instance])
498        (instance,) = instance.to_numpy_MA("A")
499        instance = instance[0]
500
501        mask = self.dir != 0
502        dir = self.dir[mask]
503        cut = self.cut[mask]
504
505        values = instance[mask] - cut
506        values *= dir
507
508        values = numpy.where(values > 0, values, 0)
509        value = numpy.prod(values.filled(0))
510
511        return self.class_var(value if value is not numpy.ma.masked else "?")
512
513
514"""
515Multi-label utility functions
516"""
517
518
519"""
520ctypes interface to ForwardPass and EvalSubsetsUsingXtx.
521"""
522
523import ctypes
524from numpy import ctypeslib
525import orange
526
527_c_orange_lib = ctypeslib.load_library(orange.__file__, "")
528_c_forward_pass_ = _c_orange_lib.EarthForwardPass
529
530_c_forward_pass_.argtypes = \
531    [ctypes.POINTER(ctypes.c_int),  #pnTerms:
532     ctypeslib.ndpointer(dtype=ctypes.c_bool, ndim=1),  #FullSet
533     ctypeslib.ndpointer(dtype=ctypes.c_double, ndim=2, flags="F_CONTIGUOUS"), #bx
534     ctypeslib.ndpointer(dtype=ctypes.c_int, ndim=2, flags="F_CONTIGUOUS"),    #Dirs
535     ctypeslib.ndpointer(dtype=ctypes.c_double, ndim=2, flags="F_CONTIGUOUS"), #Cuts
536     ctypeslib.ndpointer(dtype=ctypes.c_int, ndim=1),  #nFactorsInTerms
537     ctypeslib.ndpointer(dtype=ctypes.c_int, ndim=1),  #nUses
538     ctypeslib.ndpointer(dtype=ctypes.c_double, ndim=2, flags="F_CONTIGUOUS"), #x
539     ctypeslib.ndpointer(dtype=ctypes.c_double, ndim=2, flags="F_CONTIGUOUS"), #y
540     ctypeslib.ndpointer(dtype=ctypes.c_double, ndim=1), # Weights
541     ctypes.c_int,  #nCases
542     ctypes.c_int,  #nResp
543     ctypes.c_int,  #nPred
544     ctypes.c_int,  #nMaxDegree
545     ctypes.c_int,  #nMaxTerms
546     ctypes.c_double,   #Penalty
547     ctypes.c_double,   #Thresh
548     ctypes.c_int,  #nFastK
549     ctypes.c_double,   #FastBeta
550     ctypes.c_double,   #NewVarPenalty
551     ctypeslib.ndpointer(dtype=ctypes.c_int, ndim=1),  #LinPreds
552     ctypes.c_bool, #UseBetaCache
553     ctypes.c_char_p    #sPredNames
554     ]
555   
556def forward_pass(x, y, degree=1, terms=21, penalty=None, thresh=0.001,
557                  fast_k=21, fast_beta=1, new_var_penalty=2):
558    """ Do earth forward pass.
559    """
560    x = numpy.asfortranarray(x, dtype=ctypes.c_double)
561    y = numpy.asfortranarray(y, dtype=ctypes.c_double)
562    if x.shape[0] != y.shape[0]:
563        raise ValueError("First dimensions of x and y must be the same.")
564    if y.ndim == 1:
565        y = y.reshape((-1, 1), order="F")
566    if penalty is None:
567        penalty = 2
568    n_cases = x.shape[0]
569    n_preds = x.shape[1]
570   
571    n_resp = y.shape[1] if y.ndim == 2 else y.shape[0]
572   
573    # Output variables
574    n_term = ctypes.c_int()
575    full_set = numpy.zeros((terms,), dtype=ctypes.c_bool, order="F")
576    bx = numpy.zeros((n_cases, terms), dtype=ctypes.c_double, order="F")
577    dirs = numpy.zeros((terms, n_preds), dtype=ctypes.c_int, order="F")
578    cuts = numpy.zeros((terms, n_preds), dtype=ctypes.c_double, order="F")
579    n_factors_in_terms = numpy.zeros((terms,), dtype=ctypes.c_int, order="F")
580    n_uses = numpy.zeros((n_preds,), dtype=ctypes.c_int, order="F")
581    weights = numpy.ones((n_cases,), dtype=ctypes.c_double, order="F")
582    lin_preds = numpy.zeros((n_preds,), dtype=ctypes.c_int, order="F")
583    use_beta_cache = True
584   
585    # These tests are performed in ForwardPass, and if they fail the function
586    # calls exit. So we must check it here and raise a exception to avoid a
587    # process shutdown.
588    if n_cases < 8:
589        raise ValueError("Need at least 8 data instances.")
590    if n_cases > 1e8:
591        raise ValueError("To many data instances.")
592    if n_resp < 1:
593        raise ValueError("No response column.")
594    if n_resp > 1e6:
595        raise ValueError("To many response columns.")
596    if n_preds < 1:
597        raise ValueError("No predictor columns.")
598    if n_preds > 1e5:
599        raise ValueError("To many predictor columns.")
600    if degree <= 0 or degree > 100:
601        raise ValueError("Invalid 'degree'.")
602    if terms < 3 or terms > 10000:
603        raise ValueError("'terms' must be in >= 3 and <= 10000.")
604    if penalty < 0 and penalty != -1:
605        raise ValueError("Invalid 'penalty' (the only legal negative value is -1).")
606    if penalty > 1000:
607        raise ValueError("Invalid 'penalty' (must be <= 1000).")
608    if thresh < 0.0 or thresh >= 1.0:
609        raise ValueError("Invalid 'thresh' (must be in [0.0, 1.0) ).")
610    if fast_beta < 0 or fast_beta > 1000:
611        raise ValueError("Invalid 'fast_beta' (must be in [0, 1000] ).")
612    if new_var_penalty < 0 or new_var_penalty > 10:
613        raise ValueError("Invalid 'new_var_penalty' (must be in [0, 10] ).")
614    if (numpy.var(y, axis=0) <= 1e-8).any():
615        raise ValueError("Variance of y is zero (or near zero).")
616
617    _c_forward_pass_(ctypes.byref(n_term), full_set, bx, dirs, cuts,
618                     n_factors_in_terms, n_uses, x, y, weights, n_cases,
619                     n_resp, n_preds, degree, terms, penalty, thresh,
620                     fast_k, fast_beta, new_var_penalty, lin_preds, 
621                     use_beta_cache, None)
622    return n_term.value, full_set, bx, dirs, cuts
623
624
625_c_eval_subsets_xtx = _c_orange_lib.EarthEvalSubsetsUsingXtx
626
627_c_eval_subsets_xtx.argtypes = \
628    [ctypeslib.ndpointer(dtype=ctypes.c_bool, ndim=2, flags="F_CONTIGUOUS"),   #PruneTerms
629     ctypeslib.ndpointer(dtype=ctypes.c_double, ndim=1),   #RssVec
630     ctypes.c_int,  #nCases
631     ctypes.c_int,  #nResp
632     ctypes.c_int,  #nMaxTerms
633     ctypeslib.ndpointer(dtype=ctypes.c_double, ndim=2, flags="F_CONTIGUOUS"),   #bx
634     ctypeslib.ndpointer(dtype=ctypes.c_double, ndim=2, flags="F_CONTIGUOUS"),   #y
635     ctypeslib.ndpointer(dtype=ctypes.c_double, ndim=1)  #WeightsArg
636     ]
637   
638_c_eval_subsets_xtx.restype = ctypes.c_int
639
640def subset_selection_xtx(X, Y):
641    """ Subsets selection using EvalSubsetsUsingXtx in the Earth package.
642    """
643    X = numpy.asfortranarray(X, dtype=ctypes.c_double)
644    Y = numpy.asfortranarray(Y, dtype=ctypes.c_double)
645    if Y.ndim == 1:
646        Y = Y.reshape((-1, 1), order="F")
647       
648    if X.shape[0] != Y.shape[0]:
649        raise ValueError("First dimensions of bx and y must be the same")
650       
651    var_count = X.shape[1]
652    resp_count = Y.shape[1]
653    cases = X.shape[0]
654    subsets = numpy.zeros((var_count, var_count), dtype=ctypes.c_bool,
655                              order="F")
656    rss_vec = numpy.zeros((var_count,), dtype=ctypes.c_double, order="F")
657    weights = numpy.ones((cases,), dtype=ctypes.c_double, order="F")
658   
659    rval = _c_eval_subsets_xtx(subsets, rss_vec, cases, resp_count, var_count,
660                        X, Y, weights)
661    if rval != 0:
662        raise numpy.linalg.LinAlgError("Lin. dep. terms in X")
663   
664    subsets_ind = numpy.zeros((var_count, var_count), dtype=int)
665    for i, used in enumerate(subsets.T):
666        subsets_ind[i, :i + 1] = numpy.where(used)[0]
667       
668    return subsets_ind, rss_vec
669   
670def subset_selection_xtx_numpy(X, Y):
671    """ A numpy implementation of EvalSubsetsUsingXtx in the Earth package.
672    Using numpy.linalg.lstsq
673   
674    """
675    X = numpy.asarray(X)
676    Y = numpy.asarray(Y)
677   
678    var_count = X.shape[1]
679    rss_vec = numpy.zeros(var_count)
680    working_set = range(var_count)
681    subsets = numpy.zeros((var_count, var_count), dtype=int)
682   
683    for subset_size in reversed(range(var_count)):
684        subsets[subset_size, :subset_size + 1] = working_set
685        X_work = X[:, working_set]
686        b, res, rank, s = numpy.linalg.lstsq(X_work, Y)
687        if res.size > 0:
688            rss_vec[subset_size] = numpy.sum(res)
689        else:
690            rss_vec[subset_size] = numpy.sum((Y - numpy.dot(X_work, b)) ** 2)
691           
692        XtX = numpy.dot(X_work.T, X_work)
693        iXtX = numpy.linalg.pinv(XtX)
694        diag = numpy.diag(iXtX).reshape((-1, 1))
695       
696        if subset_size == 0:
697            break
698       
699        delta_rss = b ** 2 / diag
700        delta_rss = numpy.sum(delta_rss, axis=1)
701        delete_i = numpy.argmin(delta_rss[1:]) + 1 # Keep the intercept
702        del working_set[delete_i]
703    return subsets, rss_vec
704   
705def subset_selection_xtx2(X, Y):
706    """ Another implementation (this uses qr decomp).
707    """
708    from Orange.misc import linalg
709    X = numpy.asfortranarray(X, dtype=ctypes.c_double)
710    Y = numpy.asfortranarray(Y, dtype=ctypes.c_double)
711    col_count = X.shape[1]
712    working_set = range(col_count)
713    subsets = numpy.zeros((col_count, col_count), dtype=int)
714    rss_vec = numpy.zeros((col_count,))
715    QR, k, _, jpvt = linalg.qr_decomp(X)
716   
717    if k < col_count:
718        # remove jpvt[k:] from the work set. Will have zero
719        # entries in the subsets matrix, and inf rss
720        for i in sorted(jpvt[k:], reverse=True):
721            del working_set[i]
722            rss_vec[len(working_set)] = float("inf")
723        col_count = len(working_set)
724       
725    for subset_size in reversed(range(col_count)):
726        subsets[subset_size, :subset_size + 1] = working_set
727        X_work = X[:, working_set]
728        b, rsd, rank = linalg.qr_lstsq(X_work, Y)
729        rss_vec[subset_size] = numpy.sum(rsd ** 2)
730        XtX = numpy.dot(X_work.T, X_work)
731        iXtX = numpy.linalg.pinv(XtX)
732        diag = numpy.diag(iXtX)
733       
734        if subset_size == 0:
735            break
736       
737        delta_rss = b ** 2 / diag
738        delete_i = numpy.argmin(delta_rss[1:]) + 1 # Keep the intercept
739        del working_set[delete_i]
740    return subsets, rss_vec
741   
742def pruning_pass(bx, y, penalty, pruned_terms=-1):
743    """ Do pruning pass
744   
745    .. todo:: pruned_terms, Leaps
746   
747    """
748    try:
749        subsets, rss_vec = subset_selection_xtx(bx, y)
750    except numpy.linalg.LinAlgError:
751        subsets, rss_vec = subset_selection_xtx_numpy(bx, y)
752   
753    cases, terms = bx.shape
754    n_effective_params = numpy.arange(terms) + 1.0
755    n_effective_params += penalty * (n_effective_params - 1.0) / 2.0
756   
757    gcv_vec = gcv(rss_vec, cases, n_effective_params)
758   
759    min_i = numpy.argmin(gcv_vec)
760    used = numpy.zeros((terms), dtype=bool)
761   
762    used[subsets[min_i, :min_i + 1]] = True
763   
764    return used, subsets, rss_vec, gcv_vec
765
766"""
767Printing functions.
768"""
769   
770def format_model(model, percision=3, indent=3):
771    """ Return a formated string representation of the earth model.
772    """
773    if model.multitarget:
774        r_vars = list(model.domain.class_vars)
775    elif is_discrete(model.class_var):
776        r_vars = model.expanded_class
777    else:
778        r_vars = [model.class_var]
779       
780    r_names = [v.name for v in r_vars]
781    betas = model.betas
782       
783    resp = []
784    for name, betas in zip(r_names, betas):
785        resp.append(_format_response(model, name, betas,
786                                     percision, indent))
787    return "\n\n".join(resp)
788
789def _format_response(model, resp_name, betas, percision=3, indent=3):
790    header = "%s =" % resp_name
791    indent = " " * indent
792    fmt = "%." + str(percision) + "f"
793    terms = [([], fmt % betas[0])]
794    beta_i = 0
795    for i, used in enumerate(model.best_set[1:], 1):
796        if used:
797            beta_i += 1
798            beta = fmt % abs(betas[beta_i])
799            knots = [_format_knot(model, attr.name, d, c, percision) \
800                     for d, c, attr in \
801                     zip(model.dirs[i], model.cuts[i], model.domain.attributes) \
802                     if d != 0]
803            term_attrs = [a for a, d in zip(model.domain.attributes, model.dirs[i]) \
804                          if d != 0]
805            term_attrs = sorted(term_attrs)
806            sign = "-" if betas[beta_i] < 0 else "+"
807            if knots:
808                terms.append((term_attrs,
809                              sign + " * ".join([beta] + knots)))
810            else:
811                terms.append((term_attrs, sign + beta))
812    # Sort by len(term_attrs), then by term_attrs
813    terms = sorted(terms, key=lambda t: (len(t[0]), t[0]))
814    return "\n".join([header] + [indent + t for _, t in terms])
815       
816def _format_knot(model, name, dir, cut, percision=3):
817    fmt = "%%.%if" % percision
818    if dir == 1:
819        txt = ("max(0, %s - " + fmt + ")") % (name, cut)
820    elif dir == -1:
821        txt = ("max(0, " + fmt + " - %s)") % (cut, name)
822    elif dir == 2:
823        txt = name
824    return txt
825
826   
827"""\
828Variable importance estimation
829------------------------------
830"""
831
832from collections import defaultdict
833
834def collect_source(vars):
835    """ Given a list of variables ``var``, return a mapping from source
836    variables (``source_variable`` or ``get_value_from.variable`` members)
837    back to the variables in ``vars`` (assumes the default preprocessor in
838    EarthLearner).
839   
840    """
841    source = defaultdict(list)
842    for var in vars:
843        svar = None
844        if var.source_variable:
845            source[var.source_variable].append(var)
846        elif isinstance(var.get_value_from, Orange.core.ClassifierFromVar):
847            source[var.get_value_from.variable].append(var)
848        elif isinstance(var.get_value_from, Orange.core.ImputeClassifier):
849            source[var.get_value_from.classifier_from_var.variable].append(var)
850        else:
851            source[var].append(var)
852    return dict(source)
853
854def map_to_source_var(var, sources):
855    """
856    """
857    if var in sources:
858        return var
859    elif var.source_variable in sources:
860        return var.source_variable
861    elif isinstance(var.get_value_from, Orange.core.ClassifierFromVar):
862        return map_to_source_var(var.get_value_from.variable, sources)
863    elif isinstance(var.get_value_from, Orange.core.ImputeClassifier):
864        var = var.get_value_from.classifier_from_var.variable
865        return map_to_source_var(var, sources)
866    else:
867        return None
868   
869def evimp(model, used_only=True):
870    """ Return the estimated variable importance for the model.
871   
872    :param model: Earth model.
873    :type model: `EarthClassifier`
874   
875    """
876    if model.subsets is None:
877        raise ValueError("No subsets. Use the learner with 'prune=True'.")
878   
879    subsets = model.subsets
880    n_subsets = numpy.sum(model.best_set)
881   
882    rss = -numpy.diff(model.rss_per_subset)
883    gcv = -numpy.diff(model.gcv_per_subset)
884    attributes = list(model.domain.variables)
885   
886    attr2ind = dict(zip(attributes, range(len(attributes))))
887    importances = numpy.zeros((len(attributes), 4))
888    importances[:, 0] = range(len(attributes))
889   
890    for i in range(1, n_subsets):
891        term_subset = subsets[i, :i + 1]
892        used_attributes = reduce(set.union, [model.used_attributes(term) \
893                                             for term in term_subset], set())
894        for attr in used_attributes:
895            importances[attr2ind[attr]][1] += 1.0
896            importances[attr2ind[attr]][2] += gcv[i - 1]
897            importances[attr2ind[attr]][3] += rss[i - 1]
898    imp_min = numpy.min(importances[:, [2, 3]], axis=0)
899    imp_max = numpy.max(importances[:, [2, 3]], axis=0)
900   
901    #Normalize importances.
902    importances[:, [2, 3]] = 100.0 * (importances[:, [2, 3]] \
903                            - [imp_min]) / ([imp_max - imp_min])
904   
905    importances = list(importances)
906    # Sort by n_subsets and gcv.
907    importances = sorted(importances, key=lambda row: (row[1], row[2]),
908                         reverse=True)
909    importances = numpy.array(importances)
910   
911    if used_only:
912        importances = importances[importances[:,1] > 0.0]
913   
914    res = [(attributes[int(row[0])], tuple(row[1:])) for row in importances]
915    return res
916
917
918def plot_evimp(evimp):
919    """ Plot the variable importances as returned from
920    :obj:`EarthClassifier.evimp` call.
921   
922    ::
923
924        import Orange
925        data = Orange.data.Table("housing")
926        c = Orange.regression.earth.EarthLearner(data, degree=3)
927        Orange.regression.earth.plot_evimp(c.evimp())
928
929    .. image:: files/earth-evimp.png
930     
931    The left axis is the nsubsets measure and on the right are the normalized
932    RSS and GCV.
933   
934    """
935    from Orange.ensemble.bagging import BaggedClassifier
936    if isinstance(evimp, EarthClassifier):
937        evimp = evimp.evimp()
938    elif isinstance(evimp, BaggedClassifier):
939        evimp = bagged_evimp(evimp)
940       
941    import pylab
942    fig = pylab.figure()
943    axes1 = fig.add_subplot(111)
944    attrs = [a for a, _ in evimp]
945    imp = [s for _, s in evimp]
946    imp = numpy.array(imp)
947    X = range(len(attrs))
948    l1 = axes1.plot(X, imp[:, 0], "b-",)
949    axes2 = axes1.twinx()
950   
951    l2 = axes2.plot(X, imp[:, 1], "g-",)
952    l3 = axes2.plot(X, imp[:, 2], "r-",)
953   
954    x_axis = axes1.xaxis
955    x_axis.set_ticks(X)
956    x_axis.set_ticklabels([a.name for a in attrs], rotation=90)
957   
958    axes1.yaxis.set_label_text("nsubsets")
959    axes2.yaxis.set_label_text("normalized gcv or rss")
960
961    axes1.legend([l1, l2, l3], ["nsubsets", "gcv", "rss"])
962    axes1.set_title("Variable importance")
963    fig.show()
964
965   
966def bagged_evimp(classifier, used_only=True):
967    """ Extract combined (average) evimp from an instance of BaggedClassifier
968   
969    Example::
970
971        from Orange.ensemble.bagging import BaggedLearner
972        bc = BaggedLearner(EarthLearner(degree=3, terms=10), data)
973        bagged_evimp(bc)
974
975    """
976    def assert_type(object, class_):
977        if not isinstance(object, class_):
978            raise TypeError("Instance of %r expected." % (class_))
979   
980    from Orange.ensemble.bagging import BaggedClassifier
981   
982    assert_type(classifier, BaggedClassifier)
983    bagged_imp = defaultdict(list)
984    attrs_by_name = defaultdict(list)
985    for c in classifier.classifiers:
986        assert_type(c, EarthClassifier)
987        imp = evimp(c, used_only=used_only)
988        for attr, score in imp:
989            bagged_imp[attr.name].append(score) # map by name
990            attrs_by_name[attr.name].append(attr)
991           
992    for attr, scores in bagged_imp.items():
993        scores = numpy.average(scores, axis=0)
994        bagged_imp[attr] = tuple(scores)
995   
996   
997    bagged_imp = sorted(bagged_imp.items(),
998                        key=lambda t: (t[1][0], t[1][1]),
999                        reverse=True)
1000   
1001    bagged_imp = [(attrs_by_name[name][0], scores) for name, scores in bagged_imp]
1002   
1003    if used_only:
1004        bagged_imp = [(a, r) for a, r in bagged_imp if r[0] > 0]
1005    return bagged_imp
1006
1007"""
1008High level interface for measuring variable importance
1009(compatible with Orange.feature.scoring module).
1010
1011"""
1012from Orange.feature import scoring
1013           
1014class ScoreEarthImportance(scoring.Score):
1015    """ A subclass of :class:`Orange.feature.scoring.Score` that.
1016    scores features based on their importance in the Earth
1017    model using ``bagged_evimp``.
1018   
1019    """
1020    # Return types 
1021    NSUBSETS = 0
1022    RSS = 1
1023    GCV = 2
1024   
1025    handles_discrete = True
1026    handles_continuous = True
1027    computes_thresholds = False
1028    needs = scoring.Score.Generator
1029   
1030    def __new__(cls, attr=None, data=None, weight_id=None, **kwargs):
1031        self = scoring.Score.__new__(cls)
1032        if attr is not None and data is not None:
1033            self.__init__(**kwargs)
1034            # TODO: Should raise a warning, about caching
1035            return self.__call__(attr, data, weight_id)
1036        elif not attr and not data:
1037            return self
1038        else:
1039            raise ValueError("Both 'attr' and 'data' arguments expected.")
1040       
1041    def __init__(self, t=10, degree=2, terms=10, score_what="nsubsets", cached=True):
1042        """
1043        :param t: Number of earth models to train on the data
1044            (using BaggedLearner).
1045           
1046        :param score_what: What to return as a score.
1047            Can be one of: "nsubsets", "rss", "gcv" or class constants
1048            NSUBSETS, RSS, GCV.
1049           
1050        """
1051        self.t = t
1052        self.degree = degree
1053        self.terms = terms
1054        if isinstance(score_what, basestring):
1055            score_what = {"nsubsets":self.NSUBSETS, "rss":self.RSS,
1056                          "gcv":self.GCV}.get(score_what, None)
1057                         
1058        if score_what not in range(3):
1059            raise ValueError("Invalid  'score_what' parameter.")
1060
1061        self.score_what = score_what
1062        self.cached = cached
1063        self._cache_ref = None
1064        self._cached_evimp = None
1065       
1066    def __call__(self, attr, data, weight_id=None):
1067        ref = self._cache_ref
1068        if ref is not None and ref is data:
1069            evimp = self._cached_evimp
1070        else:
1071            from Orange.ensemble.bagging import BaggedLearner
1072            bc = BaggedLearner(EarthLearner(degree=self.degree,
1073                            terms=self.terms), t=self.t)(data, weight_id)
1074            evimp = bagged_evimp(bc, used_only=False)
1075            self._cache_ref = data
1076            self._cached_evimp = evimp
1077       
1078        evimp = dict(evimp)
1079        score = evimp.get(attr, None)
1080       
1081        if score is None:
1082            source = collect_source(evimp.keys())
1083            if attr in source:
1084                # Return average of source var scores
1085                return numpy.average([evimp[v][self.score_what] \
1086                                      for v in source[attr]])
1087            else:
1088                raise ValueError("Attribute %r not in the domain." % attr)
1089        else:
1090            return score[self.score_what]
1091   
1092class ScoreRSS(scoring.Score):
1093   
1094    handles_discrete = False
1095    handles_continuous = True
1096    computes_thresholds = False
1097   
1098    def __new__(cls, attr=None, data=None, weight_id=None, **kwargs):
1099        self = scoring.Score.__new__(cls)
1100        if attr is not None and data is not None:
1101            self.__init__(**kwargs)
1102            # TODO: Should raise a warning, about caching
1103            return self.__call__(attr, data, weight_id)
1104        elif not attr and not data:
1105            return self
1106        else:
1107            raise ValueError("Both 'attr' and 'data' arguments expected.")
1108       
1109    def __init__(self):
1110        self._cache_data = None
1111        self._cache_rss = None
1112       
1113    def __call__(self, attr, data, weight_id=None):
1114        ref = self._cache_data
1115        if ref is not None and ref is data:
1116            rss = self._cache_rss
1117        else:
1118            x, y = data.to_numpy_MA("1A/c")
1119            try:
1120                subsets, rss = subset_selection_xtx2(x, y)
1121            except numpy.linalg.LinAlgError:
1122                subsets, rss = subset_selection_xtx_numpy(x, y)
1123            rss_diff = -numpy.diff(rss)
1124            rss = numpy.zeros_like(rss)
1125            for s_size in range(1, subsets.shape[0]):
1126                subset = subsets[s_size, :s_size + 1]
1127                rss[subset] += rss_diff[s_size - 1]
1128            rss = rss[1:] #Drop the intercept
1129            self._cache_data = data
1130            self._cache_rss = rss
1131           
1132        index = list(data.domain.attributes).index(attr)
1133        return rss[index]
1134   
1135
1136#from Orange.core import EarthLearner as BaseEarthLearner, \
1137#                        EarthClassifier as BaseEarthClassifier
1138#from Orange.misc import member_set
1139#
1140#class _EarthLearner(BaseEarthLearner):
1141#    """ An earth learner.
1142#    """
1143#    def __new__(cls, data=None, weightId=None, **kwargs):
1144#        self = BaseEarthLearner.__new__(cls, **kwargs)
1145#        if data is not None:
1146#            self.__init__(**kwargs)
1147#            return self.__call__(data, weightId)
1148#        return self
1149#   
1150#    def __init__(self, max_degree=1, max_terms=21, new_var_penalty=0.0,
1151#                 threshold=0.001, prune=True, penalty=None, fast_k=20,
1152#                 fast_beta=0.0, store_examples=True, **kwargs):
1153#        """ Initialize the learner instance.
1154#       
1155#        :param max_degree:
1156#        """
1157#        self.max_degree = max_degree
1158#        self.max_terms = max_terms
1159#        self.new_var_penalty = new_var_penalty
1160#        self.threshold = threshold
1161#        self.prune = prunes
1162#        if penaty is None:
1163#            penalty = 2.0 if degree > 1 else 3.0
1164#        self.penalty = penalty
1165#        self.fast_k = fast_k
1166#        self.fast_beta = fast_beta
1167#        self.store_examples = store_examples
1168#       
1169#        for key, val in kwargs.items():
1170#            setattr(self, key, val)
1171#   
1172#    def __call__(self, data, weightId=None):
1173#        if not data.domain.class_var:
1174#            raise ValueError("No class var in the domain.")
1175#       
1176#        with member_set(self, "prune", False):
1177#            # We overwrite the prune argument (will do the pruning in python).
1178#            base_clsf =  BaseEarthLearner.__call__(self, data, weightId)
1179#       
1180#        if self.prune:
1181#            (best_set, betas, rss, subsets, rss_per_subset,
1182#             gcv_per_subset) = self.pruning_pass(base_clsf, data)
1183#           
1184#            return _EarthClassifier(base_clsf, data if self.store_examples else None,
1185#                                   best_set=best_set, dirs=base_clsf.dirs,
1186#                                   cuts=base_clsf.cuts,
1187#                                   betas=betas,
1188#                                   subsets=subsets,
1189#                                   rss_per_subset=rss_per_subset,
1190#                                   gcv_per_subset=gcv_per_subset)
1191#        else:
1192#            return _EarthClassifier(base_clsf, data if self.store_examples else None)
1193#   
1194#   
1195#    def pruning_pass(self, base_clsf, examples):
1196#        """ Prune the terms constructed in the forward pass.
1197#        (Pure numpy reimplementation)
1198#        """
1199#        n_terms = numpy.sum(base_clsf.best_set)
1200#        n_eff_params = n_terms + self.penalty * (n_terms - 1) / 2.0
1201#        data, y, _ = examples.to_numpy_MA()
1202#        data = data.filled(0.0)
1203#        best_set = numpy.asarray(base_clsf.best_set, dtype=bool)
1204#       
1205#        bx = base_matrix(data, base_clsf.best_set,
1206#                         base_clsf.dirs, base_clsf.cuts,
1207#                         )
1208#       
1209#        bx_used = bx[:, best_set]
1210#        subsets, rss_per_subset = subsets_selection_xtx(bx_used, y) # TODO: Use leaps like library
1211#        gcv_per_subset = [gcv(rss, bx.shape[0], i + self.penalty * (i - 1) / 2.0) \
1212#                              for i, rss in enumerate(rss_per_subset, 1)]
1213#        gcv_per_subset = numpy.array(gcv_per_subset)
1214#       
1215#        best_i = numpy.argmin(gcv_per_subset[1:]) + 1 # Ignore the intercept
1216#        best_ind = subsets[best_i, :best_i + 1]
1217#        bs_i = 0
1218#        for i, b in enumerate(best_set):
1219#            if b:
1220#                best_set[i] = bs_i in best_ind
1221#                bs_i += 1
1222#               
1223#        bx_subset = bx[:, best_set]
1224#        betas, rss, rank, s = numpy.linalg.lstsq(bx_subset, y)
1225#        return best_set, betas, rss, subsets, rss_per_subset, gcv_per_subset
1226#   
1227#       
1228#class _EarthClassifier(Orange.core.ClassifierFD):
1229#    def __init__(self, base_classifier=None, examples=None, best_set=None,
1230#                 dirs=None, cuts=None, betas=None, subsets=None,
1231#                 rss_per_subset=None,
1232#                 gcv_per_subset=None):
1233#        self._base_classifier = base_classifier
1234#        self.examples = examples
1235#        self.domain = base_classifier.domain
1236#        self.class_var = base_classifier.class_var
1237#       
1238#        best_set = base_classifier.best_set if best_set is None else best_set
1239#        dirs = base_classifier.dirs if dirs is None else dirs
1240#        cuts = base_classifier.cuts if cuts is None else cuts
1241#        betas = base_classifier.betas if betas is None else betas
1242#       
1243#        self.best_set = numpy.asarray(best_set, dtype=bool)
1244#        self.dirs = numpy.array(dirs, dtype=int)
1245#        self.cuts = numpy.array(cuts)
1246#        self.betas = numpy.array(betas)
1247#       
1248#        self.subsets = subsets
1249#        self.rss_per_subset = rss_per_subset
1250#        self.gcv_per_subset = gcv_per_subset
1251#       
1252#    @property
1253#    def num_terms(self):
1254#        """ Number of terms in the model (including the intercept).
1255#        """
1256#        return numpy.sum(numpy.asarray(self.best_set, dtype=int))
1257#   
1258#    @property
1259#    def max_terms(self):
1260#        """ Maximum number of terms (as specified in the learning step).
1261#        """
1262#        return self.best_set.size
1263#   
1264#    @property
1265#    def num_preds(self):
1266#        """ Number of predictors (variables) included in the model.
1267#        """
1268#        return len(self.used_attributes(term))
1269#   
1270#    def __call__(self, example, what=Orange.core.GetValue):
1271#        value = self.predict(example)
1272#        if isinstance(self.class_var, Orange.feature.Continuous):
1273#            value = self.class_var(value)
1274#        else:
1275#            value = self.class_var(int(round(value)))
1276#           
1277#        dist = Orange.statistics.distribution.Distribution(self.class_var)
1278#        dist[value] = 1.0
1279#        if what == Orange.core.GetValue:
1280#            return value
1281#        elif what == Orange.core.GetProbabilities:
1282#            return dist
1283#        else:
1284#            return (value, dist)
1285#   
1286#    def base_matrix(self, examples=None):
1287#        """ Return the base matrix (bx) of the Earth model for the table.
1288#        If table is not supplied the base matrix of the training examples
1289#        is returned.
1290#       
1291#       
1292#        :param examples: Input examples for the base matrix.
1293#        :type examples: Orange.data.Table
1294#       
1295#        """
1296#        if examples is None:
1297#            examples = self.examples
1298#           
1299#        if examples is None:
1300#            raise ValueError("base matrix is only available if 'store_examples=True'")
1301#       
1302#        if isinstance(examples, Orange.data.Table):
1303#            data, cls, w = examples.to_numpy_MA()
1304#            data = data.filled(0.0)
1305#        else:
1306#            data = numpy.asarray(examples)
1307#           
1308#        return base_matrix(data, self.best_set, self.dirs, self.cuts)
1309#   
1310#    def _anova_order(self):
1311#        """ Return indices that sort the terms into the 'ANOVA' format.
1312#        """
1313#        terms = [([], 0)] # intercept
1314#        for i, used in enumerate(self.best_set[1:], 1):
1315#            attrs = sorted(self.used_attributes(i))
1316#            if used and attrs:
1317#                terms.append((attrs, i))
1318#        terms = sotred(terms, key=lambda t:(len(t[0]), t[0]))
1319#        return [i for _, i in terms]
1320#   
1321#    def format_model(self, percision=3, indent=3):
1322#        return format_model(self, percision, indent)
1323#   
1324#    def print_model(self, percision=3, indent=3):
1325#        print self.format_model(percision, indent)
1326#       
1327#    def predict(self, ex):
1328#        """ Return the predicted value (float) for example.
1329#        """
1330#        x = Orange.data.Table(self.domain, [ex])
1331#        x, c, w = x.to_numpy_MA()
1332#        x = x.filled(0.0)[0]
1333#       
1334#        bx = numpy.ones(self.best_set.shape)
1335#        betas = numpy.zeros_like(self.betas)
1336#        betas[0] = self.betas[0]
1337#        beta_i = 0
1338#        for termi in range(1, len(self.best_set)):
1339#            dirs = self.dirs[termi]
1340#            cuts = self.cuts[termi]
1341#            dir_p1 = numpy.where(dirs == 1)[0]
1342#            dir_m1 = numpy.where(dirs == -1)[0]
1343#            dir_2 = numpy.where(dirs == 2)[0]
1344#           
1345#            x1 = x[dir_p1] - cuts[dir_p1]
1346#            x2 = cuts[dir_m1] - x[dir_m1]
1347#            x3 = x[dir_2]
1348#           
1349#            x1 = numpy.maximum(x1, 0.0)
1350#            x2 = numpy.maximum(x2, 0.0)
1351#
1352#            X = numpy.hstack((x1, x2, x3))
1353#            X = numpy.cumprod(X)
1354#           
1355#            bx[termi] = X[-1] if X.size else 0.0
1356#            if self.best_set[termi] != 0:
1357#                beta_i += 1
1358#                betas[beta_i] = self.betas[beta_i]
1359#
1360#        return numpy.sum(bx[self.best_set] * betas)
1361#           
1362#    def used_attributes(self, term=None):
1363#        """ Return a list of used attributes. If term (index) is given
1364#        return only attributes used in that single term.
1365#       
1366#        """
1367#        if term is None:
1368#            terms = numpy.where(self.best_set)[0]
1369#        else:
1370#            terms = [term]
1371#        attrs = set()
1372#        for ti in terms:
1373#            attri = numpy.where(self.dirs[ti] != 0.0)[0]
1374#            attrs.update([self.domain.attributes[i] for i in attri])
1375#        return attrs
1376#       
1377#    def evimp(self, used_only=True):
1378#        """ Return the estimated variable importance.
1379#        """
1380#        return evimp(self, used_only)
1381#       
1382#    def __reduce__(self):
1383#        return (EarthClassifier, (self._base_classifier, self.examples,
1384#                                  self.best_set, self.dirs, self.cuts,
1385#                                  self.betas, self.subsets,
1386#                                  self.rss_per_subset, self.gcv_per_subset),
1387#                {})
1388                                 
Note: See TracBrowser for help on using the repository browser.