source: orange/Orange/regression/earth.py @ 10775:c720c17c2f3f

Revision 10775:c720c17c2f3f, 49.2 KB checked in by Ales Erjavec <ales.erjavec@…>, 2 years ago (diff)

Added base_features method to EarthClassifier.

Line 
1"""\
2====================================================
3Multivariate Adaptive Regression Splines (``earth``)
4====================================================
5
6.. index:: regression, linear model
7
8`Multivariate adaptive regression splines (MARS)`_ is a non-parametric
9regression method that extends a linear model with non-linear
10interactions.
11
12This module borrows the implementation of the technique from the `Earth R
13package`_ by Stephen Milborrow.
14
15.. _`Multivariate adaptive regression splines (MARS)`: http://en.wikipedia.org/wiki/Multivariate_adaptive_regression_splines
16.. _`Earth R package`: http://cran.r-project.org/web/packages/earth/index.html
17
18Example ::
19
20    >>> import Orange
21    >>> data = Orange.data.Table("housing")
22    >>> c = Orange.regression.earth.EarthLearner(data, degree=2, terms=10)
23    >>> print c
24    MEDV =
25       23.587
26       +11.896 * max(0, RM - 6.431)
27       +1.142 * max(0, 6.431 - RM)
28       -0.612 * max(0, LSTAT - 6.120)
29       -228.795 * max(0, NOX - 0.647) * max(0, RM - 6.431)
30       +0.023 * max(0, TAX - 307.000) * max(0, 6.120 - LSTAT)
31       +0.029 * max(0, 307.000 - TAX) * max(0, 6.120 - LSTAT)
32
33
34.. autoclass:: EarthLearner
35    :members:
36
37.. autoclass:: EarthClassifier
38    :members:
39
40
41Utility functions
42-----------------
43
44.. autofunction:: gcv
45
46.. autofunction:: plot_evimp
47
48.. autofunction:: bagged_evimp
49
50.. autoclass:: ScoreEarthImportance
51
52"""
53
54import Orange
55from Orange.feature import Discrete, Continuous
56from Orange.data import Table, Domain
57from Orange.data.preprocess import Continuize as Preprocessor_continuize, \
58                              Impute as Preprocessor_impute, \
59                              PreprocessorList as Preprocessor_preprocessorList, \
60                              DomainContinuizer
61
62import numpy
63
64def is_discrete(var):
65    return isinstance(var, Discrete)
66
67def is_continuous(var):
68    return isinstance(var, Continuous)
69
70def expand_discrete(var):
71    """ Expand a discrete variable ``var`` returning one continuous indicator
72    variable for each value of ``var`` (if the number of values is grater
73    then 2 else return only one indicator variable).
74   
75    """
76    if len(var.values) > 2:
77        values = var.values
78    elif len(var.values) == 2:
79        values = var.values[-1:]
80    else:
81        values = var.values[:1]
82    new_vars = []
83    for value in values:
84        new = Continuous("{0}={1}".format(var.name, value))
85        new.get_value_from = cls = Orange.core.ClassifierFromVar(whichVar=var)
86        cls.transformer = Orange.core.Discrete2Continuous()
87        cls.transformer.value = int(Orange.core.Value(var, value))
88        new.source_variable = var
89        new_vars.append(new)
90    return new_vars
91
92def select_attrs(table, features, class_var=None,
93                 class_vars=None, metas=None):
94    """ Select only ``attributes`` from the ``table``.
95    """
96    if class_vars is None:
97        domain = Domain(features, class_var)
98    else:
99        domain = Domain(features, class_var, class_vars=class_vars)
100    if metas:
101        domain.add_metas(metas)
102    return Table(domain, table)
103   
104class EarthLearner(Orange.regression.base.BaseRegressionLearner):
105    """Earth learner class. Supports both regression and classification
106    problems. For classification, class values are expanded into
107    continuous indicator columns (one for each value if the number of
108    values is grater then 2), and a multi response model is fit to these
109    new columns. The resulting classifier the computes response
110    values on new instances to select the final predicted class.
111     
112    """
113    def __new__(cls, instances=None, weight_id=None, **kwargs):
114        self = Orange.regression.base.BaseRegressionLearner.__new__(cls)
115        if instances is not None:
116            self.__init__(**kwargs)
117            return self.__call__(instances, weight_id)
118        else:
119            return self
120       
121    def __init__(self, degree=1, terms=21, penalty= None, thresh=1e-3,
122                 min_span=0, new_var_penalty=0, fast_k=20, fast_beta=1,
123                 pruned_terms=None, scale_resp=True, store_instances=True,
124                **kwds):
125        """Initialize the learner instance.
126       
127        :param degree: Maximum degree (num. of hinge functions per term)
128            of the terms in the model (default: 1).
129        :type degree: int
130        :param terms: Maximum number of terms in the forward pass
131                (default: 21).  If set to ``None``, ``min(200, max(20, 2
132                * n_attributes)) + 1`` will be used, like the default
133                setting in earth R package.
134        :type terms: int
135        :param penalty: Penalty for hinges in the GCV computation (used
136            in the pruning pass). Default is 3.0 if ``degree`` is above 1,
137            and 2.0 otherwise.
138        :type penalty: float
139        :param thresh: Threshold for RSS decrease in the forward pass
140            (default: 0.001).
141        :type thresh: float
142        :param min_span: TODO.
143        :param new_var_penalty: Penalty for introducing a new variable
144            in the model during the forward pass (default: 0).
145        :type new_var_penalty: float
146        :param fast_k: Fast k.
147        :param fast_beta: Fast beta.
148        :param pruned_terms: Maximum number of terms in the model after
149            pruning (default: ``None``, no limit).
150        :type pruned_terms: int
151        :param scale_resp: Scale responses prior to forward pass (default:
152            ``True``); ignored for models with multiple responses.
153        :type scale_resp: bool
154        :param store_instances: Store training instances in the model
155            (default: ``True``).
156        :type store_instances: bool
157         
158        .. todo:: min_span, prunning_method (need Leaps like functionality,
159            currently only eval_subsets_using_xtx is implemented).
160       
161        """
162       
163        super(EarthLearner, self).__init__()
164       
165        self.degree = degree
166        self.terms = terms
167        if penalty is None:
168            penalty = 3 if degree > 1 else 2
169        self.penalty = penalty
170        self.thresh = thresh
171        self.min_span = min_span
172        self.new_var_penalty = new_var_penalty
173        self.fast_k = fast_k
174        self.fast_beta = fast_beta
175        self.pruned_terms = pruned_terms
176        self.scale_resp = scale_resp
177        self.store_instances = store_instances
178        self.__dict__.update(kwds)
179       
180        self.continuizer.class_treatment = DomainContinuizer.Ignore
181       
182    def __call__(self, instances, weight_id=None):
183       
184        expanded_class = None
185        multitarget = False
186       
187        if instances.domain.class_var:
188            instances = self.impute_table(instances)
189            instances = self.continuize_table(instances)
190           
191            if is_discrete(instances.domain.class_var):
192                # Expand a discrete class with indicator columns
193                expanded_class = expand_discrete(instances.domain.class_var)
194                y_table = select_attrs(instances, expanded_class)
195                (y, ) = y_table.to_numpy_MA("A")
196                (x, ) = instances.to_numpy_MA("A")
197            elif is_continuous(instances.domain.class_var):
198                x, y, _ = instances.to_numpy_MA()
199                y = y.reshape((-1, 1))
200            else:
201                raise ValueError("Cannot handle the response.")
202        elif instances.domain.class_vars:
203            # Multi-target domain
204            if not all(map(is_continuous, instances.domain.class_vars)):
205                raise TypeError("Only continuous multi-target classes are supported.")
206            x_table = select_attrs(instances, instances.domain.attributes)
207            y_table = select_attrs(instances, instances.domain.class_vars)
208           
209            # Impute and continuize only the x_table
210            x_table = self.impute_table(x_table)
211            x_table = self.continuize_table(x_table)
212            domain = Domain(x_table.domain.attributes,
213                            class_vars=instances.domain.class_vars)
214           
215            (x, ) = x_table.to_numpy_MA("A")
216            (y, ) = y_table.to_numpy_MA("A")
217           
218            multitarget = True
219        else:
220            raise ValueError("Class variable expected.")
221       
222        if self.scale_resp and y.shape[1] == 1:
223            sy = y - numpy.mean(y, axis=0)
224            sy = sy / numpy.std(sy, axis=1)
225        else:
226            sy = y
227           
228        terms = self.terms
229        if terms is None:
230            # Automatic maximum number of terms
231            terms = min(200, max(20, 2 * x.shape[1])) + 1
232           
233        n_terms, used, bx, dirs, cuts = forward_pass(x, sy,
234            degree=self.degree, terms=terms, penalty=self.penalty,
235            thresh=self.thresh, fast_k=self.fast_k, fast_beta=self.fast_beta,
236            new_var_penalty=self.new_var_penalty)
237       
238        # discard unused terms from bx, dirs, cuts
239        bx = bx[:, used]
240        dirs = dirs[used, :]
241        cuts = cuts[used, :]
242       
243        # pruning
244        used, subsets, rss_per_subset, gcv_per_subset = \
245            pruning_pass(bx, y, self.penalty,
246                         pruned_terms=self.pruned_terms)
247       
248        # Fit betas
249        bx_used = bx[:, used]
250        betas, res, rank, s = numpy.linalg.lstsq(bx_used, y)
251       
252        return EarthClassifier(instances.domain, used, dirs, cuts, betas.T,
253                               subsets, rss_per_subset, gcv_per_subset,
254                               instances=instances if self.store_instances else None,
255                               multitarget=multitarget,
256                               expanded_class=expanded_class
257                               )
258
259
260def soft_max(values):
261    values = numpy.asarray(values)
262    return numpy.exp(values) / numpy.sum(numpy.exp(values))
263
264
265class EarthClassifier(Orange.core.ClassifierFD):
266    """ Earth classifier.
267    """
268    def __init__(self, domain, best_set, dirs, cuts, betas, subsets=None,
269                 rss_per_subset=None, gcv_per_subset=None, instances=None,
270                 multitarget=False, expanded_class=None,
271                 original_domain=None, **kwargs):
272        self.multitarget = multitarget
273        self.domain = domain
274        self.class_var = domain.class_var
275        if self.multitarget:
276            self.class_vars = domain.class_vars
277           
278        self.best_set = best_set
279        self.dirs = dirs
280        self.cuts = cuts
281        self.betas = betas
282        self.subsets = subsets
283        self.rss_per_subset = rss_per_subset
284        self.gcv_per_subset = gcv_per_subset
285        self.instances = instances
286        self.expanded_class = expanded_class
287        self.original_domain = original_domain
288        self.__dict__.update(kwargs)
289       
290    def __call__(self, instance, result_type=Orange.core.GetValue):
291        if self.multitarget and self.domain.class_vars:
292            resp_vars = list(self.domain.class_vars)
293        elif is_discrete(self.class_var):
294            resp_vars = self.expanded_class
295        else:
296            resp_vars = [self.class_var]
297           
298        vals = self.predict(instance)
299        vals = [var(val) for var, val in zip(resp_vars, vals)]
300       
301        from Orange.statistics.distribution import Distribution
302       
303        if not self.multitarget and is_discrete(self.class_var):
304            dist = Distribution(self.class_var)
305            if len(self.class_var.values) == 2:
306                probs = [1 - float(vals[0]), float(vals[0])]
307            else:
308                probs = soft_max(map(float, vals))
309               
310            for val, p in zip(self.class_var.values, probs):
311                dist[val] = p
312            value = dist.modus()
313            vals, probs = [value], [dist]
314        else:
315            probs = []
316            for var, val in zip(resp_vars, vals):
317                dist = Distribution(var)
318                dist[val] = 1.0
319                probs.append(dist)
320           
321        if not self.multitarget:
322            vals, probs = vals[0], probs[0]
323           
324        if result_type == Orange.core.GetValue:
325            return vals
326        elif result_type == Orange.core.GetBoth:
327            return vals, probs
328        else:
329            return probs
330       
331    def base_matrix(self, instances=None):
332        """Return the base matrix (bx) of the Earth model for the table.
333        If table is not supplied, the base matrix of the training instances
334        is returned.
335        Base matrix is a len(instances) x num_terms matrix of computed values
336        of terms in the model (not multiplied by beta) for each instance.
337       
338        :param instances: Input instances for the base matrix.
339        :type instances: :class:`Orange.data.Table`
340       
341        """
342        if instances is None:
343            instances = self.instances
344        instances = select_attrs(instances, self.domain.attributes)
345        (data,) = instances.to_numpy_MA("A")
346        bx = base_matrix(data, self.best_set, self.dirs, self.cuts)
347        return bx
348
349    def base_features(self):
350        """Return a list of features for the included Earth terms.
351        The attributes can be used in Orange's domain translation
352        (i.e. they define the proper ``get_value_from`` functions).
353       
354        """
355        terms = []
356        dirs = self.dirs[self.best_set]
357        cuts = self.cuts[self.best_set]
358
359        for dir, cut in zip(dirs[1:], cuts[1:]): # Drop the intercept (first)
360            hinge = [_format_knot(self, attr.name, dir1, cut1) \
361                     for (attr, dir1, cut1) in \
362                     zip(self.domain.attributes, dir, cut) \
363                     if dir1 != 0.0]
364            term_name = " * ".join(hinge)
365            term = Orange.feature.Continuous(term_name)
366            term.get_value_from = term_computer(term, self.domain, dir, cut)
367            terms.append(term)
368        return terms
369
370    def predict(self, instance):
371        """ Predict the response value(s)
372       
373        :param instance: Data instance
374        :type instance: :class:`Orange.data.Instance`
375       
376        """
377        data = Orange.data.Table(self.domain, [instance])
378        bx = self.base_matrix(data)
379        bx_used = bx[:, self.best_set]
380        vals = numpy.dot(bx_used, self.betas.T).ravel()
381        return vals
382   
383    def used_attributes(self, term=None):
384        """Return the used terms for term (index). If no term is given,
385        return all attributes in the model.
386       
387        :param term: term index
388        :type term: int
389       
390        """
391        if term is None:
392            return reduce(set.union, [self.used_attributes(i) \
393                                      for i in range(self.best_set.size)],
394                          set())
395           
396        attrs = self.domain.attributes
397       
398        used_mask = self.dirs[term, :] != 0.0
399        return [a for a, u in zip(attrs, used_mask) if u]
400   
401    def evimp(self, used_only=True):
402        """ Return the estimated variable importances.
403       
404        :param used_only: if True return only used attributes
405       
406        """ 
407        return evimp(self, used_only)
408   
409    def __reduce__(self):
410        return (type(self), (self.domain, self.best_set, self.dirs,
411                            self.cuts, self.betas),
412                dict(self.__dict__))
413       
414    def to_string(self, percision=3, indent=3):
415        """ Return a string representation of the model.
416        """
417        return format_model(self, percision, indent)
418       
419    def __str__(self):
420        return self.to_string()
421
422"""
423Utility functions
424-----------------
425"""
426   
427def base_matrix(data, best_set, dirs, cuts):
428    """ Return the base matrix for the earth model.
429   
430    :param data: Input data
431    :type data: :class:`numpy.ndarray`
432   
433    :param best_set: A array of booleans indicating used terms.
434    :type best_set: :class:`numpy.ndarray`
435   
436    :param dirs: Earth model's dirs members
437    :type dirs: :class:`numpy.ndarray`
438   
439    :param cuts: Earth model's cuts members
440    :type cuts: :class:`numpy.ndarray`
441   
442    """
443    data = numpy.asarray(data)
444    best_set = numpy.asarray(best_set)
445    dirs = numpy.asarray(dirs)
446    cuts = numpy.asarray(cuts)
447   
448    bx = numpy.zeros((data.shape[0], best_set.shape[0]))
449    bx[:, 0] = 1.0 # The intercept
450    for termi in range(1, best_set.shape[0]):
451        term_dirs = dirs[termi]
452        term_cuts = cuts[termi]
453       
454        dir_p1 = numpy.where(term_dirs == 1)[0]
455        dir_m1 = numpy.where(term_dirs == -1)[0]
456        dir_2 = numpy.where(term_dirs == 2)[0]
457       
458        x1 = data[:, dir_p1] - term_cuts[dir_p1]
459        x2 = term_cuts[dir_m1] - data[:, dir_m1]
460        x3 = data[:, dir_2]
461       
462        x1 = numpy.where(x1 > 0.0, x1, 0.0)
463        x2 = numpy.where(x2 > 0.0, x2, 0.0)
464       
465        X = numpy.hstack((x1, x2, x3))
466        X = numpy.cumprod(X, axis=1)
467        bx[:, termi] = X[:, -1] if X.size else 0.0
468       
469    return bx
470
471   
472def gcv(rss, n, n_effective_params):
473    """ Return the generalized cross validation.
474   
475    .. math:: gcv = rss / (n * (1 - NumEffectiveParams / n) ^ 2)
476   
477    :param rss: Residual sum of squares.
478    :param n: Number of training instances.
479    :param n_effective_params: Number of effective paramaters.
480     
481    """
482    return  rss / (n * (1. - n_effective_params / n) ** 2)
483
484class term_computer(Orange.core.ClassifierFD):
485    """An utility class for computing basis terms. Can be used as
486    a :obj:`~Orange.feature.Descriptior.get_value_from` member.
487    """
488    def __init__(self, term_var=None, domain=None, dir=None, cut=None):
489        self.class_var = term_var
490        self.domain = domain
491        self.dir = dir
492        self.cut = cut
493
494    def __call__(self, instance, return_what=Orange.core.GetValue):
495        instance = Orange.data.Instance(self.domain, instance)
496        attributes = self.domain.attributes
497        sum = 0.0
498        for val, dir1, cut1 in zip(instance, self.dir, self.cut):
499            if dir1 != 0.0 and dir1 != 2 and not val.isSpecial():
500                sum += max(dir1 * (float(val) - cut1), 0.0)
501        return self.class_var(sum)
502
503"""
504Multi-label utility functions
505"""
506
507
508"""
509ctypes interface to ForwardPass and EvalSubsetsUsingXtx.
510"""
511       
512import ctypes
513from numpy import ctypeslib
514import orange
515
516_c_orange_lib = ctypeslib.load_library(orange.__file__, "")
517_c_forward_pass_ = _c_orange_lib.EarthForwardPass
518
519_c_forward_pass_.argtypes = \
520    [ctypes.POINTER(ctypes.c_int),  #pnTerms:
521     ctypeslib.ndpointer(dtype=ctypes.c_bool, ndim=1),  #FullSet
522     ctypeslib.ndpointer(dtype=ctypes.c_double, ndim=2, flags="F_CONTIGUOUS"), #bx
523     ctypeslib.ndpointer(dtype=ctypes.c_int, ndim=2, flags="F_CONTIGUOUS"),    #Dirs
524     ctypeslib.ndpointer(dtype=ctypes.c_double, ndim=2, flags="F_CONTIGUOUS"), #Cuts
525     ctypeslib.ndpointer(dtype=ctypes.c_int, ndim=1),  #nFactorsInTerms
526     ctypeslib.ndpointer(dtype=ctypes.c_int, ndim=1),  #nUses
527     ctypeslib.ndpointer(dtype=ctypes.c_double, ndim=2, flags="F_CONTIGUOUS"), #x
528     ctypeslib.ndpointer(dtype=ctypes.c_double, ndim=2, flags="F_CONTIGUOUS"), #y
529     ctypeslib.ndpointer(dtype=ctypes.c_double, ndim=1), # Weights
530     ctypes.c_int,  #nCases
531     ctypes.c_int,  #nResp
532     ctypes.c_int,  #nPred
533     ctypes.c_int,  #nMaxDegree
534     ctypes.c_int,  #nMaxTerms
535     ctypes.c_double,   #Penalty
536     ctypes.c_double,   #Thresh
537     ctypes.c_int,  #nFastK
538     ctypes.c_double,   #FastBeta
539     ctypes.c_double,   #NewVarPenalty
540     ctypeslib.ndpointer(dtype=ctypes.c_int, ndim=1),  #LinPreds
541     ctypes.c_bool, #UseBetaCache
542     ctypes.c_char_p    #sPredNames
543     ]
544   
545def forward_pass(x, y, degree=1, terms=21, penalty=None, thresh=0.001,
546                  fast_k=21, fast_beta=1, new_var_penalty=2):
547    """ Do earth forward pass.
548    """
549    x = numpy.asfortranarray(x, dtype=ctypes.c_double)
550    y = numpy.asfortranarray(y, dtype=ctypes.c_double)
551    if x.shape[0] != y.shape[0]:
552        raise ValueError("First dimensions of x and y must be the same.")
553    if y.ndim == 1:
554        y = y.reshape((-1, 1), order="F")
555    if penalty is None:
556        penalty = 2
557    n_cases = x.shape[0]
558    n_preds = x.shape[1]
559   
560    n_resp = y.shape[1] if y.ndim == 2 else y.shape[0]
561   
562    # Output variables
563    n_term = ctypes.c_int()
564    full_set = numpy.zeros((terms,), dtype=ctypes.c_bool, order="F")
565    bx = numpy.zeros((n_cases, terms), dtype=ctypes.c_double, order="F")
566    dirs = numpy.zeros((terms, n_preds), dtype=ctypes.c_int, order="F")
567    cuts = numpy.zeros((terms, n_preds), dtype=ctypes.c_double, order="F")
568    n_factors_in_terms = numpy.zeros((terms,), dtype=ctypes.c_int, order="F")
569    n_uses = numpy.zeros((n_preds,), dtype=ctypes.c_int, order="F")
570    weights = numpy.ones((n_cases,), dtype=ctypes.c_double, order="F")
571    lin_preds = numpy.zeros((n_preds,), dtype=ctypes.c_int, order="F")
572    use_beta_cache = True
573   
574    # These tests are performed in ForwardPass, and if they fail the function
575    # calls exit. So we must check it here and raise a exception to avoid a
576    # process shutdown.
577    if n_cases < 8:
578        raise ValueError("Need at least 8 data instances.")
579    if n_cases > 1e8:
580        raise ValueError("To many data instances.")
581    if n_resp < 1:
582        raise ValueError("No response column.")
583    if n_resp > 1e6:
584        raise ValueError("To many response columns.")
585    if n_preds < 1:
586        raise ValueError("No predictor columns.")
587    if n_preds > 1e5:
588        raise ValueError("To many predictor columns.")
589    if degree <= 0 or degree > 100:
590        raise ValueError("Invalid 'degree'.")
591    if terms < 3 or terms > 10000:
592        raise ValueError("'terms' must be in >= 3 and <= 10000.")
593    if penalty < 0 and penalty != -1:
594        raise ValueError("Invalid 'penalty' (the only legal negative value is -1).")
595    if penalty > 1000:
596        raise ValueError("Invalid 'penalty' (must be <= 1000).")
597    if thresh < 0.0 or thresh >= 1.0:
598        raise ValueError("Invalid 'thresh' (must be in [0.0, 1.0) ).")
599    if fast_beta < 0 or fast_beta > 1000:
600        raise ValueError("Invalid 'fast_beta' (must be in [0, 1000] ).")
601    if new_var_penalty < 0 or new_var_penalty > 10:
602        raise ValueError("Invalid 'new_var_penalty' (must be in [0, 10] ).")
603    if (numpy.var(y, axis=0) <= 1e-8).any():
604        raise ValueError("Variance of y is zero (or near zero).")
605     
606    _c_forward_pass_(ctypes.byref(n_term), full_set, bx, dirs, cuts,
607                     n_factors_in_terms, n_uses, x, y, weights, n_cases,
608                     n_resp, n_preds, degree, terms, penalty, thresh,
609                     fast_k, fast_beta, new_var_penalty, lin_preds, 
610                     use_beta_cache, None)
611    return n_term.value, full_set, bx, dirs, cuts
612
613
614_c_eval_subsets_xtx = _c_orange_lib.EarthEvalSubsetsUsingXtx
615
616_c_eval_subsets_xtx.argtypes = \
617    [ctypeslib.ndpointer(dtype=ctypes.c_bool, ndim=2, flags="F_CONTIGUOUS"),   #PruneTerms
618     ctypeslib.ndpointer(dtype=ctypes.c_double, ndim=1),   #RssVec
619     ctypes.c_int,  #nCases
620     ctypes.c_int,  #nResp
621     ctypes.c_int,  #nMaxTerms
622     ctypeslib.ndpointer(dtype=ctypes.c_double, ndim=2, flags="F_CONTIGUOUS"),   #bx
623     ctypeslib.ndpointer(dtype=ctypes.c_double, ndim=2, flags="F_CONTIGUOUS"),   #y
624     ctypeslib.ndpointer(dtype=ctypes.c_double, ndim=1)  #WeightsArg
625     ]
626   
627_c_eval_subsets_xtx.restype = ctypes.c_int
628
629def subset_selection_xtx(X, Y):
630    """ Subsets selection using EvalSubsetsUsingXtx in the Earth package.
631    """
632    X = numpy.asfortranarray(X, dtype=ctypes.c_double)
633    Y = numpy.asfortranarray(Y, dtype=ctypes.c_double)
634    if Y.ndim == 1:
635        Y = Y.reshape((-1, 1), order="F")
636       
637    if X.shape[0] != Y.shape[0]:
638        raise ValueError("First dimensions of bx and y must be the same")
639       
640    var_count = X.shape[1]
641    resp_count = Y.shape[1]
642    cases = X.shape[0]
643    subsets = numpy.zeros((var_count, var_count), dtype=ctypes.c_bool,
644                              order="F")
645    rss_vec = numpy.zeros((var_count,), dtype=ctypes.c_double, order="F")
646    weights = numpy.ones((cases,), dtype=ctypes.c_double, order="F")
647   
648    rval = _c_eval_subsets_xtx(subsets, rss_vec, cases, resp_count, var_count,
649                        X, Y, weights)
650    if rval != 0:
651        raise numpy.linalg.LinAlgError("Lin. dep. terms in X")
652   
653    subsets_ind = numpy.zeros((var_count, var_count), dtype=int)
654    for i, used in enumerate(subsets.T):
655        subsets_ind[i, :i + 1] = numpy.where(used)[0]
656       
657    return subsets_ind, rss_vec
658   
659def subset_selection_xtx_numpy(X, Y):
660    """ A numpy implementation of EvalSubsetsUsingXtx in the Earth package.
661    Using numpy.linalg.lstsq
662   
663    """
664    X = numpy.asarray(X)
665    Y = numpy.asarray(Y)
666   
667    var_count = X.shape[1]
668    rss_vec = numpy.zeros(var_count)
669    working_set = range(var_count)
670    subsets = numpy.zeros((var_count, var_count), dtype=int)
671   
672    for subset_size in reversed(range(var_count)):
673        subsets[subset_size, :subset_size + 1] = working_set
674        X_work = X[:, working_set]
675        b, res, rank, s = numpy.linalg.lstsq(X_work, Y)
676        if res.size > 0:
677            rss_vec[subset_size] = numpy.sum(res)
678        else:
679            rss_vec[subset_size] = numpy.sum((Y - numpy.dot(X_work, b)) ** 2)
680           
681        XtX = numpy.dot(X_work.T, X_work)
682        iXtX = numpy.linalg.pinv(XtX)
683        diag = numpy.diag(iXtX).reshape((-1, 1))
684       
685        if subset_size == 0:
686            break
687       
688        delta_rss = b ** 2 / diag
689        delta_rss = numpy.sum(delta_rss, axis=1)
690        delete_i = numpy.argmin(delta_rss[1:]) + 1 # Keep the intercept
691        del working_set[delete_i]
692    return subsets, rss_vec
693   
694def subset_selection_xtx2(X, Y):
695    """ Another implementation (this uses qr decomp).
696    """
697    from Orange.misc import linalg
698    X = numpy.asfortranarray(X, dtype=ctypes.c_double)
699    Y = numpy.asfortranarray(Y, dtype=ctypes.c_double)
700    col_count = X.shape[1]
701    working_set = range(col_count)
702    subsets = numpy.zeros((col_count, col_count), dtype=int)
703    rss_vec = numpy.zeros((col_count,))
704    QR, k, _, jpvt = linalg.qr_decomp(X)
705   
706    if k < col_count:
707        # remove jpvt[k:] from the work set. Will have zero
708        # entries in the subsets matrix, and inf rss
709        for i in sorted(jpvt[k:], reverse=True):
710            del working_set[i]
711            rss_vec[len(working_set)] = float("inf")
712        col_count = len(working_set)
713       
714    for subset_size in reversed(range(col_count)):
715        subsets[subset_size, :subset_size + 1] = working_set
716        X_work = X[:, working_set]
717        b, rsd, rank = linalg.qr_lstsq(X_work, Y)
718        rss_vec[subset_size] = numpy.sum(rsd ** 2)
719        XtX = numpy.dot(X_work.T, X_work)
720        iXtX = numpy.linalg.pinv(XtX)
721        diag = numpy.diag(iXtX)
722       
723        if subset_size == 0:
724            break
725       
726        delta_rss = b ** 2 / diag
727        delete_i = numpy.argmin(delta_rss[1:]) + 1 # Keep the intercept
728        del working_set[delete_i]
729    return subsets, rss_vec
730   
731def pruning_pass(bx, y, penalty, pruned_terms=-1):
732    """ Do pruning pass
733   
734    .. todo:: pruned_terms, Leaps
735   
736    """
737    try:
738        subsets, rss_vec = subset_selection_xtx(bx, y)
739    except numpy.linalg.LinAlgError:
740        subsets, rss_vec = subset_selection_xtx_numpy(bx, y)
741   
742    cases, terms = bx.shape
743    n_effective_params = numpy.arange(terms) + 1.0
744    n_effective_params += penalty * (n_effective_params - 1.0) / 2.0
745   
746    gcv_vec = gcv(rss_vec, cases, n_effective_params)
747   
748    min_i = numpy.argmin(gcv_vec)
749    used = numpy.zeros((terms), dtype=bool)
750   
751    used[subsets[min_i, :min_i + 1]] = True
752   
753    return used, subsets, rss_vec, gcv_vec
754
755"""
756Printing functions.
757"""
758   
759def format_model(model, percision=3, indent=3):
760    """ Return a formated string representation of the earth model.
761    """
762    if model.multitarget:
763        r_vars = list(model.domain.class_vars)
764    elif is_discrete(model.class_var):
765        r_vars = model.expanded_class
766    else:
767        r_vars = [model.class_var]
768       
769    r_names = [v.name for v in r_vars]
770    betas = model.betas
771       
772    resp = []
773    for name, betas in zip(r_names, betas):
774        resp.append(_format_response(model, name, betas,
775                                     percision, indent))
776    return "\n\n".join(resp)
777
778def _format_response(model, resp_name, betas, percision=3, indent=3):
779    header = "%s =" % resp_name
780    indent = " " * indent
781    fmt = "%." + str(percision) + "f"
782    terms = [([], fmt % betas[0])]
783    beta_i = 0
784    for i, used in enumerate(model.best_set[1:], 1):
785        if used:
786            beta_i += 1
787            beta = fmt % abs(betas[beta_i])
788            knots = [_format_knot(model, attr.name, d, c, percision) \
789                     for d, c, attr in \
790                     zip(model.dirs[i], model.cuts[i], model.domain.attributes) \
791                     if d != 0]
792            term_attrs = [a for a, d in zip(model.domain.attributes, model.dirs[i]) \
793                          if d != 0]
794            term_attrs = sorted(term_attrs)
795            sign = "-" if betas[beta_i] < 0 else "+"
796            if knots:
797                terms.append((term_attrs,
798                              sign + " * ".join([beta] + knots)))
799            else:
800                terms.append((term_attrs, sign + beta))
801    # Sort by len(term_attrs), then by term_attrs
802    terms = sorted(terms, key=lambda t: (len(t[0]), t[0]))
803    return "\n".join([header] + [indent + t for _, t in terms])
804       
805def _format_knot(model, name, dir, cut, percision=3):
806    fmt = "%%.%if" % percision
807    if dir == 1:
808        txt = ("max(0, %s - " + fmt + ")") % (name, cut)
809    elif dir == -1:
810        txt = ("max(0, " + fmt + " - %s)") % (cut, name)
811    elif dir == 2:
812        txt = name
813    return txt
814
815   
816"""\
817Variable importance estimation
818------------------------------
819"""
820
821from collections import defaultdict
822
823def collect_source(vars):
824    """ Given a list of variables ``var``, return a mapping from source
825    variables (``source_variable`` or ``get_value_from.variable`` members)
826    back to the variables in ``vars`` (assumes the default preprocessor in
827    EarthLearner).
828   
829    """
830    source = defaultdict(list)
831    for var in vars:
832        svar = None
833        if var.source_variable:
834            source[var.source_variable].append(var)
835        elif isinstance(var.get_value_from, Orange.core.ClassifierFromVar):
836            source[var.get_value_from.variable].append(var)
837        elif isinstance(var.get_value_from, Orange.core.ImputeClassifier):
838            source[var.get_value_from.classifier_from_var.variable].append(var)
839        else:
840            source[var].append(var)
841    return dict(source)
842
843def map_to_source_var(var, sources):
844    """
845    """
846    if var in sources:
847        return var
848    elif var.source_variable in sources:
849        return var.source_variable
850    elif isinstance(var.get_value_from, Orange.core.ClassifierFromVar):
851        return map_to_source_var(var.get_value_from.variable, sources)
852    elif isinstance(var.get_value_from, Orange.core.ImputeClassifier):
853        var = var.get_value_from.classifier_from_var.variable
854        return map_to_source_var(var, sources)
855    else:
856        return None
857   
858def evimp(model, used_only=True):
859    """ Return the estimated variable importance for the model.
860   
861    :param model: Earth model.
862    :type model: `EarthClassifier`
863   
864    """
865    if model.subsets is None:
866        raise ValueError("No subsets. Use the learner with 'prune=True'.")
867   
868    subsets = model.subsets
869    n_subsets = numpy.sum(model.best_set)
870   
871    rss = -numpy.diff(model.rss_per_subset)
872    gcv = -numpy.diff(model.gcv_per_subset)
873    attributes = list(model.domain.variables)
874   
875    attr2ind = dict(zip(attributes, range(len(attributes))))
876    importances = numpy.zeros((len(attributes), 4))
877    importances[:, 0] = range(len(attributes))
878   
879    for i in range(1, n_subsets):
880        term_subset = subsets[i, :i + 1]
881        used_attributes = reduce(set.union, [model.used_attributes(term) \
882                                             for term in term_subset], set())
883        for attr in used_attributes:
884            importances[attr2ind[attr]][1] += 1.0
885            importances[attr2ind[attr]][2] += gcv[i - 1]
886            importances[attr2ind[attr]][3] += rss[i - 1]
887    imp_min = numpy.min(importances[:, [2, 3]], axis=0)
888    imp_max = numpy.max(importances[:, [2, 3]], axis=0)
889   
890    #Normalize importances.
891    importances[:, [2, 3]] = 100.0 * (importances[:, [2, 3]] \
892                            - [imp_min]) / ([imp_max - imp_min])
893   
894    importances = list(importances)
895    # Sort by n_subsets and gcv.
896    importances = sorted(importances, key=lambda row: (row[1], row[2]),
897                         reverse=True)
898    importances = numpy.array(importances)
899   
900    if used_only:
901        importances = importances[importances[:,1] > 0.0]
902   
903    res = [(attributes[int(row[0])], tuple(row[1:])) for row in importances]
904    return res
905
906
907def plot_evimp(evimp):
908    """ Plot the variable importances as returned from
909    :obj:`EarthClassifier.evimp` call.
910   
911    ::
912
913        import Orange
914        data = Orange.data.Table("housing")
915        c = Orange.regression.earth.EarthLearner(data, degree=3)
916        Orange.regression.earth.plot_evimp(c.evimp())
917
918    .. image:: files/earth-evimp.png
919     
920    The left axis is the nsubsets measure and on the right are the normalized
921    RSS and GCV.
922   
923    """
924    from Orange.ensemble.bagging import BaggedClassifier
925    if isinstance(evimp, EarthClassifier):
926        evimp = evimp.evimp()
927    elif isinstance(evimp, BaggedClassifier):
928        evimp = bagged_evimp(evimp)
929       
930    import pylab
931    fig = pylab.figure()
932    axes1 = fig.add_subplot(111)
933    attrs = [a for a, _ in evimp]
934    imp = [s for _, s in evimp]
935    imp = numpy.array(imp)
936    X = range(len(attrs))
937    l1 = axes1.plot(X, imp[:, 0], "b-",)
938    axes2 = axes1.twinx()
939   
940    l2 = axes2.plot(X, imp[:, 1], "g-",)
941    l3 = axes2.plot(X, imp[:, 2], "r-",)
942   
943    x_axis = axes1.xaxis
944    x_axis.set_ticks(X)
945    x_axis.set_ticklabels([a.name for a in attrs], rotation=90)
946   
947    axes1.yaxis.set_label_text("nsubsets")
948    axes2.yaxis.set_label_text("normalized gcv or rss")
949
950    axes1.legend([l1, l2, l3], ["nsubsets", "gcv", "rss"])
951    axes1.set_title("Variable importance")
952    fig.show()
953
954   
955def bagged_evimp(classifier, used_only=True):
956    """ Extract combined (average) evimp from an instance of BaggedClassifier
957   
958    Example::
959
960        from Orange.ensemble.bagging import BaggedLearner
961        bc = BaggedLearner(EarthLearner(degree=3, terms=10), data)
962        bagged_evimp(bc)
963
964    """
965    def assert_type(object, class_):
966        if not isinstance(object, class_):
967            raise TypeError("Instance of %r expected." % (class_))
968   
969    from Orange.ensemble.bagging import BaggedClassifier
970   
971    assert_type(classifier, BaggedClassifier)
972    bagged_imp = defaultdict(list)
973    attrs_by_name = defaultdict(list)
974    for c in classifier.classifiers:
975        assert_type(c, EarthClassifier)
976        imp = evimp(c, used_only=used_only)
977        for attr, score in imp:
978            bagged_imp[attr.name].append(score) # map by name
979            attrs_by_name[attr.name].append(attr)
980           
981    for attr, scores in bagged_imp.items():
982        scores = numpy.average(scores, axis=0)
983        bagged_imp[attr] = tuple(scores)
984   
985   
986    bagged_imp = sorted(bagged_imp.items(),
987                        key=lambda t: (t[1][0], t[1][1]),
988                        reverse=True)
989   
990    bagged_imp = [(attrs_by_name[name][0], scores) for name, scores in bagged_imp]
991   
992    if used_only:
993        bagged_imp = [(a, r) for a, r in bagged_imp if r[0] > 0]
994    return bagged_imp
995
996"""
997High level interface for measuring variable importance
998(compatible with Orange.feature.scoring module).
999
1000"""
1001from Orange.feature import scoring
1002           
1003class ScoreEarthImportance(scoring.Score):
1004    """ A subclass of :class:`Orange.feature.scoring.Score` that.
1005    scores features based on their importance in the Earth
1006    model using ``bagged_evimp``.
1007   
1008    """
1009    # Return types 
1010    NSUBSETS = 0
1011    RSS = 1
1012    GCV = 2
1013   
1014    handles_discrete = True
1015    handles_continuous = True
1016    computes_thresholds = False
1017    needs = scoring.Score.Generator
1018   
1019    def __new__(cls, attr=None, data=None, weight_id=None, **kwargs):
1020        self = scoring.Score.__new__(cls)
1021        if attr is not None and data is not None:
1022            self.__init__(**kwargs)
1023            # TODO: Should raise a warning, about caching
1024            return self.__call__(attr, data, weight_id)
1025        elif not attr and not data:
1026            return self
1027        else:
1028            raise ValueError("Both 'attr' and 'data' arguments expected.")
1029       
1030    def __init__(self, t=10, degree=2, terms=10, score_what="nsubsets", cached=True):
1031        """
1032        :param t: Number of earth models to train on the data
1033            (using BaggedLearner).
1034           
1035        :param score_what: What to return as a score.
1036            Can be one of: "nsubsets", "rss", "gcv" or class constants
1037            NSUBSETS, RSS, GCV.
1038           
1039        """
1040        self.t = t
1041        self.degree = degree
1042        self.terms = terms
1043        if isinstance(score_what, basestring):
1044            score_what = {"nsubsets":self.NSUBSETS, "rss":self.RSS,
1045                          "gcv":self.GCV}.get(score_what, None)
1046                         
1047        if score_what not in range(3):
1048            raise ValueError("Invalid  'score_what' parameter.")
1049
1050        self.score_what = score_what
1051        self.cached = cached
1052        self._cache_ref = None
1053        self._cached_evimp = None
1054       
1055    def __call__(self, attr, data, weight_id=None):
1056        ref = self._cache_ref
1057        if ref is not None and ref is data:
1058            evimp = self._cached_evimp
1059        else:
1060            from Orange.ensemble.bagging import BaggedLearner
1061            bc = BaggedLearner(EarthLearner(degree=self.degree,
1062                            terms=self.terms), t=self.t)(data, weight_id)
1063            evimp = bagged_evimp(bc, used_only=False)
1064            self._cache_ref = data
1065            self._cached_evimp = evimp
1066       
1067        evimp = dict(evimp)
1068        score = evimp.get(attr, None)
1069       
1070        if score is None:
1071            source = collect_source(evimp.keys())
1072            if attr in source:
1073                # Return average of source var scores
1074                return numpy.average([evimp[v][self.score_what] \
1075                                      for v in source[attr]])
1076            else:
1077                raise ValueError("Attribute %r not in the domain." % attr)
1078        else:
1079            return score[self.score_what]
1080   
1081class ScoreRSS(scoring.Score):
1082   
1083    handles_discrete = False
1084    handles_continuous = True
1085    computes_thresholds = False
1086   
1087    def __new__(cls, attr=None, data=None, weight_id=None, **kwargs):
1088        self = scoring.Score.__new__(cls)
1089        if attr is not None and data is not None:
1090            self.__init__(**kwargs)
1091            # TODO: Should raise a warning, about caching
1092            return self.__call__(attr, data, weight_id)
1093        elif not attr and not data:
1094            return self
1095        else:
1096            raise ValueError("Both 'attr' and 'data' arguments expected.")
1097       
1098    def __init__(self):
1099        self._cache_data = None
1100        self._cache_rss = None
1101       
1102    def __call__(self, attr, data, weight_id=None):
1103        ref = self._cache_data
1104        if ref is not None and ref is data:
1105            rss = self._cache_rss
1106        else:
1107            x, y = data.to_numpy_MA("1A/c")
1108            try:
1109                subsets, rss = subset_selection_xtx2(x, y)
1110            except numpy.linalg.LinAlgError:
1111                subsets, rss = subset_selection_xtx_numpy(x, y)
1112            rss_diff = -numpy.diff(rss)
1113            rss = numpy.zeros_like(rss)
1114            for s_size in range(1, subsets.shape[0]):
1115                subset = subsets[s_size, :s_size + 1]
1116                rss[subset] += rss_diff[s_size - 1]
1117            rss = rss[1:] #Drop the intercept
1118            self._cache_data = data
1119            self._cache_rss = rss
1120           
1121        index = list(data.domain.attributes).index(attr)
1122        return rss[index]
1123   
1124
1125#from Orange.core import EarthLearner as BaseEarthLearner, \
1126#                        EarthClassifier as BaseEarthClassifier
1127#from Orange.misc import member_set
1128#
1129#class _EarthLearner(BaseEarthLearner):
1130#    """ An earth learner.
1131#    """
1132#    def __new__(cls, data=None, weightId=None, **kwargs):
1133#        self = BaseEarthLearner.__new__(cls, **kwargs)
1134#        if data is not None:
1135#            self.__init__(**kwargs)
1136#            return self.__call__(data, weightId)
1137#        return self
1138#   
1139#    def __init__(self, max_degree=1, max_terms=21, new_var_penalty=0.0,
1140#                 threshold=0.001, prune=True, penalty=None, fast_k=20,
1141#                 fast_beta=0.0, store_examples=True, **kwargs):
1142#        """ Initialize the learner instance.
1143#       
1144#        :param max_degree:
1145#        """
1146#        self.max_degree = max_degree
1147#        self.max_terms = max_terms
1148#        self.new_var_penalty = new_var_penalty
1149#        self.threshold = threshold
1150#        self.prune = prunes
1151#        if penaty is None:
1152#            penalty = 2.0 if degree > 1 else 3.0
1153#        self.penalty = penalty
1154#        self.fast_k = fast_k
1155#        self.fast_beta = fast_beta
1156#        self.store_examples = store_examples
1157#       
1158#        for key, val in kwargs.items():
1159#            setattr(self, key, val)
1160#   
1161#    def __call__(self, data, weightId=None):
1162#        if not data.domain.class_var:
1163#            raise ValueError("No class var in the domain.")
1164#       
1165#        with member_set(self, "prune", False):
1166#            # We overwrite the prune argument (will do the pruning in python).
1167#            base_clsf =  BaseEarthLearner.__call__(self, data, weightId)
1168#       
1169#        if self.prune:
1170#            (best_set, betas, rss, subsets, rss_per_subset,
1171#             gcv_per_subset) = self.pruning_pass(base_clsf, data)
1172#           
1173#            return _EarthClassifier(base_clsf, data if self.store_examples else None,
1174#                                   best_set=best_set, dirs=base_clsf.dirs,
1175#                                   cuts=base_clsf.cuts,
1176#                                   betas=betas,
1177#                                   subsets=subsets,
1178#                                   rss_per_subset=rss_per_subset,
1179#                                   gcv_per_subset=gcv_per_subset)
1180#        else:
1181#            return _EarthClassifier(base_clsf, data if self.store_examples else None)
1182#   
1183#   
1184#    def pruning_pass(self, base_clsf, examples):
1185#        """ Prune the terms constructed in the forward pass.
1186#        (Pure numpy reimplementation)
1187#        """
1188#        n_terms = numpy.sum(base_clsf.best_set)
1189#        n_eff_params = n_terms + self.penalty * (n_terms - 1) / 2.0
1190#        data, y, _ = examples.to_numpy_MA()
1191#        data = data.filled(0.0)
1192#        best_set = numpy.asarray(base_clsf.best_set, dtype=bool)
1193#       
1194#        bx = base_matrix(data, base_clsf.best_set,
1195#                         base_clsf.dirs, base_clsf.cuts,
1196#                         )
1197#       
1198#        bx_used = bx[:, best_set]
1199#        subsets, rss_per_subset = subsets_selection_xtx(bx_used, y) # TODO: Use leaps like library
1200#        gcv_per_subset = [gcv(rss, bx.shape[0], i + self.penalty * (i - 1) / 2.0) \
1201#                              for i, rss in enumerate(rss_per_subset, 1)]
1202#        gcv_per_subset = numpy.array(gcv_per_subset)
1203#       
1204#        best_i = numpy.argmin(gcv_per_subset[1:]) + 1 # Ignore the intercept
1205#        best_ind = subsets[best_i, :best_i + 1]
1206#        bs_i = 0
1207#        for i, b in enumerate(best_set):
1208#            if b:
1209#                best_set[i] = bs_i in best_ind
1210#                bs_i += 1
1211#               
1212#        bx_subset = bx[:, best_set]
1213#        betas, rss, rank, s = numpy.linalg.lstsq(bx_subset, y)
1214#        return best_set, betas, rss, subsets, rss_per_subset, gcv_per_subset
1215#   
1216#       
1217#class _EarthClassifier(Orange.core.ClassifierFD):
1218#    def __init__(self, base_classifier=None, examples=None, best_set=None,
1219#                 dirs=None, cuts=None, betas=None, subsets=None,
1220#                 rss_per_subset=None,
1221#                 gcv_per_subset=None):
1222#        self._base_classifier = base_classifier
1223#        self.examples = examples
1224#        self.domain = base_classifier.domain
1225#        self.class_var = base_classifier.class_var
1226#       
1227#        best_set = base_classifier.best_set if best_set is None else best_set
1228#        dirs = base_classifier.dirs if dirs is None else dirs
1229#        cuts = base_classifier.cuts if cuts is None else cuts
1230#        betas = base_classifier.betas if betas is None else betas
1231#       
1232#        self.best_set = numpy.asarray(best_set, dtype=bool)
1233#        self.dirs = numpy.array(dirs, dtype=int)
1234#        self.cuts = numpy.array(cuts)
1235#        self.betas = numpy.array(betas)
1236#       
1237#        self.subsets = subsets
1238#        self.rss_per_subset = rss_per_subset
1239#        self.gcv_per_subset = gcv_per_subset
1240#       
1241#    @property
1242#    def num_terms(self):
1243#        """ Number of terms in the model (including the intercept).
1244#        """
1245#        return numpy.sum(numpy.asarray(self.best_set, dtype=int))
1246#   
1247#    @property
1248#    def max_terms(self):
1249#        """ Maximum number of terms (as specified in the learning step).
1250#        """
1251#        return self.best_set.size
1252#   
1253#    @property
1254#    def num_preds(self):
1255#        """ Number of predictors (variables) included in the model.
1256#        """
1257#        return len(self.used_attributes(term))
1258#   
1259#    def __call__(self, example, what=Orange.core.GetValue):
1260#        value = self.predict(example)
1261#        if isinstance(self.class_var, Orange.feature.Continuous):
1262#            value = self.class_var(value)
1263#        else:
1264#            value = self.class_var(int(round(value)))
1265#           
1266#        dist = Orange.statistics.distribution.Distribution(self.class_var)
1267#        dist[value] = 1.0
1268#        if what == Orange.core.GetValue:
1269#            return value
1270#        elif what == Orange.core.GetProbabilities:
1271#            return dist
1272#        else:
1273#            return (value, dist)
1274#   
1275#    def base_matrix(self, examples=None):
1276#        """ Return the base matrix (bx) of the Earth model for the table.
1277#        If table is not supplied the base matrix of the training examples
1278#        is returned.
1279#       
1280#       
1281#        :param examples: Input examples for the base matrix.
1282#        :type examples: Orange.data.Table
1283#       
1284#        """
1285#        if examples is None:
1286#            examples = self.examples
1287#           
1288#        if examples is None:
1289#            raise ValueError("base matrix is only available if 'store_examples=True'")
1290#       
1291#        if isinstance(examples, Orange.data.Table):
1292#            data, cls, w = examples.to_numpy_MA()
1293#            data = data.filled(0.0)
1294#        else:
1295#            data = numpy.asarray(examples)
1296#           
1297#        return base_matrix(data, self.best_set, self.dirs, self.cuts)
1298#   
1299#    def _anova_order(self):
1300#        """ Return indices that sort the terms into the 'ANOVA' format.
1301#        """
1302#        terms = [([], 0)] # intercept
1303#        for i, used in enumerate(self.best_set[1:], 1):
1304#            attrs = sorted(self.used_attributes(i))
1305#            if used and attrs:
1306#                terms.append((attrs, i))
1307#        terms = sotred(terms, key=lambda t:(len(t[0]), t[0]))
1308#        return [i for _, i in terms]
1309#   
1310#    def format_model(self, percision=3, indent=3):
1311#        return format_model(self, percision, indent)
1312#   
1313#    def print_model(self, percision=3, indent=3):
1314#        print self.format_model(percision, indent)
1315#       
1316#    def predict(self, ex):
1317#        """ Return the predicted value (float) for example.
1318#        """
1319#        x = Orange.data.Table(self.domain, [ex])
1320#        x, c, w = x.to_numpy_MA()
1321#        x = x.filled(0.0)[0]
1322#       
1323#        bx = numpy.ones(self.best_set.shape)
1324#        betas = numpy.zeros_like(self.betas)
1325#        betas[0] = self.betas[0]
1326#        beta_i = 0
1327#        for termi in range(1, len(self.best_set)):
1328#            dirs = self.dirs[termi]
1329#            cuts = self.cuts[termi]
1330#            dir_p1 = numpy.where(dirs == 1)[0]
1331#            dir_m1 = numpy.where(dirs == -1)[0]
1332#            dir_2 = numpy.where(dirs == 2)[0]
1333#           
1334#            x1 = x[dir_p1] - cuts[dir_p1]
1335#            x2 = cuts[dir_m1] - x[dir_m1]
1336#            x3 = x[dir_2]
1337#           
1338#            x1 = numpy.maximum(x1, 0.0)
1339#            x2 = numpy.maximum(x2, 0.0)
1340#
1341#            X = numpy.hstack((x1, x2, x3))
1342#            X = numpy.cumprod(X)
1343#           
1344#            bx[termi] = X[-1] if X.size else 0.0
1345#            if self.best_set[termi] != 0:
1346#                beta_i += 1
1347#                betas[beta_i] = self.betas[beta_i]
1348#
1349#        return numpy.sum(bx[self.best_set] * betas)
1350#           
1351#    def used_attributes(self, term=None):
1352#        """ Return a list of used attributes. If term (index) is given
1353#        return only attributes used in that single term.
1354#       
1355#        """
1356#        if term is None:
1357#            terms = numpy.where(self.best_set)[0]
1358#        else:
1359#            terms = [term]
1360#        attrs = set()
1361#        for ti in terms:
1362#            attri = numpy.where(self.dirs[ti] != 0.0)[0]
1363#            attrs.update([self.domain.attributes[i] for i in attri])
1364#        return attrs
1365#       
1366#    def evimp(self, used_only=True):
1367#        """ Return the estimated variable importance.
1368#        """
1369#        return evimp(self, used_only)
1370#       
1371#    def __reduce__(self):
1372#        return (EarthClassifier, (self._base_classifier, self.examples,
1373#                                  self.best_set, self.dirs, self.cuts,
1374#                                  self.betas, self.subsets,
1375#                                  self.rss_per_subset, self.gcv_per_subset),
1376#                {})
1377                                 
Note: See TracBrowser for help on using the repository browser.