source: orange/Orange/regression/earth.py @ 10420:969093619068

Revision 10420:969093619068, 47.5 KB checked in by janezd <janez.demsar@…>, 2 years ago (diff)

Minor changes to documentation about multitarget

Line 
1"""\
2====================================================
3Multivariate Adaptive Regression Splines (``earth``)
4====================================================
5
6.. index:: regression, linear model
7
8`Multivariate adaptive regression splines (MARS)`_ is a non-parametric
9regression method that extends a linear model with non-linear
10interactions.
11
12This module borrows the implementation of the technique from the `Earth R
13package`_ by Stephen Milborrow.
14
15.. _`Multivariate adaptive regression splines (MARS)`: http://en.wikipedia.org/wiki/Multivariate_adaptive_regression_splines
16.. _`Earth R package`: http://cran.r-project.org/web/packages/earth/index.html
17
18Example ::
19
20    >>> import Orange
21    >>> data = Orange.data.Table("housing")
22    >>> c = Orange.regression.earth.EarthLearner(data, degree=2, terms=10)
23    >>> print c
24    MEDV =
25       23.587
26       +11.896 * max(0, RM - 6.431)
27       +1.142 * max(0, 6.431 - RM)
28       -0.612 * max(0, LSTAT - 6.120)
29       -228.795 * max(0, NOX - 0.647) * max(0, RM - 6.431)
30       +0.023 * max(0, TAX - 307.000) * max(0, 6.120 - LSTAT)
31       +0.029 * max(0, 307.000 - TAX) * max(0, 6.120 - LSTAT)
32
33
34.. autoclass:: EarthLearner
35    :members:
36
37.. autoclass:: EarthClassifier
38    :members:
39
40
41Utility functions
42-----------------
43
44.. autofunction:: gcv
45
46.. autofunction:: plot_evimp
47
48.. autofunction:: bagged_evimp
49
50.. autoclass:: ScoreEarthImportance
51
52"""
53
54import Orange
55from Orange.feature import Discrete, Continuous
56from Orange.data import Table, Domain
57from Orange.preprocess import Preprocessor_continuize, \
58                              Preprocessor_impute, \
59                              Preprocessor_preprocessorList, \
60                              DomainContinuizer
61
62import numpy
63
64def is_discrete(var):
65    return isinstance(var, Discrete)
66
67def is_continuous(var):
68    return isinstance(var, Continuous)
69
70def expand_discrete(var):
71    """ Expand a discrete variable ``var`` returning one continuous indicator
72    variable for each value of ``var`` (if the number of values is grater
73    then 2 else return only one indicator variable).
74   
75    """
76    if len(var.values) > 2:
77        values = var.values
78    elif len(var.values) == 2:
79        values = var.values[-1:]
80    else:
81        values = var.values[:1]
82    new_vars = []
83    for value in values:
84        new = Continuous("{0}={1}".format(var.name, value))
85        new.get_value_from = cls = Orange.core.ClassifierFromVar(whichVar=var)
86        cls.transformer = Orange.core.Discrete2Continuous()
87        cls.transformer.value = int(Orange.core.Value(var, value))
88        new.source_variable = var
89        new_vars.append(new)
90    return new_vars
91
92def select_attrs(table, features, class_var=None,
93                 class_vars=None, metas=None):
94    """ Select only ``attributes`` from the ``table``.
95    """
96    if class_vars is None:
97        domain = Domain(features, class_var)
98    else:
99        domain = Domain(features, class_var, class_vars=class_vars)
100    if metas:
101        domain.add_metas(metas)
102    return Table(domain, table)
103   
104class EarthLearner(Orange.regression.base.BaseRegressionLearner):
105    """Earth learner class. Supports both regression and classification
106    problems. For classification, class values are expanded into
107    continuous indicator columns (one for each value if the number of
108    values is grater then 2), and a multi response model is fit to these
109    new columns. The resulting classifier the computes response
110    values on new instances to select the final predicted class.
111     
112    """
113    def __new__(cls, instances=None, weight_id=None, **kwargs):
114        self = Orange.regression.base.BaseRegressionLearner.__new__(cls)
115        if instances is not None:
116            self.__init__(**kwargs)
117            return self.__call__(instances, weight_id)
118        else:
119            return self
120       
121    def __init__(self, degree=1, terms=21, penalty= None, thresh=1e-3,
122                 min_span=0, new_var_penalty=0, fast_k=20, fast_beta=1,
123                 pruned_terms=None, scale_resp=True, store_instances=True,
124                **kwds):
125        """Initialize the learner instance.
126       
127        :param degree: Maximum degree (num. of hinge functions per term)
128            of the terms in the model (default: 1).
129        :type degree: int
130        :param terms: Maximum number of terms in the forward pass
131                (default: 21).  If set to ``None``, ``min(200, max(20, 2
132                * n_attributes)) + 1`` will be used, like the default
133                setting in earth R package.
134        :type terms: int
135        :param penalty: Penalty for hinges in the GCV computation (used
136            in the pruning pass). Default is 3.0 if ``degree`` is above 1,
137            and 2.0 otherwise.
138        :type penalty: float
139        :param thresh: Threshold for RSS decrease in the forward pass
140            (default: 0.001).
141        :type thresh: float
142        :param min_span: TODO.
143        :param new_var_penalty: Penalty for introducing a new variable
144            in the model during the forward pass (default: 0).
145        :type new_var_penalty: float
146        :param fast_k: Fast k.
147        :param fast_beta: Fast beta.
148        :param pruned_terms: Maximum number of terms in the model after
149            pruning (default: ``None``, no limit).
150        :type pruned_terms: int
151        :param scale_resp: Scale responses prior to forward pass (default:
152            ``True``); ignored for models with multiple responses.
153        :type scale_resp: bool
154        :param store_instances: Store training instances in the model
155            (default: ``True``).
156        :type store_instances: bool
157         
158        .. todo:: min_span, prunning_method (need Leaps like functionality,
159            currently only eval_subsets_using_xtx is implemented).
160       
161        """
162       
163        super(EarthLearner, self).__init__()
164       
165        self.degree = degree
166        self.terms = terms
167        if penalty is None:
168            penalty = 3 if degree > 1 else 2
169        self.penalty = penalty
170        self.thresh = thresh
171        self.min_span = min_span
172        self.new_var_penalty = new_var_penalty
173        self.fast_k = fast_k
174        self.fast_beta = fast_beta
175        self.pruned_terms = pruned_terms
176        self.scale_resp = scale_resp
177        self.store_instances = store_instances
178        self.__dict__.update(kwds)
179       
180        self.continuizer.class_treatment = DomainContinuizer.Ignore
181       
182    def __call__(self, instances, weight_id=None):
183       
184        expanded_class = None
185        multitarget = False
186       
187        if instances.domain.class_var:
188            instances = self.impute_table(instances)
189            instances = self.continuize_table(instances)
190           
191            if is_discrete(instances.domain.class_var):
192                # Expand a discrete class with indicator columns
193                expanded_class = expand_discrete(instances.domain.class_var)
194                y_table = select_attrs(instances, expanded_class)
195                (y, ) = y_table.to_numpy_MA("A")
196                (x, ) = instances.to_numpy_MA("A")
197            elif is_continuous(instances.domain.class_var):
198                x, y, _ = instances.to_numpy_MA()
199                y = y.reshape((-1, 1))
200            else:
201                raise ValueError("Cannot handle the response.")
202        elif instances.domain.class_vars:
203            # Multi-target domain
204            if not all(map(is_continuous, instances.domain.class_vars)):
205                raise TypeError("Only continuous multi-target classes are supported.")
206            x_table = select_attrs(instances, instances.domain.attributes)
207            y_table = select_attrs(instances, instances.domain.class_vars)
208           
209            # Impute and continuize only the x_table
210            x_table = self.impute_table(x_table)
211            x_table = self.continuize_table(x_table)
212            domain = Domain(x_table.domain.attributes,
213                            class_vars=instances.domain.class_vars)
214           
215            (x, ) = x_table.to_numpy_MA("A")
216            (y, ) = y_table.to_numpy_MA("A")
217           
218            multitarget = True
219        else:
220            raise ValueError("Class variable expected.")
221       
222        if self.scale_resp and y.shape[1] == 1:
223            sy = y - numpy.mean(y, axis=0)
224            sy = sy / numpy.std(sy, axis=1)
225        else:
226            sy = y
227           
228        terms = self.terms
229        if terms is None:
230            # Automatic maximum number of terms
231            terms = min(200, max(20, 2 * x.shape[1])) + 1
232           
233        n_terms, used, bx, dirs, cuts = forward_pass(x, sy,
234            degree=self.degree, terms=terms, penalty=self.penalty,
235            thresh=self.thresh, fast_k=self.fast_k, fast_beta=self.fast_beta,
236            new_var_penalty=self.new_var_penalty)
237       
238        # discard unused terms from bx, dirs, cuts
239        bx = bx[:, used]
240        dirs = dirs[used, :]
241        cuts = cuts[used, :]
242       
243        # pruning
244        used, subsets, rss_per_subset, gcv_per_subset = \
245            pruning_pass(bx, y, self.penalty,
246                         pruned_terms=self.pruned_terms)
247       
248        # Fit betas
249        bx_used = bx[:, used]
250        betas, res, rank, s = numpy.linalg.lstsq(bx_used, y)
251       
252        return EarthClassifier(instances.domain, used, dirs, cuts, betas.T,
253                               subsets, rss_per_subset, gcv_per_subset,
254                               instances=instances if self.store_instances else None,
255                               multitarget=multitarget,
256                               expanded_class=expanded_class
257                               )
258
259
260def soft_max(values):
261    values = numpy.asarray(values)
262    return numpy.exp(values) / numpy.sum(numpy.exp(values))
263
264
265class EarthClassifier(Orange.core.ClassifierFD):
266    """ Earth classifier.
267    """
268    def __init__(self, domain, best_set, dirs, cuts, betas, subsets=None,
269                 rss_per_subset=None, gcv_per_subset=None, instances=None,
270                 multitarget=False, expanded_class=None,
271                 original_domain=None, **kwargs):
272        self.multitarget = multitarget
273        self.domain = domain
274        self.class_var = domain.class_var
275        if self.multitarget:
276            self.class_vars = domain.class_vars
277           
278        self.best_set = best_set
279        self.dirs = dirs
280        self.cuts = cuts
281        self.betas = betas
282        self.subsets = subsets
283        self.rss_per_subset = rss_per_subset
284        self.gcv_per_subset = gcv_per_subset
285        self.instances = instances
286        self.expanded_class = expanded_class
287        self.original_domain = original_domain
288        self.__dict__.update(kwargs)
289       
290    def __call__(self, instance, result_type=Orange.core.GetValue):
291        if self.multitarget and self.domain.class_vars:
292            resp_vars = list(self.domain.class_vars)
293        elif is_discrete(self.class_var):
294            resp_vars = self.expanded_class
295        else:
296            resp_vars = [self.class_var]
297           
298        vals = self.predict(instance)
299        vals = [var(val) for var, val in zip(resp_vars, vals)]
300       
301        from Orange.statistics.distribution import Distribution
302       
303        if not self.multitarget and is_discrete(self.class_var):
304            dist = Distribution(self.class_var)
305            if len(self.class_var.values) == 2:
306                probs = [1 - float(vals[0]), float(vals[0])]
307            else:
308                probs = soft_max(map(float, vals))
309               
310            for val, p in zip(self.class_var.values, probs):
311                dist[val] = p
312            value = dist.modus()
313            vals, probs = [value], [dist]
314        else:
315            probs = []
316            for var, val in zip(resp_vars, vals):
317                dist = Distribution(var)
318                dist[val] = 1.0
319                probs.append(dist)
320           
321        if not self.multitarget:
322            vals, probs = vals[0], probs[0]
323           
324        if result_type == Orange.core.GetValue:
325            return vals
326        elif result_type == Orange.core.GetBoth:
327            return vals, probs
328        else:
329            return probs
330       
331    def base_matrix(self, instances=None):
332        """Return the base matrix (bx) of the Earth model for the table.
333        If table is not supplied, the base matrix of the training instances
334        is returned.
335        Base matrix is a len(instances) x num_terms matrix of computed values
336        of terms in the model (not multiplied by beta) for each instance.
337       
338        :param instances: Input instances for the base matrix.
339        :type instances: :class:`Orange.data.Table`
340       
341        """
342        if instances is None:
343            instances = self.instances
344        instances = select_attrs(instances, self.domain.attributes)
345        (data,) = instances.to_numpy_MA("A")
346        bx = base_matrix(data, self.best_set, self.dirs, self.cuts)
347        return bx
348   
349    def predict(self, instance):
350        """ Predict the response value(s)
351       
352        :param instance: Data instance
353        :type instance: :class:`Orange.data.Instance`
354       
355        """
356        data = Orange.data.Table(self.domain, [instance])
357        bx = self.base_matrix(data)
358        bx_used = bx[:, self.best_set]
359        vals = numpy.dot(bx_used, self.betas.T).ravel()
360        return vals
361   
362    def used_attributes(self, term=None):
363        """Return the used terms for term (index). If no term is given,
364        return all attributes in the model.
365       
366        :param term: term index
367        :type term: int
368       
369        """
370        if term is None:
371            return reduce(set.union, [self.used_attributes(i) \
372                                      for i in range(self.best_set.size)],
373                          set())
374           
375        attrs = self.domain.attributes
376       
377        used_mask = self.dirs[term, :] != 0.0
378        return [a for a, u in zip(attrs, used_mask) if u]
379   
380    def evimp(self, used_only=True):
381        """ Return the estimated variable importances.
382       
383        :param used_only: if True return only used attributes
384       
385        """ 
386        return evimp(self, used_only)
387   
388    def __reduce__(self):
389        return (type(self), (self.domain, self.best_set, self.dirs,
390                            self.cuts, self.betas),
391                dict(self.__dict__))
392       
393    def to_string(self, percision=3, indent=3):
394        """ Return a string representation of the model.
395        """
396        return format_model(self, percision, indent)
397       
398    def __str__(self):
399        return self.to_string()
400   
401
402"""
403Utility functions
404-----------------
405"""
406   
407def base_matrix(data, best_set, dirs, cuts):
408    """ Return the base matrix for the earth model.
409   
410    :param data: Input data
411    :type data: :class:`numpy.ndarray`
412   
413    :param best_set: A array of booleans indicating used terms.
414    :type best_set: :class:`numpy.ndarray`
415   
416    :param dirs: Earth model's dirs members
417    :type dirs: :class:`numpy.ndarray`
418   
419    :param cuts: Earth model's cuts members
420    :type cuts: :class:`numpy.ndarray`
421   
422    """
423    data = numpy.asarray(data)
424    best_set = numpy.asarray(best_set)
425    dirs = numpy.asarray(dirs)
426    cuts = numpy.asarray(cuts)
427   
428    bx = numpy.zeros((data.shape[0], best_set.shape[0]))
429    bx[:, 0] = 1.0 # The intercept
430    for termi in range(1, best_set.shape[0]):
431        term_dirs = dirs[termi]
432        term_cuts = cuts[termi]
433       
434        dir_p1 = numpy.where(term_dirs == 1)[0]
435        dir_m1 = numpy.where(term_dirs == -1)[0]
436        dir_2 = numpy.where(term_dirs == 2)[0]
437       
438        x1 = data[:, dir_p1] - term_cuts[dir_p1]
439        x2 = term_cuts[dir_m1] - data[:, dir_m1]
440        x3 = data[:, dir_2]
441       
442        x1 = numpy.where(x1 > 0.0, x1, 0.0)
443        x2 = numpy.where(x2 > 0.0, x2, 0.0)
444       
445        X = numpy.hstack((x1, x2, x3))
446        X = numpy.cumprod(X, axis=1)
447        bx[:, termi] = X[:, -1] if X.size else 0.0
448       
449    return bx
450
451   
452def gcv(rss, n, n_effective_params):
453    """ Return the generalized cross validation.
454   
455    .. math:: gcv = rss / (n * (1 - NumEffectiveParams / n) ^ 2)
456   
457    :param rss: Residual sum of squares.
458    :param n: Number of training instances.
459    :param n_effective_params: Number of effective paramaters.
460     
461    """
462    return  rss / (n * (1. - n_effective_params / n) ** 2)
463
464"""
465Multi-label utility functions
466"""
467
468
469"""
470ctypes interface to ForwardPass and EvalSubsetsUsingXtx.
471"""
472       
473import ctypes
474from numpy import ctypeslib
475import orange
476
477_c_orange_lib = ctypeslib.load_library(orange.__file__, "")
478_c_forward_pass_ = _c_orange_lib.EarthForwardPass
479
480_c_forward_pass_.argtypes = \
481    [ctypes.POINTER(ctypes.c_int),  #pnTerms:
482     ctypeslib.ndpointer(dtype=ctypes.c_bool, ndim=1),  #FullSet
483     ctypeslib.ndpointer(dtype=ctypes.c_double, ndim=2, flags="F_CONTIGUOUS"), #bx
484     ctypeslib.ndpointer(dtype=ctypes.c_int, ndim=2, flags="F_CONTIGUOUS"),    #Dirs
485     ctypeslib.ndpointer(dtype=ctypes.c_double, ndim=2, flags="F_CONTIGUOUS"), #Cuts
486     ctypeslib.ndpointer(dtype=ctypes.c_int, ndim=1),  #nFactorsInTerms
487     ctypeslib.ndpointer(dtype=ctypes.c_int, ndim=1),  #nUses
488     ctypeslib.ndpointer(dtype=ctypes.c_double, ndim=2, flags="F_CONTIGUOUS"), #x
489     ctypeslib.ndpointer(dtype=ctypes.c_double, ndim=2, flags="F_CONTIGUOUS"), #y
490     ctypeslib.ndpointer(dtype=ctypes.c_double, ndim=1), # Weights
491     ctypes.c_int,  #nCases
492     ctypes.c_int,  #nResp
493     ctypes.c_int,  #nPred
494     ctypes.c_int,  #nMaxDegree
495     ctypes.c_int,  #nMaxTerms
496     ctypes.c_double,   #Penalty
497     ctypes.c_double,   #Thresh
498     ctypes.c_int,  #nFastK
499     ctypes.c_double,   #FastBeta
500     ctypes.c_double,   #NewVarPenalty
501     ctypeslib.ndpointer(dtype=ctypes.c_int, ndim=1),  #LinPreds
502     ctypes.c_bool, #UseBetaCache
503     ctypes.c_char_p    #sPredNames
504     ]
505   
506def forward_pass(x, y, degree=1, terms=21, penalty=None, thresh=0.001,
507                  fast_k=21, fast_beta=1, new_var_penalty=2):
508    """ Do earth forward pass.
509    """
510    x = numpy.asfortranarray(x, dtype=ctypes.c_double)
511    y = numpy.asfortranarray(y, dtype=ctypes.c_double)
512    if x.shape[0] != y.shape[0]:
513        raise ValueError("First dimensions of x and y must be the same.")
514    if y.ndim == 1:
515        y = y.reshape((-1, 1), order="F")
516    if penalty is None:
517        penalty = 2
518    n_cases = x.shape[0]
519    n_preds = x.shape[1]
520   
521    n_resp = y.shape[1] if y.ndim == 2 else y.shape[0]
522   
523    # Output variables
524    n_term = ctypes.c_int()
525    full_set = numpy.zeros((terms,), dtype=ctypes.c_bool, order="F")
526    bx = numpy.zeros((n_cases, terms), dtype=ctypes.c_double, order="F")
527    dirs = numpy.zeros((terms, n_preds), dtype=ctypes.c_int, order="F")
528    cuts = numpy.zeros((terms, n_preds), dtype=ctypes.c_double, order="F")
529    n_factors_in_terms = numpy.zeros((terms,), dtype=ctypes.c_int, order="F")
530    n_uses = numpy.zeros((n_preds,), dtype=ctypes.c_int, order="F")
531    weights = numpy.ones((n_cases,), dtype=ctypes.c_double, order="F")
532    lin_preds = numpy.zeros((n_preds,), dtype=ctypes.c_int, order="F")
533    use_beta_cache = True
534   
535    # These tests are performed in ForwardPass, and if they fail the function
536    # calls exit. So we must check it here and raise a exception to avoid a
537    # process shutdown.
538    if n_cases < 8:
539        raise ValueError("Need at least 8 data instances.")
540    if n_cases > 1e8:
541        raise ValueError("To many data instances.")
542    if n_resp < 1:
543        raise ValueError("No response column.")
544    if n_resp > 1e6:
545        raise ValueError("To many response columns.")
546    if n_preds < 1:
547        raise ValueError("No predictor columns.")
548    if n_preds > 1e5:
549        raise ValueError("To many predictor columns.")
550    if degree <= 0 or degree > 100:
551        raise ValueError("Invalid 'degree'.")
552    if terms < 3 or terms > 10000:
553        raise ValueError("'terms' must be in >= 3 and <= 10000.")
554    if penalty < 0 and penalty != -1:
555        raise ValueError("Invalid 'penalty' (the only legal negative value is -1).")
556    if penalty > 1000:
557        raise ValueError("Invalid 'penalty' (must be <= 1000).")
558    if thresh < 0.0 or thresh >= 1.0:
559        raise ValueError("Invalid 'thresh' (must be in [0.0, 1.0) ).")
560    if fast_beta < 0 or fast_beta > 1000:
561        raise ValueError("Invalid 'fast_beta' (must be in [0, 1000] ).")
562    if new_var_penalty < 0 or new_var_penalty > 10:
563        raise ValueError("Invalid 'new_var_penalty' (must be in [0, 10] ).")
564    if (numpy.var(y, axis=0) <= 1e-8).any():
565        raise ValueError("Variance of y is zero (or near zero).")
566     
567    _c_forward_pass_(ctypes.byref(n_term), full_set, bx, dirs, cuts,
568                     n_factors_in_terms, n_uses, x, y, weights, n_cases,
569                     n_resp, n_preds, degree, terms, penalty, thresh,
570                     fast_k, fast_beta, new_var_penalty, lin_preds, 
571                     use_beta_cache, None)
572    return n_term.value, full_set, bx, dirs, cuts
573
574
575_c_eval_subsets_xtx = _c_orange_lib.EarthEvalSubsetsUsingXtx
576
577_c_eval_subsets_xtx.argtypes = \
578    [ctypeslib.ndpointer(dtype=ctypes.c_bool, ndim=2, flags="F_CONTIGUOUS"),   #PruneTerms
579     ctypeslib.ndpointer(dtype=ctypes.c_double, ndim=1),   #RssVec
580     ctypes.c_int,  #nCases
581     ctypes.c_int,  #nResp
582     ctypes.c_int,  #nMaxTerms
583     ctypeslib.ndpointer(dtype=ctypes.c_double, ndim=2, flags="F_CONTIGUOUS"),   #bx
584     ctypeslib.ndpointer(dtype=ctypes.c_double, ndim=2, flags="F_CONTIGUOUS"),   #y
585     ctypeslib.ndpointer(dtype=ctypes.c_double, ndim=1)  #WeightsArg
586     ]
587   
588_c_eval_subsets_xtx.restype = ctypes.c_int
589
590def subset_selection_xtx(X, Y):
591    """ Subsets selection using EvalSubsetsUsingXtx in the Earth package.
592    """
593    X = numpy.asfortranarray(X, dtype=ctypes.c_double)
594    Y = numpy.asfortranarray(Y, dtype=ctypes.c_double)
595    if Y.ndim == 1:
596        Y = Y.reshape((-1, 1), order="F")
597       
598    if X.shape[0] != Y.shape[0]:
599        raise ValueError("First dimensions of bx and y must be the same")
600       
601    var_count = X.shape[1]
602    resp_count = Y.shape[1]
603    cases = X.shape[0]
604    subsets = numpy.zeros((var_count, var_count), dtype=ctypes.c_bool,
605                              order="F")
606    rss_vec = numpy.zeros((var_count,), dtype=ctypes.c_double, order="F")
607    weights = numpy.ones((cases,), dtype=ctypes.c_double, order="F")
608   
609    rval = _c_eval_subsets_xtx(subsets, rss_vec, cases, resp_count, var_count,
610                        X, Y, weights)
611    if rval != 0:
612        raise numpy.linalg.LinAlgError("Lin. dep. terms in X")
613   
614    subsets_ind = numpy.zeros((var_count, var_count), dtype=int)
615    for i, used in enumerate(subsets.T):
616        subsets_ind[i, :i + 1] = numpy.where(used)[0]
617       
618    return subsets_ind, rss_vec
619   
620def subset_selection_xtx_numpy(X, Y):
621    """ A numpy implementation of EvalSubsetsUsingXtx in the Earth package.
622    Using numpy.linalg.lstsq
623   
624    """
625    X = numpy.asarray(X)
626    Y = numpy.asarray(Y)
627   
628    var_count = X.shape[1]
629    rss_vec = numpy.zeros(var_count)
630    working_set = range(var_count)
631    subsets = numpy.zeros((var_count, var_count), dtype=int)
632   
633    for subset_size in reversed(range(var_count)):
634        subsets[subset_size, :subset_size + 1] = working_set
635        X_work = X[:, working_set]
636        b, res, rank, s = numpy.linalg.lstsq(X_work, Y)
637        if res.size > 0:
638            rss_vec[subset_size] = numpy.sum(res)
639        else:
640            rss_vec[subset_size] = numpy.sum((Y - numpy.dot(X_work, b)) ** 2)
641           
642        XtX = numpy.dot(X_work.T, X_work)
643        iXtX = numpy.linalg.pinv(XtX)
644        diag = numpy.diag(iXtX).reshape((-1, 1))
645       
646        if subset_size == 0:
647            break
648       
649        delta_rss = b ** 2 / diag
650        delta_rss = numpy.sum(delta_rss, axis=1)
651        delete_i = numpy.argmin(delta_rss[1:]) + 1 # Keep the intercept
652        del working_set[delete_i]
653    return subsets, rss_vec
654   
655def subset_selection_xtx2(X, Y):
656    """ Another implementation (this uses qr decomp).
657    """
658    from Orange.misc import linalg
659    X = numpy.asfortranarray(X, dtype=ctypes.c_double)
660    Y = numpy.asfortranarray(Y, dtype=ctypes.c_double)
661    col_count = X.shape[1]
662    working_set = range(col_count)
663    subsets = numpy.zeros((col_count, col_count), dtype=int)
664    rss_vec = numpy.zeros((col_count,))
665    QR, k, _, jpvt = linalg.qr_decomp(X)
666   
667    if k < col_count:
668        # remove jpvt[k:] from the work set. Will have zero
669        # entries in the subsets matrix, and inf rss
670        for i in sorted(jpvt[k:], reverse=True):
671            del working_set[i]
672            rss_vec[len(working_set)] = float("inf")
673        col_count = len(working_set)
674       
675    for subset_size in reversed(range(col_count)):
676        subsets[subset_size, :subset_size + 1] = working_set
677        X_work = X[:, working_set]
678        b, rsd, rank = linalg.qr_lstsq(X_work, Y)
679        rss_vec[subset_size] = numpy.sum(rsd ** 2)
680        XtX = numpy.dot(X_work.T, X_work)
681        iXtX = numpy.linalg.pinv(XtX)
682        diag = numpy.diag(iXtX)
683       
684        if subset_size == 0:
685            break
686       
687        delta_rss = b ** 2 / diag
688        delete_i = numpy.argmin(delta_rss[1:]) + 1 # Keep the intercept
689        del working_set[delete_i]
690    return subsets, rss_vec
691   
692def pruning_pass(bx, y, penalty, pruned_terms=-1):
693    """ Do pruning pass
694   
695    .. todo:: pruned_terms, Leaps
696   
697    """
698    try:
699        subsets, rss_vec = subset_selection_xtx(bx, y)
700    except numpy.linalg.LinAlgError:
701        subsets, rss_vec = subset_selection_xtx_numpy(bx, y)
702   
703    cases, terms = bx.shape
704    n_effective_params = numpy.arange(terms) + 1.0
705    n_effective_params += penalty * (n_effective_params - 1.0) / 2.0
706   
707    gcv_vec = gcv(rss_vec, cases, n_effective_params)
708   
709    min_i = numpy.argmin(gcv_vec)
710    used = numpy.zeros((terms), dtype=bool)
711   
712    used[subsets[min_i, :min_i + 1]] = True
713   
714    return used, subsets, rss_vec, gcv_vec
715
716"""
717Printing functions.
718"""
719   
720def format_model(model, percision=3, indent=3):
721    """ Return a formated string representation of the earth model.
722    """
723    if model.multitarget:
724        r_vars = list(model.domain.class_vars)
725    elif is_discrete(model.class_var):
726        r_vars = model.expanded_class
727    else:
728        r_vars = [model.class_var]
729       
730    r_names = [v.name for v in r_vars]
731    betas = model.betas
732       
733    resp = []
734    for name, betas in zip(r_names, betas):
735        resp.append(_format_response(model, name, betas,
736                                     percision, indent))
737    return "\n\n".join(resp)
738
739def _format_response(model, resp_name, betas, percision=3, indent=3):
740    header = "%s =" % resp_name
741    indent = " " * indent
742    fmt = "%." + str(percision) + "f"
743    terms = [([], fmt % betas[0])]
744    beta_i = 0
745    for i, used in enumerate(model.best_set[1:], 1):
746        if used:
747            beta_i += 1
748            beta = fmt % abs(betas[beta_i])
749            knots = [_format_knot(model, attr.name, d, c, percision) \
750                     for d, c, attr in \
751                     zip(model.dirs[i], model.cuts[i], model.domain.attributes) \
752                     if d != 0]
753            term_attrs = [a for a, d in zip(model.domain.attributes, model.dirs[i]) \
754                          if d != 0]
755            term_attrs = sorted(term_attrs)
756            sign = "-" if betas[beta_i] < 0 else "+"
757            if knots:
758                terms.append((term_attrs,
759                              sign + " * ".join([beta] + knots)))
760            else:
761                terms.append((term_attrs, sign + beta))
762    # Sort by len(term_attrs), then by term_attrs
763    terms = sorted(terms, key=lambda t: (len(t[0]), t[0]))
764    return "\n".join([header] + [indent + t for _, t in terms])
765       
766def _format_knot(model, name, dir, cut, percision=3):
767    fmt = "%%.%if" % percision
768    if dir == 1:
769        txt = ("max(0, %s - " + fmt + ")") % (name, cut)
770    elif dir == -1:
771        txt = ("max(0, " + fmt + " - %s)") % (cut, name)
772    elif dir == 2:
773        txt = name
774    return txt
775
776   
777"""\
778Variable importance estimation
779------------------------------
780"""
781
782from collections import defaultdict
783
784def collect_source(vars):
785    """ Given a list of variables ``var``, return a mapping from source
786    variables (``source_variable`` or ``get_value_from.variable`` members)
787    back to the variables in ``vars`` (assumes the default preprocessor in
788    EarthLearner).
789   
790    """
791    source = defaultdict(list)
792    for var in vars:
793        svar = None
794        if var.source_variable:
795            source[var.source_variable].append(var)
796        elif isinstance(var.get_value_from, Orange.core.ClassifierFromVar):
797            source[var.get_value_from.variable].append(var)
798        elif isinstance(var.get_value_from, Orange.core.ImputeClassifier):
799            source[var.get_value_from.classifier_from_var.variable].append(var)
800        else:
801            source[var].append(var)
802    return dict(source)
803
804def map_to_source_var(var, sources):
805    """
806    """
807    if var in sources:
808        return var
809    elif var.source_variable in sources:
810        return var.source_variable
811    elif isinstance(var.get_value_from, Orange.core.ClassifierFromVar):
812        return map_to_source_var(var.get_value_from.variable, sources)
813    elif isinstance(var.get_value_from, Orange.core.ImputeClassifier):
814        var = var.get_value_from.classifier_from_var.variable
815        return map_to_source_var(var, sources)
816    else:
817        return None
818   
819def evimp(model, used_only=True):
820    """ Return the estimated variable importance for the model.
821   
822    :param model: Earth model.
823    :type model: `EarthClassifier`
824   
825    """
826    if model.subsets is None:
827        raise ValueError("No subsets. Use the learner with 'prune=True'.")
828   
829    subsets = model.subsets
830    n_subsets = numpy.sum(model.best_set)
831   
832    rss = -numpy.diff(model.rss_per_subset)
833    gcv = -numpy.diff(model.gcv_per_subset)
834    attributes = list(model.domain.variables)
835   
836    attr2ind = dict(zip(attributes, range(len(attributes))))
837    importances = numpy.zeros((len(attributes), 4))
838    importances[:, 0] = range(len(attributes))
839   
840    for i in range(1, n_subsets):
841        term_subset = subsets[i, :i + 1]
842        used_attributes = reduce(set.union, [model.used_attributes(term) \
843                                             for term in term_subset], set())
844        for attr in used_attributes:
845            importances[attr2ind[attr]][1] += 1.0
846            importances[attr2ind[attr]][2] += gcv[i - 1]
847            importances[attr2ind[attr]][3] += rss[i - 1]
848    imp_min = numpy.min(importances[:, [2, 3]], axis=0)
849    imp_max = numpy.max(importances[:, [2, 3]], axis=0)
850   
851    #Normalize importances.
852    importances[:, [2, 3]] = 100.0 * (importances[:, [2, 3]] \
853                            - [imp_min]) / ([imp_max - imp_min])
854   
855    importances = list(importances)
856    # Sort by n_subsets and gcv.
857    importances = sorted(importances, key=lambda row: (row[1], row[2]),
858                         reverse=True)
859    importances = numpy.array(importances)
860   
861    if used_only:
862        importances = importances[importances[:,1] > 0.0]
863   
864    res = [(attributes[int(row[0])], tuple(row[1:])) for row in importances]
865    return res
866
867
868def plot_evimp(evimp):
869    """ Plot the variable importances as returned from
870    :obj:`EarthClassifier.evimp` call.
871   
872    ::
873
874        import Orange
875        data = Orange.data.Table("housing")
876        c = Orange.regression.earth.EarthLearner(data, degree=3)
877        Orange.regression.earth.plot_evimp(c.evimp())
878
879    .. image:: files/earth-evimp.png
880     
881    The left axis is the nsubsets measure and on the right are the normalized
882    RSS and GCV.
883   
884    """
885    from Orange.ensemble.bagging import BaggedClassifier
886    if isinstance(evimp, EarthClassifier):
887        evimp = evimp.evimp()
888    elif isinstance(evimp, BaggedClassifier):
889        evimp = bagged_evimp(evimp)
890       
891    import pylab
892    fig = pylab.figure()
893    axes1 = fig.add_subplot(111)
894    attrs = [a for a, _ in evimp]
895    imp = [s for _, s in evimp]
896    imp = numpy.array(imp)
897    X = range(len(attrs))
898    l1 = axes1.plot(X, imp[:, 0], "b-",)
899    axes2 = axes1.twinx()
900   
901    l2 = axes2.plot(X, imp[:, 1], "g-",)
902    l3 = axes2.plot(X, imp[:, 2], "r-",)
903   
904    x_axis = axes1.xaxis
905    x_axis.set_ticks(X)
906    x_axis.set_ticklabels([a.name for a in attrs], rotation=90)
907   
908    axes1.yaxis.set_label_text("nsubsets")
909    axes2.yaxis.set_label_text("normalized gcv or rss")
910
911    axes1.legend([l1, l2, l3], ["nsubsets", "gcv", "rss"])
912    axes1.set_title("Variable importance")
913    fig.show()
914
915   
916def bagged_evimp(classifier, used_only=True):
917    """ Extract combined (average) evimp from an instance of BaggedClassifier
918   
919    Example::
920
921        from Orange.ensemble.bagging import BaggedLearner
922        bc = BaggedLearner(EarthLearner(degree=3, terms=10), data)
923        bagged_evimp(bc)
924
925    """
926    def assert_type(object, class_):
927        if not isinstance(object, class_):
928            raise TypeError("Instance of %r expected." % (class_))
929   
930    from Orange.ensemble.bagging import BaggedClassifier
931   
932    assert_type(classifier, BaggedClassifier)
933    bagged_imp = defaultdict(list)
934    attrs_by_name = defaultdict(list)
935    for c in classifier.classifiers:
936        assert_type(c, EarthClassifier)
937        imp = evimp(c, used_only=used_only)
938        for attr, score in imp:
939            bagged_imp[attr.name].append(score) # map by name
940            attrs_by_name[attr.name].append(attr)
941           
942    for attr, scores in bagged_imp.items():
943        scores = numpy.average(scores, axis=0)
944        bagged_imp[attr] = tuple(scores)
945   
946   
947    bagged_imp = sorted(bagged_imp.items(),
948                        key=lambda t: (t[1][0], t[1][1]),
949                        reverse=True)
950   
951    bagged_imp = [(attrs_by_name[name][0], scores) for name, scores in bagged_imp]
952   
953    if used_only:
954        bagged_imp = [(a, r) for a, r in bagged_imp if r[0] > 0]
955    return bagged_imp
956
957"""
958High level interface for measuring variable importance
959(compatible with Orange.feature.scoring module).
960
961"""
962from Orange.feature import scoring
963           
964class ScoreEarthImportance(scoring.Score):
965    """ A subclass of :class:`Orange.feature.scoring.Score` that.
966    scores features based on their importance in the Earth
967    model using ``bagged_evimp``.
968   
969    """
970    # Return types 
971    NSUBSETS = 0
972    RSS = 1
973    GCV = 2
974   
975    handles_discrete = True
976    handles_continuous = True
977    computes_thresholds = False
978    needs = scoring.Score.Generator
979   
980    def __new__(cls, attr=None, data=None, weight_id=None, **kwargs):
981        self = scoring.Score.__new__(cls)
982        if attr is not None and data is not None:
983            self.__init__(**kwargs)
984            # TODO: Should raise a warning, about caching
985            return self.__call__(attr, data, weight_id)
986        elif not attr and not data:
987            return self
988        else:
989            raise ValueError("Both 'attr' and 'data' arguments expected.")
990       
991    def __init__(self, t=10, degree=2, terms=10, score_what="nsubsets", cached=True):
992        """
993        :param t: Number of earth models to train on the data
994            (using BaggedLearner).
995           
996        :param score_what: What to return as a score.
997            Can be one of: "nsubsets", "rss", "gcv" or class constants
998            NSUBSETS, RSS, GCV.
999           
1000        """
1001        self.t = t
1002        self.degree = degree
1003        self.terms = terms
1004        if isinstance(score_what, basestring):
1005            score_what = {"nsubsets":self.NSUBSETS, "rss":self.RSS,
1006                          "gcv":self.GCV}.get(score_what, None)
1007                         
1008        if score_what not in range(3):
1009            raise ValueError("Invalid  'score_what' parameter.")
1010
1011        self.score_what = score_what
1012        self.cached = cached
1013        self._cache_ref = None
1014        self._cached_evimp = None
1015       
1016    def __call__(self, attr, data, weight_id=None):
1017        ref = self._cache_ref
1018        if ref is not None and ref is data:
1019            evimp = self._cached_evimp
1020        else:
1021            from Orange.ensemble.bagging import BaggedLearner
1022            bc = BaggedLearner(EarthLearner(degree=self.degree,
1023                            terms=self.terms), t=self.t)(data, weight_id)
1024            evimp = bagged_evimp(bc, used_only=False)
1025            self._cache_ref = data
1026            self._cached_evimp = evimp
1027       
1028        evimp = dict(evimp)
1029        score = evimp.get(attr, None)
1030       
1031        if score is None:
1032            source = collect_source(evimp.keys())
1033            if attr in source:
1034                # Return average of source var scores
1035                return numpy.average([evimp[v][self.score_what] \
1036                                      for v in source[attr]])
1037            else:
1038                raise ValueError("Attribute %r not in the domain." % attr)
1039        else:
1040            return score[self.score_what]
1041   
1042class ScoreRSS(scoring.Score):
1043   
1044    handles_discrete = False
1045    handles_continuous = True
1046    computes_thresholds = False
1047   
1048    def __new__(cls, attr=None, data=None, weight_id=None, **kwargs):
1049        self = scoring.Score.__new__(cls)
1050        if attr is not None and data is not None:
1051            self.__init__(**kwargs)
1052            # TODO: Should raise a warning, about caching
1053            return self.__call__(attr, data, weight_id)
1054        elif not attr and not data:
1055            return self
1056        else:
1057            raise ValueError("Both 'attr' and 'data' arguments expected.")
1058       
1059    def __init__(self):
1060        self._cache_data = None
1061        self._cache_rss = None
1062       
1063    def __call__(self, attr, data, weight_id=None):
1064        ref = self._cache_data
1065        if ref is not None and ref is data:
1066            rss = self._cache_rss
1067        else:
1068            x, y = data.to_numpy_MA("1A/c")
1069            try:
1070                subsets, rss = subset_selection_xtx2(x, y)
1071            except numpy.linalg.LinAlgError:
1072                subsets, rss = subset_selection_xtx_numpy(x, y)
1073            rss_diff = -numpy.diff(rss)
1074            rss = numpy.zeros_like(rss)
1075            for s_size in range(1, subsets.shape[0]):
1076                subset = subsets[s_size, :s_size + 1]
1077                rss[subset] += rss_diff[s_size - 1]
1078            rss = rss[1:] #Drop the intercept
1079            self._cache_data = data
1080            self._cache_rss = rss
1081           
1082        index = list(data.domain.attributes).index(attr)
1083        return rss[index]
1084   
1085
1086#from Orange.core import EarthLearner as BaseEarthLearner, \
1087#                        EarthClassifier as BaseEarthClassifier
1088#from Orange.misc import member_set
1089#
1090#class _EarthLearner(BaseEarthLearner):
1091#    """ An earth learner.
1092#    """
1093#    def __new__(cls, data=None, weightId=None, **kwargs):
1094#        self = BaseEarthLearner.__new__(cls, **kwargs)
1095#        if data is not None:
1096#            self.__init__(**kwargs)
1097#            return self.__call__(data, weightId)
1098#        return self
1099#   
1100#    def __init__(self, max_degree=1, max_terms=21, new_var_penalty=0.0,
1101#                 threshold=0.001, prune=True, penalty=None, fast_k=20,
1102#                 fast_beta=0.0, store_examples=True, **kwargs):
1103#        """ Initialize the learner instance.
1104#       
1105#        :param max_degree:
1106#        """
1107#        self.max_degree = max_degree
1108#        self.max_terms = max_terms
1109#        self.new_var_penalty = new_var_penalty
1110#        self.threshold = threshold
1111#        self.prune = prunes
1112#        if penaty is None:
1113#            penalty = 2.0 if degree > 1 else 3.0
1114#        self.penalty = penalty
1115#        self.fast_k = fast_k
1116#        self.fast_beta = fast_beta
1117#        self.store_examples = store_examples
1118#       
1119#        for key, val in kwargs.items():
1120#            setattr(self, key, val)
1121#   
1122#    def __call__(self, data, weightId=None):
1123#        if not data.domain.class_var:
1124#            raise ValueError("No class var in the domain.")
1125#       
1126#        with member_set(self, "prune", False):
1127#            # We overwrite the prune argument (will do the pruning in python).
1128#            base_clsf =  BaseEarthLearner.__call__(self, data, weightId)
1129#       
1130#        if self.prune:
1131#            (best_set, betas, rss, subsets, rss_per_subset,
1132#             gcv_per_subset) = self.pruning_pass(base_clsf, data)
1133#           
1134#            return _EarthClassifier(base_clsf, data if self.store_examples else None,
1135#                                   best_set=best_set, dirs=base_clsf.dirs,
1136#                                   cuts=base_clsf.cuts,
1137#                                   betas=betas,
1138#                                   subsets=subsets,
1139#                                   rss_per_subset=rss_per_subset,
1140#                                   gcv_per_subset=gcv_per_subset)
1141#        else:
1142#            return _EarthClassifier(base_clsf, data if self.store_examples else None)
1143#   
1144#   
1145#    def pruning_pass(self, base_clsf, examples):
1146#        """ Prune the terms constructed in the forward pass.
1147#        (Pure numpy reimplementation)
1148#        """
1149#        n_terms = numpy.sum(base_clsf.best_set)
1150#        n_eff_params = n_terms + self.penalty * (n_terms - 1) / 2.0
1151#        data, y, _ = examples.to_numpy_MA()
1152#        data = data.filled(0.0)
1153#        best_set = numpy.asarray(base_clsf.best_set, dtype=bool)
1154#       
1155#        bx = base_matrix(data, base_clsf.best_set,
1156#                         base_clsf.dirs, base_clsf.cuts,
1157#                         )
1158#       
1159#        bx_used = bx[:, best_set]
1160#        subsets, rss_per_subset = subsets_selection_xtx(bx_used, y) # TODO: Use leaps like library
1161#        gcv_per_subset = [gcv(rss, bx.shape[0], i + self.penalty * (i - 1) / 2.0) \
1162#                              for i, rss in enumerate(rss_per_subset, 1)]
1163#        gcv_per_subset = numpy.array(gcv_per_subset)
1164#       
1165#        best_i = numpy.argmin(gcv_per_subset[1:]) + 1 # Ignore the intercept
1166#        best_ind = subsets[best_i, :best_i + 1]
1167#        bs_i = 0
1168#        for i, b in enumerate(best_set):
1169#            if b:
1170#                best_set[i] = bs_i in best_ind
1171#                bs_i += 1
1172#               
1173#        bx_subset = bx[:, best_set]
1174#        betas, rss, rank, s = numpy.linalg.lstsq(bx_subset, y)
1175#        return best_set, betas, rss, subsets, rss_per_subset, gcv_per_subset
1176#   
1177#       
1178#class _EarthClassifier(Orange.core.ClassifierFD):
1179#    def __init__(self, base_classifier=None, examples=None, best_set=None,
1180#                 dirs=None, cuts=None, betas=None, subsets=None,
1181#                 rss_per_subset=None,
1182#                 gcv_per_subset=None):
1183#        self._base_classifier = base_classifier
1184#        self.examples = examples
1185#        self.domain = base_classifier.domain
1186#        self.class_var = base_classifier.class_var
1187#       
1188#        best_set = base_classifier.best_set if best_set is None else best_set
1189#        dirs = base_classifier.dirs if dirs is None else dirs
1190#        cuts = base_classifier.cuts if cuts is None else cuts
1191#        betas = base_classifier.betas if betas is None else betas
1192#       
1193#        self.best_set = numpy.asarray(best_set, dtype=bool)
1194#        self.dirs = numpy.array(dirs, dtype=int)
1195#        self.cuts = numpy.array(cuts)
1196#        self.betas = numpy.array(betas)
1197#       
1198#        self.subsets = subsets
1199#        self.rss_per_subset = rss_per_subset
1200#        self.gcv_per_subset = gcv_per_subset
1201#       
1202#    @property
1203#    def num_terms(self):
1204#        """ Number of terms in the model (including the intercept).
1205#        """
1206#        return numpy.sum(numpy.asarray(self.best_set, dtype=int))
1207#   
1208#    @property
1209#    def max_terms(self):
1210#        """ Maximum number of terms (as specified in the learning step).
1211#        """
1212#        return self.best_set.size
1213#   
1214#    @property
1215#    def num_preds(self):
1216#        """ Number of predictors (variables) included in the model.
1217#        """
1218#        return len(self.used_attributes(term))
1219#   
1220#    def __call__(self, example, what=Orange.core.GetValue):
1221#        value = self.predict(example)
1222#        if isinstance(self.class_var, Orange.feature.Continuous):
1223#            value = self.class_var(value)
1224#        else:
1225#            value = self.class_var(int(round(value)))
1226#           
1227#        dist = Orange.statistics.distribution.Distribution(self.class_var)
1228#        dist[value] = 1.0
1229#        if what == Orange.core.GetValue:
1230#            return value
1231#        elif what == Orange.core.GetProbabilities:
1232#            return dist
1233#        else:
1234#            return (value, dist)
1235#   
1236#    def base_matrix(self, examples=None):
1237#        """ Return the base matrix (bx) of the Earth model for the table.
1238#        If table is not supplied the base matrix of the training examples
1239#        is returned.
1240#       
1241#       
1242#        :param examples: Input examples for the base matrix.
1243#        :type examples: Orange.data.Table
1244#       
1245#        """
1246#        if examples is None:
1247#            examples = self.examples
1248#           
1249#        if examples is None:
1250#            raise ValueError("base matrix is only available if 'store_examples=True'")
1251#       
1252#        if isinstance(examples, Orange.data.Table):
1253#            data, cls, w = examples.to_numpy_MA()
1254#            data = data.filled(0.0)
1255#        else:
1256#            data = numpy.asarray(examples)
1257#           
1258#        return base_matrix(data, self.best_set, self.dirs, self.cuts)
1259#   
1260#    def _anova_order(self):
1261#        """ Return indices that sort the terms into the 'ANOVA' format.
1262#        """
1263#        terms = [([], 0)] # intercept
1264#        for i, used in enumerate(self.best_set[1:], 1):
1265#            attrs = sorted(self.used_attributes(i))
1266#            if used and attrs:
1267#                terms.append((attrs, i))
1268#        terms = sotred(terms, key=lambda t:(len(t[0]), t[0]))
1269#        return [i for _, i in terms]
1270#   
1271#    def format_model(self, percision=3, indent=3):
1272#        return format_model(self, percision, indent)
1273#   
1274#    def print_model(self, percision=3, indent=3):
1275#        print self.format_model(percision, indent)
1276#       
1277#    def predict(self, ex):
1278#        """ Return the predicted value (float) for example.
1279#        """
1280#        x = Orange.data.Table(self.domain, [ex])
1281#        x, c, w = x.to_numpy_MA()
1282#        x = x.filled(0.0)[0]
1283#       
1284#        bx = numpy.ones(self.best_set.shape)
1285#        betas = numpy.zeros_like(self.betas)
1286#        betas[0] = self.betas[0]
1287#        beta_i = 0
1288#        for termi in range(1, len(self.best_set)):
1289#            dirs = self.dirs[termi]
1290#            cuts = self.cuts[termi]
1291#            dir_p1 = numpy.where(dirs == 1)[0]
1292#            dir_m1 = numpy.where(dirs == -1)[0]
1293#            dir_2 = numpy.where(dirs == 2)[0]
1294#           
1295#            x1 = x[dir_p1] - cuts[dir_p1]
1296#            x2 = cuts[dir_m1] - x[dir_m1]
1297#            x3 = x[dir_2]
1298#           
1299#            x1 = numpy.maximum(x1, 0.0)
1300#            x2 = numpy.maximum(x2, 0.0)
1301#
1302#            X = numpy.hstack((x1, x2, x3))
1303#            X = numpy.cumprod(X)
1304#           
1305#            bx[termi] = X[-1] if X.size else 0.0
1306#            if self.best_set[termi] != 0:
1307#                beta_i += 1
1308#                betas[beta_i] = self.betas[beta_i]
1309#
1310#        return numpy.sum(bx[self.best_set] * betas)
1311#           
1312#    def used_attributes(self, term=None):
1313#        """ Return a list of used attributes. If term (index) is given
1314#        return only attributes used in that single term.
1315#       
1316#        """
1317#        if term is None:
1318#            terms = numpy.where(self.best_set)[0]
1319#        else:
1320#            terms = [term]
1321#        attrs = set()
1322#        for ti in terms:
1323#            attri = numpy.where(self.dirs[ti] != 0.0)[0]
1324#            attrs.update([self.domain.attributes[i] for i in attri])
1325#        return attrs
1326#       
1327#    def evimp(self, used_only=True):
1328#        """ Return the estimated variable importance.
1329#        """
1330#        return evimp(self, used_only)
1331#       
1332#    def __reduce__(self):
1333#        return (EarthClassifier, (self._base_classifier, self.examples,
1334#                                  self.best_set, self.dirs, self.cuts,
1335#                                  self.betas, self.subsets,
1336#                                  self.rss_per_subset, self.gcv_per_subset),
1337#                {})
1338                                 
Note: See TracBrowser for help on using the repository browser.