source: orange/Orange/regression/earth.py @ 10330:616584f3fd52

Revision 10330:616584f3fd52, 47.6 KB checked in by Ales Erjavec <ales.erjavec@…>, 2 years ago (diff)

Removed old unneeded code (for old mult-label domains) from earth module.

Line 
1"""\
2====================================================
3Multivariate Adaptive Regression Splines (``earth``)
4====================================================
5
6.. index:: regression, linear model
7
8`Multivariate adaptive regression splines (MARS)`_ is a non-parametric
9regression method that extends a linear model with non-linear
10interactions.
11
12This module borrows the implementation of the technique from the `Earth R
13package`_ by Stephen Milborrow.
14
15.. _`Multivariate adaptive regression splines (MARS)`: http://en.wikipedia.org/wiki/Multivariate_adaptive_regression_splines
16.. _`Earth R package`: http://cran.r-project.org/web/packages/earth/index.html
17
18Example ::
19
20    >>> import Orange
21    >>> data = Orange.data.Table("housing")
22    >>> c = Orange.regression.earth.EarthLearner(data, degree=2, terms=10)
23    >>> print c
24    MEDV =
25       23.587
26       +11.896 * max(0, RM - 6.431)
27       +1.142 * max(0, 6.431 - RM)
28       -0.612 * max(0, LSTAT - 6.120)
29       -228.795 * max(0, NOX - 0.647) * max(0, RM - 6.431)
30       +0.023 * max(0, TAX - 307.000) * max(0, 6.120 - LSTAT)
31       +0.029 * max(0, 307.000 - TAX) * max(0, 6.120 - LSTAT)
32
33
34.. autoclass:: EarthLearner
35    :members:
36
37.. autoclass:: EarthClassifier
38    :members:
39
40
41Utility functions
42-----------------
43
44.. autofunction:: gcv
45
46.. autofunction:: plot_evimp
47
48.. autofunction:: bagged_evimp
49
50.. autoclass:: ScoreEarthImportance
51
52"""
53
54import Orange
55from Orange.feature import Discrete, Continuous
56from Orange.data import Table, Domain
57from Orange.preprocess import Preprocessor_continuize, \
58                              Preprocessor_impute, \
59                              Preprocessor_preprocessorList, \
60                              DomainContinuizer
61
62import numpy
63
64def is_discrete(var):
65    return isinstance(var, Discrete)
66
67def is_continuous(var):
68    return isinstance(var, Continuous)
69
70def expand_discrete(var):
71    """ Expand a discrete variable ``var`` returning one continuous indicator
72    variable for each value of ``var`` (if the number of values is grater
73    then 2 else return only one indicator variable).
74   
75    """
76    if len(var.values) > 2:
77        values = var.values
78    elif len(var.values) == 2:
79        values = var.values[-1:]
80    else:
81        values = var.values[:1]
82    new_vars = []
83    for value in values:
84        new = Continuous("{0}={1}".format(var.name, value))
85        new.get_value_from = cls = Orange.core.ClassifierFromVar(whichVar=var)
86        cls.transformer = Orange.core.Discrete2Continuous()
87        cls.transformer.value = int(Orange.core.Value(var, value))
88        new.source_variable = var
89        new_vars.append(new)
90    return new_vars
91
92def select_attrs(table, features, class_var=None,
93                 class_vars=None, metas=None):
94    """ Select only ``attributes`` from the ``table``.
95    """
96    if class_vars is None:
97        domain = Domain(features, class_var)
98    else:
99        domain = Domain(features, class_var, class_vars=class_vars)
100    if metas:
101        domain.add_metas(metas)
102    return Table(domain, table)
103   
104class EarthLearner(Orange.regression.base.BaseRegressionLearner):
105    """Earth learner class. Supports both regression and classification
106    problems. In case of classification the class values are expanded into
107    continuous indicator columns (one for each value if the number of
108    values is grater then 2), and a multi response model is learned on these
109    new columns. The resulting classifier will then use the computed response
110    values on new instances to select the final predicted class.
111     
112    """
113    def __new__(cls, instances=None, weight_id=None, **kwargs):
114        self = Orange.regression.base.BaseRegressionLearner.__new__(cls)
115        if instances is not None:
116            self.__init__(**kwargs)
117            return self.__call__(instances, weight_id)
118        else:
119            return self
120       
121    def __init__(self, degree=1, terms=21, penalty= None, thresh=1e-3,
122                 min_span=0, new_var_penalty=0, fast_k=20, fast_beta=1,
123                 pruned_terms=None, scale_resp=True, store_instances=True,
124                **kwds):
125        """Initialize the learner instance.
126       
127        :param degree: Maximum degree (num. of hinge functions per term)
128            of the terms in the model.
129        :type degree: int
130        :param terms: Maximum number of terms in the forward pass (default 21).
131           
132            .. note:: If this paramter is None then
133                ``min(200, max(20, 2 * n_attributes)) + 1`` will be used. This
134                is the same as the default setting in earth R package.
135               
136        :type terms: int
137        :param penalty: Penalty for hinges in the GCV computation (used
138            in the pruning pass). By default it is 3.0 if the degree > 1,
139            2.0 otherwise.
140        :type penalty: float
141        :param thresh: Threshold for RSS decrease in the forward pass
142            (default 0.001).
143        :type thresh: float
144        :param min_span: TODO.
145        :param new_var_penalty: Penalty for introducing a new variable
146            in the model during the forward pass (default 0).
147        :type new_var_penalty: float
148        :param fast_k: Fast k.
149        :param fast_beta: Fast beta.
150        :param pruned_terms: Maximum number of terms in the model after
151            pruning (default None - no limit).
152        :type pruned_terms: int
153        :param scale_resp: Scale responses prior to forward pass (default
154            True - ignored for multi response models).
155        :type scale_resp: bool
156        :param store_instances: Store training instances in the model
157            (default True).
158        :type store_instances: bool
159         
160        .. todo:: min_span, prunning_method (need Leaps like functionality,
161            currently only eval_subsets_using_xtx is implemented).
162       
163        """
164       
165        super(EarthLearner, self).__init__()
166       
167        self.degree = degree
168        self.terms = terms
169        if penalty is None:
170            penalty = 3 if degree > 1 else 2
171        self.penalty = penalty
172        self.thresh = thresh
173        self.min_span = min_span
174        self.new_var_penalty = new_var_penalty
175        self.fast_k = fast_k
176        self.fast_beta = fast_beta
177        self.pruned_terms = pruned_terms
178        self.scale_resp = scale_resp
179        self.store_instances = store_instances
180        self.__dict__.update(kwds)
181       
182        self.continuizer.class_treatment = DomainContinuizer.Ignore
183       
184    def __call__(self, instances, weight_id=None):
185       
186        expanded_class = None
187        multitarget = False
188       
189        if instances.domain.class_var:
190            instances = self.impute_table(instances)
191            instances = self.continuize_table(instances)
192           
193            if is_discrete(instances.domain.class_var):
194                # Expand a discrete class with indicator columns
195                expanded_class = expand_discrete(instances.domain.class_var)
196                y_table = select_attrs(instances, expanded_class)
197                (y, ) = y_table.to_numpy_MA("A")
198                (x, ) = instances.to_numpy_MA("A")
199            elif is_continuous(instances.domain.class_var):
200                x, y, _ = instances.to_numpy_MA()
201                y = y.reshape((-1, 1))
202            else:
203                raise ValueError("Cannot handle the response.")
204        elif instances.domain.class_vars:
205            # Multi-target domain
206            if not all(map(is_continuous, instances.domain.class_vars)):
207                raise TypeError("Only continuous multi-target classes are supported.")
208            x_table = select_attrs(instances, instances.domain.attributes)
209            y_table = select_attrs(instances, instances.domain.class_vars)
210           
211            # Impute and continuize only the x_table
212            x_table = self.impute_table(x_table)
213            x_table = self.continuize_table(x_table)
214            domain = Domain(x_table.domain.attributes,
215                            class_vars=instances.domain.class_vars)
216           
217            (x, ) = x_table.to_numpy_MA("A")
218            (y, ) = y_table.to_numpy_MA("A")
219           
220            multitarget = True
221        else:
222            raise ValueError("Class variable expected.")
223       
224        if self.scale_resp and y.shape[1] == 1:
225            sy = y - numpy.mean(y, axis=0)
226            sy = sy / numpy.std(sy, axis=1)
227        else:
228            sy = y
229           
230        terms = self.terms
231        if terms is None:
232            # Automatic maximum number of terms
233            terms = min(200, max(20, 2 * x.shape[1])) + 1
234           
235        n_terms, used, bx, dirs, cuts = forward_pass(x, sy,
236            degree=self.degree, terms=terms, penalty=self.penalty,
237            thresh=self.thresh, fast_k=self.fast_k, fast_beta=self.fast_beta,
238            new_var_penalty=self.new_var_penalty)
239       
240        # discard unused terms from bx, dirs, cuts
241        bx = bx[:, used]
242        dirs = dirs[used, :]
243        cuts = cuts[used, :]
244       
245        # pruning
246        used, subsets, rss_per_subset, gcv_per_subset = \
247            pruning_pass(bx, y, self.penalty,
248                         pruned_terms=self.pruned_terms)
249       
250        # Fit betas
251        bx_used = bx[:, used]
252        betas, res, rank, s = numpy.linalg.lstsq(bx_used, y)
253       
254        return EarthClassifier(instances.domain, used, dirs, cuts, betas.T,
255                               subsets, rss_per_subset, gcv_per_subset,
256                               instances=instances if self.store_instances else None,
257                               multitarget=multitarget,
258                               expanded_class=expanded_class
259                               )
260
261
262def soft_max(values):
263    values = numpy.asarray(values)
264    return numpy.exp(values) / numpy.sum(numpy.exp(values))
265
266
267class EarthClassifier(Orange.core.ClassifierFD):
268    """ Earth classifier.
269    """
270    def __init__(self, domain, best_set, dirs, cuts, betas, subsets=None,
271                 rss_per_subset=None, gcv_per_subset=None, instances=None,
272                 multitarget=False, expanded_class=None,
273                 original_domain=None, **kwargs):
274        self.multitarget = multitarget
275        self.domain = domain
276        self.class_var = domain.class_var
277        if self.multitarget:
278            self.class_vars = domain.class_vars
279           
280        self.best_set = best_set
281        self.dirs = dirs
282        self.cuts = cuts
283        self.betas = betas
284        self.subsets = subsets
285        self.rss_per_subset = rss_per_subset
286        self.gcv_per_subset = gcv_per_subset
287        self.instances = instances
288        self.expanded_class = expanded_class
289        self.original_domain = original_domain
290        self.__dict__.update(kwargs)
291       
292    def __call__(self, instance, result_type=Orange.core.GetValue):
293        if self.multitarget and self.domain.class_vars:
294            resp_vars = list(self.domain.class_vars)
295        elif is_discrete(self.class_var):
296            resp_vars = self.expanded_class
297        else:
298            resp_vars = [self.class_var]
299           
300        vals = self.predict(instance)
301        vals = [var(val) for var, val in zip(resp_vars, vals)]
302       
303        from Orange.statistics.distribution import Distribution
304       
305        if not self.multitarget and is_discrete(self.class_var):
306            dist = Distribution(self.class_var)
307            if len(self.class_var.values) == 2:
308                probs = [1 - float(vals[0]), float(vals[0])]
309            else:
310                probs = soft_max(map(float, vals))
311               
312            for val, p in zip(self.class_var.values, probs):
313                dist[val] = p
314            value = dist.modus()
315            vals, probs = [value], [dist]
316        else:
317            probs = []
318            for var, val in zip(resp_vars, vals):
319                dist = Distribution(var)
320                dist[val] = 1.0
321                probs.append(dist)
322           
323        if not self.multitarget:
324            vals, probs = vals[0], probs[0]
325           
326        if result_type == Orange.core.GetValue:
327            return vals
328        elif result_type == Orange.core.GetBoth:
329            return vals, probs
330        else:
331            return probs
332       
333    def base_matrix(self, instances=None):
334        """Return the base matrix (bx) of the Earth model for the table.
335        If table is not supplied the base matrix of the training instances
336        is returned.
337        Base matrix is a len(instances) x num_terms matrix of computed values
338        of terms in the model (not multiplied by beta) for each instance.
339       
340        :param instances: Input instances for the base matrix.
341        :type instances: :class:`Orange.data.Table`
342       
343        """
344        if instances is None:
345            instances = self.instances
346        instances = select_attrs(instances, self.domain.attributes)
347        (data,) = instances.to_numpy_MA("A")
348        bx = base_matrix(data, self.best_set, self.dirs, self.cuts)
349        return bx
350   
351    def predict(self, instance):
352        """ Predict the response values for the instance
353       
354        :param instance: Data instance
355        :type instance: :class:`Orange.data.Instance`
356       
357        """
358        data = Orange.data.Table(self.domain, [instance])
359        bx = self.base_matrix(data)
360        bx_used = bx[:, self.best_set]
361        vals = numpy.dot(bx_used, self.betas.T).ravel()
362        return vals
363   
364    def used_attributes(self, term=None):
365        """ Return the used terms for term (index). If no term is given
366        return all attributes in the model.
367       
368        :param term: term index
369        :type term: int
370       
371        """
372        if term is None:
373            return reduce(set.union, [self.used_attributes(i) \
374                                      for i in range(self.best_set.size)],
375                          set())
376           
377        attrs = self.domain.attributes
378       
379        used_mask = self.dirs[term, :] != 0.0
380        return [a for a, u in zip(attrs, used_mask) if u]
381   
382    def evimp(self, used_only=True):
383        """ Return the estimated variable importances.
384       
385        :param used_only: if True return only used attributes
386       
387        """ 
388        return evimp(self, used_only)
389   
390    def __reduce__(self):
391        return (type(self), (self.domain, self.best_set, self.dirs,
392                            self.cuts, self.betas),
393                dict(self.__dict__))
394       
395    def to_string(self, percision=3, indent=3):
396        """ Return a string representation of the model.
397        """
398        return format_model(self, percision, indent)
399       
400    def __str__(self):
401        return self.to_string()
402   
403
404"""
405Utility functions
406-----------------
407"""
408   
409def base_matrix(data, best_set, dirs, cuts):
410    """ Return the base matrix for the earth model.
411   
412    :param data: Input data
413    :type data: :class:`numpy.ndarray`
414   
415    :param best_set: A array of booleans indicating used terms.
416    :type best_set: :class:`numpy.ndarray`
417   
418    :param dirs: Earth model's dirs members
419    :type dirs: :class:`numpy.ndarray`
420   
421    :param cuts: Earth model's cuts members
422    :type cuts: :class:`numpy.ndarray`
423   
424    """
425    data = numpy.asarray(data)
426    best_set = numpy.asarray(best_set)
427    dirs = numpy.asarray(dirs)
428    cuts = numpy.asarray(cuts)
429   
430    bx = numpy.zeros((data.shape[0], best_set.shape[0]))
431    bx[:, 0] = 1.0 # The intercept
432    for termi in range(1, best_set.shape[0]):
433        term_dirs = dirs[termi]
434        term_cuts = cuts[termi]
435       
436        dir_p1 = numpy.where(term_dirs == 1)[0]
437        dir_m1 = numpy.where(term_dirs == -1)[0]
438        dir_2 = numpy.where(term_dirs == 2)[0]
439       
440        x1 = data[:, dir_p1] - term_cuts[dir_p1]
441        x2 = term_cuts[dir_m1] - data[:, dir_m1]
442        x3 = data[:, dir_2]
443       
444        x1 = numpy.where(x1 > 0.0, x1, 0.0)
445        x2 = numpy.where(x2 > 0.0, x2, 0.0)
446       
447        X = numpy.hstack((x1, x2, x3))
448        X = numpy.cumprod(X, axis=1)
449        bx[:, termi] = X[:, -1] if X.size else 0.0
450       
451    return bx
452
453   
454def gcv(rss, n, n_effective_params):
455    """ Return the generalized cross validation.
456   
457    .. math:: gcv = rss / (n * (1 - NumEffectiveParams / n) ^ 2)
458   
459    :param rss: Residual sum of squares.
460    :param n: Number of training instances.
461    :param n_effective_params: Number of effective paramaters.
462     
463    """
464    return  rss / (n * (1. - n_effective_params / n) ** 2)
465
466"""
467Multi-label utility functions
468"""
469
470
471"""
472ctypes interface to ForwardPass and EvalSubsetsUsingXtx.
473"""
474       
475import ctypes
476from numpy import ctypeslib
477import orange
478
479_c_orange_lib = ctypeslib.load_library(orange.__file__, "")
480_c_forward_pass_ = _c_orange_lib.EarthForwardPass
481
482_c_forward_pass_.argtypes = \
483    [ctypes.POINTER(ctypes.c_int),  #pnTerms:
484     ctypeslib.ndpointer(dtype=ctypes.c_bool, ndim=1),  #FullSet
485     ctypeslib.ndpointer(dtype=ctypes.c_double, ndim=2, flags="F_CONTIGUOUS"), #bx
486     ctypeslib.ndpointer(dtype=ctypes.c_int, ndim=2, flags="F_CONTIGUOUS"),    #Dirs
487     ctypeslib.ndpointer(dtype=ctypes.c_double, ndim=2, flags="F_CONTIGUOUS"), #Cuts
488     ctypeslib.ndpointer(dtype=ctypes.c_int, ndim=1),  #nFactorsInTerms
489     ctypeslib.ndpointer(dtype=ctypes.c_int, ndim=1),  #nUses
490     ctypeslib.ndpointer(dtype=ctypes.c_double, ndim=2, flags="F_CONTIGUOUS"), #x
491     ctypeslib.ndpointer(dtype=ctypes.c_double, ndim=2, flags="F_CONTIGUOUS"), #y
492     ctypeslib.ndpointer(dtype=ctypes.c_double, ndim=1), # Weights
493     ctypes.c_int,  #nCases
494     ctypes.c_int,  #nResp
495     ctypes.c_int,  #nPred
496     ctypes.c_int,  #nMaxDegree
497     ctypes.c_int,  #nMaxTerms
498     ctypes.c_double,   #Penalty
499     ctypes.c_double,   #Thresh
500     ctypes.c_int,  #nFastK
501     ctypes.c_double,   #FastBeta
502     ctypes.c_double,   #NewVarPenalty
503     ctypeslib.ndpointer(dtype=ctypes.c_int, ndim=1),  #LinPreds
504     ctypes.c_bool, #UseBetaCache
505     ctypes.c_char_p    #sPredNames
506     ]
507   
508def forward_pass(x, y, degree=1, terms=21, penalty=None, thresh=0.001,
509                  fast_k=21, fast_beta=1, new_var_penalty=2):
510    """ Do earth forward pass.
511    """
512    x = numpy.asfortranarray(x, dtype=ctypes.c_double)
513    y = numpy.asfortranarray(y, dtype=ctypes.c_double)
514    if x.shape[0] != y.shape[0]:
515        raise ValueError("First dimensions of x and y must be the same.")
516    if y.ndim == 1:
517        y = y.reshape((-1, 1), order="F")
518    if penalty is None:
519        penalty = 2
520    n_cases = x.shape[0]
521    n_preds = x.shape[1]
522   
523    n_resp = y.shape[1] if y.ndim == 2 else y.shape[0]
524   
525    # Output variables
526    n_term = ctypes.c_int()
527    full_set = numpy.zeros((terms,), dtype=ctypes.c_bool, order="F")
528    bx = numpy.zeros((n_cases, terms), dtype=ctypes.c_double, order="F")
529    dirs = numpy.zeros((terms, n_preds), dtype=ctypes.c_int, order="F")
530    cuts = numpy.zeros((terms, n_preds), dtype=ctypes.c_double, order="F")
531    n_factors_in_terms = numpy.zeros((terms,), dtype=ctypes.c_int, order="F")
532    n_uses = numpy.zeros((n_preds,), dtype=ctypes.c_int, order="F")
533    weights = numpy.ones((n_cases,), dtype=ctypes.c_double, order="F")
534    lin_preds = numpy.zeros((n_preds,), dtype=ctypes.c_int, order="F")
535    use_beta_cache = True
536   
537    # These tests are performed in ForwardPass, and if they fail the function
538    # calls exit. So we must check it here and raise a exception to avoid a
539    # process shutdown.
540    if n_cases < 8:
541        raise ValueError("Need at least 8 data instances.")
542    if n_cases > 1e8:
543        raise ValueError("To many data instances.")
544    if n_resp < 1:
545        raise ValueError("No response column.")
546    if n_resp > 1e6:
547        raise ValueError("To many response columns.")
548    if n_preds < 1:
549        raise ValueError("No predictor columns.")
550    if n_preds > 1e5:
551        raise ValueError("To many predictor columns.")
552    if degree <= 0 or degree > 100:
553        raise ValueError("Invalid 'degree'.")
554    if terms < 3 or terms > 10000:
555        raise ValueError("'terms' must be in >= 3 and <= 10000.")
556    if penalty < 0 and penalty != -1:
557        raise ValueError("Invalid 'penalty' (the only legal negative value is -1).")
558    if penalty > 1000:
559        raise ValueError("Invalid 'penalty' (must be <= 1000).")
560    if thresh < 0.0 or thresh >= 1.0:
561        raise ValueError("Invalid 'thresh' (must be in [0.0, 1.0) ).")
562    if fast_beta < 0 or fast_beta > 1000:
563        raise ValueError("Invalid 'fast_beta' (must be in [0, 1000] ).")
564    if new_var_penalty < 0 or new_var_penalty > 10:
565        raise ValueError("Invalid 'new_var_penalty' (must be in [0, 10] ).")
566    if (numpy.var(y, axis=0) <= 1e-8).any():
567        raise ValueError("Variance of y is zero (or near zero).")
568     
569    _c_forward_pass_(ctypes.byref(n_term), full_set, bx, dirs, cuts,
570                     n_factors_in_terms, n_uses, x, y, weights, n_cases,
571                     n_resp, n_preds, degree, terms, penalty, thresh,
572                     fast_k, fast_beta, new_var_penalty, lin_preds, 
573                     use_beta_cache, None)
574    return n_term.value, full_set, bx, dirs, cuts
575
576
577_c_eval_subsets_xtx = _c_orange_lib.EarthEvalSubsetsUsingXtx
578
579_c_eval_subsets_xtx.argtypes = \
580    [ctypeslib.ndpointer(dtype=ctypes.c_bool, ndim=2, flags="F_CONTIGUOUS"),   #PruneTerms
581     ctypeslib.ndpointer(dtype=ctypes.c_double, ndim=1),   #RssVec
582     ctypes.c_int,  #nCases
583     ctypes.c_int,  #nResp
584     ctypes.c_int,  #nMaxTerms
585     ctypeslib.ndpointer(dtype=ctypes.c_double, ndim=2, flags="F_CONTIGUOUS"),   #bx
586     ctypeslib.ndpointer(dtype=ctypes.c_double, ndim=2, flags="F_CONTIGUOUS"),   #y
587     ctypeslib.ndpointer(dtype=ctypes.c_double, ndim=1)  #WeightsArg
588     ]
589   
590_c_eval_subsets_xtx.restype = ctypes.c_int
591
592def subset_selection_xtx(X, Y):
593    """ Subsets selection using EvalSubsetsUsingXtx in the Earth package.
594    """
595    X = numpy.asfortranarray(X, dtype=ctypes.c_double)
596    Y = numpy.asfortranarray(Y, dtype=ctypes.c_double)
597    if Y.ndim == 1:
598        Y = Y.reshape((-1, 1), order="F")
599       
600    if X.shape[0] != Y.shape[0]:
601        raise ValueError("First dimensions of bx and y must be the same")
602       
603    var_count = X.shape[1]
604    resp_count = Y.shape[1]
605    cases = X.shape[0]
606    subsets = numpy.zeros((var_count, var_count), dtype=ctypes.c_bool,
607                              order="F")
608    rss_vec = numpy.zeros((var_count,), dtype=ctypes.c_double, order="F")
609    weights = numpy.ones((cases,), dtype=ctypes.c_double, order="F")
610   
611    rval = _c_eval_subsets_xtx(subsets, rss_vec, cases, resp_count, var_count,
612                        X, Y, weights)
613    if rval != 0:
614        raise numpy.linalg.LinAlgError("Lin. dep. terms in X")
615   
616    subsets_ind = numpy.zeros((var_count, var_count), dtype=int)
617    for i, used in enumerate(subsets.T):
618        subsets_ind[i, :i + 1] = numpy.where(used)[0]
619       
620    return subsets_ind, rss_vec
621   
622def subset_selection_xtx_numpy(X, Y):
623    """ A numpy implementation of EvalSubsetsUsingXtx in the Earth package.
624    Using numpy.linalg.lstsq
625   
626    """
627    X = numpy.asarray(X)
628    Y = numpy.asarray(Y)
629   
630    var_count = X.shape[1]
631    rss_vec = numpy.zeros(var_count)
632    working_set = range(var_count)
633    subsets = numpy.zeros((var_count, var_count), dtype=int)
634   
635    for subset_size in reversed(range(var_count)):
636        subsets[subset_size, :subset_size + 1] = working_set
637        X_work = X[:, working_set]
638        b, res, rank, s = numpy.linalg.lstsq(X_work, Y)
639        if res.size > 0:
640            rss_vec[subset_size] = numpy.sum(res)
641        else:
642            rss_vec[subset_size] = numpy.sum((Y - numpy.dot(X_work, b)) ** 2)
643           
644        XtX = numpy.dot(X_work.T, X_work)
645        iXtX = numpy.linalg.pinv(XtX)
646        diag = numpy.diag(iXtX).reshape((-1, 1))
647       
648        if subset_size == 0:
649            break
650       
651        delta_rss = b ** 2 / diag
652        delta_rss = numpy.sum(delta_rss, axis=1)
653        delete_i = numpy.argmin(delta_rss[1:]) + 1 # Keep the intercept
654        del working_set[delete_i]
655    return subsets, rss_vec
656   
657def subset_selection_xtx2(X, Y):
658    """ Another implementation (this uses qr decomp).
659    """
660    from Orange.misc import linalg
661    X = numpy.asfortranarray(X, dtype=ctypes.c_double)
662    Y = numpy.asfortranarray(Y, dtype=ctypes.c_double)
663    col_count = X.shape[1]
664    working_set = range(col_count)
665    subsets = numpy.zeros((col_count, col_count), dtype=int)
666    rss_vec = numpy.zeros((col_count,))
667    QR, k, _, jpvt = linalg.qr_decomp(X)
668   
669    if k < col_count:
670        # remove jpvt[k:] from the work set. Will have zero
671        # entries in the subsets matrix, and inf rss
672        for i in sorted(jpvt[k:], reverse=True):
673            del working_set[i]
674            rss_vec[len(working_set)] = float("inf")
675        col_count = len(working_set)
676       
677    for subset_size in reversed(range(col_count)):
678        subsets[subset_size, :subset_size + 1] = working_set
679        X_work = X[:, working_set]
680        b, rsd, rank = linalg.qr_lstsq(X_work, Y)
681        rss_vec[subset_size] = numpy.sum(rsd ** 2)
682        XtX = numpy.dot(X_work.T, X_work)
683        iXtX = numpy.linalg.pinv(XtX)
684        diag = numpy.diag(iXtX)
685       
686        if subset_size == 0:
687            break
688       
689        delta_rss = b ** 2 / diag
690        delete_i = numpy.argmin(delta_rss[1:]) + 1 # Keep the intercept
691        del working_set[delete_i]
692    return subsets, rss_vec
693   
694def pruning_pass(bx, y, penalty, pruned_terms=-1):
695    """ Do pruning pass
696   
697    .. todo:: pruned_terms, Leaps
698   
699    """
700    try:
701        subsets, rss_vec = subset_selection_xtx(bx, y)
702    except numpy.linalg.LinAlgError:
703        subsets, rss_vec = subset_selection_xtx_numpy(bx, y)
704   
705    cases, terms = bx.shape
706    n_effective_params = numpy.arange(terms) + 1.0
707    n_effective_params += penalty * (n_effective_params - 1.0) / 2.0
708   
709    gcv_vec = gcv(rss_vec, cases, n_effective_params)
710   
711    min_i = numpy.argmin(gcv_vec)
712    used = numpy.zeros((terms), dtype=bool)
713   
714    used[subsets[min_i, :min_i + 1]] = True
715   
716    return used, subsets, rss_vec, gcv_vec
717
718"""
719Printing functions.
720"""
721   
722def format_model(model, percision=3, indent=3):
723    """ Return a formated string representation of the earth model.
724    """
725    if model.multitarget:
726        r_vars = list(model.domain.class_vars)
727    elif is_discrete(model.class_var):
728        r_vars = model.expanded_class
729    else:
730        r_vars = [model.class_var]
731       
732    r_names = [v.name for v in r_vars]
733    betas = model.betas
734       
735    resp = []
736    for name, betas in zip(r_names, betas):
737        resp.append(_format_response(model, name, betas,
738                                     percision, indent))
739    return "\n\n".join(resp)
740
741def _format_response(model, resp_name, betas, percision=3, indent=3):
742    header = "%s =" % resp_name
743    indent = " " * indent
744    fmt = "%." + str(percision) + "f"
745    terms = [([], fmt % betas[0])]
746    beta_i = 0
747    for i, used in enumerate(model.best_set[1:], 1):
748        if used:
749            beta_i += 1
750            beta = fmt % abs(betas[beta_i])
751            knots = [_format_knot(model, attr.name, d, c, percision) \
752                     for d, c, attr in \
753                     zip(model.dirs[i], model.cuts[i], model.domain.attributes) \
754                     if d != 0]
755            term_attrs = [a for a, d in zip(model.domain.attributes, model.dirs[i]) \
756                          if d != 0]
757            term_attrs = sorted(term_attrs)
758            sign = "-" if betas[beta_i] < 0 else "+"
759            if knots:
760                terms.append((term_attrs,
761                              sign + " * ".join([beta] + knots)))
762            else:
763                terms.append((term_attrs, sign + beta))
764    # Sort by len(term_attrs), then by term_attrs
765    terms = sorted(terms, key=lambda t: (len(t[0]), t[0]))
766    return "\n".join([header] + [indent + t for _, t in terms])
767       
768def _format_knot(model, name, dir, cut, percision=3):
769    fmt = "%%.%if" % percision
770    if dir == 1:
771        txt = ("max(0, %s - " + fmt + ")") % (name, cut)
772    elif dir == -1:
773        txt = ("max(0, " + fmt + " - %s)") % (cut, name)
774    elif dir == 2:
775        txt = name
776    return txt
777
778   
779"""\
780Variable importance estimation
781------------------------------
782"""
783
784from collections import defaultdict
785
786def collect_source(vars):
787    """ Given a list of variables ``var``, return a mapping from source
788    variables (``source_variable`` or ``get_value_from.variable`` members)
789    back to the variables in ``vars`` (assumes the default preprocessor in
790    EarthLearner).
791   
792    """
793    source = defaultdict(list)
794    for var in vars:
795        svar = None
796        if var.source_variable:
797            source[var.source_variable].append(var)
798        elif isinstance(var.get_value_from, Orange.core.ClassifierFromVar):
799            source[var.get_value_from.variable].append(var)
800        elif isinstance(var.get_value_from, Orange.core.ImputeClassifier):
801            source[var.get_value_from.classifier_from_var.variable].append(var)
802        else:
803            source[var].append(var)
804    return dict(source)
805
806def map_to_source_var(var, sources):
807    """
808    """
809    if var in sources:
810        return var
811    elif var.source_variable in sources:
812        return var.source_variable
813    elif isinstance(var.get_value_from, Orange.core.ClassifierFromVar):
814        return map_to_source_var(var.get_value_from.variable, sources)
815    elif isinstance(var.get_value_from, Orange.core.ImputeClassifier):
816        var = var.get_value_from.classifier_from_var.variable
817        return map_to_source_var(var, sources)
818    else:
819        return None
820   
821def evimp(model, used_only=True):
822    """ Return the estimated variable importance for the model.
823   
824    :param model: Earth model.
825    :type model: `EarthClassifier`
826   
827    """
828    if model.subsets is None:
829        raise ValueError("No subsets. Use the learner with 'prune=True'.")
830   
831    subsets = model.subsets
832    n_subsets = numpy.sum(model.best_set)
833   
834    rss = -numpy.diff(model.rss_per_subset)
835    gcv = -numpy.diff(model.gcv_per_subset)
836    attributes = list(model.domain.variables)
837   
838    attr2ind = dict(zip(attributes, range(len(attributes))))
839    importances = numpy.zeros((len(attributes), 4))
840    importances[:, 0] = range(len(attributes))
841   
842    for i in range(1, n_subsets):
843        term_subset = subsets[i, :i + 1]
844        used_attributes = reduce(set.union, [model.used_attributes(term) \
845                                             for term in term_subset], set())
846        for attr in used_attributes:
847            importances[attr2ind[attr]][1] += 1.0
848            importances[attr2ind[attr]][2] += gcv[i - 1]
849            importances[attr2ind[attr]][3] += rss[i - 1]
850    imp_min = numpy.min(importances[:, [2, 3]], axis=0)
851    imp_max = numpy.max(importances[:, [2, 3]], axis=0)
852   
853    #Normalize importances.
854    importances[:, [2, 3]] = 100.0 * (importances[:, [2, 3]] \
855                            - [imp_min]) / ([imp_max - imp_min])
856   
857    importances = list(importances)
858    # Sort by n_subsets and gcv.
859    importances = sorted(importances, key=lambda row: (row[1], row[2]),
860                         reverse=True)
861    importances = numpy.array(importances)
862   
863    if used_only:
864        importances = importances[importances[:,1] > 0.0]
865   
866    res = [(attributes[int(row[0])], tuple(row[1:])) for row in importances]
867    return res
868
869
870def plot_evimp(evimp):
871    """ Plot the variable importances as returned from
872    :obj:`EarthClassifier.evimp` call.
873   
874    ::
875
876        import Orange
877        data = Orange.data.Table("housing")
878        c = Orange.regression.earth.EarthLearner(data, degree=3)
879        Orange.regression.earth.plot_evimp(c.evimp())
880
881    .. image:: files/earth-evimp.png
882     
883    The left axis is the nsubsets measure and on the right are the normalized
884    RSS and GCV.
885   
886    """
887    from Orange.ensemble.bagging import BaggedClassifier
888    if isinstance(evimp, EarthClassifier):
889        evimp = evimp.evimp()
890    elif isinstance(evimp, BaggedClassifier):
891        evimp = bagged_evimp(evimp)
892       
893    import pylab
894    fig = pylab.figure()
895    axes1 = fig.add_subplot(111)
896    attrs = [a for a, _ in evimp]
897    imp = [s for _, s in evimp]
898    imp = numpy.array(imp)
899    X = range(len(attrs))
900    l1 = axes1.plot(X, imp[:, 0], "b-",)
901    axes2 = axes1.twinx()
902   
903    l2 = axes2.plot(X, imp[:, 1], "g-",)
904    l3 = axes2.plot(X, imp[:, 2], "r-",)
905   
906    x_axis = axes1.xaxis
907    x_axis.set_ticks(X)
908    x_axis.set_ticklabels([a.name for a in attrs], rotation=90)
909   
910    axes1.yaxis.set_label_text("nsubsets")
911    axes2.yaxis.set_label_text("normalized gcv or rss")
912
913    axes1.legend([l1, l2, l3], ["nsubsets", "gcv", "rss"])
914    axes1.set_title("Variable importance")
915    fig.show()
916
917   
918def bagged_evimp(classifier, used_only=True):
919    """ Extract combined (average) evimp from an instance of BaggedClassifier
920   
921    Example::
922
923        from Orange.ensemble.bagging import BaggedLearner
924        bc = BaggedLearner(EarthLearner(degree=3, terms=10), data)
925        bagged_evimp(bc)
926
927    """
928    def assert_type(object, class_):
929        if not isinstance(object, class_):
930            raise TypeError("Instance of %r expected." % (class_))
931   
932    from Orange.ensemble.bagging import BaggedClassifier
933   
934    assert_type(classifier, BaggedClassifier)
935    bagged_imp = defaultdict(list)
936    attrs_by_name = defaultdict(list)
937    for c in classifier.classifiers:
938        assert_type(c, EarthClassifier)
939        imp = evimp(c, used_only=used_only)
940        for attr, score in imp:
941            bagged_imp[attr.name].append(score) # map by name
942            attrs_by_name[attr.name].append(attr)
943           
944    for attr, scores in bagged_imp.items():
945        scores = numpy.average(scores, axis=0)
946        bagged_imp[attr] = tuple(scores)
947   
948   
949    bagged_imp = sorted(bagged_imp.items(),
950                        key=lambda t: (t[1][0], t[1][1]),
951                        reverse=True)
952   
953    bagged_imp = [(attrs_by_name[name][0], scores) for name, scores in bagged_imp]
954   
955    if used_only:
956        bagged_imp = [(a, r) for a, r in bagged_imp if r[0] > 0]
957    return bagged_imp
958
959"""
960High level interface for measuring variable importance
961(compatible with Orange.feature.scoring module).
962
963"""
964from Orange.feature import scoring
965           
966class ScoreEarthImportance(scoring.Score):
967    """ An :class:`Orange.feature.scoring.Score` subclass.
968    Scores features based on their importance in the Earth
969    model using ``bagged_evimp``'s function return value.
970   
971    """
972    # Return types 
973    NSUBSETS = 0
974    RSS = 1
975    GCV = 2
976   
977    handles_discrete = True
978    handles_continuous = True
979    computes_thresholds = False
980    needs = scoring.Score.Generator
981   
982    def __new__(cls, attr=None, data=None, weight_id=None, **kwargs):
983        self = scoring.Score.__new__(cls)
984        if attr is not None and data is not None:
985            self.__init__(**kwargs)
986            # TODO: Should raise a warning, about caching
987            return self.__call__(attr, data, weight_id)
988        elif not attr and not data:
989            return self
990        else:
991            raise ValueError("Both 'attr' and 'data' arguments expected.")
992       
993    def __init__(self, t=10, degree=2, terms=10, score_what="nsubsets", cached=True):
994        """
995        :param t: Number of earth models to train on the data
996            (using BaggedLearner).
997           
998        :param score_what: What to return as a score.
999            Can be one of: "nsubsets", "rss", "gcv" or class constants
1000            NSUBSETS, RSS, GCV.
1001           
1002        """
1003        self.t = t
1004        self.degree = degree
1005        self.terms = terms
1006        if isinstance(score_what, basestring):
1007            score_what = {"nsubsets":self.NSUBSETS, "rss":self.RSS,
1008                          "gcv":self.GCV}.get(score_what, None)
1009                         
1010        if score_what not in range(3):
1011            raise ValueError("Invalid  'score_what' parameter.")
1012
1013        self.score_what = score_what
1014        self.cached = cached
1015        self._cache_ref = None
1016        self._cached_evimp = None
1017       
1018    def __call__(self, attr, data, weight_id=None):
1019        ref = self._cache_ref
1020        if ref is not None and ref is data:
1021            evimp = self._cached_evimp
1022        else:
1023            from Orange.ensemble.bagging import BaggedLearner
1024            bc = BaggedLearner(EarthLearner(degree=self.degree,
1025                            terms=self.terms), t=self.t)(data, weight_id)
1026            evimp = bagged_evimp(bc, used_only=False)
1027            self._cache_ref = data
1028            self._cached_evimp = evimp
1029       
1030        evimp = dict(evimp)
1031        score = evimp.get(attr, None)
1032       
1033        if score is None:
1034            source = collect_source(evimp.keys())
1035            if attr in source:
1036                # Return average of source var scores
1037                return numpy.average([evimp[v][self.score_what] \
1038                                      for v in source[attr]])
1039            else:
1040                raise ValueError("Attribute %r not in the domain." % attr)
1041        else:
1042            return score[self.score_what]
1043   
1044class ScoreRSS(scoring.Score):
1045   
1046    handles_discrete = False
1047    handles_continuous = True
1048    computes_thresholds = False
1049   
1050    def __new__(cls, attr=None, data=None, weight_id=None, **kwargs):
1051        self = scoring.Score.__new__(cls)
1052        if attr is not None and data is not None:
1053            self.__init__(**kwargs)
1054            # TODO: Should raise a warning, about caching
1055            return self.__call__(attr, data, weight_id)
1056        elif not attr and not data:
1057            return self
1058        else:
1059            raise ValueError("Both 'attr' and 'data' arguments expected.")
1060       
1061    def __init__(self):
1062        self._cache_data = None
1063        self._cache_rss = None
1064       
1065    def __call__(self, attr, data, weight_id=None):
1066        ref = self._cache_data
1067        if ref is not None and ref is data:
1068            rss = self._cache_rss
1069        else:
1070            x, y = data.to_numpy_MA("1A/c")
1071            try:
1072                subsets, rss = subset_selection_xtx2(x, y)
1073            except numpy.linalg.LinAlgError:
1074                subsets, rss = subset_selection_xtx_numpy(x, y)
1075            rss_diff = -numpy.diff(rss)
1076            rss = numpy.zeros_like(rss)
1077            for s_size in range(1, subsets.shape[0]):
1078                subset = subsets[s_size, :s_size + 1]
1079                rss[subset] += rss_diff[s_size - 1]
1080            rss = rss[1:] #Drop the intercept
1081            self._cache_data = data
1082            self._cache_rss = rss
1083           
1084        index = list(data.domain.attributes).index(attr)
1085        return rss[index]
1086   
1087
1088#from Orange.core import EarthLearner as BaseEarthLearner, \
1089#                        EarthClassifier as BaseEarthClassifier
1090#from Orange.misc import member_set
1091#
1092#class _EarthLearner(BaseEarthLearner):
1093#    """ An earth learner.
1094#    """
1095#    def __new__(cls, data=None, weightId=None, **kwargs):
1096#        self = BaseEarthLearner.__new__(cls, **kwargs)
1097#        if data is not None:
1098#            self.__init__(**kwargs)
1099#            return self.__call__(data, weightId)
1100#        return self
1101#   
1102#    def __init__(self, max_degree=1, max_terms=21, new_var_penalty=0.0,
1103#                 threshold=0.001, prune=True, penalty=None, fast_k=20,
1104#                 fast_beta=0.0, store_examples=True, **kwargs):
1105#        """ Initialize the learner instance.
1106#       
1107#        :param max_degree:
1108#        """
1109#        self.max_degree = max_degree
1110#        self.max_terms = max_terms
1111#        self.new_var_penalty = new_var_penalty
1112#        self.threshold = threshold
1113#        self.prune = prunes
1114#        if penaty is None:
1115#            penalty = 2.0 if degree > 1 else 3.0
1116#        self.penalty = penalty
1117#        self.fast_k = fast_k
1118#        self.fast_beta = fast_beta
1119#        self.store_examples = store_examples
1120#       
1121#        for key, val in kwargs.items():
1122#            setattr(self, key, val)
1123#   
1124#    def __call__(self, data, weightId=None):
1125#        if not data.domain.class_var:
1126#            raise ValueError("No class var in the domain.")
1127#       
1128#        with member_set(self, "prune", False):
1129#            # We overwrite the prune argument (will do the pruning in python).
1130#            base_clsf =  BaseEarthLearner.__call__(self, data, weightId)
1131#       
1132#        if self.prune:
1133#            (best_set, betas, rss, subsets, rss_per_subset,
1134#             gcv_per_subset) = self.pruning_pass(base_clsf, data)
1135#           
1136#            return _EarthClassifier(base_clsf, data if self.store_examples else None,
1137#                                   best_set=best_set, dirs=base_clsf.dirs,
1138#                                   cuts=base_clsf.cuts,
1139#                                   betas=betas,
1140#                                   subsets=subsets,
1141#                                   rss_per_subset=rss_per_subset,
1142#                                   gcv_per_subset=gcv_per_subset)
1143#        else:
1144#            return _EarthClassifier(base_clsf, data if self.store_examples else None)
1145#   
1146#   
1147#    def pruning_pass(self, base_clsf, examples):
1148#        """ Prune the terms constructed in the forward pass.
1149#        (Pure numpy reimplementation)
1150#        """
1151#        n_terms = numpy.sum(base_clsf.best_set)
1152#        n_eff_params = n_terms + self.penalty * (n_terms - 1) / 2.0
1153#        data, y, _ = examples.to_numpy_MA()
1154#        data = data.filled(0.0)
1155#        best_set = numpy.asarray(base_clsf.best_set, dtype=bool)
1156#       
1157#        bx = base_matrix(data, base_clsf.best_set,
1158#                         base_clsf.dirs, base_clsf.cuts,
1159#                         )
1160#       
1161#        bx_used = bx[:, best_set]
1162#        subsets, rss_per_subset = subsets_selection_xtx(bx_used, y) # TODO: Use leaps like library
1163#        gcv_per_subset = [gcv(rss, bx.shape[0], i + self.penalty * (i - 1) / 2.0) \
1164#                              for i, rss in enumerate(rss_per_subset, 1)]
1165#        gcv_per_subset = numpy.array(gcv_per_subset)
1166#       
1167#        best_i = numpy.argmin(gcv_per_subset[1:]) + 1 # Ignore the intercept
1168#        best_ind = subsets[best_i, :best_i + 1]
1169#        bs_i = 0
1170#        for i, b in enumerate(best_set):
1171#            if b:
1172#                best_set[i] = bs_i in best_ind
1173#                bs_i += 1
1174#               
1175#        bx_subset = bx[:, best_set]
1176#        betas, rss, rank, s = numpy.linalg.lstsq(bx_subset, y)
1177#        return best_set, betas, rss, subsets, rss_per_subset, gcv_per_subset
1178#   
1179#       
1180#class _EarthClassifier(Orange.core.ClassifierFD):
1181#    def __init__(self, base_classifier=None, examples=None, best_set=None,
1182#                 dirs=None, cuts=None, betas=None, subsets=None,
1183#                 rss_per_subset=None,
1184#                 gcv_per_subset=None):
1185#        self._base_classifier = base_classifier
1186#        self.examples = examples
1187#        self.domain = base_classifier.domain
1188#        self.class_var = base_classifier.class_var
1189#       
1190#        best_set = base_classifier.best_set if best_set is None else best_set
1191#        dirs = base_classifier.dirs if dirs is None else dirs
1192#        cuts = base_classifier.cuts if cuts is None else cuts
1193#        betas = base_classifier.betas if betas is None else betas
1194#       
1195#        self.best_set = numpy.asarray(best_set, dtype=bool)
1196#        self.dirs = numpy.array(dirs, dtype=int)
1197#        self.cuts = numpy.array(cuts)
1198#        self.betas = numpy.array(betas)
1199#       
1200#        self.subsets = subsets
1201#        self.rss_per_subset = rss_per_subset
1202#        self.gcv_per_subset = gcv_per_subset
1203#       
1204#    @property
1205#    def num_terms(self):
1206#        """ Number of terms in the model (including the intercept).
1207#        """
1208#        return numpy.sum(numpy.asarray(self.best_set, dtype=int))
1209#   
1210#    @property
1211#    def max_terms(self):
1212#        """ Maximum number of terms (as specified in the learning step).
1213#        """
1214#        return self.best_set.size
1215#   
1216#    @property
1217#    def num_preds(self):
1218#        """ Number of predictors (variables) included in the model.
1219#        """
1220#        return len(self.used_attributes(term))
1221#   
1222#    def __call__(self, example, what=Orange.core.GetValue):
1223#        value = self.predict(example)
1224#        if isinstance(self.class_var, Orange.feature.Continuous):
1225#            value = self.class_var(value)
1226#        else:
1227#            value = self.class_var(int(round(value)))
1228#           
1229#        dist = Orange.statistics.distribution.Distribution(self.class_var)
1230#        dist[value] = 1.0
1231#        if what == Orange.core.GetValue:
1232#            return value
1233#        elif what == Orange.core.GetProbabilities:
1234#            return dist
1235#        else:
1236#            return (value, dist)
1237#   
1238#    def base_matrix(self, examples=None):
1239#        """ Return the base matrix (bx) of the Earth model for the table.
1240#        If table is not supplied the base matrix of the training examples
1241#        is returned.
1242#       
1243#       
1244#        :param examples: Input examples for the base matrix.
1245#        :type examples: Orange.data.Table
1246#       
1247#        """
1248#        if examples is None:
1249#            examples = self.examples
1250#           
1251#        if examples is None:
1252#            raise ValueError("base matrix is only available if 'store_examples=True'")
1253#       
1254#        if isinstance(examples, Orange.data.Table):
1255#            data, cls, w = examples.to_numpy_MA()
1256#            data = data.filled(0.0)
1257#        else:
1258#            data = numpy.asarray(examples)
1259#           
1260#        return base_matrix(data, self.best_set, self.dirs, self.cuts)
1261#   
1262#    def _anova_order(self):
1263#        """ Return indices that sort the terms into the 'ANOVA' format.
1264#        """
1265#        terms = [([], 0)] # intercept
1266#        for i, used in enumerate(self.best_set[1:], 1):
1267#            attrs = sorted(self.used_attributes(i))
1268#            if used and attrs:
1269#                terms.append((attrs, i))
1270#        terms = sotred(terms, key=lambda t:(len(t[0]), t[0]))
1271#        return [i for _, i in terms]
1272#   
1273#    def format_model(self, percision=3, indent=3):
1274#        return format_model(self, percision, indent)
1275#   
1276#    def print_model(self, percision=3, indent=3):
1277#        print self.format_model(percision, indent)
1278#       
1279#    def predict(self, ex):
1280#        """ Return the predicted value (float) for example.
1281#        """
1282#        x = Orange.data.Table(self.domain, [ex])
1283#        x, c, w = x.to_numpy_MA()
1284#        x = x.filled(0.0)[0]
1285#       
1286#        bx = numpy.ones(self.best_set.shape)
1287#        betas = numpy.zeros_like(self.betas)
1288#        betas[0] = self.betas[0]
1289#        beta_i = 0
1290#        for termi in range(1, len(self.best_set)):
1291#            dirs = self.dirs[termi]
1292#            cuts = self.cuts[termi]
1293#            dir_p1 = numpy.where(dirs == 1)[0]
1294#            dir_m1 = numpy.where(dirs == -1)[0]
1295#            dir_2 = numpy.where(dirs == 2)[0]
1296#           
1297#            x1 = x[dir_p1] - cuts[dir_p1]
1298#            x2 = cuts[dir_m1] - x[dir_m1]
1299#            x3 = x[dir_2]
1300#           
1301#            x1 = numpy.maximum(x1, 0.0)
1302#            x2 = numpy.maximum(x2, 0.0)
1303#
1304#            X = numpy.hstack((x1, x2, x3))
1305#            X = numpy.cumprod(X)
1306#           
1307#            bx[termi] = X[-1] if X.size else 0.0
1308#            if self.best_set[termi] != 0:
1309#                beta_i += 1
1310#                betas[beta_i] = self.betas[beta_i]
1311#
1312#        return numpy.sum(bx[self.best_set] * betas)
1313#           
1314#    def used_attributes(self, term=None):
1315#        """ Return a list of used attributes. If term (index) is given
1316#        return only attributes used in that single term.
1317#       
1318#        """
1319#        if term is None:
1320#            terms = numpy.where(self.best_set)[0]
1321#        else:
1322#            terms = [term]
1323#        attrs = set()
1324#        for ti in terms:
1325#            attri = numpy.where(self.dirs[ti] != 0.0)[0]
1326#            attrs.update([self.domain.attributes[i] for i in attri])
1327#        return attrs
1328#       
1329#    def evimp(self, used_only=True):
1330#        """ Return the estimated variable importance.
1331#        """
1332#        return evimp(self, used_only)
1333#       
1334#    def __reduce__(self):
1335#        return (EarthClassifier, (self._base_classifier, self.examples,
1336#                                  self.best_set, self.dirs, self.cuts,
1337#                                  self.betas, self.subsets,
1338#                                  self.rss_per_subset, self.gcv_per_subset),
1339#                {})
1340                                 
Note: See TracBrowser for help on using the repository browser.