# source:orange/orange/Orange/regression/earth.py@9569:3bd245c51572

Revision 9569:3bd245c51572, 47.9 KB checked in by ales_erjavec, 2 years ago (diff)

Changed 'format' member function to 'to_string'

Line
1"""\
2====================================================
3Multivariate Adaptive Regression Splines (``earth``)
4====================================================
5
6.. index:: regression, linear model
7
8`Multivariate adaptive regression splines (MARS)`_ is a non-parametric
9regression method that extends a linear model with non-linear
10interactions.
11
12This module borrows the implementation of the technique from the `Earth R
13package`_ by Stephen Milborrow.
14
15.. _`Multivariate adaptive regression splines (MARS)`: http://en.wikipedia.org/wiki/Multivariate_adaptive_regression_splines
16.. _`Earth R package`: http://cran.r-project.org/web/packages/earth/index.html
17
18Example ::
19
20    >>> from Orange.regression import earth
21    >>> data = Orange.data.Table("housing")
22    >>> c = earth.EarthLearner(data, degree=2)
23    >>> print c
24    MEDV =
25       23.587
26       +11.896 * max(0, RM - 6.431)
27       +1.142 * max(0, 6.431 - RM)
28       -0.612 * max(0, LSTAT - 6.120)
29       -228.795 * max(0, NOX - 0.647) * max(0, RM - 6.431)
30       +0.023 * max(0, TAX - 307.000) * max(0, 6.120 - LSTAT)
31       +0.029 * max(0, 307.000 - TAX) * max(0, 6.120 - LSTAT)
32
33    >>>
34
35
36.. autoclass:: EarthLearner
37    :members:
38
39.. autoclass:: EarthClassifier
40    :members:
41
42
43Utility functions
44-----------------
45
46.. autofunction:: gcv
47
48.. autofunction:: plot_evimp
49
50.. autofunction:: bagged_evimp
51
52.. autoclass:: ScoreEarthImportance
53
54"""
55
56import Orange
57from Orange.data.variable import Discrete, Continuous
58from Orange.data import Table, Domain
59from Orange.preprocess import Preprocessor_continuize, \
60                              Preprocessor_impute, \
61                              Preprocessor_preprocessorList, \
62                              DomainContinuizer
63
64import numpy
65
66def is_discrete(var):
67    return isinstance(var, Discrete)
68
69def is_continuous(var):
70    return isinstance(var, Continuous)
71
72def expand_discrete(var):
73    """ Expand a discrete variable ``var`` returning one continuous indicator
74    variable for each value of ``var`` (if the number of values is grater
75    then 2 else return only one indicator variable).
76
77    """
78    if len(var.values) > 2:
79        values = var.values
80    elif len(var.values) == 2:
81        values = var.values[-1:]
82    else:
83        values = var.values[:1]
84    new_vars = []
85    for value in values:
86        new = Continuous("{0}={1}".format(var.name, value))
87        new.get_value_from = cls = Orange.core.ClassifierFromVar(whichVar=var)
88        cls.transformer = Orange.core.Discrete2Continuous()
89        cls.transformer.value = int(Orange.core.Value(var, value))
90        new.source_variable = var
91        new_vars.append(new)
92    return new_vars
93
94class EarthLearner(Orange.core.LearnerFD):
95    """ Earth learner class. Supports both regression and classification
96    problems. In case of classification the class values are expanded into
97    continuous indicator columns (one for each value if the number of
98    values is grater then 2), and a multi response model is learned on these
99    new columns. The resulting classifier will then use the computed response
100    values on new instances to select the final predicted class.
101
102    """
103    def __new__(cls, instances=None, weight_id=None, **kwargs):
104        self = Orange.core.LearnerFD.__new__(cls)
105        if instances is not None:
106            self.__init__(**kwargs)
107            return self.__call__(instances, weight_id)
108        else:
109            return self
110
111    def __init__(self, degree=1, terms=21, penalty= None, thresh=1e-3,
112                 min_span=0, new_var_penalty=0, fast_k=20, fast_beta=1,
113                 pruned_terms=None, scale_resp=True, store_instances=True,
114                 multi_label=False, **kwds):
115        """ Initialize the learner instance.
116
117        :param degree: Maximum degree (num. of hinge functions per term)
118            of the terms in the model.
119        :type degree: int
120        :param terms: Maximum number of terms in the forward pass (default 21).
121
122            .. note:: If this paramter is None then
123                ``min(200, max(20, 2 * n_attributes)) + 1`` will be used. This
124                is the same as the default setting in earth R package.
125
126        :type terms: int
127        :param penalty: Penalty for hinges in the GCV computation (used
128            in the pruning pass). By default it is 3.0 if the degree > 1,
129            2.0 otherwise.
130        :type penalty: float
131        :param thresh: Threshold for RSS decrease in the forward pass
132            (default 0.001).
133        :type thresh: float
134        :param min_span: TODO.
135        :param new_var_penalty: Penalty for introducing a new variable
136            in the model during the forward pass (default 0).
137        :type new_var_penalty: float
138        :param fast_k: Fast k.
139        :param fast_beta: Fast beta.
140        :param pruned_terms: Maximum number of terms in the model after
141            pruning (default None - no limit).
142        :type pruned_terms: int
143        :param scale_resp: Scale responses prior to forward pass (default
144            True - ignored for multi response models).
145        :type scale_resp: bool
146        :param store_instances: Store training instances in the model
147            (default True).
148        :type store_instances: bool
149        :param multi_label: If True build a multi label model (default False).
150        :type multi_label: bool
151
152        .. todo:: min_span, prunning_method (need Leaps like functionality,
153            currently only eval_subsets_using_xtx is implemented).
154
155        """
156        self.degree = degree
157        self.terms = terms
158        if penalty is None:
159            penalty = 3 if degree > 1 else 2
160        self.penalty = penalty
161        self.thresh = thresh
162        self.min_span = min_span
163        self.new_var_penalty = new_var_penalty
164        self.fast_k = fast_k
165        self.fast_beta = fast_beta
166        self.pruned_terms = pruned_terms
167        self.scale_resp = scale_resp
168        self.store_instances = store_instances
169        self.multi_label = multi_label
170        self.__dict__.update(kwds)
171
172        impute = Preprocessor_impute()
173        cont = Preprocessor_continuize(multinomialTreatment=
174                                       DomainContinuizer.AsOrdinal)
175        self.preproc = Preprocessor_preprocessorList(preprocessors=\
176                                                     [impute, cont])
177
178    def __call__(self, instances, weight_id=None):
179        original_domain = instances.domain # Needed by classifier and evimp
180        instances = self.preproc(instances)
181        expanded_class = None
182        if self.multi_label:
183            label_mask = data_label_mask(instances.domain)
184            data = instances.to_numpy_MA("Ac")[0]
185            y = data[:, label_mask]
186            x = data[:, ~ label_mask]
187        else:
188            # Expand a discrete class with indicator columns
189            if is_discrete(instances.domain.class_var):
190                expanded_class = expand_discrete(instances.domain.class_var)
191                y = Table(Domain(expanded_class, None), instances)
192                y = y.to_numpy_MA("A")[0]
193                x = instances.to_numpy_MA("A")[0]
194                label_mask = [False] * x.shape[1] + [True] * y.shape[1]
195                label_mask = numpy.array(label_mask)
196            elif is_continuous(instances.domain.class_var):
197                label_mask = numpy.zeros(len(instances.domain.variables),
198                                         dtype=bool)
199                label_mask[-1] = True
200                x, y, _ = instances.to_numpy_MA()
201                y = y.reshape((-1, 1))
202            else:
203                raise ValueError("Cannot handle the response.")
204
205        if self.scale_resp and y.shape[1] == 1:
206            sy = y - numpy.mean(y, axis=0)
207            sy = sy / numpy.std(sy, axis=1)
208        else:
209            sy = y
210
211        terms = self.terms
212        if terms is None:
213            # Automatic maximum number of terms
214            terms = min(200, max(20, 2 * x.shape[1])) + 1
215
216        n_terms, used, bx, dirs, cuts = forward_pass(x, sy,
217            degree=self.degree, terms=terms, penalty=self.penalty,
218            thresh=self.thresh, fast_k=self.fast_k, fast_beta=self.fast_beta,
219            new_var_penalty=self.new_var_penalty)
220
221        # discard unused terms from bx, dirs, cuts
222        bx = bx[:, used]
223        dirs = dirs[used, :]
224        cuts = cuts[used, :]
225
226        # pruning
227        used, subsets, rss_per_subset, gcv_per_subset = \
228            pruning_pass(bx, y, self.penalty,
229                         pruned_terms=self.pruned_terms)
230
231        # Fit betas
232        bx_used = bx[:, used]
233        betas, res, rank, s = numpy.linalg.lstsq(bx_used, y)
234
235        return EarthClassifier(instances.domain, used, dirs, cuts, betas.T,
236                               subsets, rss_per_subset, gcv_per_subset,
237                               instances=instances if self.store_instances else None,
238                               label_mask=label_mask, multi_flag=self.multi_label,
239                               expanded_class=expanded_class)
240
241
242def soft_max(values):
243    values = numpy.asarray(values)
244    return numpy.exp(values) / numpy.sum(numpy.exp(values))
245
246
247class EarthClassifier(Orange.core.ClassifierFD):
248    """ Earth classifier.
249    """
250    def __init__(self, domain, best_set, dirs, cuts, betas, subsets=None,
251                 rss_per_subset=None, gcv_per_subset=None, instances=None,
252                 label_mask=None, multi_flag=False, expanded_class=None,
253                 original_domain=None, **kwargs):
254        self.multi_flag = multi_flag
255        self.domain = domain
256        self.class_var = domain.class_var
257        self.best_set = best_set
258        self.dirs = dirs
259        self.cuts = cuts
260        self.betas = betas
261        self.subsets = subsets
262        self.rss_per_subset = rss_per_subset
263        self.gcv_per_subset = gcv_per_subset
264        self.instances = instances
265        self.label_mask = label_mask
266        self.expanded_class = expanded_class
267        self.original_domain = original_domain
268        self.__dict__.update(kwargs)
269
270    def __call__(self, instance, result_type=Orange.core.GetValue):
271        if self.multi_flag:
272            resp_vars = [v for v, m in zip(self.domain.variables,
273                                           self.label_mask) if m]
274        elif is_discrete(self.class_var):
275            resp_vars = self.expanded_class
276        else:
277            resp_vars = [self.class_var]
278
279        vals = self.predict(instance)
280        vals = [var(val) for var, val in zip(resp_vars, vals)]
281
282        from Orange.statistics.distribution import Distribution
283
284        if not self.multi_flag and is_discrete(self.class_var):
285            dist = Distribution(self.class_var)
286            if len(self.class_var.values) == 2:
287                probs = [1 - float(vals[0]), float(vals[0])]
288            else:
289                probs = soft_max(map(float, vals))
290
291            for val, p in zip(self.class_var.values, probs):
292                dist[val] = p
293            value = dist.modus()
294            vals, probs = [value], [dist]
295        else:
296            probs = []
297            for var, val in zip(resp_vars, vals):
298                dist = Distribution(var)
299                dist[val] = 1.0
300                probs.append(dist)
301
302        if not self.multi_flag:
303            vals, probs = vals[0], probs[0]
304
305        if result_type == Orange.core.GetValue:
306            return vals
307        elif result_type == Orange.core.GetBoth:
308            return vals, probs
309        else:
310            return probs
311
312    def base_matrix(self, instances=None):
313        """Return the base matrix (bx) of the Earth model for the table.
314        If table is not supplied the base matrix of the training instances
315        is returned.
316        Base matrix is a len(instances) x num_terms matrix of computed values
317        of terms in the model (not multiplied by beta) for each instance.
318
319        :param instances: Input instances for the base matrix.
320        :type instances: :class:`Orange.data.Table`
321
322        """
323        if instances is None:
324            instances = self.instances
325        (data,) = instances.to_numpy_MA("Ac")
326        data = data[:, ~ self.label_mask]
327        bx = base_matrix(data, self.best_set, self.dirs, self.cuts)
328        return bx
329
330    def predict(self, instance):
331        """ Predict the response values for the instance
332
333        :param instance: Data instance
334        :type instance: :class:`Orange.data.Instance`
335
336        """
337        data = Orange.data.Table(self.domain, [instance])
338        bx = self.base_matrix(data)
339        bx_used = bx[:, self.best_set]
340        vals = numpy.dot(bx_used, self.betas.T).ravel()
341        return vals
342
343    def used_attributes(self, term=None):
344        """ Return the used terms for term (index). If no term is given
345        return all attributes in the model.
346
347        :param term: term index
348        :type term: int
349
350        """
351        if term is None:
352            return reduce(set.union, [self.used_attributes(i) \
353                                      for i in range(self.best_set.size)],
354                          set())
355        attrs = [a for a, m in zip(self.domain.variables, self.label_mask)
356                 if not m]
357
358        used_mask = self.dirs[term, :] != 0.0
359        return [a for a, u in zip(attrs, used_mask) if u]
360
361    def evimp(self, used_only=True):
362        """ Return the estimated variable importances.
363
364        :param used_only: if True return only used attributes
365
366        """
367        return evimp(self, used_only)
368
369    def __reduce__(self):
370        return (type(self), (self.domain, self.best_set, self.dirs,
371                            self.cuts, self.betas),
372                dict(self.__dict__))
373
374    def to_string(self, percision=3, indent=3):
375        """ Return a string representation of the model.
376        """
377        return format_model(self, percision, indent)
378
379    def __str__(self):
380        return self.to_string()
381
382
383"""
384Utility functions
385-----------------
386"""
387
388def base_matrix(data, best_set, dirs, cuts):
389    """ Return the base matrix for the earth model.
390
391    :param data: Input data
392    :type data: :class:`numpy.ndarray`
393
394    :param best_set: A array of booleans indicating used terms.
395    :type best_set: :class:`numpy.ndarray`
396
397    :param dirs: Earth model's dirs members
398    :type dirs: :class:`numpy.ndarray`
399
400    :param cuts: Earth model's cuts members
401    :type cuts: :class:`numpy.ndarray`
402
403    """
404    data = numpy.asarray(data)
405    best_set = numpy.asarray(best_set)
406    dirs = numpy.asarray(dirs)
407    cuts = numpy.asarray(cuts)
408
409    bx = numpy.zeros((data.shape[0], best_set.shape[0]))
410    bx[:, 0] = 1.0 # The intercept
411    for termi in range(1, best_set.shape[0]):
412        term_dirs = dirs[termi]
413        term_cuts = cuts[termi]
414
415        dir_p1 = numpy.where(term_dirs == 1)[0]
416        dir_m1 = numpy.where(term_dirs == -1)[0]
417        dir_2 = numpy.where(term_dirs == 2)[0]
418
419        x1 = data[:, dir_p1] - term_cuts[dir_p1]
420        x2 = term_cuts[dir_m1] - data[:, dir_m1]
421        x3 = data[:, dir_2]
422
423        x1 = numpy.where(x1 > 0.0, x1, 0.0)
424        x2 = numpy.where(x2 > 0.0, x2, 0.0)
425
426        X = numpy.hstack((x1, x2, x3))
427        X = numpy.cumprod(X, axis=1)
428        bx[:, termi] = X[:, -1] if X.size else 0.0
429
430    return bx
431
432
433def gcv(rss, n, n_effective_params):
434    """ Return the generalized cross validation.
435
436    .. math:: gcv = rss / (n * (1 - NumEffectiveParams / n) ^ 2)
437
438    :param rss: Residual sum of squares.
439    :param n: Number of training instances.
440    :param n_effective_params: Number of effective paramaters.
441
442    """
443    return  rss / (n * (1. - n_effective_params / n) ** 2)
444
445"""
446Multi-label utility functions
447"""
448
449def is_label_attr(attr):
450    """ Is attribute a label.
451    """
452    return attr.attributes.has_key("label")
453
454def data_label_indices(domain):
455    """ Return the indices of label attributes in data.
456    """
457    return numpy.where(data_label_mask(domain))[0]
458
459def data_label_mask(domain):
460    """ Return an array of booleans indicating whether a variable in the
461    domain is a label.
462    """
463    is_label = map(is_label_attr, domain.variables)
464    if domain.class_var:
465        is_label[-1] = True
466    return numpy.array(is_label, dtype=bool)
467
468"""
469ctypes interface to ForwardPass and EvalSubsetsUsingXtx.
470"""
471
472import ctypes
473from numpy import ctypeslib
474import orange
475
476_c_orange_lib = ctypeslib.load_library(orange.__file__, "")
477_c_forward_pass_ = _c_orange_lib.EarthForwardPass
478
479_c_forward_pass_.argtypes = \
480    [ctypes.POINTER(ctypes.c_int),  #pnTerms:
481     ctypeslib.ndpointer(dtype=ctypes.c_bool, ndim=1),  #FullSet
482     ctypeslib.ndpointer(dtype=ctypes.c_double, ndim=2, flags="F_CONTIGUOUS"), #bx
483     ctypeslib.ndpointer(dtype=ctypes.c_int, ndim=2, flags="F_CONTIGUOUS"),    #Dirs
484     ctypeslib.ndpointer(dtype=ctypes.c_double, ndim=2, flags="F_CONTIGUOUS"), #Cuts
485     ctypeslib.ndpointer(dtype=ctypes.c_int, ndim=1),  #nFactorsInTerms
486     ctypeslib.ndpointer(dtype=ctypes.c_int, ndim=1),  #nUses
487     ctypeslib.ndpointer(dtype=ctypes.c_double, ndim=2, flags="F_CONTIGUOUS"), #x
488     ctypeslib.ndpointer(dtype=ctypes.c_double, ndim=2, flags="F_CONTIGUOUS"), #y
489     ctypeslib.ndpointer(dtype=ctypes.c_double, ndim=1), # Weights
490     ctypes.c_int,  #nCases
491     ctypes.c_int,  #nResp
492     ctypes.c_int,  #nPred
493     ctypes.c_int,  #nMaxDegree
494     ctypes.c_int,  #nMaxTerms
495     ctypes.c_double,   #Penalty
496     ctypes.c_double,   #Thresh
497     ctypes.c_int,  #nFastK
498     ctypes.c_double,   #FastBeta
499     ctypes.c_double,   #NewVarPenalty
500     ctypeslib.ndpointer(dtype=ctypes.c_int, ndim=1),  #LinPreds
501     ctypes.c_bool, #UseBetaCache
502     ctypes.c_char_p    #sPredNames
503     ]
504
505def forward_pass(x, y, degree=1, terms=21, penalty=None, thresh=0.001,
506                  fast_k=21, fast_beta=1, new_var_penalty=2):
507    """ Do earth forward pass.
508    """
509    import ctypes, orange
510    x = numpy.asfortranarray(x, dtype=ctypes.c_double)
511    y = numpy.asfortranarray(y, dtype=ctypes.c_double)
512    if x.shape[0] != y.shape[0]:
513        raise ValueError("First dimensions of x and y must be the same.")
514    if y.ndim == 1:
515        y = y.reshape((-1, 1), order="F")
516    if penalty is None:
517        penalty = 2
518    n_cases = x.shape[0]
519    n_preds = x.shape[1]
520
521    n_resp = y.shape[1] if y.ndim == 2 else y.shape[0]
522
523    # Output variables
524    n_term = ctypes.c_int()
525    full_set = numpy.zeros((terms,), dtype=ctypes.c_bool, order="F")
526    bx = numpy.zeros((n_cases, terms), dtype=ctypes.c_double, order="F")
527    dirs = numpy.zeros((terms, n_preds), dtype=ctypes.c_int, order="F")
528    cuts = numpy.zeros((terms, n_preds), dtype=ctypes.c_double, order="F")
529    n_factors_in_terms = numpy.zeros((terms,), dtype=ctypes.c_int, order="F")
530    n_uses = numpy.zeros((n_preds,), dtype=ctypes.c_int, order="F")
531    weights = numpy.ones((n_cases,), dtype=ctypes.c_double, order="F")
532    lin_preds = numpy.zeros((n_preds,), dtype=ctypes.c_int, order="F")
533    use_beta_cache = True
534
535    # These tests are performed in ForwardPass, and if they fail the function
536    # calls exit. So we must check it here and raise a exception to avoid a
537    # process shutdown.
538    if n_cases < 8:
539        raise ValueError("Need at least 8 data instances.")
540    if n_cases > 1e8:
541        raise ValueError("To many data instances.")
542    if n_resp < 1:
543        raise ValueError("No response column.")
544    if n_resp > 1e6:
545        raise ValueError("To many response columns.")
546    if n_preds < 1:
547        raise ValueError("No predictor columns.")
548    if n_preds > 1e5:
549        raise ValueError("To many predictor columns.")
550    if degree <= 0 or degree > 100:
551        raise ValueError("Invalid 'degree'.")
552    if terms < 3 or terms > 10000:
553        raise ValueError("'terms' must be in >= 3 and <= 10000.")
554    if penalty < 0 and penalty != -1:
555        raise ValueError("Invalid 'penalty' (the only legal negative value is -1).")
556    if penalty > 1000:
557        raise ValueError("Invalid 'penalty' (must be <= 1000).")
558    if thresh < 0.0 or thresh >= 1.0:
559        raise ValueError("Invalid 'thresh' (must be in [0.0, 1.0) ).")
560    if fast_beta < 0 or fast_beta > 1000:
561        raise ValueError("Invalid 'fast_beta' (must be in [0, 1000] ).")
562    if new_var_penalty < 0 or new_var_penalty > 10:
563        raise ValueError("Invalid 'new_var_penalty' (must be in [0, 10] ).")
564    if (numpy.var(y, axis=0) <= 1e-8).any():
565        raise ValueError("Variance of y is zero (or near zero).")
566
567    _c_forward_pass_(ctypes.byref(n_term), full_set, bx, dirs, cuts,
568                     n_factors_in_terms, n_uses, x, y, weights, n_cases,
569                     n_resp, n_preds, degree, terms, penalty, thresh,
570                     fast_k, fast_beta, new_var_penalty, lin_preds,
571                     use_beta_cache, None)
572    return n_term.value, full_set, bx, dirs, cuts
573
574
575_c_eval_subsets_xtx = _c_orange_lib.EarthEvalSubsetsUsingXtx
576
577_c_eval_subsets_xtx.argtypes = \
578    [ctypeslib.ndpointer(dtype=ctypes.c_bool, ndim=2, flags="F_CONTIGUOUS"),   #PruneTerms
579     ctypeslib.ndpointer(dtype=ctypes.c_double, ndim=1),   #RssVec
580     ctypes.c_int,  #nCases
581     ctypes.c_int,  #nResp
582     ctypes.c_int,  #nMaxTerms
583     ctypeslib.ndpointer(dtype=ctypes.c_double, ndim=2, flags="F_CONTIGUOUS"),   #bx
584     ctypeslib.ndpointer(dtype=ctypes.c_double, ndim=2, flags="F_CONTIGUOUS"),   #y
585     ctypeslib.ndpointer(dtype=ctypes.c_double, ndim=1)  #WeightsArg
586     ]
587
588_c_eval_subsets_xtx.restype = ctypes.c_int
589
590def subset_selection_xtx(X, Y):
591    """ Subsets selection using EvalSubsetsUsingXtx in the Earth package.
592    """
593    X = numpy.asfortranarray(X, dtype=ctypes.c_double)
594    Y = numpy.asfortranarray(Y, dtype=ctypes.c_double)
595    if Y.ndim == 1:
596        Y = Y.reshape((-1, 1), order="F")
597
598    if X.shape[0] != Y.shape[0]:
599        raise ValueError("First dimensions of bx and y must be the same")
600
601    var_count = X.shape[1]
602    resp_count = Y.shape[1]
603    cases = X.shape[0]
604    subsets = numpy.zeros((var_count, var_count), dtype=ctypes.c_bool,
605                              order="F")
606    rss_vec = numpy.zeros((var_count,), dtype=ctypes.c_double, order="F")
607    weights = numpy.ones((cases,), dtype=ctypes.c_double, order="F")
608
609    rval = _c_eval_subsets_xtx(subsets, rss_vec, cases, resp_count, var_count,
610                        X, Y, weights)
611    if rval != 0:
612        raise numpy.linalg.LinAlgError("Lin. dep. terms in X")
613
614    subsets_ind = numpy.zeros((var_count, var_count), dtype=int)
615    for i, used in enumerate(subsets.T):
616        subsets_ind[i, :i + 1] = numpy.where(used)[0]
617
618    return subsets_ind, rss_vec
619
620def subset_selection_xtx_numpy(X, Y):
621    """ A numpy implementation of EvalSubsetsUsingXtx in the Earth package.
622    Using numpy.linalg.lstsq
623
624    """
625    X = numpy.asarray(X)
626    Y = numpy.asarray(Y)
627
628    var_count= X.shape[1]
629    rss_vec = numpy.zeros(var_count)
630    working_set = range(var_count)
631    subsets = numpy.zeros((var_count, var_count), dtype=int)
632
633    for subset_size in reversed(range(var_count)):
634        subsets[subset_size, :subset_size + 1] = working_set
635        X_work = X[:, working_set]
636        b, res, rank, s = numpy.linalg.lstsq(X_work, Y)
637        if res.size > 0:
638            rss_vec[subset_size] = numpy.sum(res)
639        else:
640            rss_vec[subset_size] = numpy.sum((Y - numpy.dot(X_work, b)) ** 2)
641
642        XtX = numpy.dot(X_work.T, X_work)
643        iXtX = numpy.linalg.pinv(XtX)
644        diag = numpy.diag(iXtX).reshape((-1, 1))
645
646        if subset_size == 0:
647            break
648
649        delta_rss = b ** 2 / diag
650        delta_rss = numpy.sum(delta_rss, axis=1)
651        delete_i = numpy.argmin(delta_rss[1:]) + 1 # Keep the intercept
652        del working_set[delete_i]
653    return subsets, rss_vec
654
655def subset_selection_xtx2(X, Y):
656    """ Another implementation (this uses qr decomp).
657    """
658    from Orange.misc import linalg
659    X = numpy.asfortranarray(X, dtype=ctypes.c_double)
660    Y = numpy.asfortranarray(Y, dtype=ctypes.c_double)
661    col_count = X.shape[1]
662    working_set = range(col_count)
663    subsets = numpy.zeros((col_count, col_count), dtype=int)
664    rss_vec = numpy.zeros((col_count,))
665    QR, k, _, jpvt = linalg.qr_decomp(X)
666
667    if k < col_count:
668        # remove jpvt[k:] from the work set. Will have zero
669        # entries in the subsets matrix, and inf rss
670        for i in sorted(jpvt[k:], reverse=True):
671            del working_set[i]
672            rss_vec[len(working_set)] = float("inf")
673        col_count = len(working_set)
674
675    for subset_size in reversed(range(col_count)):
676        subsets[subset_size, :subset_size + 1] = working_set
677        X_work = X[:, working_set]
678        b, rsd, rank = linalg.qr_lstsq(X_work, Y)
679#        print rsd
680        rss_vec[subset_size] = numpy.sum(rsd ** 2)
681        XtX = numpy.dot(X_work.T, X_work)
682        iXtX = numpy.linalg.pinv(XtX)
683        diag = numpy.diag(iXtX)
684
685        if subset_size == 0:
686            break
687
688        delta_rss = b ** 2 / diag
689        delete_i = numpy.argmin(delta_rss[1:]) + 1 # Keep the intercept
690        del working_set[delete_i]
691    return subsets, rss_vec
692
693def pruning_pass(bx, y, penalty, pruned_terms=-1):
694    """ Do pruning pass
695
696    .. todo:: pruned_terms, Leaps
697
698    """
699    try:
700        subsets, rss_vec = subset_selection_xtx(bx, y)
701    except numpy.linalg.LinAlgError:
702        subsets, rss_vec = subset_selection_xtx_numpy(bx, y)
703
704    cases, terms = bx.shape
705    n_effective_params = numpy.arange(terms) + 1.0
706    n_effective_params += penalty * (n_effective_params - 1.0) / 2.0
707
708    gcv_vec = gcv(rss_vec, cases, n_effective_params)
709
710    min_i = numpy.argmin(gcv_vec)
711    used = numpy.zeros((terms), dtype=bool)
712
713    used[subsets[min_i, :min_i + 1]] = True
714
715    return used, subsets, rss_vec, gcv_vec
716
717"""
718Printing functions.
719"""
720
721def format_model(model, percision=3, indent=3):
722    """ Return a formated string representation of the earth model.
723    """
724    mask = model.label_mask
725    if model.multi_flag:
726        r_vars = [v for v, m in zip(model.domain.variables,
727                                    model.label_mask)
728                  if m]
729    elif is_discrete(model.class_var):
730        r_vars = model.expanded_class
731    else:
732        r_vars = [model.class_var]
733
734    r_names = [v.name for v in r_vars]
735    betas = model.betas
736
737    resp = []
738    for name, betas in zip(r_names, betas):
739        resp.append(_format_response(model, name, betas,
740                                     percision, indent))
741    return "\n\n".join(resp)
742
743def _format_response(model, resp_name, betas, percision=3, indent=3):
744    header = "%s =" % resp_name
745    indent = " " * indent
746    fmt = "%." + str(percision) + "f"
747    terms = [([], fmt % betas[0])]
748    beta_i = 0
749    for i, used in enumerate(model.best_set[1:], 1):
750        if used:
751            beta_i += 1
752            beta = fmt % abs(betas[beta_i])
753            knots = [_format_knot(model, attr.name, d, c) for d, c, attr in \
754                     zip(model.dirs[i], model.cuts[i], model.domain.attributes) \
755                     if d != 0]
756            term_attrs = [a for a, d in zip(model.domain.attributes, model.dirs[i]) \
757                          if d != 0]
758            term_attrs = sorted(term_attrs)
759            sign = "-" if betas[beta_i] < 0 else "+"
760            if knots:
761                terms.append((term_attrs,
762                              sign + " * ".join([beta] + knots)))
763            else:
764                terms.append((term_attrs, sign + beta))
765    # Sort by len(term_attrs), then by term_attrs
766    terms = sorted(terms, key=lambda t: (len(t[0]), t[0]))
767    return "\n".join([header] + [indent + t for _, t in terms])
768
769def _format_knot(model, name, dir, cut):
770    if dir == 1:
771        txt = "max(0, %s - %.3f)" % (name, cut)
772    elif dir == -1:
773        txt = "max(0, %.3f - %s)" % (cut, name)
774    elif dir == 2:
775        txt = name
776    return txt
777
778def _format_term(model, i, attr_name):
779    knots = [_format_knot(model, attr, d, c) for d, c, attr in \
780             zip(model.dirs[i], model.cuts[i], model.domain.attributes) \
781             if d != 0]
782    return " * ".join(knots)
783
784"""\
785Variable importance estimation
786------------------------------
787"""
788
789from collections import defaultdict
790
791def collect_source(vars):
792    """ Given a list of variables ``var``, return a mapping from source
793    variables (``source_variable`` or ``get_value_from.variable`` members)
794    back to the variables in ``vars`` (assumes the default preprocessor in
795    EarthLearner).
796
797    """
798    source = defaultdict(list)
799    for var in vars:
800        svar = None
801        if var.source_variable:
802            source[var.source_variable].append(var)
803        elif isinstance(var.get_value_from, Orange.core.ClassifierFromVar):
804            source[var.get_value_from.variable].append(var)
805        elif isinstance(var.get_value_from, Orange.core.ImputeClassifier):
806            source[var.get_value_from.classifier_from_var.variable].append(var)
807        else:
808            source[var].append(var)
809    return dict(source)
810
811def map_to_source_var(var, sources):
812    """
813    """
814    if var in sources:
815        return var
816    elif var.source_variable in sources:
817        return var.source_variable
818    elif isinstance(var.get_value_from, Orange.core.ClassifierFromVar):
819        return map_to_source_var(var.get_value_from.variable, sources)
820    elif isinstance(var.get_value_from, Orange.core.ImputeClassifier):
821        var = var.get_value_from.classifier_from_var.variable
822        return map_to_source_var(var, sources)
823    else:
824        return None
825
826def evimp(model, used_only=True):
827    """ Return the estimated variable importance for the model.
828
829    :param model: Earth model.
830    :type model: `EarthClassifier`
831
832    """
833    if model.subsets is None:
834        raise ValueError("No subsets. Use the learner with 'prune=True'.")
835
836    subsets = model.subsets
837    n_subsets = numpy.sum(model.best_set)
838
839    rss = -numpy.diff(model.rss_per_subset)
840    gcv = -numpy.diff(model.gcv_per_subset)
841    attributes = list(model.domain.variables)
842
843    attr2ind = dict(zip(attributes, range(len(attributes))))
844    importances = numpy.zeros((len(attributes), 4))
845    importances[:, 0] = range(len(attributes))
846
847    for i in range(1, n_subsets):
848        term_subset = subsets[i, :i + 1]
849        used_attributes = reduce(set.union, [model.used_attributes(term) \
850                                             for term in term_subset], set())
851        for attr in used_attributes:
852            importances[attr2ind[attr]][1] += 1.0
853            importances[attr2ind[attr]][2] += gcv[i - 1]
854            importances[attr2ind[attr]][3] += rss[i - 1]
855    imp_min = numpy.min(importances[:, [2, 3]], axis=0)
856    imp_max = numpy.max(importances[:, [2, 3]], axis=0)
857
858    #Normalize importances.
859    importances[:, [2, 3]] = 100.0 * (importances[:, [2, 3]] \
860                            - [imp_min]) / ([imp_max - imp_min])
861
862    importances = list(importances)
863    # Sort by n_subsets and gcv.
864    importances = sorted(importances, key=lambda row: (row[1], row[2]),
865                         reverse=True)
866    importances = numpy.array(importances)
867
868    if used_only:
869        importances = importances[importances[:,1] > 0.0]
870
871    res = [(attributes[int(row[0])], tuple(row[1:])) for row in importances]
872    return res
873
874
875def plot_evimp(evimp):
876    """ Plot the variable importances as returned from
877    :obj:`EarthClassifier.evimp` call.
878
879        >>> data = Orange.data.Table("housing")
880        >>> c = EarthLearner(data, degree=3)
881        >>> plot_evimp(c.evimp())
882
883    .. image:: files/earth-evimp.png
884
885    The left axis is the nsubsets measure an on the right are the normalized
886    RSS and GCV.
887
888    """
889    from Orange.ensemble.bagging import BaggedClassifier
890    if isinstance(evimp, EarthClassifier):
891        evimp = evimp.evimp()
892    elif isinstance(evimp, BaggedClassifier):
893        evimp = bagged_evimp(evimp)
894
895    import pylab
896    fig = pylab.figure()
897    axes1 = fig.add_subplot(111)
898    attrs = [a for a, _ in evimp]
899    imp = [s for _, s in evimp]
900    imp = numpy.array(imp)
901    X = range(len(attrs))
902    l1 = axes1.plot(X, imp[:,0], "b-",)
903    axes2 = axes1.twinx()
904
905    l2 = axes2.plot(X, imp[:,1], "g-",)
906    l3 = axes2.plot(X, imp[:,2], "r-",)
907
908    x_axis = axes1.xaxis
909    x_axis.set_ticks(X)
910    x_axis.set_ticklabels([a.name for a in attrs], rotation=90)
911
912    axes1.yaxis.set_label_text("nsubsets")
913    axes2.yaxis.set_label_text("normalized gcv or rss")
914
915    axes1.legend([l1, l2, l3], ["nsubsets", "gcv", "rss"])
916    axes1.set_title("Variable importance")
917    fig.show()
918
919
920def bagged_evimp(classifier, used_only=True):
921    """ Extract combined (average) evimp from an instance of BaggedClassifier
922
923    Example ::
924
925        >>> from Orange.ensemble.bagging import BaggedLearner
926        >>> bc = BaggedLearner(EarthLearner(degree=3, terms=10), data)
927        >>> bagged_evimp(bc)
928
929    """
930    def assert_type(object, class_):
931        if not isinstance(object, class_):
932            raise TypeError("Instance of %r expected." % (class_))
933    from collections import defaultdict
934    from Orange.ensemble.bagging import BaggedClassifier
935
936    assert_type(classifier, BaggedClassifier)
937    bagged_imp = defaultdict(list)
938    attrs_by_name = defaultdict(list)
939    for c in classifier.classifiers:
940        assert_type(c, EarthClassifier)
941        imp = evimp(c, used_only=used_only)
942        for attr, score in imp:
943            bagged_imp[attr.name].append(score) # map by name
944            attrs_by_name[attr.name].append(attr)
945
946    for attr, scores in bagged_imp.items():
947        scores = numpy.average(scores, axis=0)
948        bagged_imp[attr] = tuple(scores)
949
950
951    bagged_imp = sorted(bagged_imp.items(), key=lambda t: (t[1][0],t[1][1]),
952                        reverse=True)
953
954    bagged_imp = [(attrs_by_name[name][0], scores) for name, scores in bagged_imp]
955
956    if used_only:
957        bagged_imp = [(a, r) for a, r in bagged_imp if r[0] > 0]
958    return bagged_imp
959
960"""
961High level interface for measuring variable importance
962(compatible with Orange.feature.scoring module).
963
964"""
965from Orange.feature import scoring
966
967class ScoreEarthImportance(scoring.Score):
968    """ An :class:`Orange.feature.scoring.Score` subclass.
969    Scores features based on their importance in the Earth
970    model using ``bagged_evimp``'s function return value.
971
972    """
973    # Return types
974    NSUBSETS = 0
975    RSS = 1
976    GCV = 2
977
978    handles_discrete = True
979    handles_continuous = True
980    computes_thresholds = False
981    needs = scoring.Score.Generator
982
983    def __new__(cls, attr=None, data=None, weight_id=None, **kwargs):
984        self = scoring.Score.__new__(cls)
985        if attr is not None and data is not None:
986            self.__init__(**kwargs)
987            # TODO: Should raise a warning, about caching
988            return self.__call__(attr, data, weight_id)
989        elif not attr and not data:
990            return self
991        else:
992            raise ValueError("Both 'attr' and 'data' arguments expected.")
993
994    def __init__(self, t=10, degree=2, terms=10, score_what="nsubsets", cached=True):
995        """
996        :param t: Number of earth models to train on the data
997            (using BaggedLearner).
998
999        :param score_what: What to return as a score.
1000            Can be one of: "nsubsets", "rss", "gcv" or class constants
1001            NSUBSETS, RSS, GCV.
1002
1003        """
1004        self.t = t
1005        self.degree = degree
1006        self.terms = terms
1007        if isinstance(score_what, basestring):
1008            score_what = {"nsubsets":self.NSUBSETS, "rss":self.RSS,
1009                          "gcv":self.GCV}.get(score_what, None)
1010
1011        if score_what not in range(3):
1012            raise ValueError("Invalid  'score_what' parameter.")
1013
1014        self.score_what = score_what
1015        self.cached = cached
1016        self._cache_ref = None
1017
1018    def __call__(self, attr, data, weight_id=None):
1019        ref = self._cache_ref
1020        if ref is not None and ref is data:
1021            evimp = self._cached_evimp
1022        else:
1023            from Orange.ensemble.bagging import BaggedLearner
1024            bc = BaggedLearner(EarthLearner(degree=self.degree,
1025                            terms=self.terms), t=self.t)(data, weight_id)
1026            evimp = bagged_evimp(bc, used_only=False)
1027            self._cache_ref = data
1028            self._cached_evimp = evimp
1029
1030        evimp = dict(evimp)
1031        score = evimp.get(attr, None)
1032
1033        if score is None:
1034            source = collect_source(evimp.keys())
1035            if attr in source:
1036                # Return average of source var scores
1037                return numpy.average([evimp[v][self.score_what] \
1038                                      for v in source[attr]])
1039            else:
1040                raise ValueError("Attribute %r not in the domain." % attr)
1041        else:
1042            return score[self.score_what]
1043
1044class ScoreRSS(scoring.Score):
1045
1046    handles_discrete = False
1047    handles_continuous = True
1048    computes_thresholds = False
1049
1050    def __new__(cls, attr=None, data=None, weight_id=None, **kwargs):
1051        self = scoring.Score.__new__(cls)
1052        if attr is not None and data is not None:
1053            self.__init__(**kwargs)
1054            # TODO: Should raise a warning, about caching
1055            return self.__call__(attr, data, weight_id)
1056        elif not attr and not data:
1057            return self
1058        else:
1059            raise ValueError("Both 'attr' and 'data' arguments expected.")
1060
1061    def __init__(self):
1062        self._cache_data = None
1063        self._cache_rss = None
1064
1065    def __call__(self, attr, data, weight_id=None):
1066        ref = self._cache_data
1067        if ref is not None and ref is data:
1068            rss = self._cache_rss
1069        else:
1070            x, y = data.to_numpy_MA("1A/c")
1071            try:
1072                subsets, rss = subset_selection_xtx2(x, y)
1073            except numpy.linalg.LinAlgError:
1074                subsets, rss = subset_selection_xtx_numpy(x, y)
1075            rss_diff = -numpy.diff(rss)
1076            rss = numpy.zeros_like(rss)
1077            for s_size in range(1, subsets.shape[0]):
1078                subset = subsets[s_size, :s_size + 1]
1079                rss[subset] += rss_diff[s_size - 1]
1080            rss = rss[1:] #Drop the intercept
1081            self._cache_data = data
1082            self._cache_rss = rss
1083#        print sorted(zip(rss, data.domain.attributes))
1084        index = list(data.domain.attributes).index(attr)
1085        return rss[index]
1086
1087
1088#from Orange.core import EarthLearner as BaseEarthLearner, \
1089#                        EarthClassifier as BaseEarthClassifier
1090#from Orange.misc import member_set
1091#
1092#class _EarthLearner(BaseEarthLearner):
1093#    """ An earth learner.
1094#    """
1095#    def __new__(cls, data=None, weightId=None, **kwargs):
1096#        self = BaseEarthLearner.__new__(cls, **kwargs)
1097#        if data is not None:
1098#            self.__init__(**kwargs)
1099#            return self.__call__(data, weightId)
1100#        return self
1101#
1102#    def __init__(self, max_degree=1, max_terms=21, new_var_penalty=0.0,
1103#                 threshold=0.001, prune=True, penalty=None, fast_k=20,
1104#                 fast_beta=0.0, store_examples=True, **kwargs):
1105#        """ Initialize the learner instance.
1106#
1107#        :param max_degree:
1108#        """
1109#        self.max_degree = max_degree
1110#        self.max_terms = max_terms
1111#        self.new_var_penalty = new_var_penalty
1112#        self.threshold = threshold
1113#        self.prune = prunes
1114#        if penaty is None:
1115#            penalty = 2.0 if degree > 1 else 3.0
1116#        self.penalty = penalty
1117#        self.fast_k = fast_k
1118#        self.fast_beta = fast_beta
1119#        self.store_examples = store_examples
1120#
1121#        for key, val in kwargs.items():
1122#            setattr(self, key, val)
1123#
1124#    def __call__(self, data, weightId=None):
1125#        if not data.domain.class_var:
1126#            raise ValueError("No class var in the domain.")
1127#
1128#        with member_set(self, "prune", False):
1129#            # We overwrite the prune argument (will do the pruning in python).
1130#            base_clsf =  BaseEarthLearner.__call__(self, data, weightId)
1131#
1132#        if self.prune:
1133#            (best_set, betas, rss, subsets, rss_per_subset,
1134#             gcv_per_subset) = self.pruning_pass(base_clsf, data)
1135#
1136#            return _EarthClassifier(base_clsf, data if self.store_examples else None,
1137#                                   best_set=best_set, dirs=base_clsf.dirs,
1138#                                   cuts=base_clsf.cuts,
1139#                                   betas=betas,
1140#                                   subsets=subsets,
1141#                                   rss_per_subset=rss_per_subset,
1142#                                   gcv_per_subset=gcv_per_subset)
1143#        else:
1144#            return _EarthClassifier(base_clsf, data if self.store_examples else None)
1145#
1146#
1147#    def pruning_pass(self, base_clsf, examples):
1148#        """ Prune the terms constructed in the forward pass.
1149#        (Pure numpy reimplementation)
1150#        """
1151#        n_terms = numpy.sum(base_clsf.best_set)
1152#        n_eff_params = n_terms + self.penalty * (n_terms - 1) / 2.0
1153#        data, y, _ = examples.to_numpy_MA()
1154#        data = data.filled(0.0)
1155#        best_set = numpy.asarray(base_clsf.best_set, dtype=bool)
1156#
1157#        bx = base_matrix(data, base_clsf.best_set,
1158#                         base_clsf.dirs, base_clsf.cuts,
1159#                         )
1160#
1161#        bx_used = bx[:, best_set]
1162#        subsets, rss_per_subset = subsets_selection_xtx(bx_used, y) # TODO: Use leaps like library
1163#        gcv_per_subset = [gcv(rss, bx.shape[0], i + self.penalty * (i - 1) / 2.0) \
1164#                              for i, rss in enumerate(rss_per_subset, 1)]
1165#        gcv_per_subset = numpy.array(gcv_per_subset)
1166#
1167#        best_i = numpy.argmin(gcv_per_subset[1:]) + 1 # Ignore the intercept
1168#        best_ind = subsets[best_i, :best_i + 1]
1169#        bs_i = 0
1170#        for i, b in enumerate(best_set):
1171#            if b:
1172#                best_set[i] = bs_i in best_ind
1173#                bs_i += 1
1174#
1175#        bx_subset = bx[:, best_set]
1176#        betas, rss, rank, s = numpy.linalg.lstsq(bx_subset, y)
1177#        return best_set, betas, rss, subsets, rss_per_subset, gcv_per_subset
1178#
1179#
1180#class _EarthClassifier(Orange.core.ClassifierFD):
1181#    def __init__(self, base_classifier=None, examples=None, best_set=None,
1182#                 dirs=None, cuts=None, betas=None, subsets=None,
1183#                 rss_per_subset=None,
1184#                 gcv_per_subset=None):
1185#        self._base_classifier = base_classifier
1186#        self.examples = examples
1187#        self.domain = base_classifier.domain
1188#        self.class_var = base_classifier.class_var
1189#
1190#        best_set = base_classifier.best_set if best_set is None else best_set
1191#        dirs = base_classifier.dirs if dirs is None else dirs
1192#        cuts = base_classifier.cuts if cuts is None else cuts
1193#        betas = base_classifier.betas if betas is None else betas
1194#
1195#        self.best_set = numpy.asarray(best_set, dtype=bool)
1196#        self.dirs = numpy.array(dirs, dtype=int)
1197#        self.cuts = numpy.array(cuts)
1198#        self.betas = numpy.array(betas)
1199#
1200#        self.subsets = subsets
1201#        self.rss_per_subset = rss_per_subset
1202#        self.gcv_per_subset = gcv_per_subset
1203#
1204#    @property
1205#    def num_terms(self):
1206#        """ Number of terms in the model (including the intercept).
1207#        """
1208#        return numpy.sum(numpy.asarray(self.best_set, dtype=int))
1209#
1210#    @property
1211#    def max_terms(self):
1212#        """ Maximum number of terms (as specified in the learning step).
1213#        """
1214#        return self.best_set.size
1215#
1216#    @property
1217#    def num_preds(self):
1218#        """ Number of predictors (variables) included in the model.
1219#        """
1220#        return len(self.used_attributes(term))
1221#
1222#    def __call__(self, example, what=Orange.core.GetValue):
1223#        value = self.predict(example)
1224#        if isinstance(self.class_var, Orange.data.variable.Continuous):
1225#            value = self.class_var(value)
1226#        else:
1227#            value = self.class_var(int(round(value)))
1228#
1229#        dist = Orange.statistics.distribution.Distribution(self.class_var)
1230#        dist[value] = 1.0
1231#        if what == Orange.core.GetValue:
1232#            return value
1233#        elif what == Orange.core.GetProbabilities:
1234#            return dist
1235#        else:
1236#            return (value, dist)
1237#
1238#    def base_matrix(self, examples=None):
1239#        """ Return the base matrix (bx) of the Earth model for the table.
1240#        If table is not supplied the base matrix of the training examples
1241#        is returned.
1242#
1243#
1244#        :param examples: Input examples for the base matrix.
1245#        :type examples: Orange.data.Table
1246#
1247#        """
1248#        if examples is None:
1249#            examples = self.examples
1250#
1251#        if examples is None:
1252#            raise ValueError("base matrix is only available if 'store_examples=True'")
1253#
1254#        if isinstance(examples, Orange.data.Table):
1255#            data, cls, w = examples.to_numpy_MA()
1256#            data = data.filled(0.0)
1257#        else:
1258#            data = numpy.asarray(examples)
1259#
1260#        return base_matrix(data, self.best_set, self.dirs, self.cuts)
1261#
1262#    def _anova_order(self):
1263#        """ Return indices that sort the terms into the 'ANOVA' format.
1264#        """
1265#        terms = [([], 0)] # intercept
1266#        for i, used in enumerate(self.best_set[1:], 1):
1267#            attrs = sorted(self.used_attributes(i))
1268#            if used and attrs:
1269#                terms.append((attrs, i))
1270#        terms = sotred(terms, key=lambda t:(len(t[0]), t[0]))
1271#        return [i for _, i in terms]
1272#
1273#    def format_model(self, percision=3, indent=3):
1274#        return format_model(self, percision, indent)
1275#
1276#    def print_model(self, percision=3, indent=3):
1277#        print self.format_model(percision, indent)
1278#
1279#    def predict(self, ex):
1280#        """ Return the predicted value (float) for example.
1281#        """
1282#        x = Orange.data.Table(self.domain, [ex])
1283#        x, c, w = x.to_numpy_MA()
1284#        x = x.filled(0.0)[0]
1285#
1286#        bx = numpy.ones(self.best_set.shape)
1287#        betas = numpy.zeros_like(self.betas)
1288#        betas[0] = self.betas[0]
1289#        beta_i = 0
1290#        for termi in range(1, len(self.best_set)):
1291#            dirs = self.dirs[termi]
1292#            cuts = self.cuts[termi]
1293#            dir_p1 = numpy.where(dirs == 1)[0]
1294#            dir_m1 = numpy.where(dirs == -1)[0]
1295#            dir_2 = numpy.where(dirs == 2)[0]
1296#
1297#            x1 = x[dir_p1] - cuts[dir_p1]
1298#            x2 = cuts[dir_m1] - x[dir_m1]
1299#            x3 = x[dir_2]
1300#
1301#            x1 = numpy.maximum(x1, 0.0)
1302#            x2 = numpy.maximum(x2, 0.0)
1303#
1304#            X = numpy.hstack((x1, x2, x3))
1305#            X = numpy.cumprod(X)
1306#
1307#            bx[termi] = X[-1] if X.size else 0.0
1308#            if self.best_set[termi] != 0:
1309#                beta_i += 1
1310#                betas[beta_i] = self.betas[beta_i]
1311#
1312#        return numpy.sum(bx[self.best_set] * betas)
1313#
1314#    def used_attributes(self, term=None):
1315#        """ Return a list of used attributes. If term (index) is given
1316#        return only attributes used in that single term.
1317#
1318#        """
1319#        if term is None:
1320#            terms = numpy.where(self.best_set)[0]
1321#        else:
1322#            terms = [term]
1323#        attrs = set()
1324#        for ti in terms:
1325#            attri = numpy.where(self.dirs[ti] != 0.0)[0]
1326#            attrs.update([self.domain.attributes[i] for i in attri])
1327#        return attrs
1328#
1329#    def evimp(self, used_only=True):
1330#        """ Return the estimated variable importance.
1331#        """
1332#        return evimp(self, used_only)
1333#
1334#    def __reduce__(self):
1335#        return (EarthClassifier, (self._base_classifier, self.examples,
1336#                                  self.best_set, self.dirs, self.cuts,
1337#                                  self.betas, self.subsets,
1338#                                  self.rss_per_subset, self.gcv_per_subset),
1339#                {})
1340
Note: See TracBrowser for help on using the repository browser.