source: orange/Orange/regression/earth.py @ 10328:9bb531f21ac5

Revision 10328:9bb531f21ac5, 48.2 KB checked in by Ales Erjavec <ales.erjavec@…>, 2 years ago (diff)

Using the new multi-target domain definition for multiresponse learning. Changed the base class to Orange.regression.base.BaseRegressionLearner.

Line 
1"""\
2====================================================
3Multivariate Adaptive Regression Splines (``earth``)
4====================================================
5
6.. index:: regression, linear model
7
8`Multivariate adaptive regression splines (MARS)`_ is a non-parametric
9regression method that extends a linear model with non-linear
10interactions.
11
12This module borrows the implementation of the technique from the `Earth R
13package`_ by Stephen Milborrow.
14
15.. _`Multivariate adaptive regression splines (MARS)`: http://en.wikipedia.org/wiki/Multivariate_adaptive_regression_splines
16.. _`Earth R package`: http://cran.r-project.org/web/packages/earth/index.html
17
18Example ::
19
20    >>> import Orange
21    >>> data = Orange.data.Table("housing")
22    >>> c = Orange.regression.earth.EarthLearner(data, degree=2, terms=10)
23    >>> print c
24    MEDV =
25       23.587
26       +11.896 * max(0, RM - 6.431)
27       +1.142 * max(0, 6.431 - RM)
28       -0.612 * max(0, LSTAT - 6.120)
29       -228.795 * max(0, NOX - 0.647) * max(0, RM - 6.431)
30       +0.023 * max(0, TAX - 307.000) * max(0, 6.120 - LSTAT)
31       +0.029 * max(0, 307.000 - TAX) * max(0, 6.120 - LSTAT)
32
33
34.. autoclass:: EarthLearner
35    :members:
36
37.. autoclass:: EarthClassifier
38    :members:
39
40
41Utility functions
42-----------------
43
44.. autofunction:: gcv
45
46.. autofunction:: plot_evimp
47
48.. autofunction:: bagged_evimp
49
50.. autoclass:: ScoreEarthImportance
51
52"""
53
54import Orange
55from Orange.feature import Discrete, Continuous
56from Orange.data import Table, Domain
57from Orange.preprocess import Preprocessor_continuize, \
58                              Preprocessor_impute, \
59                              Preprocessor_preprocessorList, \
60                              DomainContinuizer
61
62import numpy
63
64def is_discrete(var):
65    return isinstance(var, Discrete)
66
67def is_continuous(var):
68    return isinstance(var, Continuous)
69
70def expand_discrete(var):
71    """ Expand a discrete variable ``var`` returning one continuous indicator
72    variable for each value of ``var`` (if the number of values is grater
73    then 2 else return only one indicator variable).
74   
75    """
76    if len(var.values) > 2:
77        values = var.values
78    elif len(var.values) == 2:
79        values = var.values[-1:]
80    else:
81        values = var.values[:1]
82    new_vars = []
83    for value in values:
84        new = Continuous("{0}={1}".format(var.name, value))
85        new.get_value_from = cls = Orange.core.ClassifierFromVar(whichVar=var)
86        cls.transformer = Orange.core.Discrete2Continuous()
87        cls.transformer.value = int(Orange.core.Value(var, value))
88        new.source_variable = var
89        new_vars.append(new)
90    return new_vars
91
92def select_attrs(table, features, class_var=None,
93                 class_vars=None, metas=None):
94    """ Select only ``attributes`` from the ``table``.
95    """
96    if class_vars is None:
97        domain = Domain(features, class_var)
98    else:
99        domain = Domain(features, class_var, class_vars=class_vars)
100    if metas:
101        domain.add_metas(metas)
102    return Table(domain, table)
103   
104class EarthLearner(Orange.regression.base.BaseRegressionLearner):
105    """Earth learner class. Supports both regression and classification
106    problems. In case of classification the class values are expanded into
107    continuous indicator columns (one for each value if the number of
108    values is grater then 2), and a multi response model is learned on these
109    new columns. The resulting classifier will then use the computed response
110    values on new instances to select the final predicted class.
111     
112    """
113    def __new__(cls, instances=None, weight_id=None, **kwargs):
114        self = Orange.regression.base.BaseRegressionLearner.__new__(cls)
115        if instances is not None:
116            self.__init__(**kwargs)
117            return self.__call__(instances, weight_id)
118        else:
119            return self
120       
121    def __init__(self, degree=1, terms=21, penalty= None, thresh=1e-3,
122                 min_span=0, new_var_penalty=0, fast_k=20, fast_beta=1,
123                 pruned_terms=None, scale_resp=True, store_instances=True,
124                **kwds):
125        """Initialize the learner instance.
126       
127        :param degree: Maximum degree (num. of hinge functions per term)
128            of the terms in the model.
129        :type degree: int
130        :param terms: Maximum number of terms in the forward pass (default 21).
131           
132            .. note:: If this paramter is None then
133                ``min(200, max(20, 2 * n_attributes)) + 1`` will be used. This
134                is the same as the default setting in earth R package.
135               
136        :type terms: int
137        :param penalty: Penalty for hinges in the GCV computation (used
138            in the pruning pass). By default it is 3.0 if the degree > 1,
139            2.0 otherwise.
140        :type penalty: float
141        :param thresh: Threshold for RSS decrease in the forward pass
142            (default 0.001).
143        :type thresh: float
144        :param min_span: TODO.
145        :param new_var_penalty: Penalty for introducing a new variable
146            in the model during the forward pass (default 0).
147        :type new_var_penalty: float
148        :param fast_k: Fast k.
149        :param fast_beta: Fast beta.
150        :param pruned_terms: Maximum number of terms in the model after
151            pruning (default None - no limit).
152        :type pruned_terms: int
153        :param scale_resp: Scale responses prior to forward pass (default
154            True - ignored for multi response models).
155        :type scale_resp: bool
156        :param store_instances: Store training instances in the model
157            (default True).
158        :type store_instances: bool
159         
160        .. todo:: min_span, prunning_method (need Leaps like functionality,
161            currently only eval_subsets_using_xtx is implemented).
162       
163        """
164       
165        super(EarthLearner, self).__init__()
166       
167        self.degree = degree
168        self.terms = terms
169        if penalty is None:
170            penalty = 3 if degree > 1 else 2
171        self.penalty = penalty
172        self.thresh = thresh
173        self.min_span = min_span
174        self.new_var_penalty = new_var_penalty
175        self.fast_k = fast_k
176        self.fast_beta = fast_beta
177        self.pruned_terms = pruned_terms
178        self.scale_resp = scale_resp
179        self.store_instances = store_instances
180        self.__dict__.update(kwds)
181       
182        self.continuizer.class_treatment = DomainContinuizer.Ignore
183       
184    def __call__(self, instances, weight_id=None):
185       
186        expanded_class = None
187        multitarget = False
188       
189        if instances.domain.class_var:
190            instances = self.impute_table(instances)
191            instances = self.continuize_table(instances)
192           
193            if is_discrete(instances.domain.class_var):
194                # Expand a discrete class with indicator columns
195                expanded_class = expand_discrete(instances.domain.class_var)
196                y_table = select_attrs(instances, expanded_class)
197                (y, ) = y_table.to_numpy_MA("A")
198                (x, ) = instances.to_numpy_MA("A")
199            elif is_continuous(instances.domain.class_var):
200                x, y, _ = instances.to_numpy_MA()
201                y = y.reshape((-1, 1))
202            else:
203                raise ValueError("Cannot handle the response.")
204        elif instances.domain.class_vars:
205            # Multi-target domain
206            if not all(map(is_continuous, instances.domain.class_vars)):
207                raise TypeError("Only continuous multi-target classes are supported.")
208            x_table = select_attrs(instances, instances.domain.attributes)
209            y_table = select_attrs(instances, instances.domain.class_vars)
210           
211            # Impute and continuize only the x_table
212            x_table = self.impute_table(x_table)
213            x_table = self.continuize_table(x_table)
214            domain = Domain(x_table.domain.attributes,
215                            class_vars=instances.domain.class_vars)
216           
217            (x, ) = x_table.to_numpy_MA("A")
218            (y, ) = y_table.to_numpy_MA("A")
219           
220            multitarget = True
221        else:
222            raise ValueError("Class variable expected.")
223       
224        if self.scale_resp and y.shape[1] == 1:
225            sy = y - numpy.mean(y, axis=0)
226            sy = sy / numpy.std(sy, axis=1)
227        else:
228            sy = y
229           
230        terms = self.terms
231        if terms is None:
232            # Automatic maximum number of terms
233            terms = min(200, max(20, 2 * x.shape[1])) + 1
234           
235        n_terms, used, bx, dirs, cuts = forward_pass(x, sy,
236            degree=self.degree, terms=terms, penalty=self.penalty,
237            thresh=self.thresh, fast_k=self.fast_k, fast_beta=self.fast_beta,
238            new_var_penalty=self.new_var_penalty)
239       
240        # discard unused terms from bx, dirs, cuts
241        bx = bx[:, used]
242        dirs = dirs[used, :]
243        cuts = cuts[used, :]
244       
245        # pruning
246        used, subsets, rss_per_subset, gcv_per_subset = \
247            pruning_pass(bx, y, self.penalty,
248                         pruned_terms=self.pruned_terms)
249       
250        # Fit betas
251        bx_used = bx[:, used]
252        betas, res, rank, s = numpy.linalg.lstsq(bx_used, y)
253       
254        return EarthClassifier(instances.domain, used, dirs, cuts, betas.T,
255                               subsets, rss_per_subset, gcv_per_subset,
256                               instances=instances if self.store_instances else None,
257                               multitarget=multitarget,
258                               expanded_class=expanded_class
259                               )
260
261
262def soft_max(values):
263    values = numpy.asarray(values)
264    return numpy.exp(values) / numpy.sum(numpy.exp(values))
265
266
267class EarthClassifier(Orange.core.ClassifierFD):
268    """ Earth classifier.
269    """
270    def __init__(self, domain, best_set, dirs, cuts, betas, subsets=None,
271                 rss_per_subset=None, gcv_per_subset=None, instances=None,
272                 multitarget=False, expanded_class=None,
273                 original_domain=None, **kwargs):
274        self.multitarget = multitarget
275        self.domain = domain
276        self.class_var = domain.class_var
277        if self.multitarget:
278            self.class_vars = domain.class_vars
279           
280        self.best_set = best_set
281        self.dirs = dirs
282        self.cuts = cuts
283        self.betas = betas
284        self.subsets = subsets
285        self.rss_per_subset = rss_per_subset
286        self.gcv_per_subset = gcv_per_subset
287        self.instances = instances
288        self.expanded_class = expanded_class
289        self.original_domain = original_domain
290        self.__dict__.update(kwargs)
291       
292    def __call__(self, instance, result_type=Orange.core.GetValue):
293        if self.multitarget and self.domain.class_vars:
294            resp_vars = list(self.domain.class_vars)
295        elif is_discrete(self.class_var):
296            resp_vars = self.expanded_class
297        else:
298            resp_vars = [self.class_var]
299           
300        vals = self.predict(instance)
301        vals = [var(val) for var, val in zip(resp_vars, vals)]
302       
303        from Orange.statistics.distribution import Distribution
304       
305        if not self.multitarget and is_discrete(self.class_var):
306            dist = Distribution(self.class_var)
307            if len(self.class_var.values) == 2:
308                probs = [1 - float(vals[0]), float(vals[0])]
309            else:
310                probs = soft_max(map(float, vals))
311               
312            for val, p in zip(self.class_var.values, probs):
313                dist[val] = p
314            value = dist.modus()
315            vals, probs = [value], [dist]
316        else:
317            probs = []
318            for var, val in zip(resp_vars, vals):
319                dist = Distribution(var)
320                dist[val] = 1.0
321                probs.append(dist)
322           
323        if not self.multitarget:
324            vals, probs = vals[0], probs[0]
325           
326        if result_type == Orange.core.GetValue:
327            return vals
328        elif result_type == Orange.core.GetBoth:
329            return vals, probs
330        else:
331            return probs
332       
333    def base_matrix(self, instances=None):
334        """Return the base matrix (bx) of the Earth model for the table.
335        If table is not supplied the base matrix of the training instances
336        is returned.
337        Base matrix is a len(instances) x num_terms matrix of computed values
338        of terms in the model (not multiplied by beta) for each instance.
339       
340        :param instances: Input instances for the base matrix.
341        :type instances: :class:`Orange.data.Table`
342       
343        """
344        if instances is None:
345            instances = self.instances
346        instances = select_attrs(instances, self.domain.attributes)
347        (data,) = instances.to_numpy_MA("A")
348        bx = base_matrix(data, self.best_set, self.dirs, self.cuts)
349        return bx
350   
351    def predict(self, instance):
352        """ Predict the response values for the instance
353       
354        :param instance: Data instance
355        :type instance: :class:`Orange.data.Instance`
356       
357        """
358        data = Orange.data.Table(self.domain, [instance])
359        bx = self.base_matrix(data)
360        bx_used = bx[:, self.best_set]
361        vals = numpy.dot(bx_used, self.betas.T).ravel()
362        return vals
363   
364    def used_attributes(self, term=None):
365        """ Return the used terms for term (index). If no term is given
366        return all attributes in the model.
367       
368        :param term: term index
369        :type term: int
370       
371        """
372        if term is None:
373            return reduce(set.union, [self.used_attributes(i) \
374                                      for i in range(self.best_set.size)],
375                          set())
376           
377        attrs = self.domain.attributes
378       
379        used_mask = self.dirs[term, :] != 0.0
380        return [a for a, u in zip(attrs, used_mask) if u]
381   
382    def evimp(self, used_only=True):
383        """ Return the estimated variable importances.
384       
385        :param used_only: if True return only used attributes
386       
387        """ 
388        return evimp(self, used_only)
389   
390    def __reduce__(self):
391        return (type(self), (self.domain, self.best_set, self.dirs,
392                            self.cuts, self.betas),
393                dict(self.__dict__))
394       
395    def to_string(self, percision=3, indent=3):
396        """ Return a string representation of the model.
397        """
398        return format_model(self, percision, indent)
399       
400    def __str__(self):
401        return self.to_string()
402   
403
404"""
405Utility functions
406-----------------
407"""
408   
409def base_matrix(data, best_set, dirs, cuts):
410    """ Return the base matrix for the earth model.
411   
412    :param data: Input data
413    :type data: :class:`numpy.ndarray`
414   
415    :param best_set: A array of booleans indicating used terms.
416    :type best_set: :class:`numpy.ndarray`
417   
418    :param dirs: Earth model's dirs members
419    :type dirs: :class:`numpy.ndarray`
420   
421    :param cuts: Earth model's cuts members
422    :type cuts: :class:`numpy.ndarray`
423   
424    """
425    data = numpy.asarray(data)
426    best_set = numpy.asarray(best_set)
427    dirs = numpy.asarray(dirs)
428    cuts = numpy.asarray(cuts)
429   
430    bx = numpy.zeros((data.shape[0], best_set.shape[0]))
431    bx[:, 0] = 1.0 # The intercept
432    for termi in range(1, best_set.shape[0]):
433        term_dirs = dirs[termi]
434        term_cuts = cuts[termi]
435       
436        dir_p1 = numpy.where(term_dirs == 1)[0]
437        dir_m1 = numpy.where(term_dirs == -1)[0]
438        dir_2 = numpy.where(term_dirs == 2)[0]
439       
440        x1 = data[:, dir_p1] - term_cuts[dir_p1]
441        x2 = term_cuts[dir_m1] - data[:, dir_m1]
442        x3 = data[:, dir_2]
443       
444        x1 = numpy.where(x1 > 0.0, x1, 0.0)
445        x2 = numpy.where(x2 > 0.0, x2, 0.0)
446       
447        X = numpy.hstack((x1, x2, x3))
448        X = numpy.cumprod(X, axis=1)
449        bx[:, termi] = X[:, -1] if X.size else 0.0
450       
451    return bx
452
453   
454def gcv(rss, n, n_effective_params):
455    """ Return the generalized cross validation.
456   
457    .. math:: gcv = rss / (n * (1 - NumEffectiveParams / n) ^ 2)
458   
459    :param rss: Residual sum of squares.
460    :param n: Number of training instances.
461    :param n_effective_params: Number of effective paramaters.
462     
463    """
464    return  rss / (n * (1. - n_effective_params / n) ** 2)
465
466"""
467Multi-label utility functions
468"""
469
470def is_label_attr(attr):
471    """ Is attribute a label.
472    """
473    return attr.attributes.has_key("label")
474   
475def data_label_indices(domain):
476    """ Return the indices of label attributes in data.
477    """
478    return numpy.where(data_label_mask(domain))[0]
479
480def data_label_mask(domain):
481    """ Return an array of booleans indicating whether a variable in the
482    domain is a label.
483    """
484    is_label = map(is_label_attr, domain.variables)
485    if domain.class_var:
486        is_label[-1] = True
487    return numpy.array(is_label, dtype=bool)
488
489"""
490ctypes interface to ForwardPass and EvalSubsetsUsingXtx.
491"""
492       
493import ctypes
494from numpy import ctypeslib
495import orange
496
497_c_orange_lib = ctypeslib.load_library(orange.__file__, "")
498_c_forward_pass_ = _c_orange_lib.EarthForwardPass
499
500_c_forward_pass_.argtypes = \
501    [ctypes.POINTER(ctypes.c_int),  #pnTerms:
502     ctypeslib.ndpointer(dtype=ctypes.c_bool, ndim=1),  #FullSet
503     ctypeslib.ndpointer(dtype=ctypes.c_double, ndim=2, flags="F_CONTIGUOUS"), #bx
504     ctypeslib.ndpointer(dtype=ctypes.c_int, ndim=2, flags="F_CONTIGUOUS"),    #Dirs
505     ctypeslib.ndpointer(dtype=ctypes.c_double, ndim=2, flags="F_CONTIGUOUS"), #Cuts
506     ctypeslib.ndpointer(dtype=ctypes.c_int, ndim=1),  #nFactorsInTerms
507     ctypeslib.ndpointer(dtype=ctypes.c_int, ndim=1),  #nUses
508     ctypeslib.ndpointer(dtype=ctypes.c_double, ndim=2, flags="F_CONTIGUOUS"), #x
509     ctypeslib.ndpointer(dtype=ctypes.c_double, ndim=2, flags="F_CONTIGUOUS"), #y
510     ctypeslib.ndpointer(dtype=ctypes.c_double, ndim=1), # Weights
511     ctypes.c_int,  #nCases
512     ctypes.c_int,  #nResp
513     ctypes.c_int,  #nPred
514     ctypes.c_int,  #nMaxDegree
515     ctypes.c_int,  #nMaxTerms
516     ctypes.c_double,   #Penalty
517     ctypes.c_double,   #Thresh
518     ctypes.c_int,  #nFastK
519     ctypes.c_double,   #FastBeta
520     ctypes.c_double,   #NewVarPenalty
521     ctypeslib.ndpointer(dtype=ctypes.c_int, ndim=1),  #LinPreds
522     ctypes.c_bool, #UseBetaCache
523     ctypes.c_char_p    #sPredNames
524     ]
525   
526def forward_pass(x, y, degree=1, terms=21, penalty=None, thresh=0.001,
527                  fast_k=21, fast_beta=1, new_var_penalty=2):
528    """ Do earth forward pass.
529    """
530    x = numpy.asfortranarray(x, dtype=ctypes.c_double)
531    y = numpy.asfortranarray(y, dtype=ctypes.c_double)
532    if x.shape[0] != y.shape[0]:
533        raise ValueError("First dimensions of x and y must be the same.")
534    if y.ndim == 1:
535        y = y.reshape((-1, 1), order="F")
536    if penalty is None:
537        penalty = 2
538    n_cases = x.shape[0]
539    n_preds = x.shape[1]
540   
541    n_resp = y.shape[1] if y.ndim == 2 else y.shape[0]
542   
543    # Output variables
544    n_term = ctypes.c_int()
545    full_set = numpy.zeros((terms,), dtype=ctypes.c_bool, order="F")
546    bx = numpy.zeros((n_cases, terms), dtype=ctypes.c_double, order="F")
547    dirs = numpy.zeros((terms, n_preds), dtype=ctypes.c_int, order="F")
548    cuts = numpy.zeros((terms, n_preds), dtype=ctypes.c_double, order="F")
549    n_factors_in_terms = numpy.zeros((terms,), dtype=ctypes.c_int, order="F")
550    n_uses = numpy.zeros((n_preds,), dtype=ctypes.c_int, order="F")
551    weights = numpy.ones((n_cases,), dtype=ctypes.c_double, order="F")
552    lin_preds = numpy.zeros((n_preds,), dtype=ctypes.c_int, order="F")
553    use_beta_cache = True
554   
555    # These tests are performed in ForwardPass, and if they fail the function
556    # calls exit. So we must check it here and raise a exception to avoid a
557    # process shutdown.
558    if n_cases < 8:
559        raise ValueError("Need at least 8 data instances.")
560    if n_cases > 1e8:
561        raise ValueError("To many data instances.")
562    if n_resp < 1:
563        raise ValueError("No response column.")
564    if n_resp > 1e6:
565        raise ValueError("To many response columns.")
566    if n_preds < 1:
567        raise ValueError("No predictor columns.")
568    if n_preds > 1e5:
569        raise ValueError("To many predictor columns.")
570    if degree <= 0 or degree > 100:
571        raise ValueError("Invalid 'degree'.")
572    if terms < 3 or terms > 10000:
573        raise ValueError("'terms' must be in >= 3 and <= 10000.")
574    if penalty < 0 and penalty != -1:
575        raise ValueError("Invalid 'penalty' (the only legal negative value is -1).")
576    if penalty > 1000:
577        raise ValueError("Invalid 'penalty' (must be <= 1000).")
578    if thresh < 0.0 or thresh >= 1.0:
579        raise ValueError("Invalid 'thresh' (must be in [0.0, 1.0) ).")
580    if fast_beta < 0 or fast_beta > 1000:
581        raise ValueError("Invalid 'fast_beta' (must be in [0, 1000] ).")
582    if new_var_penalty < 0 or new_var_penalty > 10:
583        raise ValueError("Invalid 'new_var_penalty' (must be in [0, 10] ).")
584    if (numpy.var(y, axis=0) <= 1e-8).any():
585        raise ValueError("Variance of y is zero (or near zero).")
586     
587    _c_forward_pass_(ctypes.byref(n_term), full_set, bx, dirs, cuts,
588                     n_factors_in_terms, n_uses, x, y, weights, n_cases,
589                     n_resp, n_preds, degree, terms, penalty, thresh,
590                     fast_k, fast_beta, new_var_penalty, lin_preds, 
591                     use_beta_cache, None)
592    return n_term.value, full_set, bx, dirs, cuts
593
594
595_c_eval_subsets_xtx = _c_orange_lib.EarthEvalSubsetsUsingXtx
596
597_c_eval_subsets_xtx.argtypes = \
598    [ctypeslib.ndpointer(dtype=ctypes.c_bool, ndim=2, flags="F_CONTIGUOUS"),   #PruneTerms
599     ctypeslib.ndpointer(dtype=ctypes.c_double, ndim=1),   #RssVec
600     ctypes.c_int,  #nCases
601     ctypes.c_int,  #nResp
602     ctypes.c_int,  #nMaxTerms
603     ctypeslib.ndpointer(dtype=ctypes.c_double, ndim=2, flags="F_CONTIGUOUS"),   #bx
604     ctypeslib.ndpointer(dtype=ctypes.c_double, ndim=2, flags="F_CONTIGUOUS"),   #y
605     ctypeslib.ndpointer(dtype=ctypes.c_double, ndim=1)  #WeightsArg
606     ]
607   
608_c_eval_subsets_xtx.restype = ctypes.c_int
609
610def subset_selection_xtx(X, Y):
611    """ Subsets selection using EvalSubsetsUsingXtx in the Earth package.
612    """
613    X = numpy.asfortranarray(X, dtype=ctypes.c_double)
614    Y = numpy.asfortranarray(Y, dtype=ctypes.c_double)
615    if Y.ndim == 1:
616        Y = Y.reshape((-1, 1), order="F")
617       
618    if X.shape[0] != Y.shape[0]:
619        raise ValueError("First dimensions of bx and y must be the same")
620       
621    var_count = X.shape[1]
622    resp_count = Y.shape[1]
623    cases = X.shape[0]
624    subsets = numpy.zeros((var_count, var_count), dtype=ctypes.c_bool,
625                              order="F")
626    rss_vec = numpy.zeros((var_count,), dtype=ctypes.c_double, order="F")
627    weights = numpy.ones((cases,), dtype=ctypes.c_double, order="F")
628   
629    rval = _c_eval_subsets_xtx(subsets, rss_vec, cases, resp_count, var_count,
630                        X, Y, weights)
631    if rval != 0:
632        raise numpy.linalg.LinAlgError("Lin. dep. terms in X")
633   
634    subsets_ind = numpy.zeros((var_count, var_count), dtype=int)
635    for i, used in enumerate(subsets.T):
636        subsets_ind[i, :i + 1] = numpy.where(used)[0]
637       
638    return subsets_ind, rss_vec
639   
640def subset_selection_xtx_numpy(X, Y):
641    """ A numpy implementation of EvalSubsetsUsingXtx in the Earth package.
642    Using numpy.linalg.lstsq
643   
644    """
645    X = numpy.asarray(X)
646    Y = numpy.asarray(Y)
647   
648    var_count = X.shape[1]
649    rss_vec = numpy.zeros(var_count)
650    working_set = range(var_count)
651    subsets = numpy.zeros((var_count, var_count), dtype=int)
652   
653    for subset_size in reversed(range(var_count)):
654        subsets[subset_size, :subset_size + 1] = working_set
655        X_work = X[:, working_set]
656        b, res, rank, s = numpy.linalg.lstsq(X_work, Y)
657        if res.size > 0:
658            rss_vec[subset_size] = numpy.sum(res)
659        else:
660            rss_vec[subset_size] = numpy.sum((Y - numpy.dot(X_work, b)) ** 2)
661           
662        XtX = numpy.dot(X_work.T, X_work)
663        iXtX = numpy.linalg.pinv(XtX)
664        diag = numpy.diag(iXtX).reshape((-1, 1))
665       
666        if subset_size == 0:
667            break
668       
669        delta_rss = b ** 2 / diag
670        delta_rss = numpy.sum(delta_rss, axis=1)
671        delete_i = numpy.argmin(delta_rss[1:]) + 1 # Keep the intercept
672        del working_set[delete_i]
673    return subsets, rss_vec
674   
675def subset_selection_xtx2(X, Y):
676    """ Another implementation (this uses qr decomp).
677    """
678    from Orange.misc import linalg
679    X = numpy.asfortranarray(X, dtype=ctypes.c_double)
680    Y = numpy.asfortranarray(Y, dtype=ctypes.c_double)
681    col_count = X.shape[1]
682    working_set = range(col_count)
683    subsets = numpy.zeros((col_count, col_count), dtype=int)
684    rss_vec = numpy.zeros((col_count,))
685    QR, k, _, jpvt = linalg.qr_decomp(X)
686   
687    if k < col_count:
688        # remove jpvt[k:] from the work set. Will have zero
689        # entries in the subsets matrix, and inf rss
690        for i in sorted(jpvt[k:], reverse=True):
691            del working_set[i]
692            rss_vec[len(working_set)] = float("inf")
693        col_count = len(working_set)
694       
695    for subset_size in reversed(range(col_count)):
696        subsets[subset_size, :subset_size + 1] = working_set
697        X_work = X[:, working_set]
698        b, rsd, rank = linalg.qr_lstsq(X_work, Y)
699        rss_vec[subset_size] = numpy.sum(rsd ** 2)
700        XtX = numpy.dot(X_work.T, X_work)
701        iXtX = numpy.linalg.pinv(XtX)
702        diag = numpy.diag(iXtX)
703       
704        if subset_size == 0:
705            break
706       
707        delta_rss = b ** 2 / diag
708        delete_i = numpy.argmin(delta_rss[1:]) + 1 # Keep the intercept
709        del working_set[delete_i]
710    return subsets, rss_vec
711   
712def pruning_pass(bx, y, penalty, pruned_terms=-1):
713    """ Do pruning pass
714   
715    .. todo:: pruned_terms, Leaps
716   
717    """
718    try:
719        subsets, rss_vec = subset_selection_xtx(bx, y)
720    except numpy.linalg.LinAlgError:
721        subsets, rss_vec = subset_selection_xtx_numpy(bx, y)
722   
723    cases, terms = bx.shape
724    n_effective_params = numpy.arange(terms) + 1.0
725    n_effective_params += penalty * (n_effective_params - 1.0) / 2.0
726   
727    gcv_vec = gcv(rss_vec, cases, n_effective_params)
728   
729    min_i = numpy.argmin(gcv_vec)
730    used = numpy.zeros((terms), dtype=bool)
731   
732    used[subsets[min_i, :min_i + 1]] = True
733   
734    return used, subsets, rss_vec, gcv_vec
735
736"""
737Printing functions.
738"""
739   
740def format_model(model, percision=3, indent=3):
741    """ Return a formated string representation of the earth model.
742    """
743    if model.multitarget:
744        r_vars = list(model.domain.class_vars)
745    elif is_discrete(model.class_var):
746        r_vars = model.expanded_class
747    else:
748        r_vars = [model.class_var]
749       
750    r_names = [v.name for v in r_vars]
751    betas = model.betas
752       
753    resp = []
754    for name, betas in zip(r_names, betas):
755        resp.append(_format_response(model, name, betas,
756                                     percision, indent))
757    return "\n\n".join(resp)
758
759def _format_response(model, resp_name, betas, percision=3, indent=3):
760    header = "%s =" % resp_name
761    indent = " " * indent
762    fmt = "%." + str(percision) + "f"
763    terms = [([], fmt % betas[0])]
764    beta_i = 0
765    for i, used in enumerate(model.best_set[1:], 1):
766        if used:
767            beta_i += 1
768            beta = fmt % abs(betas[beta_i])
769            knots = [_format_knot(model, attr.name, d, c, percision) \
770                     for d, c, attr in \
771                     zip(model.dirs[i], model.cuts[i], model.domain.attributes) \
772                     if d != 0]
773            term_attrs = [a for a, d in zip(model.domain.attributes, model.dirs[i]) \
774                          if d != 0]
775            term_attrs = sorted(term_attrs)
776            sign = "-" if betas[beta_i] < 0 else "+"
777            if knots:
778                terms.append((term_attrs,
779                              sign + " * ".join([beta] + knots)))
780            else:
781                terms.append((term_attrs, sign + beta))
782    # Sort by len(term_attrs), then by term_attrs
783    terms = sorted(terms, key=lambda t: (len(t[0]), t[0]))
784    return "\n".join([header] + [indent + t for _, t in terms])
785       
786def _format_knot(model, name, dir, cut, percision=3):
787    fmt = "%%.%if" % percision
788    if dir == 1:
789        txt = ("max(0, %s - " + fmt + ")") % (name, cut)
790    elif dir == -1:
791        txt = ("max(0, " + fmt + " - %s)") % (cut, name)
792    elif dir == 2:
793        txt = name
794    return txt
795
796   
797"""\
798Variable importance estimation
799------------------------------
800"""
801
802from collections import defaultdict
803
804def collect_source(vars):
805    """ Given a list of variables ``var``, return a mapping from source
806    variables (``source_variable`` or ``get_value_from.variable`` members)
807    back to the variables in ``vars`` (assumes the default preprocessor in
808    EarthLearner).
809   
810    """
811    source = defaultdict(list)
812    for var in vars:
813        svar = None
814        if var.source_variable:
815            source[var.source_variable].append(var)
816        elif isinstance(var.get_value_from, Orange.core.ClassifierFromVar):
817            source[var.get_value_from.variable].append(var)
818        elif isinstance(var.get_value_from, Orange.core.ImputeClassifier):
819            source[var.get_value_from.classifier_from_var.variable].append(var)
820        else:
821            source[var].append(var)
822    return dict(source)
823
824def map_to_source_var(var, sources):
825    """
826    """
827    if var in sources:
828        return var
829    elif var.source_variable in sources:
830        return var.source_variable
831    elif isinstance(var.get_value_from, Orange.core.ClassifierFromVar):
832        return map_to_source_var(var.get_value_from.variable, sources)
833    elif isinstance(var.get_value_from, Orange.core.ImputeClassifier):
834        var = var.get_value_from.classifier_from_var.variable
835        return map_to_source_var(var, sources)
836    else:
837        return None
838   
839def evimp(model, used_only=True):
840    """ Return the estimated variable importance for the model.
841   
842    :param model: Earth model.
843    :type model: `EarthClassifier`
844   
845    """
846    if model.subsets is None:
847        raise ValueError("No subsets. Use the learner with 'prune=True'.")
848   
849    subsets = model.subsets
850    n_subsets = numpy.sum(model.best_set)
851   
852    rss = -numpy.diff(model.rss_per_subset)
853    gcv = -numpy.diff(model.gcv_per_subset)
854    attributes = list(model.domain.variables)
855   
856    attr2ind = dict(zip(attributes, range(len(attributes))))
857    importances = numpy.zeros((len(attributes), 4))
858    importances[:, 0] = range(len(attributes))
859   
860    for i in range(1, n_subsets):
861        term_subset = subsets[i, :i + 1]
862        used_attributes = reduce(set.union, [model.used_attributes(term) \
863                                             for term in term_subset], set())
864        for attr in used_attributes:
865            importances[attr2ind[attr]][1] += 1.0
866            importances[attr2ind[attr]][2] += gcv[i - 1]
867            importances[attr2ind[attr]][3] += rss[i - 1]
868    imp_min = numpy.min(importances[:, [2, 3]], axis=0)
869    imp_max = numpy.max(importances[:, [2, 3]], axis=0)
870   
871    #Normalize importances.
872    importances[:, [2, 3]] = 100.0 * (importances[:, [2, 3]] \
873                            - [imp_min]) / ([imp_max - imp_min])
874   
875    importances = list(importances)
876    # Sort by n_subsets and gcv.
877    importances = sorted(importances, key=lambda row: (row[1], row[2]),
878                         reverse=True)
879    importances = numpy.array(importances)
880   
881    if used_only:
882        importances = importances[importances[:,1] > 0.0]
883   
884    res = [(attributes[int(row[0])], tuple(row[1:])) for row in importances]
885    return res
886
887
888def plot_evimp(evimp):
889    """ Plot the variable importances as returned from
890    :obj:`EarthClassifier.evimp` call.
891   
892    ::
893
894        import Orange
895        data = Orange.data.Table("housing")
896        c = Orange.regression.earth.EarthLearner(data, degree=3)
897        Orange.regression.earth.plot_evimp(c.evimp())
898
899    .. image:: files/earth-evimp.png
900     
901    The left axis is the nsubsets measure and on the right are the normalized
902    RSS and GCV.
903   
904    """
905    from Orange.ensemble.bagging import BaggedClassifier
906    if isinstance(evimp, EarthClassifier):
907        evimp = evimp.evimp()
908    elif isinstance(evimp, BaggedClassifier):
909        evimp = bagged_evimp(evimp)
910       
911    import pylab
912    fig = pylab.figure()
913    axes1 = fig.add_subplot(111)
914    attrs = [a for a, _ in evimp]
915    imp = [s for _, s in evimp]
916    imp = numpy.array(imp)
917    X = range(len(attrs))
918    l1 = axes1.plot(X, imp[:, 0], "b-",)
919    axes2 = axes1.twinx()
920   
921    l2 = axes2.plot(X, imp[:, 1], "g-",)
922    l3 = axes2.plot(X, imp[:, 2], "r-",)
923   
924    x_axis = axes1.xaxis
925    x_axis.set_ticks(X)
926    x_axis.set_ticklabels([a.name for a in attrs], rotation=90)
927   
928    axes1.yaxis.set_label_text("nsubsets")
929    axes2.yaxis.set_label_text("normalized gcv or rss")
930
931    axes1.legend([l1, l2, l3], ["nsubsets", "gcv", "rss"])
932    axes1.set_title("Variable importance")
933    fig.show()
934
935   
936def bagged_evimp(classifier, used_only=True):
937    """ Extract combined (average) evimp from an instance of BaggedClassifier
938   
939    Example::
940
941        from Orange.ensemble.bagging import BaggedLearner
942        bc = BaggedLearner(EarthLearner(degree=3, terms=10), data)
943        bagged_evimp(bc)
944
945    """
946    def assert_type(object, class_):
947        if not isinstance(object, class_):
948            raise TypeError("Instance of %r expected." % (class_))
949   
950    from Orange.ensemble.bagging import BaggedClassifier
951   
952    assert_type(classifier, BaggedClassifier)
953    bagged_imp = defaultdict(list)
954    attrs_by_name = defaultdict(list)
955    for c in classifier.classifiers:
956        assert_type(c, EarthClassifier)
957        imp = evimp(c, used_only=used_only)
958        for attr, score in imp:
959            bagged_imp[attr.name].append(score) # map by name
960            attrs_by_name[attr.name].append(attr)
961           
962    for attr, scores in bagged_imp.items():
963        scores = numpy.average(scores, axis=0)
964        bagged_imp[attr] = tuple(scores)
965   
966   
967    bagged_imp = sorted(bagged_imp.items(),
968                        key=lambda t: (t[1][0], t[1][1]),
969                        reverse=True)
970   
971    bagged_imp = [(attrs_by_name[name][0], scores) for name, scores in bagged_imp]
972   
973    if used_only:
974        bagged_imp = [(a, r) for a, r in bagged_imp if r[0] > 0]
975    return bagged_imp
976
977"""
978High level interface for measuring variable importance
979(compatible with Orange.feature.scoring module).
980
981"""
982from Orange.feature import scoring
983           
984class ScoreEarthImportance(scoring.Score):
985    """ An :class:`Orange.feature.scoring.Score` subclass.
986    Scores features based on their importance in the Earth
987    model using ``bagged_evimp``'s function return value.
988   
989    """
990    # Return types 
991    NSUBSETS = 0
992    RSS = 1
993    GCV = 2
994   
995    handles_discrete = True
996    handles_continuous = True
997    computes_thresholds = False
998    needs = scoring.Score.Generator
999   
1000    def __new__(cls, attr=None, data=None, weight_id=None, **kwargs):
1001        self = scoring.Score.__new__(cls)
1002        if attr is not None and data is not None:
1003            self.__init__(**kwargs)
1004            # TODO: Should raise a warning, about caching
1005            return self.__call__(attr, data, weight_id)
1006        elif not attr and not data:
1007            return self
1008        else:
1009            raise ValueError("Both 'attr' and 'data' arguments expected.")
1010       
1011    def __init__(self, t=10, degree=2, terms=10, score_what="nsubsets", cached=True):
1012        """
1013        :param t: Number of earth models to train on the data
1014            (using BaggedLearner).
1015           
1016        :param score_what: What to return as a score.
1017            Can be one of: "nsubsets", "rss", "gcv" or class constants
1018            NSUBSETS, RSS, GCV.
1019           
1020        """
1021        self.t = t
1022        self.degree = degree
1023        self.terms = terms
1024        if isinstance(score_what, basestring):
1025            score_what = {"nsubsets":self.NSUBSETS, "rss":self.RSS,
1026                          "gcv":self.GCV}.get(score_what, None)
1027                         
1028        if score_what not in range(3):
1029            raise ValueError("Invalid  'score_what' parameter.")
1030
1031        self.score_what = score_what
1032        self.cached = cached
1033        self._cache_ref = None
1034        self._cached_evimp = None
1035       
1036    def __call__(self, attr, data, weight_id=None):
1037        ref = self._cache_ref
1038        if ref is not None and ref is data:
1039            evimp = self._cached_evimp
1040        else:
1041            from Orange.ensemble.bagging import BaggedLearner
1042            bc = BaggedLearner(EarthLearner(degree=self.degree,
1043                            terms=self.terms), t=self.t)(data, weight_id)
1044            evimp = bagged_evimp(bc, used_only=False)
1045            self._cache_ref = data
1046            self._cached_evimp = evimp
1047       
1048        evimp = dict(evimp)
1049        score = evimp.get(attr, None)
1050       
1051        if score is None:
1052            source = collect_source(evimp.keys())
1053            if attr in source:
1054                # Return average of source var scores
1055                return numpy.average([evimp[v][self.score_what] \
1056                                      for v in source[attr]])
1057            else:
1058                raise ValueError("Attribute %r not in the domain." % attr)
1059        else:
1060            return score[self.score_what]
1061   
1062class ScoreRSS(scoring.Score):
1063   
1064    handles_discrete = False
1065    handles_continuous = True
1066    computes_thresholds = False
1067   
1068    def __new__(cls, attr=None, data=None, weight_id=None, **kwargs):
1069        self = scoring.Score.__new__(cls)
1070        if attr is not None and data is not None:
1071            self.__init__(**kwargs)
1072            # TODO: Should raise a warning, about caching
1073            return self.__call__(attr, data, weight_id)
1074        elif not attr and not data:
1075            return self
1076        else:
1077            raise ValueError("Both 'attr' and 'data' arguments expected.")
1078       
1079    def __init__(self):
1080        self._cache_data = None
1081        self._cache_rss = None
1082       
1083    def __call__(self, attr, data, weight_id=None):
1084        ref = self._cache_data
1085        if ref is not None and ref is data:
1086            rss = self._cache_rss
1087        else:
1088            x, y = data.to_numpy_MA("1A/c")
1089            try:
1090                subsets, rss = subset_selection_xtx2(x, y)
1091            except numpy.linalg.LinAlgError:
1092                subsets, rss = subset_selection_xtx_numpy(x, y)
1093            rss_diff = -numpy.diff(rss)
1094            rss = numpy.zeros_like(rss)
1095            for s_size in range(1, subsets.shape[0]):
1096                subset = subsets[s_size, :s_size + 1]
1097                rss[subset] += rss_diff[s_size - 1]
1098            rss = rss[1:] #Drop the intercept
1099            self._cache_data = data
1100            self._cache_rss = rss
1101           
1102        index = list(data.domain.attributes).index(attr)
1103        return rss[index]
1104   
1105
1106#from Orange.core import EarthLearner as BaseEarthLearner, \
1107#                        EarthClassifier as BaseEarthClassifier
1108#from Orange.misc import member_set
1109#
1110#class _EarthLearner(BaseEarthLearner):
1111#    """ An earth learner.
1112#    """
1113#    def __new__(cls, data=None, weightId=None, **kwargs):
1114#        self = BaseEarthLearner.__new__(cls, **kwargs)
1115#        if data is not None:
1116#            self.__init__(**kwargs)
1117#            return self.__call__(data, weightId)
1118#        return self
1119#   
1120#    def __init__(self, max_degree=1, max_terms=21, new_var_penalty=0.0,
1121#                 threshold=0.001, prune=True, penalty=None, fast_k=20,
1122#                 fast_beta=0.0, store_examples=True, **kwargs):
1123#        """ Initialize the learner instance.
1124#       
1125#        :param max_degree:
1126#        """
1127#        self.max_degree = max_degree
1128#        self.max_terms = max_terms
1129#        self.new_var_penalty = new_var_penalty
1130#        self.threshold = threshold
1131#        self.prune = prunes
1132#        if penaty is None:
1133#            penalty = 2.0 if degree > 1 else 3.0
1134#        self.penalty = penalty
1135#        self.fast_k = fast_k
1136#        self.fast_beta = fast_beta
1137#        self.store_examples = store_examples
1138#       
1139#        for key, val in kwargs.items():
1140#            setattr(self, key, val)
1141#   
1142#    def __call__(self, data, weightId=None):
1143#        if not data.domain.class_var:
1144#            raise ValueError("No class var in the domain.")
1145#       
1146#        with member_set(self, "prune", False):
1147#            # We overwrite the prune argument (will do the pruning in python).
1148#            base_clsf =  BaseEarthLearner.__call__(self, data, weightId)
1149#       
1150#        if self.prune:
1151#            (best_set, betas, rss, subsets, rss_per_subset,
1152#             gcv_per_subset) = self.pruning_pass(base_clsf, data)
1153#           
1154#            return _EarthClassifier(base_clsf, data if self.store_examples else None,
1155#                                   best_set=best_set, dirs=base_clsf.dirs,
1156#                                   cuts=base_clsf.cuts,
1157#                                   betas=betas,
1158#                                   subsets=subsets,
1159#                                   rss_per_subset=rss_per_subset,
1160#                                   gcv_per_subset=gcv_per_subset)
1161#        else:
1162#            return _EarthClassifier(base_clsf, data if self.store_examples else None)
1163#   
1164#   
1165#    def pruning_pass(self, base_clsf, examples):
1166#        """ Prune the terms constructed in the forward pass.
1167#        (Pure numpy reimplementation)
1168#        """
1169#        n_terms = numpy.sum(base_clsf.best_set)
1170#        n_eff_params = n_terms + self.penalty * (n_terms - 1) / 2.0
1171#        data, y, _ = examples.to_numpy_MA()
1172#        data = data.filled(0.0)
1173#        best_set = numpy.asarray(base_clsf.best_set, dtype=bool)
1174#       
1175#        bx = base_matrix(data, base_clsf.best_set,
1176#                         base_clsf.dirs, base_clsf.cuts,
1177#                         )
1178#       
1179#        bx_used = bx[:, best_set]
1180#        subsets, rss_per_subset = subsets_selection_xtx(bx_used, y) # TODO: Use leaps like library
1181#        gcv_per_subset = [gcv(rss, bx.shape[0], i + self.penalty * (i - 1) / 2.0) \
1182#                              for i, rss in enumerate(rss_per_subset, 1)]
1183#        gcv_per_subset = numpy.array(gcv_per_subset)
1184#       
1185#        best_i = numpy.argmin(gcv_per_subset[1:]) + 1 # Ignore the intercept
1186#        best_ind = subsets[best_i, :best_i + 1]
1187#        bs_i = 0
1188#        for i, b in enumerate(best_set):
1189#            if b:
1190#                best_set[i] = bs_i in best_ind
1191#                bs_i += 1
1192#               
1193#        bx_subset = bx[:, best_set]
1194#        betas, rss, rank, s = numpy.linalg.lstsq(bx_subset, y)
1195#        return best_set, betas, rss, subsets, rss_per_subset, gcv_per_subset
1196#   
1197#       
1198#class _EarthClassifier(Orange.core.ClassifierFD):
1199#    def __init__(self, base_classifier=None, examples=None, best_set=None,
1200#                 dirs=None, cuts=None, betas=None, subsets=None,
1201#                 rss_per_subset=None,
1202#                 gcv_per_subset=None):
1203#        self._base_classifier = base_classifier
1204#        self.examples = examples
1205#        self.domain = base_classifier.domain
1206#        self.class_var = base_classifier.class_var
1207#       
1208#        best_set = base_classifier.best_set if best_set is None else best_set
1209#        dirs = base_classifier.dirs if dirs is None else dirs
1210#        cuts = base_classifier.cuts if cuts is None else cuts
1211#        betas = base_classifier.betas if betas is None else betas
1212#       
1213#        self.best_set = numpy.asarray(best_set, dtype=bool)
1214#        self.dirs = numpy.array(dirs, dtype=int)
1215#        self.cuts = numpy.array(cuts)
1216#        self.betas = numpy.array(betas)
1217#       
1218#        self.subsets = subsets
1219#        self.rss_per_subset = rss_per_subset
1220#        self.gcv_per_subset = gcv_per_subset
1221#       
1222#    @property
1223#    def num_terms(self):
1224#        """ Number of terms in the model (including the intercept).
1225#        """
1226#        return numpy.sum(numpy.asarray(self.best_set, dtype=int))
1227#   
1228#    @property
1229#    def max_terms(self):
1230#        """ Maximum number of terms (as specified in the learning step).
1231#        """
1232#        return self.best_set.size
1233#   
1234#    @property
1235#    def num_preds(self):
1236#        """ Number of predictors (variables) included in the model.
1237#        """
1238#        return len(self.used_attributes(term))
1239#   
1240#    def __call__(self, example, what=Orange.core.GetValue):
1241#        value = self.predict(example)
1242#        if isinstance(self.class_var, Orange.feature.Continuous):
1243#            value = self.class_var(value)
1244#        else:
1245#            value = self.class_var(int(round(value)))
1246#           
1247#        dist = Orange.statistics.distribution.Distribution(self.class_var)
1248#        dist[value] = 1.0
1249#        if what == Orange.core.GetValue:
1250#            return value
1251#        elif what == Orange.core.GetProbabilities:
1252#            return dist
1253#        else:
1254#            return (value, dist)
1255#   
1256#    def base_matrix(self, examples=None):
1257#        """ Return the base matrix (bx) of the Earth model for the table.
1258#        If table is not supplied the base matrix of the training examples
1259#        is returned.
1260#       
1261#       
1262#        :param examples: Input examples for the base matrix.
1263#        :type examples: Orange.data.Table
1264#       
1265#        """
1266#        if examples is None:
1267#            examples = self.examples
1268#           
1269#        if examples is None:
1270#            raise ValueError("base matrix is only available if 'store_examples=True'")
1271#       
1272#        if isinstance(examples, Orange.data.Table):
1273#            data, cls, w = examples.to_numpy_MA()
1274#            data = data.filled(0.0)
1275#        else:
1276#            data = numpy.asarray(examples)
1277#           
1278#        return base_matrix(data, self.best_set, self.dirs, self.cuts)
1279#   
1280#    def _anova_order(self):
1281#        """ Return indices that sort the terms into the 'ANOVA' format.
1282#        """
1283#        terms = [([], 0)] # intercept
1284#        for i, used in enumerate(self.best_set[1:], 1):
1285#            attrs = sorted(self.used_attributes(i))
1286#            if used and attrs:
1287#                terms.append((attrs, i))
1288#        terms = sotred(terms, key=lambda t:(len(t[0]), t[0]))
1289#        return [i for _, i in terms]
1290#   
1291#    def format_model(self, percision=3, indent=3):
1292#        return format_model(self, percision, indent)
1293#   
1294#    def print_model(self, percision=3, indent=3):
1295#        print self.format_model(percision, indent)
1296#       
1297#    def predict(self, ex):
1298#        """ Return the predicted value (float) for example.
1299#        """
1300#        x = Orange.data.Table(self.domain, [ex])
1301#        x, c, w = x.to_numpy_MA()
1302#        x = x.filled(0.0)[0]
1303#       
1304#        bx = numpy.ones(self.best_set.shape)
1305#        betas = numpy.zeros_like(self.betas)
1306#        betas[0] = self.betas[0]
1307#        beta_i = 0
1308#        for termi in range(1, len(self.best_set)):
1309#            dirs = self.dirs[termi]
1310#            cuts = self.cuts[termi]
1311#            dir_p1 = numpy.where(dirs == 1)[0]
1312#            dir_m1 = numpy.where(dirs == -1)[0]
1313#            dir_2 = numpy.where(dirs == 2)[0]
1314#           
1315#            x1 = x[dir_p1] - cuts[dir_p1]
1316#            x2 = cuts[dir_m1] - x[dir_m1]
1317#            x3 = x[dir_2]
1318#           
1319#            x1 = numpy.maximum(x1, 0.0)
1320#            x2 = numpy.maximum(x2, 0.0)
1321#
1322#            X = numpy.hstack((x1, x2, x3))
1323#            X = numpy.cumprod(X)
1324#           
1325#            bx[termi] = X[-1] if X.size else 0.0
1326#            if self.best_set[termi] != 0:
1327#                beta_i += 1
1328#                betas[beta_i] = self.betas[beta_i]
1329#
1330#        return numpy.sum(bx[self.best_set] * betas)
1331#           
1332#    def used_attributes(self, term=None):
1333#        """ Return a list of used attributes. If term (index) is given
1334#        return only attributes used in that single term.
1335#       
1336#        """
1337#        if term is None:
1338#            terms = numpy.where(self.best_set)[0]
1339#        else:
1340#            terms = [term]
1341#        attrs = set()
1342#        for ti in terms:
1343#            attri = numpy.where(self.dirs[ti] != 0.0)[0]
1344#            attrs.update([self.domain.attributes[i] for i in attri])
1345#        return attrs
1346#       
1347#    def evimp(self, used_only=True):
1348#        """ Return the estimated variable importance.
1349#        """
1350#        return evimp(self, used_only)
1351#       
1352#    def __reduce__(self):
1353#        return (EarthClassifier, (self._base_classifier, self.examples,
1354#                                  self.best_set, self.dirs, self.cuts,
1355#                                  self.betas, self.subsets,
1356#                                  self.rss_per_subset, self.gcv_per_subset),
1357#                {})
1358                                 
Note: See TracBrowser for help on using the repository browser.