source:orange/Orange/regression/earth.py@10140:5803cc112b09

Revision 10140:5803cc112b09, 47.8 KB checked in by ales_erjavec, 2 years ago (diff)

Some minor code cleanup.

Line
1"""\
2====================================================
4====================================================
5
6.. index:: regression, linear model
7
8`Multivariate adaptive regression splines (MARS)`_ is a non-parametric
9regression method that extends a linear model with non-linear
10interactions.
11
12This module borrows the implementation of the technique from the `Earth R
13package`_ by Stephen Milborrow.
14
16.. _`Earth R package`: http://cran.r-project.org/web/packages/earth/index.html
17
18Example ::
19
20    >>> import Orange
21    >>> data = Orange.data.Table("housing")
22    >>> c = Orange.regression.earth.EarthLearner(data, degree=2)
23    >>> print c
24    MEDV =
25       23.587
26       +11.896 * max(0, RM - 6.431)
27       +1.142 * max(0, 6.431 - RM)
28       -0.612 * max(0, LSTAT - 6.120)
29       -228.795 * max(0, NOX - 0.647) * max(0, RM - 6.431)
30       +0.023 * max(0, TAX - 307.000) * max(0, 6.120 - LSTAT)
31       +0.029 * max(0, 307.000 - TAX) * max(0, 6.120 - LSTAT)
32
33
34.. autoclass:: EarthLearner
35    :members:
36
37.. autoclass:: EarthClassifier
38    :members:
39
40
41Utility functions
42-----------------
43
44.. autofunction:: gcv
45
46.. autofunction:: plot_evimp
47
48.. autofunction:: bagged_evimp
49
50.. autoclass:: ScoreEarthImportance
51
52"""
53
54import Orange
55from Orange.feature import Discrete, Continuous
56from Orange.data import Table, Domain
57from Orange.preprocess import Preprocessor_continuize, \
58                              Preprocessor_impute, \
59                              Preprocessor_preprocessorList, \
60                              DomainContinuizer
61
62import numpy
63
64def is_discrete(var):
65    return isinstance(var, Discrete)
66
67def is_continuous(var):
68    return isinstance(var, Continuous)
69
70def expand_discrete(var):
71    """ Expand a discrete variable ``var`` returning one continuous indicator
72    variable for each value of ``var`` (if the number of values is grater
73    then 2 else return only one indicator variable).
74
75    """
76    if len(var.values) > 2:
77        values = var.values
78    elif len(var.values) == 2:
79        values = var.values[-1:]
80    else:
81        values = var.values[:1]
82    new_vars = []
83    for value in values:
84        new = Continuous("{0}={1}".format(var.name, value))
85        new.get_value_from = cls = Orange.core.ClassifierFromVar(whichVar=var)
86        cls.transformer = Orange.core.Discrete2Continuous()
87        cls.transformer.value = int(Orange.core.Value(var, value))
88        new.source_variable = var
89        new_vars.append(new)
90    return new_vars
91
92class EarthLearner(Orange.core.LearnerFD):
93    """ Earth learner class. Supports both regression and classification
94    problems. In case of classification the class values are expanded into
95    continuous indicator columns (one for each value if the number of
96    values is grater then 2), and a multi response model is learned on these
97    new columns. The resulting classifier will then use the computed response
98    values on new instances to select the final predicted class.
99
100    """
101    def __new__(cls, instances=None, weight_id=None, **kwargs):
102        self = Orange.core.LearnerFD.__new__(cls)
103        if instances is not None:
104            self.__init__(**kwargs)
105            return self.__call__(instances, weight_id)
106        else:
107            return self
108
109    def __init__(self, degree=1, terms=21, penalty= None, thresh=1e-3,
110                 min_span=0, new_var_penalty=0, fast_k=20, fast_beta=1,
111                 pruned_terms=None, scale_resp=True, store_instances=True,
112                 multi_label=False, **kwds):
113        """ Initialize the learner instance.
114
115        :param degree: Maximum degree (num. of hinge functions per term)
116            of the terms in the model.
117        :type degree: int
118        :param terms: Maximum number of terms in the forward pass (default 21).
119
120            .. note:: If this paramter is None then
121                ``min(200, max(20, 2 * n_attributes)) + 1`` will be used. This
122                is the same as the default setting in earth R package.
123
124        :type terms: int
125        :param penalty: Penalty for hinges in the GCV computation (used
126            in the pruning pass). By default it is 3.0 if the degree > 1,
127            2.0 otherwise.
128        :type penalty: float
129        :param thresh: Threshold for RSS decrease in the forward pass
130            (default 0.001).
131        :type thresh: float
132        :param min_span: TODO.
133        :param new_var_penalty: Penalty for introducing a new variable
134            in the model during the forward pass (default 0).
135        :type new_var_penalty: float
136        :param fast_k: Fast k.
137        :param fast_beta: Fast beta.
138        :param pruned_terms: Maximum number of terms in the model after
139            pruning (default None - no limit).
140        :type pruned_terms: int
141        :param scale_resp: Scale responses prior to forward pass (default
142            True - ignored for multi response models).
143        :type scale_resp: bool
144        :param store_instances: Store training instances in the model
145            (default True).
146        :type store_instances: bool
147        :param multi_label: If True build a multi label model (default False).
148        :type multi_label: bool
149
150        .. todo:: min_span, prunning_method (need Leaps like functionality,
151            currently only eval_subsets_using_xtx is implemented).
152
153        """
154        self.degree = degree
155        self.terms = terms
156        if penalty is None:
157            penalty = 3 if degree > 1 else 2
158        self.penalty = penalty
159        self.thresh = thresh
160        self.min_span = min_span
161        self.new_var_penalty = new_var_penalty
162        self.fast_k = fast_k
163        self.fast_beta = fast_beta
164        self.pruned_terms = pruned_terms
165        self.scale_resp = scale_resp
166        self.store_instances = store_instances
167        self.multi_label = multi_label
168        self.__dict__.update(kwds)
169
170        impute = Preprocessor_impute()
171        cont = Preprocessor_continuize(multinomialTreatment=
172                                       DomainContinuizer.AsOrdinal)
173        self.preproc = Preprocessor_preprocessorList(preprocessors=\
174                                                     [impute, cont])
175
176    def __call__(self, instances, weight_id=None):
177        instances = self.preproc(instances)
178        expanded_class = None
179        if self.multi_label:
181            data = instances.to_numpy_MA("Ac")[0]
183            x = data[:, ~ label_mask]
184        else:
185            # Expand a discrete class with indicator columns
186            if is_discrete(instances.domain.class_var):
187                expanded_class = expand_discrete(instances.domain.class_var)
188                y = Table(Domain(expanded_class, None), instances)
189                y = y.to_numpy_MA("A")[0]
190                x = instances.to_numpy_MA("A")[0]
191                label_mask = [False] * x.shape[1] + [True] * y.shape[1]
193            elif is_continuous(instances.domain.class_var):
195                                         dtype=bool)
197                x, y, _ = instances.to_numpy_MA()
198                y = y.reshape((-1, 1))
199            else:
200                raise ValueError("Cannot handle the response.")
201
202        if self.scale_resp and y.shape[1] == 1:
203            sy = y - numpy.mean(y, axis=0)
204            sy = sy / numpy.std(sy, axis=1)
205        else:
206            sy = y
207
208        terms = self.terms
209        if terms is None:
210            # Automatic maximum number of terms
211            terms = min(200, max(20, 2 * x.shape[1])) + 1
212
213        n_terms, used, bx, dirs, cuts = forward_pass(x, sy,
214            degree=self.degree, terms=terms, penalty=self.penalty,
215            thresh=self.thresh, fast_k=self.fast_k, fast_beta=self.fast_beta,
216            new_var_penalty=self.new_var_penalty)
217
218        # discard unused terms from bx, dirs, cuts
219        bx = bx[:, used]
220        dirs = dirs[used, :]
221        cuts = cuts[used, :]
222
223        # pruning
224        used, subsets, rss_per_subset, gcv_per_subset = \
225            pruning_pass(bx, y, self.penalty,
226                         pruned_terms=self.pruned_terms)
227
228        # Fit betas
229        bx_used = bx[:, used]
230        betas, res, rank, s = numpy.linalg.lstsq(bx_used, y)
231
232        return EarthClassifier(instances.domain, used, dirs, cuts, betas.T,
234                               instances=instances if self.store_instances else None,
236                               expanded_class=expanded_class)
237
238
239def soft_max(values):
240    values = numpy.asarray(values)
241    return numpy.exp(values) / numpy.sum(numpy.exp(values))
242
243
244class EarthClassifier(Orange.core.ClassifierFD):
245    """ Earth classifier.
246    """
247    def __init__(self, domain, best_set, dirs, cuts, betas, subsets=None,
250                 original_domain=None, **kwargs):
251        self.multi_flag = multi_flag
252        self.domain = domain
253        self.class_var = domain.class_var
254        self.best_set = best_set
255        self.dirs = dirs
256        self.cuts = cuts
257        self.betas = betas
258        self.subsets = subsets
260        self.gcv_per_subset = gcv_per_subset
261        self.instances = instances
263        self.expanded_class = expanded_class
264        self.original_domain = original_domain
265        self.__dict__.update(kwargs)
266
267    def __call__(self, instance, result_type=Orange.core.GetValue):
268        if self.multi_flag:
269            resp_vars = [v for v, m in zip(self.domain.variables,
271        elif is_discrete(self.class_var):
272            resp_vars = self.expanded_class
273        else:
274            resp_vars = [self.class_var]
275
276        vals = self.predict(instance)
277        vals = [var(val) for var, val in zip(resp_vars, vals)]
278
279        from Orange.statistics.distribution import Distribution
280
281        if not self.multi_flag and is_discrete(self.class_var):
282            dist = Distribution(self.class_var)
283            if len(self.class_var.values) == 2:
284                probs = [1 - float(vals[0]), float(vals[0])]
285            else:
286                probs = soft_max(map(float, vals))
287
288            for val, p in zip(self.class_var.values, probs):
289                dist[val] = p
290            value = dist.modus()
291            vals, probs = [value], [dist]
292        else:
293            probs = []
294            for var, val in zip(resp_vars, vals):
295                dist = Distribution(var)
296                dist[val] = 1.0
297                probs.append(dist)
298
299        if not self.multi_flag:
300            vals, probs = vals[0], probs[0]
301
302        if result_type == Orange.core.GetValue:
303            return vals
304        elif result_type == Orange.core.GetBoth:
305            return vals, probs
306        else:
307            return probs
308
309    def base_matrix(self, instances=None):
310        """Return the base matrix (bx) of the Earth model for the table.
311        If table is not supplied the base matrix of the training instances
312        is returned.
313        Base matrix is a len(instances) x num_terms matrix of computed values
314        of terms in the model (not multiplied by beta) for each instance.
315
316        :param instances: Input instances for the base matrix.
317        :type instances: :class:`Orange.data.Table`
318
319        """
320        if instances is None:
321            instances = self.instances
322        (data,) = instances.to_numpy_MA("Ac")
323        data = data[:, ~ self.label_mask]
324        bx = base_matrix(data, self.best_set, self.dirs, self.cuts)
325        return bx
326
327    def predict(self, instance):
328        """ Predict the response values for the instance
329
330        :param instance: Data instance
331        :type instance: :class:`Orange.data.Instance`
332
333        """
334        data = Orange.data.Table(self.domain, [instance])
335        bx = self.base_matrix(data)
336        bx_used = bx[:, self.best_set]
337        vals = numpy.dot(bx_used, self.betas.T).ravel()
338        return vals
339
340    def used_attributes(self, term=None):
341        """ Return the used terms for term (index). If no term is given
342        return all attributes in the model.
343
344        :param term: term index
345        :type term: int
346
347        """
348        if term is None:
349            return reduce(set.union, [self.used_attributes(i) \
350                                      for i in range(self.best_set.size)],
351                          set())
352        attrs = [a for a, m in zip(self.domain.variables, self.label_mask)
353                 if not m]
354
355        used_mask = self.dirs[term, :] != 0.0
356        return [a for a, u in zip(attrs, used_mask) if u]
357
358    def evimp(self, used_only=True):
359        """ Return the estimated variable importances.
360
361        :param used_only: if True return only used attributes
362
363        """
364        return evimp(self, used_only)
365
366    def __reduce__(self):
367        return (type(self), (self.domain, self.best_set, self.dirs,
368                            self.cuts, self.betas),
369                dict(self.__dict__))
370
371    def to_string(self, percision=3, indent=3):
372        """ Return a string representation of the model.
373        """
374        return format_model(self, percision, indent)
375
376    def __str__(self):
377        return self.to_string()
378
379
380"""
381Utility functions
382-----------------
383"""
384
385def base_matrix(data, best_set, dirs, cuts):
386    """ Return the base matrix for the earth model.
387
388    :param data: Input data
389    :type data: :class:`numpy.ndarray`
390
391    :param best_set: A array of booleans indicating used terms.
392    :type best_set: :class:`numpy.ndarray`
393
394    :param dirs: Earth model's dirs members
395    :type dirs: :class:`numpy.ndarray`
396
397    :param cuts: Earth model's cuts members
398    :type cuts: :class:`numpy.ndarray`
399
400    """
401    data = numpy.asarray(data)
402    best_set = numpy.asarray(best_set)
403    dirs = numpy.asarray(dirs)
404    cuts = numpy.asarray(cuts)
405
406    bx = numpy.zeros((data.shape[0], best_set.shape[0]))
407    bx[:, 0] = 1.0 # The intercept
408    for termi in range(1, best_set.shape[0]):
409        term_dirs = dirs[termi]
410        term_cuts = cuts[termi]
411
412        dir_p1 = numpy.where(term_dirs == 1)[0]
413        dir_m1 = numpy.where(term_dirs == -1)[0]
414        dir_2 = numpy.where(term_dirs == 2)[0]
415
416        x1 = data[:, dir_p1] - term_cuts[dir_p1]
417        x2 = term_cuts[dir_m1] - data[:, dir_m1]
418        x3 = data[:, dir_2]
419
420        x1 = numpy.where(x1 > 0.0, x1, 0.0)
421        x2 = numpy.where(x2 > 0.0, x2, 0.0)
422
423        X = numpy.hstack((x1, x2, x3))
424        X = numpy.cumprod(X, axis=1)
425        bx[:, termi] = X[:, -1] if X.size else 0.0
426
427    return bx
428
429
431    """ Return the generalized cross validation.
432
433    .. math:: gcv = rss / (n * (1 - NumEffectiveParams / n) ^ 2)
434
435    :param rss: Residual sum of squares.
436    :param n: Number of training instances.
437    :param n_effective_params: Number of effective paramaters.
438
439    """
440    return  rss / (n * (1. - n_effective_params / n) ** 2)
441
442"""
443Multi-label utility functions
444"""
445
446def is_label_attr(attr):
447    """ Is attribute a label.
448    """
449    return attr.attributes.has_key("label")
450
451def data_label_indices(domain):
452    """ Return the indices of label attributes in data.
453    """
455
457    """ Return an array of booleans indicating whether a variable in the
458    domain is a label.
459    """
460    is_label = map(is_label_attr, domain.variables)
461    if domain.class_var:
462        is_label[-1] = True
463    return numpy.array(is_label, dtype=bool)
464
465"""
466ctypes interface to ForwardPass and EvalSubsetsUsingXtx.
467"""
468
469import ctypes
470from numpy import ctypeslib
471import orange
472
474_c_forward_pass_ = _c_orange_lib.EarthForwardPass
475
476_c_forward_pass_.argtypes = \
477    [ctypes.POINTER(ctypes.c_int),  #pnTerms:
478     ctypeslib.ndpointer(dtype=ctypes.c_bool, ndim=1),  #FullSet
479     ctypeslib.ndpointer(dtype=ctypes.c_double, ndim=2, flags="F_CONTIGUOUS"), #bx
480     ctypeslib.ndpointer(dtype=ctypes.c_int, ndim=2, flags="F_CONTIGUOUS"),    #Dirs
481     ctypeslib.ndpointer(dtype=ctypes.c_double, ndim=2, flags="F_CONTIGUOUS"), #Cuts
482     ctypeslib.ndpointer(dtype=ctypes.c_int, ndim=1),  #nFactorsInTerms
483     ctypeslib.ndpointer(dtype=ctypes.c_int, ndim=1),  #nUses
484     ctypeslib.ndpointer(dtype=ctypes.c_double, ndim=2, flags="F_CONTIGUOUS"), #x
485     ctypeslib.ndpointer(dtype=ctypes.c_double, ndim=2, flags="F_CONTIGUOUS"), #y
486     ctypeslib.ndpointer(dtype=ctypes.c_double, ndim=1), # Weights
487     ctypes.c_int,  #nCases
488     ctypes.c_int,  #nResp
489     ctypes.c_int,  #nPred
490     ctypes.c_int,  #nMaxDegree
491     ctypes.c_int,  #nMaxTerms
492     ctypes.c_double,   #Penalty
493     ctypes.c_double,   #Thresh
494     ctypes.c_int,  #nFastK
495     ctypes.c_double,   #FastBeta
496     ctypes.c_double,   #NewVarPenalty
497     ctypeslib.ndpointer(dtype=ctypes.c_int, ndim=1),  #LinPreds
498     ctypes.c_bool, #UseBetaCache
499     ctypes.c_char_p    #sPredNames
500     ]
501
502def forward_pass(x, y, degree=1, terms=21, penalty=None, thresh=0.001,
503                  fast_k=21, fast_beta=1, new_var_penalty=2):
504    """ Do earth forward pass.
505    """
506    x = numpy.asfortranarray(x, dtype=ctypes.c_double)
507    y = numpy.asfortranarray(y, dtype=ctypes.c_double)
508    if x.shape[0] != y.shape[0]:
509        raise ValueError("First dimensions of x and y must be the same.")
510    if y.ndim == 1:
511        y = y.reshape((-1, 1), order="F")
512    if penalty is None:
513        penalty = 2
514    n_cases = x.shape[0]
515    n_preds = x.shape[1]
516
517    n_resp = y.shape[1] if y.ndim == 2 else y.shape[0]
518
519    # Output variables
520    n_term = ctypes.c_int()
521    full_set = numpy.zeros((terms,), dtype=ctypes.c_bool, order="F")
522    bx = numpy.zeros((n_cases, terms), dtype=ctypes.c_double, order="F")
523    dirs = numpy.zeros((terms, n_preds), dtype=ctypes.c_int, order="F")
524    cuts = numpy.zeros((terms, n_preds), dtype=ctypes.c_double, order="F")
525    n_factors_in_terms = numpy.zeros((terms,), dtype=ctypes.c_int, order="F")
526    n_uses = numpy.zeros((n_preds,), dtype=ctypes.c_int, order="F")
527    weights = numpy.ones((n_cases,), dtype=ctypes.c_double, order="F")
528    lin_preds = numpy.zeros((n_preds,), dtype=ctypes.c_int, order="F")
529    use_beta_cache = True
530
531    # These tests are performed in ForwardPass, and if they fail the function
532    # calls exit. So we must check it here and raise a exception to avoid a
533    # process shutdown.
534    if n_cases < 8:
535        raise ValueError("Need at least 8 data instances.")
536    if n_cases > 1e8:
537        raise ValueError("To many data instances.")
538    if n_resp < 1:
539        raise ValueError("No response column.")
540    if n_resp > 1e6:
541        raise ValueError("To many response columns.")
542    if n_preds < 1:
543        raise ValueError("No predictor columns.")
544    if n_preds > 1e5:
545        raise ValueError("To many predictor columns.")
546    if degree <= 0 or degree > 100:
547        raise ValueError("Invalid 'degree'.")
548    if terms < 3 or terms > 10000:
549        raise ValueError("'terms' must be in >= 3 and <= 10000.")
550    if penalty < 0 and penalty != -1:
551        raise ValueError("Invalid 'penalty' (the only legal negative value is -1).")
552    if penalty > 1000:
553        raise ValueError("Invalid 'penalty' (must be <= 1000).")
554    if thresh < 0.0 or thresh >= 1.0:
555        raise ValueError("Invalid 'thresh' (must be in [0.0, 1.0) ).")
556    if fast_beta < 0 or fast_beta > 1000:
557        raise ValueError("Invalid 'fast_beta' (must be in [0, 1000] ).")
558    if new_var_penalty < 0 or new_var_penalty > 10:
559        raise ValueError("Invalid 'new_var_penalty' (must be in [0, 10] ).")
560    if (numpy.var(y, axis=0) <= 1e-8).any():
561        raise ValueError("Variance of y is zero (or near zero).")
562
563    _c_forward_pass_(ctypes.byref(n_term), full_set, bx, dirs, cuts,
564                     n_factors_in_terms, n_uses, x, y, weights, n_cases,
565                     n_resp, n_preds, degree, terms, penalty, thresh,
566                     fast_k, fast_beta, new_var_penalty, lin_preds,
567                     use_beta_cache, None)
568    return n_term.value, full_set, bx, dirs, cuts
569
570
571_c_eval_subsets_xtx = _c_orange_lib.EarthEvalSubsetsUsingXtx
572
573_c_eval_subsets_xtx.argtypes = \
574    [ctypeslib.ndpointer(dtype=ctypes.c_bool, ndim=2, flags="F_CONTIGUOUS"),   #PruneTerms
576     ctypes.c_int,  #nCases
577     ctypes.c_int,  #nResp
578     ctypes.c_int,  #nMaxTerms
579     ctypeslib.ndpointer(dtype=ctypes.c_double, ndim=2, flags="F_CONTIGUOUS"),   #bx
580     ctypeslib.ndpointer(dtype=ctypes.c_double, ndim=2, flags="F_CONTIGUOUS"),   #y
581     ctypeslib.ndpointer(dtype=ctypes.c_double, ndim=1)  #WeightsArg
582     ]
583
584_c_eval_subsets_xtx.restype = ctypes.c_int
585
586def subset_selection_xtx(X, Y):
587    """ Subsets selection using EvalSubsetsUsingXtx in the Earth package.
588    """
589    X = numpy.asfortranarray(X, dtype=ctypes.c_double)
590    Y = numpy.asfortranarray(Y, dtype=ctypes.c_double)
591    if Y.ndim == 1:
592        Y = Y.reshape((-1, 1), order="F")
593
594    if X.shape[0] != Y.shape[0]:
595        raise ValueError("First dimensions of bx and y must be the same")
596
597    var_count = X.shape[1]
598    resp_count = Y.shape[1]
599    cases = X.shape[0]
600    subsets = numpy.zeros((var_count, var_count), dtype=ctypes.c_bool,
601                              order="F")
602    rss_vec = numpy.zeros((var_count,), dtype=ctypes.c_double, order="F")
603    weights = numpy.ones((cases,), dtype=ctypes.c_double, order="F")
604
605    rval = _c_eval_subsets_xtx(subsets, rss_vec, cases, resp_count, var_count,
606                        X, Y, weights)
607    if rval != 0:
608        raise numpy.linalg.LinAlgError("Lin. dep. terms in X")
609
610    subsets_ind = numpy.zeros((var_count, var_count), dtype=int)
611    for i, used in enumerate(subsets.T):
612        subsets_ind[i, :i + 1] = numpy.where(used)[0]
613
615
616def subset_selection_xtx_numpy(X, Y):
617    """ A numpy implementation of EvalSubsetsUsingXtx in the Earth package.
618    Using numpy.linalg.lstsq
619
620    """
621    X = numpy.asarray(X)
622    Y = numpy.asarray(Y)
623
624    var_count = X.shape[1]
626    working_set = range(var_count)
627    subsets = numpy.zeros((var_count, var_count), dtype=int)
628
629    for subset_size in reversed(range(var_count)):
630        subsets[subset_size, :subset_size + 1] = working_set
631        X_work = X[:, working_set]
632        b, res, rank, s = numpy.linalg.lstsq(X_work, Y)
633        if res.size > 0:
635        else:
636            rss_vec[subset_size] = numpy.sum((Y - numpy.dot(X_work, b)) ** 2)
637
638        XtX = numpy.dot(X_work.T, X_work)
639        iXtX = numpy.linalg.pinv(XtX)
640        diag = numpy.diag(iXtX).reshape((-1, 1))
641
642        if subset_size == 0:
643            break
644
645        delta_rss = b ** 2 / diag
647        delete_i = numpy.argmin(delta_rss[1:]) + 1 # Keep the intercept
648        del working_set[delete_i]
650
651def subset_selection_xtx2(X, Y):
652    """ Another implementation (this uses qr decomp).
653    """
654    from Orange.misc import linalg
655    X = numpy.asfortranarray(X, dtype=ctypes.c_double)
656    Y = numpy.asfortranarray(Y, dtype=ctypes.c_double)
657    col_count = X.shape[1]
658    working_set = range(col_count)
659    subsets = numpy.zeros((col_count, col_count), dtype=int)
661    QR, k, _, jpvt = linalg.qr_decomp(X)
662
663    if k < col_count:
664        # remove jpvt[k:] from the work set. Will have zero
665        # entries in the subsets matrix, and inf rss
666        for i in sorted(jpvt[k:], reverse=True):
667            del working_set[i]
669        col_count = len(working_set)
670
671    for subset_size in reversed(range(col_count)):
672        subsets[subset_size, :subset_size + 1] = working_set
673        X_work = X[:, working_set]
674        b, rsd, rank = linalg.qr_lstsq(X_work, Y)
675#        print rsd
676        rss_vec[subset_size] = numpy.sum(rsd ** 2)
677        XtX = numpy.dot(X_work.T, X_work)
678        iXtX = numpy.linalg.pinv(XtX)
679        diag = numpy.diag(iXtX)
680
681        if subset_size == 0:
682            break
683
684        delta_rss = b ** 2 / diag
685        delete_i = numpy.argmin(delta_rss[1:]) + 1 # Keep the intercept
686        del working_set[delete_i]
688
689def pruning_pass(bx, y, penalty, pruned_terms=-1):
690    """ Do pruning pass
691
692    .. todo:: pruned_terms, Leaps
693
694    """
695    try:
696        subsets, rss_vec = subset_selection_xtx(bx, y)
697    except numpy.linalg.LinAlgError:
698        subsets, rss_vec = subset_selection_xtx_numpy(bx, y)
699
700    cases, terms = bx.shape
701    n_effective_params = numpy.arange(terms) + 1.0
702    n_effective_params += penalty * (n_effective_params - 1.0) / 2.0
703
704    gcv_vec = gcv(rss_vec, cases, n_effective_params)
705
706    min_i = numpy.argmin(gcv_vec)
707    used = numpy.zeros((terms), dtype=bool)
708
709    used[subsets[min_i, :min_i + 1]] = True
710
711    return used, subsets, rss_vec, gcv_vec
712
713"""
714Printing functions.
715"""
716
717def format_model(model, percision=3, indent=3):
718    """ Return a formated string representation of the earth model.
719    """
720    if model.multi_flag:
721        r_vars = [v for v, m in zip(model.domain.variables,
723                  if m]
724    elif is_discrete(model.class_var):
725        r_vars = model.expanded_class
726    else:
727        r_vars = [model.class_var]
728
729    r_names = [v.name for v in r_vars]
730    betas = model.betas
731
732    resp = []
733    for name, betas in zip(r_names, betas):
734        resp.append(_format_response(model, name, betas,
735                                     percision, indent))
736    return "\n\n".join(resp)
737
738def _format_response(model, resp_name, betas, percision=3, indent=3):
739    header = "%s =" % resp_name
740    indent = " " * indent
741    fmt = "%." + str(percision) + "f"
742    terms = [([], fmt % betas[0])]
743    beta_i = 0
744    for i, used in enumerate(model.best_set[1:], 1):
745        if used:
746            beta_i += 1
747            beta = fmt % abs(betas[beta_i])
748            knots = [_format_knot(model, attr.name, d, c) for d, c, attr in \
749                     zip(model.dirs[i], model.cuts[i], model.domain.attributes) \
750                     if d != 0]
751            term_attrs = [a for a, d in zip(model.domain.attributes, model.dirs[i]) \
752                          if d != 0]
753            term_attrs = sorted(term_attrs)
754            sign = "-" if betas[beta_i] < 0 else "+"
755            if knots:
756                terms.append((term_attrs,
757                              sign + " * ".join([beta] + knots)))
758            else:
759                terms.append((term_attrs, sign + beta))
760    # Sort by len(term_attrs), then by term_attrs
761    terms = sorted(terms, key=lambda t: (len(t[0]), t[0]))
762    return "\n".join([header] + [indent + t for _, t in terms])
763
764def _format_knot(model, name, dir, cut):
765    if dir == 1:
766        txt = "max(0, %s - %.3f)" % (name, cut)
767    elif dir == -1:
768        txt = "max(0, %.3f - %s)" % (cut, name)
769    elif dir == 2:
770        txt = name
771    return txt
772
773def _format_term(model, i, attr_name):
774    knots = [_format_knot(model, attr, d, c) for d, c, attr in \
775             zip(model.dirs[i], model.cuts[i], model.domain.attributes) \
776             if d != 0]
777    return " * ".join(knots)
778
779"""\
780Variable importance estimation
781------------------------------
782"""
783
784from collections import defaultdict
785
786def collect_source(vars):
787    """ Given a list of variables ``var``, return a mapping from source
788    variables (``source_variable`` or ``get_value_from.variable`` members)
789    back to the variables in ``vars`` (assumes the default preprocessor in
790    EarthLearner).
791
792    """
793    source = defaultdict(list)
794    for var in vars:
795        svar = None
796        if var.source_variable:
797            source[var.source_variable].append(var)
798        elif isinstance(var.get_value_from, Orange.core.ClassifierFromVar):
799            source[var.get_value_from.variable].append(var)
800        elif isinstance(var.get_value_from, Orange.core.ImputeClassifier):
801            source[var.get_value_from.classifier_from_var.variable].append(var)
802        else:
803            source[var].append(var)
804    return dict(source)
805
806def map_to_source_var(var, sources):
807    """
808    """
809    if var in sources:
810        return var
811    elif var.source_variable in sources:
812        return var.source_variable
813    elif isinstance(var.get_value_from, Orange.core.ClassifierFromVar):
814        return map_to_source_var(var.get_value_from.variable, sources)
815    elif isinstance(var.get_value_from, Orange.core.ImputeClassifier):
816        var = var.get_value_from.classifier_from_var.variable
817        return map_to_source_var(var, sources)
818    else:
819        return None
820
821def evimp(model, used_only=True):
822    """ Return the estimated variable importance for the model.
823
824    :param model: Earth model.
825    :type model: `EarthClassifier`
826
827    """
828    if model.subsets is None:
829        raise ValueError("No subsets. Use the learner with 'prune=True'.")
830
831    subsets = model.subsets
832    n_subsets = numpy.sum(model.best_set)
833
835    gcv = -numpy.diff(model.gcv_per_subset)
836    attributes = list(model.domain.variables)
837
838    attr2ind = dict(zip(attributes, range(len(attributes))))
839    importances = numpy.zeros((len(attributes), 4))
840    importances[:, 0] = range(len(attributes))
841
842    for i in range(1, n_subsets):
843        term_subset = subsets[i, :i + 1]
844        used_attributes = reduce(set.union, [model.used_attributes(term) \
845                                             for term in term_subset], set())
846        for attr in used_attributes:
847            importances[attr2ind[attr]][1] += 1.0
848            importances[attr2ind[attr]][2] += gcv[i - 1]
849            importances[attr2ind[attr]][3] += rss[i - 1]
850    imp_min = numpy.min(importances[:, [2, 3]], axis=0)
851    imp_max = numpy.max(importances[:, [2, 3]], axis=0)
852
853    #Normalize importances.
854    importances[:, [2, 3]] = 100.0 * (importances[:, [2, 3]] \
855                            - [imp_min]) / ([imp_max - imp_min])
856
857    importances = list(importances)
858    # Sort by n_subsets and gcv.
859    importances = sorted(importances, key=lambda row: (row[1], row[2]),
860                         reverse=True)
861    importances = numpy.array(importances)
862
863    if used_only:
864        importances = importances[importances[:,1] > 0.0]
865
866    res = [(attributes[int(row[0])], tuple(row[1:])) for row in importances]
867    return res
868
869
870def plot_evimp(evimp):
871    """ Plot the variable importances as returned from
872    :obj:`EarthClassifier.evimp` call.
873
874    ::
875
876        import Orange
877        data = Orange.data.Table("housing")
878        c = Orange.regression.earth.EarthLearner(data, degree=3)
879        Orange.regression.earth.plot_evimp(c.evimp())
880
881    .. image:: files/earth-evimp.png
882
883    The left axis is the nsubsets measure an on the right are the normalized
885
886    """
887    from Orange.ensemble.bagging import BaggedClassifier
888    if isinstance(evimp, EarthClassifier):
889        evimp = evimp.evimp()
890    elif isinstance(evimp, BaggedClassifier):
891        evimp = bagged_evimp(evimp)
892
893    import pylab
894    fig = pylab.figure()
896    attrs = [a for a, _ in evimp]
897    imp = [s for _, s in evimp]
898    imp = numpy.array(imp)
899    X = range(len(attrs))
900    l1 = axes1.plot(X, imp[:, 0], "b-",)
901    axes2 = axes1.twinx()
902
903    l2 = axes2.plot(X, imp[:, 1], "g-",)
904    l3 = axes2.plot(X, imp[:, 2], "r-",)
905
906    x_axis = axes1.xaxis
907    x_axis.set_ticks(X)
908    x_axis.set_ticklabels([a.name for a in attrs], rotation=90)
909
910    axes1.yaxis.set_label_text("nsubsets")
912
913    axes1.legend([l1, l2, l3], ["nsubsets", "gcv", "rss"])
914    axes1.set_title("Variable importance")
915    fig.show()
916
917
918def bagged_evimp(classifier, used_only=True):
919    """ Extract combined (average) evimp from an instance of BaggedClassifier
920
921    Example::
922
923        from Orange.ensemble.bagging import BaggedLearner
924        bc = BaggedLearner(EarthLearner(degree=3, terms=10), data)
925        bagged_evimp(bc)
926
927    """
928    def assert_type(object, class_):
929        if not isinstance(object, class_):
930            raise TypeError("Instance of %r expected." % (class_))
931
932    from Orange.ensemble.bagging import BaggedClassifier
933
934    assert_type(classifier, BaggedClassifier)
935    bagged_imp = defaultdict(list)
936    attrs_by_name = defaultdict(list)
937    for c in classifier.classifiers:
938        assert_type(c, EarthClassifier)
939        imp = evimp(c, used_only=used_only)
940        for attr, score in imp:
941            bagged_imp[attr.name].append(score) # map by name
942            attrs_by_name[attr.name].append(attr)
943
944    for attr, scores in bagged_imp.items():
945        scores = numpy.average(scores, axis=0)
946        bagged_imp[attr] = tuple(scores)
947
948
949    bagged_imp = sorted(bagged_imp.items(),
950                        key=lambda t: (t[1][0], t[1][1]),
951                        reverse=True)
952
953    bagged_imp = [(attrs_by_name[name][0], scores) for name, scores in bagged_imp]
954
955    if used_only:
956        bagged_imp = [(a, r) for a, r in bagged_imp if r[0] > 0]
957    return bagged_imp
958
959"""
960High level interface for measuring variable importance
961(compatible with Orange.feature.scoring module).
962
963"""
964from Orange.feature import scoring
965
966class ScoreEarthImportance(scoring.Score):
967    """ An :class:`Orange.feature.scoring.Score` subclass.
968    Scores features based on their importance in the Earth
969    model using ``bagged_evimp``'s function return value.
970
971    """
972    # Return types
973    NSUBSETS = 0
975    GCV = 2
976
977    handles_discrete = True
978    handles_continuous = True
979    computes_thresholds = False
980    needs = scoring.Score.Generator
981
982    def __new__(cls, attr=None, data=None, weight_id=None, **kwargs):
983        self = scoring.Score.__new__(cls)
984        if attr is not None and data is not None:
985            self.__init__(**kwargs)
986            # TODO: Should raise a warning, about caching
987            return self.__call__(attr, data, weight_id)
988        elif not attr and not data:
989            return self
990        else:
991            raise ValueError("Both 'attr' and 'data' arguments expected.")
992
993    def __init__(self, t=10, degree=2, terms=10, score_what="nsubsets", cached=True):
994        """
995        :param t: Number of earth models to train on the data
996            (using BaggedLearner).
997
998        :param score_what: What to return as a score.
999            Can be one of: "nsubsets", "rss", "gcv" or class constants
1001
1002        """
1003        self.t = t
1004        self.degree = degree
1005        self.terms = terms
1006        if isinstance(score_what, basestring):
1008                          "gcv":self.GCV}.get(score_what, None)
1009
1010        if score_what not in range(3):
1011            raise ValueError("Invalid  'score_what' parameter.")
1012
1013        self.score_what = score_what
1014        self.cached = cached
1015        self._cache_ref = None
1016        self._cached_evimp = None
1017
1018    def __call__(self, attr, data, weight_id=None):
1019        ref = self._cache_ref
1020        if ref is not None and ref is data:
1021            evimp = self._cached_evimp
1022        else:
1023            from Orange.ensemble.bagging import BaggedLearner
1024            bc = BaggedLearner(EarthLearner(degree=self.degree,
1025                            terms=self.terms), t=self.t)(data, weight_id)
1026            evimp = bagged_evimp(bc, used_only=False)
1027            self._cache_ref = data
1028            self._cached_evimp = evimp
1029
1030        evimp = dict(evimp)
1031        score = evimp.get(attr, None)
1032
1033        if score is None:
1034            source = collect_source(evimp.keys())
1035            if attr in source:
1036                # Return average of source var scores
1037                return numpy.average([evimp[v][self.score_what] \
1038                                      for v in source[attr]])
1039            else:
1040                raise ValueError("Attribute %r not in the domain." % attr)
1041        else:
1042            return score[self.score_what]
1043
1045
1046    handles_discrete = False
1047    handles_continuous = True
1048    computes_thresholds = False
1049
1050    def __new__(cls, attr=None, data=None, weight_id=None, **kwargs):
1051        self = scoring.Score.__new__(cls)
1052        if attr is not None and data is not None:
1053            self.__init__(**kwargs)
1054            # TODO: Should raise a warning, about caching
1055            return self.__call__(attr, data, weight_id)
1056        elif not attr and not data:
1057            return self
1058        else:
1059            raise ValueError("Both 'attr' and 'data' arguments expected.")
1060
1061    def __init__(self):
1062        self._cache_data = None
1064
1065    def __call__(self, attr, data, weight_id=None):
1066        ref = self._cache_data
1067        if ref is not None and ref is data:
1069        else:
1070            x, y = data.to_numpy_MA("1A/c")
1071            try:
1072                subsets, rss = subset_selection_xtx2(x, y)
1073            except numpy.linalg.LinAlgError:
1074                subsets, rss = subset_selection_xtx_numpy(x, y)
1077            for s_size in range(1, subsets.shape[0]):
1078                subset = subsets[s_size, :s_size + 1]
1081            self._cache_data = data
1084        index = list(data.domain.attributes).index(attr)
1086
1087
1088#from Orange.core import EarthLearner as BaseEarthLearner, \
1089#                        EarthClassifier as BaseEarthClassifier
1090#from Orange.misc import member_set
1091#
1092#class _EarthLearner(BaseEarthLearner):
1093#    """ An earth learner.
1094#    """
1095#    def __new__(cls, data=None, weightId=None, **kwargs):
1096#        self = BaseEarthLearner.__new__(cls, **kwargs)
1097#        if data is not None:
1098#            self.__init__(**kwargs)
1099#            return self.__call__(data, weightId)
1100#        return self
1101#
1102#    def __init__(self, max_degree=1, max_terms=21, new_var_penalty=0.0,
1103#                 threshold=0.001, prune=True, penalty=None, fast_k=20,
1104#                 fast_beta=0.0, store_examples=True, **kwargs):
1105#        """ Initialize the learner instance.
1106#
1107#        :param max_degree:
1108#        """
1109#        self.max_degree = max_degree
1110#        self.max_terms = max_terms
1111#        self.new_var_penalty = new_var_penalty
1112#        self.threshold = threshold
1113#        self.prune = prunes
1114#        if penaty is None:
1115#            penalty = 2.0 if degree > 1 else 3.0
1116#        self.penalty = penalty
1117#        self.fast_k = fast_k
1118#        self.fast_beta = fast_beta
1119#        self.store_examples = store_examples
1120#
1121#        for key, val in kwargs.items():
1122#            setattr(self, key, val)
1123#
1124#    def __call__(self, data, weightId=None):
1125#        if not data.domain.class_var:
1126#            raise ValueError("No class var in the domain.")
1127#
1128#        with member_set(self, "prune", False):
1129#            # We overwrite the prune argument (will do the pruning in python).
1130#            base_clsf =  BaseEarthLearner.__call__(self, data, weightId)
1131#
1132#        if self.prune:
1134#             gcv_per_subset) = self.pruning_pass(base_clsf, data)
1135#
1136#            return _EarthClassifier(base_clsf, data if self.store_examples else None,
1137#                                   best_set=best_set, dirs=base_clsf.dirs,
1138#                                   cuts=base_clsf.cuts,
1139#                                   betas=betas,
1140#                                   subsets=subsets,
1142#                                   gcv_per_subset=gcv_per_subset)
1143#        else:
1144#            return _EarthClassifier(base_clsf, data if self.store_examples else None)
1145#
1146#
1147#    def pruning_pass(self, base_clsf, examples):
1148#        """ Prune the terms constructed in the forward pass.
1149#        (Pure numpy reimplementation)
1150#        """
1151#        n_terms = numpy.sum(base_clsf.best_set)
1152#        n_eff_params = n_terms + self.penalty * (n_terms - 1) / 2.0
1153#        data, y, _ = examples.to_numpy_MA()
1154#        data = data.filled(0.0)
1155#        best_set = numpy.asarray(base_clsf.best_set, dtype=bool)
1156#
1157#        bx = base_matrix(data, base_clsf.best_set,
1158#                         base_clsf.dirs, base_clsf.cuts,
1159#                         )
1160#
1161#        bx_used = bx[:, best_set]
1162#        subsets, rss_per_subset = subsets_selection_xtx(bx_used, y) # TODO: Use leaps like library
1163#        gcv_per_subset = [gcv(rss, bx.shape[0], i + self.penalty * (i - 1) / 2.0) \
1165#        gcv_per_subset = numpy.array(gcv_per_subset)
1166#
1167#        best_i = numpy.argmin(gcv_per_subset[1:]) + 1 # Ignore the intercept
1168#        best_ind = subsets[best_i, :best_i + 1]
1169#        bs_i = 0
1170#        for i, b in enumerate(best_set):
1171#            if b:
1172#                best_set[i] = bs_i in best_ind
1173#                bs_i += 1
1174#
1175#        bx_subset = bx[:, best_set]
1176#        betas, rss, rank, s = numpy.linalg.lstsq(bx_subset, y)
1178#
1179#
1180#class _EarthClassifier(Orange.core.ClassifierFD):
1181#    def __init__(self, base_classifier=None, examples=None, best_set=None,
1182#                 dirs=None, cuts=None, betas=None, subsets=None,
1184#                 gcv_per_subset=None):
1185#        self._base_classifier = base_classifier
1186#        self.examples = examples
1187#        self.domain = base_classifier.domain
1188#        self.class_var = base_classifier.class_var
1189#
1190#        best_set = base_classifier.best_set if best_set is None else best_set
1191#        dirs = base_classifier.dirs if dirs is None else dirs
1192#        cuts = base_classifier.cuts if cuts is None else cuts
1193#        betas = base_classifier.betas if betas is None else betas
1194#
1195#        self.best_set = numpy.asarray(best_set, dtype=bool)
1196#        self.dirs = numpy.array(dirs, dtype=int)
1197#        self.cuts = numpy.array(cuts)
1198#        self.betas = numpy.array(betas)
1199#
1200#        self.subsets = subsets
1202#        self.gcv_per_subset = gcv_per_subset
1203#
1204#    @property
1205#    def num_terms(self):
1206#        """ Number of terms in the model (including the intercept).
1207#        """
1208#        return numpy.sum(numpy.asarray(self.best_set, dtype=int))
1209#
1210#    @property
1211#    def max_terms(self):
1212#        """ Maximum number of terms (as specified in the learning step).
1213#        """
1214#        return self.best_set.size
1215#
1216#    @property
1217#    def num_preds(self):
1218#        """ Number of predictors (variables) included in the model.
1219#        """
1220#        return len(self.used_attributes(term))
1221#
1222#    def __call__(self, example, what=Orange.core.GetValue):
1223#        value = self.predict(example)
1224#        if isinstance(self.class_var, Orange.feature.Continuous):
1225#            value = self.class_var(value)
1226#        else:
1227#            value = self.class_var(int(round(value)))
1228#
1229#        dist = Orange.statistics.distribution.Distribution(self.class_var)
1230#        dist[value] = 1.0
1231#        if what == Orange.core.GetValue:
1232#            return value
1233#        elif what == Orange.core.GetProbabilities:
1234#            return dist
1235#        else:
1236#            return (value, dist)
1237#
1238#    def base_matrix(self, examples=None):
1239#        """ Return the base matrix (bx) of the Earth model for the table.
1240#        If table is not supplied the base matrix of the training examples
1241#        is returned.
1242#
1243#
1244#        :param examples: Input examples for the base matrix.
1245#        :type examples: Orange.data.Table
1246#
1247#        """
1248#        if examples is None:
1249#            examples = self.examples
1250#
1251#        if examples is None:
1252#            raise ValueError("base matrix is only available if 'store_examples=True'")
1253#
1254#        if isinstance(examples, Orange.data.Table):
1255#            data, cls, w = examples.to_numpy_MA()
1256#            data = data.filled(0.0)
1257#        else:
1258#            data = numpy.asarray(examples)
1259#
1260#        return base_matrix(data, self.best_set, self.dirs, self.cuts)
1261#
1262#    def _anova_order(self):
1263#        """ Return indices that sort the terms into the 'ANOVA' format.
1264#        """
1265#        terms = [([], 0)] # intercept
1266#        for i, used in enumerate(self.best_set[1:], 1):
1267#            attrs = sorted(self.used_attributes(i))
1268#            if used and attrs:
1269#                terms.append((attrs, i))
1270#        terms = sotred(terms, key=lambda t:(len(t[0]), t[0]))
1271#        return [i for _, i in terms]
1272#
1273#    def format_model(self, percision=3, indent=3):
1274#        return format_model(self, percision, indent)
1275#
1276#    def print_model(self, percision=3, indent=3):
1277#        print self.format_model(percision, indent)
1278#
1279#    def predict(self, ex):
1280#        """ Return the predicted value (float) for example.
1281#        """
1282#        x = Orange.data.Table(self.domain, [ex])
1283#        x, c, w = x.to_numpy_MA()
1284#        x = x.filled(0.0)[0]
1285#
1286#        bx = numpy.ones(self.best_set.shape)
1287#        betas = numpy.zeros_like(self.betas)
1288#        betas[0] = self.betas[0]
1289#        beta_i = 0
1290#        for termi in range(1, len(self.best_set)):
1291#            dirs = self.dirs[termi]
1292#            cuts = self.cuts[termi]
1293#            dir_p1 = numpy.where(dirs == 1)[0]
1294#            dir_m1 = numpy.where(dirs == -1)[0]
1295#            dir_2 = numpy.where(dirs == 2)[0]
1296#
1297#            x1 = x[dir_p1] - cuts[dir_p1]
1298#            x2 = cuts[dir_m1] - x[dir_m1]
1299#            x3 = x[dir_2]
1300#
1301#            x1 = numpy.maximum(x1, 0.0)
1302#            x2 = numpy.maximum(x2, 0.0)
1303#
1304#            X = numpy.hstack((x1, x2, x3))
1305#            X = numpy.cumprod(X)
1306#
1307#            bx[termi] = X[-1] if X.size else 0.0
1308#            if self.best_set[termi] != 0:
1309#                beta_i += 1
1310#                betas[beta_i] = self.betas[beta_i]
1311#
1312#        return numpy.sum(bx[self.best_set] * betas)
1313#
1314#    def used_attributes(self, term=None):
1315#        """ Return a list of used attributes. If term (index) is given
1316#        return only attributes used in that single term.
1317#
1318#        """
1319#        if term is None:
1320#            terms = numpy.where(self.best_set)[0]
1321#        else:
1322#            terms = [term]
1323#        attrs = set()
1324#        for ti in terms:
1325#            attri = numpy.where(self.dirs[ti] != 0.0)[0]
1326#            attrs.update([self.domain.attributes[i] for i in attri])
1327#        return attrs
1328#
1329#    def evimp(self, used_only=True):
1330#        """ Return the estimated variable importance.
1331#        """
1332#        return evimp(self, used_only)
1333#
1334#    def __reduce__(self):
1335#        return (EarthClassifier, (self._base_classifier, self.examples,
1336#                                  self.best_set, self.dirs, self.cuts,
1337#                                  self.betas, self.subsets,