source: orange/Orange/regression/earth.py @ 10897:7abac22a882e

Revision 10897:7abac22a882e, 38.6 KB checked in by Ales Erjavec <ales.erjavec@…>, 23 months ago (diff)

Code style fixup.

Line 
1"""\
2====================================================
3Multivariate Adaptive Regression Splines (``earth``)
4====================================================
5
6.. index:: regression, linear model
7
8`Multivariate adaptive regression splines (MARS)`_ is a non-parametric
9regression method that extends a linear model with non-linear
10interactions.
11
12This module borrows the implementation of the technique from the `Earth R
13package`_ by Stephen Milborrow.
14
15.. _`Multivariate adaptive regression splines (MARS)`:
16        http://en.wikipedia.org/wiki/Multivariate_adaptive_regression_splines
17
18.. _`Earth R package`: http://cran.r-project.org/web/packages/earth/index.html
19
20Example ::
21
22    >>> import Orange
23    >>> data = Orange.data.Table("housing")
24    >>> c = Orange.regression.earth.EarthLearner(data, degree=2, terms=10)
25    >>> print c
26    MEDV =
27       23.587
28       +11.896 * max(0, RM - 6.431)
29       +1.142 * max(0, 6.431 - RM)
30       -0.612 * max(0, LSTAT - 6.120)
31       -228.795 * max(0, NOX - 0.647) * max(0, RM - 6.431)
32       +0.023 * max(0, TAX - 307.000) * max(0, 6.120 - LSTAT)
33       +0.029 * max(0, 307.000 - TAX) * max(0, 6.120 - LSTAT)
34
35
36.. autoclass:: EarthLearner
37    :members:
38
39.. autoclass:: EarthClassifier
40    :members:
41
42
43Utility functions
44-----------------
45
46.. autofunction:: gcv
47
48.. autofunction:: plot_evimp
49
50.. autofunction:: bagged_evimp
51
52.. autoclass:: ScoreEarthImportance
53
54"""
55
56import Orange
57from Orange.feature import Discrete, Continuous
58from Orange.data import Table, Domain
59from Orange.data.preprocess import DomainContinuizer
60
61import numpy
62
63
64def is_discrete(var):
65    return isinstance(var, Discrete)
66
67
68def is_continuous(var):
69    return isinstance(var, Continuous)
70
71
72def expand_discrete(var):
73    """ Expand a discrete variable ``var`` returning one continuous indicator
74    variable for each value of ``var`` (if the number of values is grater
75    then 2 else return only one indicator variable).
76
77    """
78    if len(var.values) > 2:
79        values = var.values
80    elif len(var.values) == 2:
81        values = var.values[-1:]
82    else:
83        values = var.values[:1]
84    new_vars = []
85    for value in values:
86        new = Continuous("{0}={1}".format(var.name, value))
87        new.get_value_from = cls = Orange.core.ClassifierFromVar(whichVar=var)
88        cls.transformer = Orange.core.Discrete2Continuous()
89        cls.transformer.value = int(Orange.core.Value(var, value))
90        new.source_variable = var
91        new_vars.append(new)
92    return new_vars
93
94
95def select_attrs(table, features, class_var=None,
96                 class_vars=None, metas=None):
97    """ Select only ``attributes`` from the ``table``.
98    """
99    if class_vars is None:
100        domain = Domain(features, class_var)
101    else:
102        domain = Domain(features, class_var, class_vars=class_vars)
103    if metas:
104        domain.add_metas(metas)
105    return Table(domain, table)
106
107
108class EarthLearner(Orange.regression.base.BaseRegressionLearner):
109    """Earth learner class. Supports both regression and classification
110    problems. For classification, class values are expanded into
111    continuous indicator columns (one for each value if the number of
112    values is grater then 2), and a multi response model is fit to these
113    new columns. The resulting classifier the computes response
114    values on new instances to select the final predicted class.
115
116    """
117    def __new__(cls, instances=None, weight_id=None, **kwargs):
118        self = Orange.regression.base.BaseRegressionLearner.__new__(cls)
119        if instances is not None:
120            self.__init__(**kwargs)
121            return self.__call__(instances, weight_id)
122        else:
123            return self
124
125    def __init__(self, degree=1, terms=21, penalty=None, thresh=1e-3,
126                 min_span=0, new_var_penalty=0, fast_k=20, fast_beta=1,
127                 pruned_terms=None, scale_resp=True, store_instances=True,
128                **kwds):
129        """Initialize the learner instance.
130
131        :param degree: Maximum degree (num. of hinge functions per term)
132            of the terms in the model (default: 1).
133        :type degree: int
134        :param terms: Maximum number of terms in the forward pass
135                (default: 21).  If set to ``None``, ``min(200, max(20, 2
136                * n_attributes)) + 1`` will be used, like the default
137                setting in earth R package.
138        :type terms: int
139        :param penalty: Penalty for hinges in the GCV computation (used
140            in the pruning pass). Default is 3.0 if ``degree`` is above 1,
141            and 2.0 otherwise.
142        :type penalty: float
143        :param thresh: Threshold for RSS decrease in the forward pass
144            (default: 0.001).
145        :type thresh: float
146        :param min_span: TODO.
147        :param new_var_penalty: Penalty for introducing a new variable
148            in the model during the forward pass (default: 0).
149        :type new_var_penalty: float
150        :param fast_k: Fast k.
151        :param fast_beta: Fast beta.
152        :param pruned_terms: Maximum number of terms in the model after
153            pruning (default: ``None``, no limit).
154        :type pruned_terms: int
155        :param scale_resp: Scale responses prior to forward pass (default:
156            ``True``); ignored for models with multiple responses.
157        :type scale_resp: bool
158        :param store_instances: Store training instances in the model
159            (default: ``True``).
160        :type store_instances: bool
161
162        .. todo:: min_span, prunning_method (need Leaps like functionality,
163            currently only eval_subsets_using_xtx is implemented).
164
165        """
166
167        super(EarthLearner, self).__init__()
168
169        self.degree = degree
170        self.terms = terms
171        if penalty is None:
172            penalty = 3 if degree > 1 else 2
173        self.penalty = penalty
174        self.thresh = thresh
175        self.min_span = min_span
176        self.new_var_penalty = new_var_penalty
177        self.fast_k = fast_k
178        self.fast_beta = fast_beta
179        self.pruned_terms = pruned_terms
180        self.scale_resp = scale_resp
181        self.store_instances = store_instances
182        self.__dict__.update(kwds)
183
184        self.continuizer.class_treatment = DomainContinuizer.Ignore
185
186    def __call__(self, instances, weight_id=None):
187        expanded_class = None
188        multitarget = False
189
190        if instances.domain.class_var:
191            instances = self.impute_table(instances)
192            instances = self.continuize_table(instances)
193
194            if is_discrete(instances.domain.class_var):
195                # Expand a discrete class with indicator columns
196                expanded_class = expand_discrete(instances.domain.class_var)
197                y_table = select_attrs(instances, expanded_class)
198                (y, ) = y_table.to_numpy_MA("A")
199                (x, ) = instances.to_numpy_MA("A")
200            elif is_continuous(instances.domain.class_var):
201                x, y, _ = instances.to_numpy_MA()
202                y = y.reshape((-1, 1))
203            else:
204                raise ValueError("Cannot handle the response.")
205        elif instances.domain.class_vars:
206            # Multi-target domain
207            if not all(map(is_continuous, instances.domain.class_vars)):
208                raise TypeError("Only continuous multi-target classes are supported.")
209            x_table = select_attrs(instances, instances.domain.attributes)
210            y_table = select_attrs(instances, instances.domain.class_vars)
211
212            # Impute and continuize only the x_table
213            x_table = self.impute_table(x_table)
214            x_table = self.continuize_table(x_table)
215            domain = Domain(x_table.domain.attributes,
216                            class_vars=instances.domain.class_vars)
217
218            (x, ) = x_table.to_numpy_MA("A")
219            (y, ) = y_table.to_numpy_MA("A")
220
221            multitarget = True
222        else:
223            raise ValueError("Class variable expected.")
224
225        if self.scale_resp and y.shape[1] == 1:
226            sy = y - numpy.mean(y, axis=0)
227            sy = sy / numpy.std(sy, axis=0)
228        else:
229            sy = y
230
231        terms = self.terms
232        if terms is None:
233            # Automatic maximum number of terms
234            terms = min(200, max(20, 2 * x.shape[1])) + 1
235
236        n_terms, used, bx, dirs, cuts = forward_pass(x, sy,
237            degree=self.degree, terms=terms, penalty=self.penalty,
238            thresh=self.thresh, fast_k=self.fast_k, fast_beta=self.fast_beta,
239            new_var_penalty=self.new_var_penalty)
240
241        # discard unused terms from bx, dirs, cuts
242        bx = bx[:, used]
243        dirs = dirs[used, :]
244        cuts = cuts[used, :]
245
246        # pruning
247        used, subsets, rss_per_subset, gcv_per_subset = \
248            pruning_pass(bx, y, self.penalty,
249                         pruned_terms=self.pruned_terms)
250
251        # Fit betas
252        bx_used = bx[:, used]
253        betas, res, rank, s = numpy.linalg.lstsq(bx_used, y)
254
255        return EarthClassifier(instances.domain, used, dirs, cuts, betas.T,
256                               subsets, rss_per_subset, gcv_per_subset,
257                               instances=instances if self.store_instances else None,
258                               multitarget=multitarget,
259                               expanded_class=expanded_class
260                               )
261
262
263def soft_max(values):
264    values = numpy.asarray(values)
265    return numpy.exp(values) / numpy.sum(numpy.exp(values))
266
267
268class EarthClassifier(Orange.core.ClassifierFD):
269    """ Earth classifier.
270    """
271    def __init__(self, domain, best_set, dirs, cuts, betas, subsets=None,
272                 rss_per_subset=None, gcv_per_subset=None, instances=None,
273                 multitarget=False, expanded_class=None,
274                 original_domain=None, **kwargs):
275        self.multitarget = multitarget
276        self.domain = domain
277        self.class_var = domain.class_var
278        if self.multitarget:
279            self.class_vars = domain.class_vars
280
281        self.best_set = best_set
282        self.dirs = dirs
283        self.cuts = cuts
284        self.betas = betas
285        self.subsets = subsets
286        self.rss_per_subset = rss_per_subset
287        self.gcv_per_subset = gcv_per_subset
288        self.instances = instances
289        self.expanded_class = expanded_class
290        self.original_domain = original_domain
291        self.__dict__.update(kwargs)
292
293    def __call__(self, instance, result_type=Orange.core.GetValue):
294        if self.multitarget and self.domain.class_vars:
295            resp_vars = list(self.domain.class_vars)
296        elif is_discrete(self.class_var):
297            resp_vars = self.expanded_class
298        else:
299            resp_vars = [self.class_var]
300
301        vals = self.predict(instance)
302        vals = [var(val) for var, val in zip(resp_vars, vals)]
303
304        from Orange.statistics.distribution import Distribution
305
306        if not self.multitarget and is_discrete(self.class_var):
307            dist = Distribution(self.class_var)
308            if len(self.class_var.values) == 2:
309                probs = [1 - float(vals[0]), float(vals[0])]
310            else:
311                probs = soft_max(map(float, vals))
312
313            for val, p in zip(self.class_var.values, probs):
314                dist[val] = p
315            value = dist.modus()
316            vals, probs = [value], [dist]
317        else:
318            probs = []
319            for var, val in zip(resp_vars, vals):
320                dist = Distribution(var)
321                dist[val] = 1.0
322                probs.append(dist)
323
324        if not self.multitarget:
325            vals, probs = vals[0], probs[0]
326
327        if result_type == Orange.core.GetValue:
328            return vals
329        elif result_type == Orange.core.GetBoth:
330            return vals, probs
331        else:
332            return probs
333
334    def base_matrix(self, instances=None):
335        """Return the base matrix (bx) of the Earth model for the table.
336        If table is not supplied, the base matrix of the training instances
337        is returned.
338        Base matrix is a len(instances) x num_terms matrix of computed values
339        of terms in the model (not multiplied by beta) for each instance.
340
341        :param instances: Input instances for the base matrix.
342        :type instances: :class:`Orange.data.Table`
343
344        """
345        if instances is None:
346            instances = self.instances
347        instances = select_attrs(instances, self.domain.attributes)
348        (data,) = instances.to_numpy_MA("A")
349        bx = base_matrix(data, self.best_set, self.dirs, self.cuts)
350        return bx
351
352    def base_features(self):
353        """Return a list of features for the included Earth terms.
354        The attributes can be used in Orange's domain translation
355        (i.e. they define the proper ``get_value_from`` functions).
356
357        """
358        terms = []
359        dirs = self.dirs[self.best_set]
360        cuts = self.cuts[self.best_set]
361
362        for dir, cut in zip(dirs[1:], cuts[1:]):  # Drop the intercept (first column)
363            hinge = [_format_knot(self, attr.name, dir1, cut1) \
364                     for (attr, dir1, cut1) in \
365                     zip(self.domain.attributes, dir, cut) \
366                     if dir1 != 0.0]
367            term_name = " * ".join(hinge)
368            term = Orange.feature.Continuous(term_name)
369            term.get_value_from = term_computer(term, self.domain, dir, cut)
370            terms.append(term)
371        return terms
372
373    def predict(self, instance):
374        """ Predict the response value(s)
375
376        :param instance: Data instance
377        :type instance: :class:`Orange.data.Instance`
378
379        """
380        data = Orange.data.Table(self.domain, [instance])
381        bx = self.base_matrix(data)
382        bx_used = bx[:, self.best_set]
383        vals = numpy.dot(bx_used, self.betas.T).ravel()
384        return vals
385
386    def used_attributes(self, term=None):
387        """Return the used terms for term (index). If no term is given,
388        return all attributes in the model.
389
390        :param term: term index
391        :type term: int
392
393        """
394        if term is None:
395            return reduce(set.union, [self.used_attributes(i) \
396                                      for i in range(self.best_set.size)],
397                          set())
398
399        attrs = self.domain.attributes
400
401        used_mask = self.dirs[term, :] != 0.0
402        return [a for a, u in zip(attrs, used_mask) if u]
403
404    def evimp(self, used_only=True):
405        """ Return the estimated variable importances.
406
407        :param used_only: if True return only used attributes
408
409        """
410        return evimp(self, used_only)
411
412    def __reduce__(self):
413        return (type(self), (self.domain, self.best_set, self.dirs,
414                            self.cuts, self.betas),
415                dict(self.__dict__))
416
417    def to_string(self, percision=3, indent=3):
418        """ Return a string representation of the model.
419        """
420        return format_model(self, percision, indent)
421
422    def __str__(self):
423        return self.to_string()
424
425"""
426Utility functions
427-----------------
428"""
429
430
431def base_matrix(data, best_set, dirs, cuts):
432    """ Return the base matrix for the earth model.
433
434    :param data: Input data
435    :type data: :class:`numpy.ndarray`
436
437    :param best_set: A array of booleans indicating used terms.
438    :type best_set: :class:`numpy.ndarray`
439
440    :param dirs: Earth model's dirs members
441    :type dirs: :class:`numpy.ndarray`
442
443    :param cuts: Earth model's cuts members
444    :type cuts: :class:`numpy.ndarray`
445
446    """
447    data = numpy.asarray(data)
448    best_set = numpy.asarray(best_set)
449    dirs = numpy.asarray(dirs)
450    cuts = numpy.asarray(cuts)
451
452    bx = numpy.zeros((data.shape[0], best_set.shape[0]))
453    bx[:, 0] = 1.0  # The intercept
454    for termi in range(1, best_set.shape[0]):
455        term_dirs = dirs[termi]
456        term_cuts = cuts[termi]
457
458        dir_p1 = numpy.where(term_dirs == 1)[0]
459        dir_m1 = numpy.where(term_dirs == -1)[0]
460        dir_2 = numpy.where(term_dirs == 2)[0]
461
462        x1 = data[:, dir_p1] - term_cuts[dir_p1]
463        x2 = term_cuts[dir_m1] - data[:, dir_m1]
464        x3 = data[:, dir_2]
465
466        x1 = numpy.where(x1 > 0.0, x1, 0.0)
467        x2 = numpy.where(x2 > 0.0, x2, 0.0)
468
469        X = numpy.hstack((x1, x2, x3))
470        X = numpy.cumprod(X, axis=1)
471        bx[:, termi] = X[:, -1] if X.size else 0.0
472
473    return bx
474
475
476def gcv(rss, n, n_effective_params):
477    """ Return the generalized cross validation.
478
479    .. math:: gcv = rss / (n * (1 - NumEffectiveParams / n) ^ 2)
480
481    :param rss: Residual sum of squares.
482    :param n: Number of training instances.
483    :param n_effective_params: Number of effective paramaters.
484
485    """
486    return  rss / (n * (1. - n_effective_params / n) ** 2)
487
488
489class term_computer(Orange.core.ClassifierFD):
490    """An utility class for computing basis terms. Can be used as
491    a :obj:`~Orange.feature.Descriptior.get_value_from` member.
492
493    """
494    def __init__(self, term_var=None, domain=None, dir=None, cut=None):
495        self.class_var = term_var
496        self.domain = domain
497        self.dir = dir
498        self.cut = cut
499
500    def __call__(self, instance, return_what=Orange.core.GetValue):
501        instance = Orange.data.Table(self.domain, [instance])
502        (instance,) = instance.to_numpy_MA("A")
503        instance = instance[0]
504
505        mask = self.dir != 0
506        dir = self.dir[mask]
507        cut = self.cut[mask]
508
509        values = instance[mask] - cut
510        values *= dir
511
512        values = numpy.where(values > 0, values, 0)
513        value = numpy.prod(values.filled(0))
514
515        return self.class_var(value if value is not numpy.ma.masked else "?")
516
517
518"""
519Multi-label utility functions
520"""
521
522
523"""
524ctypes interface to ForwardPass and EvalSubsetsUsingXtx.
525"""
526
527import ctypes
528from numpy import ctypeslib
529import orange
530
531_c_orange_lib = ctypeslib.load_library(orange.__file__, "")
532_c_forward_pass_ = _c_orange_lib.EarthForwardPass
533
534_c_forward_pass_.argtypes = \
535    [ctypes.POINTER(ctypes.c_int),  # pnTerms:
536     ctypeslib.ndpointer(dtype=ctypes.c_bool, ndim=1),  # FullSet
537     ctypeslib.ndpointer(dtype=ctypes.c_double, ndim=2, flags="F_CONTIGUOUS"), # bx
538     ctypeslib.ndpointer(dtype=ctypes.c_int, ndim=2, flags="F_CONTIGUOUS"),    # Dirs
539     ctypeslib.ndpointer(dtype=ctypes.c_double, ndim=2, flags="F_CONTIGUOUS"), # Cuts
540     ctypeslib.ndpointer(dtype=ctypes.c_int, ndim=1),  # nFactorsInTerms
541     ctypeslib.ndpointer(dtype=ctypes.c_int, ndim=1),  # nUses
542     ctypeslib.ndpointer(dtype=ctypes.c_double, ndim=2, flags="F_CONTIGUOUS"), # x
543     ctypeslib.ndpointer(dtype=ctypes.c_double, ndim=2, flags="F_CONTIGUOUS"), # y
544     ctypeslib.ndpointer(dtype=ctypes.c_double, ndim=1), # Weights
545     ctypes.c_int,  # nCases
546     ctypes.c_int,  # nResp
547     ctypes.c_int,  # nPred
548     ctypes.c_int,  # nMaxDegree
549     ctypes.c_int,  # nMaxTerms
550     ctypes.c_double,   # Penalty
551     ctypes.c_double,   # Thresh
552     ctypes.c_int,  # nFastK
553     ctypes.c_double,   # FastBeta
554     ctypes.c_double,   # NewVarPenalty
555     ctypeslib.ndpointer(dtype=ctypes.c_int, ndim=1),  # LinPreds
556     ctypes.c_bool, # UseBetaCache
557     ctypes.c_char_p    # sPredNames
558     ]
559
560
561def forward_pass(x, y, degree=1, terms=21, penalty=None, thresh=0.001,
562                  fast_k=21, fast_beta=1, new_var_penalty=2):
563    """ Do earth forward pass.
564    """
565    x = numpy.asfortranarray(x, dtype=ctypes.c_double)
566    y = numpy.asfortranarray(y, dtype=ctypes.c_double)
567    if x.shape[0] != y.shape[0]:
568        raise ValueError("First dimensions of x and y must be the same.")
569    if y.ndim == 1:
570        y = y.reshape((-1, 1), order="F")
571    if penalty is None:
572        penalty = 2
573    n_cases = x.shape[0]
574    n_preds = x.shape[1]
575
576    n_resp = y.shape[1] if y.ndim == 2 else y.shape[0]
577
578    # Output variables
579    n_term = ctypes.c_int()
580    full_set = numpy.zeros((terms,), dtype=ctypes.c_bool, order="F")
581    bx = numpy.zeros((n_cases, terms), dtype=ctypes.c_double, order="F")
582    dirs = numpy.zeros((terms, n_preds), dtype=ctypes.c_int, order="F")
583    cuts = numpy.zeros((terms, n_preds), dtype=ctypes.c_double, order="F")
584    n_factors_in_terms = numpy.zeros((terms,), dtype=ctypes.c_int, order="F")
585    n_uses = numpy.zeros((n_preds,), dtype=ctypes.c_int, order="F")
586    weights = numpy.ones((n_cases,), dtype=ctypes.c_double, order="F")
587    lin_preds = numpy.zeros((n_preds,), dtype=ctypes.c_int, order="F")
588    use_beta_cache = True
589
590    # These tests are performed in ForwardPass, and if they fail the function
591    # calls exit. So we must check it here and raise a exception to avoid a
592    # process shutdown.
593    if n_cases < 8:
594        raise ValueError("Need at least 8 data instances.")
595    if n_cases > 1e8:
596        raise ValueError("To many data instances.")
597    if n_resp < 1:
598        raise ValueError("No response column.")
599    if n_resp > 1e6:
600        raise ValueError("To many response columns.")
601    if n_preds < 1:
602        raise ValueError("No predictor columns.")
603    if n_preds > 1e5:
604        raise ValueError("To many predictor columns.")
605    if degree <= 0 or degree > 100:
606        raise ValueError("Invalid 'degree'.")
607    if terms < 3 or terms > 10000:
608        raise ValueError("'terms' must be in >= 3 and <= 10000.")
609    if penalty < 0 and penalty != -1:
610        raise ValueError("Invalid 'penalty' (the only legal negative value is -1).")
611    if penalty > 1000:
612        raise ValueError("Invalid 'penalty' (must be <= 1000).")
613    if thresh < 0.0 or thresh >= 1.0:
614        raise ValueError("Invalid 'thresh' (must be in [0.0, 1.0) ).")
615    if fast_beta < 0 or fast_beta > 1000:
616        raise ValueError("Invalid 'fast_beta' (must be in [0, 1000] ).")
617    if new_var_penalty < 0 or new_var_penalty > 10:
618        raise ValueError("Invalid 'new_var_penalty' (must be in [0, 10] ).")
619    if (numpy.var(y, axis=0) <= 1e-8).any():
620        raise ValueError("Variance of y is zero (or near zero).")
621
622    _c_forward_pass_(ctypes.byref(n_term), full_set, bx, dirs, cuts,
623                     n_factors_in_terms, n_uses, x, y, weights, n_cases,
624                     n_resp, n_preds, degree, terms, penalty, thresh,
625                     fast_k, fast_beta, new_var_penalty, lin_preds,
626                     use_beta_cache, None)
627    return n_term.value, full_set, bx, dirs, cuts
628
629
630_c_eval_subsets_xtx = _c_orange_lib.EarthEvalSubsetsUsingXtx
631
632_c_eval_subsets_xtx.argtypes = \
633    [ctypeslib.ndpointer(dtype=ctypes.c_bool, ndim=2, flags="F_CONTIGUOUS"),   # PruneTerms
634     ctypeslib.ndpointer(dtype=ctypes.c_double, ndim=1),   # RssVec
635     ctypes.c_int,  # nCases
636     ctypes.c_int,  # nResp
637     ctypes.c_int,  # nMaxTerms
638     ctypeslib.ndpointer(dtype=ctypes.c_double, ndim=2, flags="F_CONTIGUOUS"),  # bx
639     ctypeslib.ndpointer(dtype=ctypes.c_double, ndim=2, flags="F_CONTIGUOUS"),  # y
640     ctypeslib.ndpointer(dtype=ctypes.c_double, ndim=1)  # WeightsArg
641     ]
642
643_c_eval_subsets_xtx.restype = ctypes.c_int
644
645
646def subset_selection_xtx(X, Y):
647    """ Subsets selection using EvalSubsetsUsingXtx in the Earth package.
648    """
649    X = numpy.asfortranarray(X, dtype=ctypes.c_double)
650    Y = numpy.asfortranarray(Y, dtype=ctypes.c_double)
651    if Y.ndim == 1:
652        Y = Y.reshape((-1, 1), order="F")
653
654    if X.shape[0] != Y.shape[0]:
655        raise ValueError("First dimensions of bx and y must be the same")
656
657    var_count = X.shape[1]
658    resp_count = Y.shape[1]
659    cases = X.shape[0]
660    subsets = numpy.zeros((var_count, var_count), dtype=ctypes.c_bool,
661                              order="F")
662    rss_vec = numpy.zeros((var_count,), dtype=ctypes.c_double, order="F")
663    weights = numpy.ones((cases,), dtype=ctypes.c_double, order="F")
664
665    rval = _c_eval_subsets_xtx(subsets, rss_vec, cases, resp_count, var_count,
666                        X, Y, weights)
667    if rval != 0:
668        raise numpy.linalg.LinAlgError("Lin. dep. terms in X")
669
670    subsets_ind = numpy.zeros((var_count, var_count), dtype=int)
671    for i, used in enumerate(subsets.T):
672        subsets_ind[i, :i + 1] = numpy.where(used)[0]
673
674    return subsets_ind, rss_vec
675
676
677def subset_selection_xtx_numpy(X, Y):
678    """ A numpy implementation of EvalSubsetsUsingXtx in the Earth package.
679    Using numpy.linalg.lstsq
680
681    """
682    X = numpy.asarray(X)
683    Y = numpy.asarray(Y)
684
685    var_count = X.shape[1]
686    rss_vec = numpy.zeros(var_count)
687    working_set = range(var_count)
688    subsets = numpy.zeros((var_count, var_count), dtype=int)
689
690    for subset_size in reversed(range(var_count)):
691        subsets[subset_size, :subset_size + 1] = working_set
692        X_work = X[:, working_set]
693        b, res, rank, s = numpy.linalg.lstsq(X_work, Y)
694        if res.size > 0:
695            rss_vec[subset_size] = numpy.sum(res)
696        else:
697            rss_vec[subset_size] = numpy.sum((Y - numpy.dot(X_work, b)) ** 2)
698
699        XtX = numpy.dot(X_work.T, X_work)
700        iXtX = numpy.linalg.pinv(XtX)
701        diag = numpy.diag(iXtX).reshape((-1, 1))
702
703        if subset_size == 0:
704            break
705
706        delta_rss = b ** 2 / diag
707        delta_rss = numpy.sum(delta_rss, axis=1)
708        delete_i = numpy.argmin(delta_rss[1:]) + 1  # Keep the intercept
709        del working_set[delete_i]
710    return subsets, rss_vec
711
712
713def subset_selection_xtx2(X, Y):
714    """ Another implementation (this uses qr decomp).
715    """
716    from Orange.misc import linalg
717    X = numpy.asfortranarray(X, dtype=ctypes.c_double)
718    Y = numpy.asfortranarray(Y, dtype=ctypes.c_double)
719    col_count = X.shape[1]
720    working_set = range(col_count)
721    subsets = numpy.zeros((col_count, col_count), dtype=int)
722    rss_vec = numpy.zeros((col_count,))
723    QR, k, _, jpvt = linalg.qr_decomp(X)
724
725    if k < col_count:
726        # remove jpvt[k:] from the work set. Will have zero
727        # entries in the subsets matrix, and inf rss
728        for i in sorted(jpvt[k:], reverse=True):
729            del working_set[i]
730            rss_vec[len(working_set)] = float("inf")
731        col_count = len(working_set)
732
733    for subset_size in reversed(range(col_count)):
734        subsets[subset_size, :subset_size + 1] = working_set
735        X_work = X[:, working_set]
736        b, rsd, rank = linalg.qr_lstsq(X_work, Y)
737        rss_vec[subset_size] = numpy.sum(rsd ** 2)
738        XtX = numpy.dot(X_work.T, X_work)
739        iXtX = numpy.linalg.pinv(XtX)
740        diag = numpy.diag(iXtX)
741
742        if subset_size == 0:
743            break
744
745        delta_rss = b ** 2 / diag
746        delete_i = numpy.argmin(delta_rss[1:]) + 1  # Keep the intercept
747        del working_set[delete_i]
748    return subsets, rss_vec
749
750
751def pruning_pass(bx, y, penalty, pruned_terms=-1):
752    """ Do pruning pass
753
754    .. todo:: pruned_terms, Leaps
755
756    """
757    try:
758        subsets, rss_vec = subset_selection_xtx(bx, y)
759    except numpy.linalg.LinAlgError:
760        subsets, rss_vec = subset_selection_xtx_numpy(bx, y)
761
762    cases, terms = bx.shape
763    n_effective_params = numpy.arange(terms) + 1.0
764    n_effective_params += penalty * (n_effective_params - 1.0) / 2.0
765
766    gcv_vec = gcv(rss_vec, cases, n_effective_params)
767
768    min_i = numpy.argmin(gcv_vec)
769    used = numpy.zeros((terms), dtype=bool)
770
771    used[subsets[min_i, :min_i + 1]] = True
772
773    return used, subsets, rss_vec, gcv_vec
774
775"""
776Printing functions.
777"""
778
779
780def format_model(model, percision=3, indent=3):
781    """ Return a formated string representation of the earth model.
782    """
783    if model.multitarget:
784        r_vars = list(model.domain.class_vars)
785    elif is_discrete(model.class_var):
786        r_vars = model.expanded_class
787    else:
788        r_vars = [model.class_var]
789
790    r_names = [v.name for v in r_vars]
791    betas = model.betas
792
793    resp = []
794    for name, betas in zip(r_names, betas):
795        resp.append(_format_response(model, name, betas,
796                                     percision, indent))
797    return "\n\n".join(resp)
798
799
800def _format_response(model, resp_name, betas, percision=3, indent=3):
801    header = "%s =" % resp_name
802    indent = " " * indent
803    fmt = "%." + str(percision) + "f"
804    terms = [([], fmt % betas[0])]
805    beta_i = 0
806    for i, used in enumerate(model.best_set[1:], 1):
807        if used:
808            beta_i += 1
809            beta = fmt % abs(betas[beta_i])
810            knots = [_format_knot(model, attr.name, d, c, percision) \
811                     for d, c, attr in \
812                     zip(model.dirs[i], model.cuts[i], model.domain.attributes) \
813                     if d != 0]
814            term_attrs = [a for a, d in zip(model.domain.attributes, model.dirs[i]) \
815                          if d != 0]
816            term_attrs = sorted(term_attrs)
817            sign = "-" if betas[beta_i] < 0 else "+"
818            if knots:
819                terms.append((term_attrs,
820                              sign + " * ".join([beta] + knots)))
821            else:
822                terms.append((term_attrs, sign + beta))
823    # Sort by len(term_attrs), then by term_attrs
824    terms = sorted(terms, key=lambda t: (len(t[0]), t[0]))
825    return "\n".join([header] + [indent + t for _, t in terms])
826
827
828def _format_knot(model, name, dir, cut, percision=3):
829    fmt = "%%.%if" % percision
830    if dir == 1:
831        txt = ("max(0, %s - " + fmt + ")") % (name, cut)
832    elif dir == -1:
833        txt = ("max(0, " + fmt + " - %s)") % (cut, name)
834    elif dir == 2:
835        txt = name
836    return txt
837
838
839"""\
840Variable importance estimation
841------------------------------
842"""
843
844from collections import defaultdict
845
846
847def collect_source(vars):
848    """ Given a list of variables ``var``, return a mapping from source
849    variables (``source_variable`` or ``get_value_from.variable`` members)
850    back to the variables in ``vars`` (assumes the default preprocessor in
851    EarthLearner).
852
853    """
854    source = defaultdict(list)
855    for var in vars:
856        svar = None
857        if var.source_variable:
858            source[var.source_variable].append(var)
859        elif isinstance(var.get_value_from, Orange.core.ClassifierFromVar):
860            source[var.get_value_from.variable].append(var)
861        elif isinstance(var.get_value_from, Orange.core.ImputeClassifier):
862            source[var.get_value_from.classifier_from_var.variable].append(var)
863        else:
864            source[var].append(var)
865    return dict(source)
866
867
868def map_to_source_var(var, sources):
869    """
870    """
871    if var in sources:
872        return var
873    elif var.source_variable in sources:
874        return var.source_variable
875    elif isinstance(var.get_value_from, Orange.core.ClassifierFromVar):
876        return map_to_source_var(var.get_value_from.variable, sources)
877    elif isinstance(var.get_value_from, Orange.core.ImputeClassifier):
878        var = var.get_value_from.classifier_from_var.variable
879        return map_to_source_var(var, sources)
880    else:
881        return None
882
883
884def evimp(model, used_only=True):
885    """ Return the estimated variable importance for the model.
886
887    :param model: Earth model.
888    :type model: `EarthClassifier`
889
890    """
891    if model.subsets is None:
892        raise ValueError("No subsets. Use the learner with 'prune=True'.")
893
894    subsets = model.subsets
895    n_subsets = numpy.sum(model.best_set)
896
897    rss = -numpy.diff(model.rss_per_subset)
898    gcv = -numpy.diff(model.gcv_per_subset)
899    attributes = list(model.domain.variables)
900
901    attr2ind = dict(zip(attributes, range(len(attributes))))
902    importances = numpy.zeros((len(attributes), 4))
903    importances[:, 0] = range(len(attributes))
904
905    for i in range(1, n_subsets):
906        term_subset = subsets[i, :i + 1]
907        used_attributes = reduce(set.union, [model.used_attributes(term) \
908                                             for term in term_subset], set())
909        for attr in used_attributes:
910            importances[attr2ind[attr]][1] += 1.0
911            importances[attr2ind[attr]][2] += gcv[i - 1]
912            importances[attr2ind[attr]][3] += rss[i - 1]
913    imp_min = numpy.min(importances[:, [2, 3]], axis=0)
914    imp_max = numpy.max(importances[:, [2, 3]], axis=0)
915
916    #Normalize importances.
917    importances[:, [2, 3]] = 100.0 * (importances[:, [2, 3]] \
918                            - [imp_min]) / ([imp_max - imp_min])
919
920    importances = list(importances)
921    # Sort by n_subsets and gcv.
922    importances = sorted(importances, key=lambda row: (row[1], row[2]),
923                         reverse=True)
924    importances = numpy.array(importances)
925
926    if used_only:
927        importances = importances[importances[:, 1] > 0.0]
928
929    res = [(attributes[int(row[0])], tuple(row[1:])) for row in importances]
930    return res
931
932
933def plot_evimp(evimp):
934    """ Plot the variable importances as returned from
935    :obj:`EarthClassifier.evimp` call.
936
937    ::
938
939        import Orange
940        data = Orange.data.Table("housing")
941        c = Orange.regression.earth.EarthLearner(data, degree=3)
942        Orange.regression.earth.plot_evimp(c.evimp())
943
944    .. image:: files/earth-evimp.png
945
946    The left axis is the nsubsets measure and on the right are the normalized
947    RSS and GCV.
948
949    """
950    from Orange.ensemble.bagging import BaggedClassifier
951    if isinstance(evimp, EarthClassifier):
952        evimp = evimp.evimp()
953    elif isinstance(evimp, BaggedClassifier):
954        evimp = bagged_evimp(evimp)
955
956    import pylab
957    fig = pylab.figure()
958    axes1 = fig.add_subplot(111)
959    attrs = [a for a, _ in evimp]
960    imp = [s for _, s in evimp]
961    imp = numpy.array(imp)
962    X = range(len(attrs))
963    l1 = axes1.plot(X, imp[:, 0], "b-",)
964    axes2 = axes1.twinx()
965
966    l2 = axes2.plot(X, imp[:, 1], "g-",)
967    l3 = axes2.plot(X, imp[:, 2], "r-",)
968
969    x_axis = axes1.xaxis
970    x_axis.set_ticks(X)
971    x_axis.set_ticklabels([a.name for a in attrs], rotation=90)
972
973    axes1.yaxis.set_label_text("nsubsets")
974    axes2.yaxis.set_label_text("normalized gcv or rss")
975
976    axes1.legend([l1, l2, l3], ["nsubsets", "gcv", "rss"])
977    axes1.set_title("Variable importance")
978    fig.show()
979
980
981def bagged_evimp(classifier, used_only=True):
982    """ Extract combined (average) evimp from an instance of BaggedClassifier
983
984    Example::
985
986        from Orange.ensemble.bagging import BaggedLearner
987        bc = BaggedLearner(EarthLearner(degree=3, terms=10), data)
988        bagged_evimp(bc)
989
990    """
991    def assert_type(object, class_):
992        if not isinstance(object, class_):
993            raise TypeError("Instance of %r expected." % (class_))
994
995    from Orange.ensemble.bagging import BaggedClassifier
996
997    assert_type(classifier, BaggedClassifier)
998    bagged_imp = defaultdict(list)
999    attrs_by_name = defaultdict(list)
1000    for c in classifier.classifiers:
1001        assert_type(c, EarthClassifier)
1002        imp = evimp(c, used_only=used_only)
1003        for attr, score in imp:
1004            bagged_imp[attr.name].append(score)  # map by name
1005            attrs_by_name[attr.name].append(attr)
1006
1007    for attr, scores in bagged_imp.items():
1008        scores = numpy.average(scores, axis=0)
1009        bagged_imp[attr] = tuple(scores)
1010
1011    bagged_imp = sorted(bagged_imp.items(),
1012                        key=lambda t: (t[1][0], t[1][1]),
1013                        reverse=True)
1014
1015    bagged_imp = [(attrs_by_name[name][0], scores) for name, scores in bagged_imp]
1016
1017    if used_only:
1018        bagged_imp = [(a, r) for a, r in bagged_imp if r[0] > 0]
1019    return bagged_imp
1020
1021"""
1022High level interface for measuring variable importance
1023(compatible with Orange.feature.scoring module).
1024
1025"""
1026from Orange.feature import scoring
1027
1028
1029class ScoreEarthImportance(scoring.Score):
1030    """ A subclass of :class:`Orange.feature.scoring.Score` that.
1031    scores features based on their importance in the Earth
1032    model using ``bagged_evimp``.
1033
1034    """
1035    # Return types
1036    NSUBSETS = 0
1037    RSS = 1
1038    GCV = 2
1039
1040    handles_discrete = True
1041    handles_continuous = True
1042    computes_thresholds = False
1043    needs = scoring.Score.Generator
1044
1045    def __new__(cls, attr=None, data=None, weight_id=None, **kwargs):
1046        self = scoring.Score.__new__(cls)
1047        if attr is not None and data is not None:
1048            self.__init__(**kwargs)
1049            # TODO: Should raise a warning, about caching
1050            return self.__call__(attr, data, weight_id)
1051        elif not attr and not data:
1052            return self
1053        else:
1054            raise ValueError("Both 'attr' and 'data' arguments expected.")
1055
1056    def __init__(self, t=10, degree=2, terms=10, score_what="nsubsets",
1057                 cached=True):
1058        """
1059        :param t: Number of earth models to train on the data
1060            (using BaggedLearner).
1061
1062        :param score_what: What to return as a score.
1063            Can be one of: "nsubsets", "rss", "gcv" or class constants
1064            NSUBSETS, RSS, GCV.
1065
1066        """
1067        self.t = t
1068        self.degree = degree
1069        self.terms = terms
1070        if isinstance(score_what, basestring):
1071            score_what = {"nsubsets": self.NSUBSETS, "rss": self.RSS,
1072                          "gcv": self.GCV}.get(score_what, None)
1073
1074        if score_what not in range(3):
1075            raise ValueError("Invalid  'score_what' parameter.")
1076
1077        self.score_what = score_what
1078        self.cached = cached
1079        self._cache_ref = None
1080        self._cached_evimp = None
1081
1082    def __call__(self, attr, data, weight_id=None):
1083        ref = self._cache_ref
1084        if ref is not None and ref is data:
1085            evimp = self._cached_evimp
1086        else:
1087            from Orange.ensemble.bagging import BaggedLearner
1088            bc = BaggedLearner(EarthLearner(degree=self.degree,
1089                            terms=self.terms), t=self.t)(data, weight_id)
1090            evimp = bagged_evimp(bc, used_only=False)
1091            self._cache_ref = data
1092            self._cached_evimp = evimp
1093
1094        evimp = dict(evimp)
1095        score = evimp.get(attr, None)
1096
1097        if score is None:
1098            source = collect_source(evimp.keys())
1099            if attr in source:
1100                # Return average of source var scores
1101                return numpy.average([evimp[v][self.score_what] \
1102                                      for v in source[attr]])
1103            else:
1104                raise ValueError("Attribute %r not in the domain." % attr)
1105        else:
1106            return score[self.score_what]
1107
1108
1109class ScoreRSS(scoring.Score):
1110
1111    handles_discrete = False
1112    handles_continuous = True
1113    computes_thresholds = False
1114
1115    def __new__(cls, attr=None, data=None, weight_id=None, **kwargs):
1116        self = scoring.Score.__new__(cls)
1117        if attr is not None and data is not None:
1118            self.__init__(**kwargs)
1119            # TODO: Should raise a warning, about caching
1120            return self.__call__(attr, data, weight_id)
1121        elif not attr and not data:
1122            return self
1123        else:
1124            raise ValueError("Both 'attr' and 'data' arguments expected.")
1125
1126    def __init__(self):
1127        self._cache_data = None
1128        self._cache_rss = None
1129
1130    def __call__(self, attr, data, weight_id=None):
1131        ref = self._cache_data
1132        if ref is not None and ref is data:
1133            rss = self._cache_rss
1134        else:
1135            x, y = data.to_numpy_MA("1A/c")
1136            try:
1137                subsets, rss = subset_selection_xtx2(x, y)
1138            except numpy.linalg.LinAlgError:
1139                subsets, rss = subset_selection_xtx_numpy(x, y)
1140            rss_diff = -numpy.diff(rss)
1141            rss = numpy.zeros_like(rss)
1142            for s_size in range(1, subsets.shape[0]):
1143                subset = subsets[s_size, :s_size + 1]
1144                rss[subset] += rss_diff[s_size - 1]
1145            rss = rss[1:]  # Drop the intercept
1146            self._cache_data = data
1147            self._cache_rss = rss
1148
1149        index = list(data.domain.attributes).index(attr)
1150        return rss[index]
Note: See TracBrowser for help on using the repository browser.