source: orange/Orange/regression/earth.py @ 10907:fd2b4a470ade

Revision 10907:fd2b4a470ade, 40.0 KB checked in by Ales Erjavec <ales.erjavec@…>, 23 months ago (diff)

Another speedup of earth basis computation.

Line 
1"""\
2====================================================
3Multivariate Adaptive Regression Splines (``earth``)
4====================================================
5
6.. index:: regression, linear model
7
8`Multivariate adaptive regression splines (MARS)`_ is a non-parametric
9regression method that extends a linear model with non-linear
10interactions.
11
12This module borrows the implementation of the technique from the `Earth R
13package`_ by Stephen Milborrow.
14
15.. _`Multivariate adaptive regression splines (MARS)`:
16        http://en.wikipedia.org/wiki/Multivariate_adaptive_regression_splines
17
18.. _`Earth R package`: http://cran.r-project.org/web/packages/earth/index.html
19
20Example ::
21
22    >>> import Orange
23    >>> data = Orange.data.Table("housing")
24    >>> c = Orange.regression.earth.EarthLearner(data, degree=2, terms=10)
25    >>> print c
26    MEDV =
27       23.587
28       +11.896 * max(0, RM - 6.431)
29       +1.142 * max(0, 6.431 - RM)
30       -0.612 * max(0, LSTAT - 6.120)
31       -228.795 * max(0, NOX - 0.647) * max(0, RM - 6.431)
32       +0.023 * max(0, TAX - 307.000) * max(0, 6.120 - LSTAT)
33       +0.029 * max(0, 307.000 - TAX) * max(0, 6.120 - LSTAT)
34
35
36.. autoclass:: EarthLearner
37    :members:
38
39.. autoclass:: EarthClassifier
40    :members:
41
42
43Utility functions
44-----------------
45
46.. autofunction:: gcv
47
48.. autofunction:: plot_evimp
49
50.. autofunction:: bagged_evimp
51
52.. autoclass:: ScoreEarthImportance
53
54"""
55
56import Orange
57from Orange.feature import Discrete, Continuous
58from Orange.data import Table, Domain
59from Orange.data.preprocess import DomainContinuizer
60
61import numpy
62
63
64def is_discrete(var):
65    return isinstance(var, Discrete)
66
67
68def is_continuous(var):
69    return isinstance(var, Continuous)
70
71
72def expand_discrete(var):
73    """ Expand a discrete variable ``var`` returning one continuous indicator
74    variable for each value of ``var`` (if the number of values is grater
75    then 2 else return only one indicator variable).
76
77    """
78    if len(var.values) > 2:
79        values = var.values
80    elif len(var.values) == 2:
81        values = var.values[-1:]
82    else:
83        values = var.values[:1]
84    new_vars = []
85    for value in values:
86        new = Continuous("{0}={1}".format(var.name, value))
87        new.get_value_from = cls = Orange.core.ClassifierFromVar(whichVar=var)
88        cls.transformer = Orange.core.Discrete2Continuous()
89        cls.transformer.value = int(Orange.core.Value(var, value))
90        new.source_variable = var
91        new_vars.append(new)
92    return new_vars
93
94
95def select_attrs(table, features, class_var=None,
96                 class_vars=None, metas=None):
97    """ Select only ``attributes`` from the ``table``.
98    """
99    if class_vars is None:
100        domain = Domain(features, class_var)
101    else:
102        domain = Domain(features, class_var, class_vars=class_vars)
103    if metas:
104        domain.add_metas(metas)
105    return Table(domain, table)
106
107
108class EarthLearner(Orange.regression.base.BaseRegressionLearner):
109    """Earth learner class. Supports both regression and classification
110    problems. For classification, class values are expanded into
111    continuous indicator columns (one for each value if the number of
112    values is grater then 2), and a multi response model is fit to these
113    new columns. The resulting classifier the computes response
114    values on new instances to select the final predicted class.
115
116    """
117    def __new__(cls, instances=None, weight_id=None, **kwargs):
118        self = Orange.regression.base.BaseRegressionLearner.__new__(cls)
119        if instances is not None:
120            self.__init__(**kwargs)
121            return self.__call__(instances, weight_id)
122        else:
123            return self
124
125    def __init__(self, degree=1, terms=21, penalty=None, thresh=1e-3,
126                 min_span=0, new_var_penalty=0, fast_k=20, fast_beta=1,
127                 pruned_terms=None, scale_resp=True, store_instances=True,
128                **kwds):
129        """Initialize the learner instance.
130
131        :param degree: Maximum degree (num. of hinge functions per term)
132            of the terms in the model (default: 1).
133        :type degree: int
134        :param terms: Maximum number of terms in the forward pass
135                (default: 21).  If set to ``None``, ``min(200, max(20, 2
136                * n_attributes)) + 1`` will be used, like the default
137                setting in earth R package.
138        :type terms: int
139        :param penalty: Penalty for hinges in the GCV computation (used
140            in the pruning pass). Default is 3.0 if ``degree`` is above 1,
141            and 2.0 otherwise.
142        :type penalty: float
143        :param thresh: Threshold for RSS decrease in the forward pass
144            (default: 0.001).
145        :type thresh: float
146        :param min_span: TODO.
147        :param new_var_penalty: Penalty for introducing a new variable
148            in the model during the forward pass (default: 0).
149        :type new_var_penalty: float
150        :param fast_k: Fast k.
151        :param fast_beta: Fast beta.
152        :param pruned_terms: Maximum number of terms in the model after
153            pruning (default: ``None``, no limit).
154        :type pruned_terms: int
155        :param scale_resp: Scale responses prior to forward pass (default:
156            ``True``); ignored for models with multiple responses.
157        :type scale_resp: bool
158        :param store_instances: Store training instances in the model
159            (default: ``True``).
160        :type store_instances: bool
161
162        .. todo:: min_span, prunning_method (need Leaps like functionality,
163            currently only eval_subsets_using_xtx is implemented).
164
165        """
166
167        super(EarthLearner, self).__init__()
168
169        self.degree = degree
170        self.terms = terms
171        if penalty is None:
172            penalty = 3 if degree > 1 else 2
173        self.penalty = penalty
174        self.thresh = thresh
175        self.min_span = min_span
176        self.new_var_penalty = new_var_penalty
177        self.fast_k = fast_k
178        self.fast_beta = fast_beta
179        self.pruned_terms = pruned_terms
180        self.scale_resp = scale_resp
181        self.store_instances = store_instances
182        self.__dict__.update(kwds)
183
184        self.continuizer.class_treatment = DomainContinuizer.Ignore
185
186    def __call__(self, instances, weight_id=None):
187        expanded_class = None
188        multitarget = False
189
190        if instances.domain.class_var:
191            instances = self.impute_table(instances)
192            instances = self.continuize_table(instances)
193
194            if is_discrete(instances.domain.class_var):
195                # Expand a discrete class with indicator columns
196                expanded_class = expand_discrete(instances.domain.class_var)
197                y_table = select_attrs(instances, expanded_class)
198                (y, ) = y_table.to_numpy_MA("A")
199                (x, ) = instances.to_numpy_MA("A")
200            elif is_continuous(instances.domain.class_var):
201                x, y, _ = instances.to_numpy_MA()
202                y = y.reshape((-1, 1))
203            else:
204                raise ValueError("Cannot handle the response.")
205        elif instances.domain.class_vars:
206            # Multi-target domain
207            if not all(map(is_continuous, instances.domain.class_vars)):
208                raise TypeError("Only continuous multi-target classes are supported.")
209            x_table = select_attrs(instances, instances.domain.attributes)
210            y_table = select_attrs(instances, instances.domain.class_vars)
211
212            # Impute and continuize only the x_table
213            x_table = self.impute_table(x_table)
214            x_table = self.continuize_table(x_table)
215            domain = Domain(x_table.domain.attributes,
216                            class_vars=instances.domain.class_vars)
217
218            (x, ) = x_table.to_numpy_MA("A")
219            (y, ) = y_table.to_numpy_MA("A")
220
221            multitarget = True
222        else:
223            raise ValueError("Class variable expected.")
224
225        if self.scale_resp and y.shape[1] == 1:
226            sy = y - numpy.mean(y, axis=0)
227            sy = sy / numpy.std(sy, axis=0)
228        else:
229            sy = y
230
231        terms = self.terms
232        if terms is None:
233            # Automatic maximum number of terms
234            terms = min(200, max(20, 2 * x.shape[1])) + 1
235
236        n_terms, used, bx, dirs, cuts = forward_pass(x, sy,
237            degree=self.degree, terms=terms, penalty=self.penalty,
238            thresh=self.thresh, fast_k=self.fast_k, fast_beta=self.fast_beta,
239            new_var_penalty=self.new_var_penalty)
240
241        # discard unused terms from bx, dirs, cuts
242        bx = bx[:, used]
243        dirs = dirs[used, :]
244        cuts = cuts[used, :]
245
246        # pruning
247        used, subsets, rss_per_subset, gcv_per_subset = \
248            pruning_pass(bx, y, self.penalty,
249                         pruned_terms=self.pruned_terms)
250
251        # Fit betas
252        bx_used = bx[:, used]
253        betas, res, rank, s = numpy.linalg.lstsq(bx_used, y)
254
255        return EarthClassifier(instances.domain, used, dirs, cuts, betas.T,
256                               subsets, rss_per_subset, gcv_per_subset,
257                               instances=instances if self.store_instances else None,
258                               multitarget=multitarget,
259                               expanded_class=expanded_class
260                               )
261
262
263def soft_max(values):
264    values = numpy.asarray(values)
265    return numpy.exp(values) / numpy.sum(numpy.exp(values))
266
267
268class EarthClassifier(Orange.core.ClassifierFD):
269    """ Earth classifier.
270    """
271    def __init__(self, domain, best_set, dirs, cuts, betas, subsets=None,
272                 rss_per_subset=None, gcv_per_subset=None, instances=None,
273                 multitarget=False, expanded_class=None,
274                 original_domain=None, **kwargs):
275        self.multitarget = multitarget
276        self.domain = domain
277        self.class_var = domain.class_var
278        if self.multitarget:
279            self.class_vars = domain.class_vars
280
281        self.best_set = best_set
282        self.dirs = dirs
283        self.cuts = cuts
284        self.betas = betas
285        self.subsets = subsets
286        self.rss_per_subset = rss_per_subset
287        self.gcv_per_subset = gcv_per_subset
288        self.instances = instances
289        self.expanded_class = expanded_class
290        self.original_domain = original_domain
291        self.__dict__.update(kwargs)
292
293    def __call__(self, instance, result_type=Orange.core.GetValue):
294        if self.multitarget and self.domain.class_vars:
295            resp_vars = list(self.domain.class_vars)
296        elif is_discrete(self.class_var):
297            resp_vars = self.expanded_class
298        else:
299            resp_vars = [self.class_var]
300
301        vals = self.predict(instance)
302        vals = [var(val) for var, val in zip(resp_vars, vals)]
303
304        from Orange.statistics.distribution import Distribution
305
306        if not self.multitarget and is_discrete(self.class_var):
307            dist = Distribution(self.class_var)
308            if len(self.class_var.values) == 2:
309                probs = [1 - float(vals[0]), float(vals[0])]
310            else:
311                probs = soft_max(map(float, vals))
312
313            for val, p in zip(self.class_var.values, probs):
314                dist[val] = p
315            value = dist.modus()
316            vals, probs = [value], [dist]
317        else:
318            probs = []
319            for var, val in zip(resp_vars, vals):
320                dist = Distribution(var)
321                dist[val] = 1.0
322                probs.append(dist)
323
324        if not self.multitarget:
325            vals, probs = vals[0], probs[0]
326
327        if result_type == Orange.core.GetValue:
328            return vals
329        elif result_type == Orange.core.GetBoth:
330            return vals, probs
331        else:
332            return probs
333
334    def base_matrix(self, instances=None):
335        """Return the base matrix (bx) of the Earth model for the table.
336        If table is not supplied, the base matrix of the training instances
337        is returned.
338        Base matrix is a len(instances) x num_terms matrix of computed values
339        of terms in the model (not multiplied by beta) for each instance.
340
341        :param instances: Input instances for the base matrix.
342        :type instances: :class:`Orange.data.Table`
343
344        """
345        if instances is None:
346            instances = self.instances
347        instances = select_attrs(instances, self.domain.attributes)
348        (data,) = instances.to_numpy_MA("A")
349        bx = base_matrix(data, self.best_set, self.dirs, self.cuts)
350        return bx
351
352    def base_features(self):
353        """Return a list of features for the included Earth terms.
354        The attributes can be used in Orange's domain translation
355        (i.e. they define the proper ``get_value_from`` functions).
356
357        """
358        terms = []
359        dirs = self.dirs[self.best_set]
360        cuts = self.cuts[self.best_set]
361        # For faster domain translation all the features share
362        # this _instance_cache.
363        _instance_cache = {}
364        for dir, cut in zip(dirs[1:], cuts[1:]):  # Drop the intercept (first column)
365            hinge = [_format_knot(self, attr.name, dir1, cut1) \
366                     for (attr, dir1, cut1) in \
367                     zip(self.domain.attributes, dir, cut) \
368                     if dir1 != 0.0]
369            term_name = " * ".join(hinge)
370            term = Orange.feature.Continuous(term_name)
371            term.get_value_from = term_computer(
372                term, self.domain, dir, cut,
373                _instance_cache=_instance_cache
374            )
375
376            terms.append(term)
377        return terms
378
379    def predict(self, instance):
380        """ Predict the response value(s)
381
382        :param instance: Data instance
383        :type instance: :class:`Orange.data.Instance`
384
385        """
386        data = Orange.data.Table(self.domain, [instance])
387        bx = self.base_matrix(data)
388        bx_used = bx[:, self.best_set]
389        vals = numpy.dot(bx_used, self.betas.T).ravel()
390        return vals
391
392    def used_attributes(self, term=None):
393        """Return the used terms for term (index). If no term is given,
394        return all attributes in the model.
395
396        :param term: term index
397        :type term: int
398
399        """
400        if term is None:
401            return reduce(set.union, [self.used_attributes(i) \
402                                      for i in range(self.best_set.size)],
403                          set())
404
405        attrs = self.domain.attributes
406
407        used_mask = self.dirs[term, :] != 0.0
408        return [a for a, u in zip(attrs, used_mask) if u]
409
410    def evimp(self, used_only=True):
411        """ Return the estimated variable importances.
412
413        :param used_only: if True return only used attributes
414
415        """
416        return evimp(self, used_only)
417
418    def __reduce__(self):
419        return (type(self), (self.domain, self.best_set, self.dirs,
420                            self.cuts, self.betas),
421                dict(self.__dict__))
422
423    def to_string(self, percision=3, indent=3):
424        """ Return a string representation of the model.
425        """
426        return format_model(self, percision, indent)
427
428    def __str__(self):
429        return self.to_string()
430
431"""
432Utility functions
433-----------------
434"""
435
436
437def base_matrix(data, best_set, dirs, cuts):
438    """ Return the base matrix for the earth model.
439
440    :param data: Input data
441    :type data: :class:`numpy.ndarray`
442
443    :param best_set: A array of booleans indicating used terms.
444    :type best_set: :class:`numpy.ndarray`
445
446    :param dirs: Earth model's dirs members
447    :type dirs: :class:`numpy.ndarray`
448
449    :param cuts: Earth model's cuts members
450    :type cuts: :class:`numpy.ndarray`
451
452    """
453    data = numpy.asarray(data)
454    best_set = numpy.asarray(best_set)
455    dirs = numpy.asarray(dirs)
456    cuts = numpy.asarray(cuts)
457
458    bx = numpy.zeros((data.shape[0], best_set.shape[0]))
459    bx[:, 0] = 1.0  # The intercept
460    for termi in range(1, best_set.shape[0]):
461        term_dirs = dirs[termi]
462        term_cuts = cuts[termi]
463
464        dir_p1 = numpy.where(term_dirs == 1)[0]
465        dir_m1 = numpy.where(term_dirs == -1)[0]
466        dir_2 = numpy.where(term_dirs == 2)[0]
467
468        x1 = data[:, dir_p1] - term_cuts[dir_p1]
469        x2 = term_cuts[dir_m1] - data[:, dir_m1]
470        x3 = data[:, dir_2]
471
472        x1 = numpy.where(x1 > 0.0, x1, 0.0)
473        x2 = numpy.where(x2 > 0.0, x2, 0.0)
474
475        X = numpy.hstack((x1, x2, x3))
476        X = numpy.cumprod(X, axis=1)
477        bx[:, termi] = X[:, -1] if X.size else 0.0
478
479    return bx
480
481
482def gcv(rss, n, n_effective_params):
483    """ Return the generalized cross validation.
484
485    .. math:: gcv = rss / (n * (1 - NumEffectiveParams / n) ^ 2)
486
487    :param rss: Residual sum of squares.
488    :param n: Number of training instances.
489    :param n_effective_params: Number of effective paramaters.
490
491    """
492    return  rss / (n * (1. - n_effective_params / n) ** 2)
493
494
495class term_computer(Orange.core.ClassifierFD):
496    """An utility class for computing basis terms. Can be used as
497    a :obj:`~Orange.feature.Descriptior.get_value_from` member.
498
499    """
500    def __init__(self, term_var=None, domain=None, dir=None, cut=None,
501                 _instance_cache=None):
502        self.class_var = term_var
503        self.domain = domain
504
505        self.dir = dir
506        self.cut = cut
507
508        if dir is not None:
509            self.mask = self.dir != 0
510            self.masked_dir = self.dir[self.mask]
511            self.masked_cut = self.cut[self.mask]
512        else:
513            # backcompat. with old pickled format.
514            self.mask = self.masked_dir = self.masked_cut = None
515
516        self._instance_cache = _instance_cache
517
518    def __call__(self, instance, return_what=Orange.core.GetValue):
519        instance = self._instance_as_masked_array(instance)
520
521        if self.mask is None:
522            self.mask = self.dir != 0
523            self.masked_dir = self.dir[self.mask]
524            self.masked_cut = self.cut[self.mask]
525
526        values = instance[self.mask]
527        if numpy.ma.is_masked(values):
528            # Can't compute the term.
529            return self.class_var("?")
530
531        # Works faster with plain arrays
532        values = numpy.array(values)
533        values -= self.masked_cut
534        values *= self.masked_dir
535
536        values[values < 0] = 0
537        value = numpy.prod(values)
538
539        return self.class_var(value)
540
541    def _instance_as_masked_array(self, instance):
542        array = None
543        if self._instance_cache is not None:
544            array = self._instance_cache.get(instance, None)
545
546        if array is None:
547            table = Orange.data.Table(self.domain, [instance])
548            (array,) = table.to_numpy_MA("A")
549            array = array[0]
550
551            if self._instance_cache is not None:
552                self._instance_cache.clear()
553                self._instance_cache[instance] = array
554        return array
555
556    def __reduce__(self):
557        return (type(self), (self.class_var, self.domain, self.dir, self.cut),
558                dict(self.__dict__.items()))
559
560
561"""
562Multi-label utility functions
563"""
564
565
566"""
567ctypes interface to ForwardPass and EvalSubsetsUsingXtx.
568"""
569
570import ctypes
571from numpy import ctypeslib
572import orange
573
574_c_orange_lib = ctypeslib.load_library(orange.__file__, "")
575_c_forward_pass_ = _c_orange_lib.EarthForwardPass
576
577_c_forward_pass_.argtypes = \
578    [ctypes.POINTER(ctypes.c_int),  # pnTerms:
579     ctypeslib.ndpointer(dtype=ctypes.c_bool, ndim=1),  # FullSet
580     ctypeslib.ndpointer(dtype=ctypes.c_double, ndim=2, flags="F_CONTIGUOUS"), # bx
581     ctypeslib.ndpointer(dtype=ctypes.c_int, ndim=2, flags="F_CONTIGUOUS"),    # Dirs
582     ctypeslib.ndpointer(dtype=ctypes.c_double, ndim=2, flags="F_CONTIGUOUS"), # Cuts
583     ctypeslib.ndpointer(dtype=ctypes.c_int, ndim=1),  # nFactorsInTerms
584     ctypeslib.ndpointer(dtype=ctypes.c_int, ndim=1),  # nUses
585     ctypeslib.ndpointer(dtype=ctypes.c_double, ndim=2, flags="F_CONTIGUOUS"), # x
586     ctypeslib.ndpointer(dtype=ctypes.c_double, ndim=2, flags="F_CONTIGUOUS"), # y
587     ctypeslib.ndpointer(dtype=ctypes.c_double, ndim=1), # Weights
588     ctypes.c_int,  # nCases
589     ctypes.c_int,  # nResp
590     ctypes.c_int,  # nPred
591     ctypes.c_int,  # nMaxDegree
592     ctypes.c_int,  # nMaxTerms
593     ctypes.c_double,   # Penalty
594     ctypes.c_double,   # Thresh
595     ctypes.c_int,  # nFastK
596     ctypes.c_double,   # FastBeta
597     ctypes.c_double,   # NewVarPenalty
598     ctypeslib.ndpointer(dtype=ctypes.c_int, ndim=1),  # LinPreds
599     ctypes.c_bool, # UseBetaCache
600     ctypes.c_char_p    # sPredNames
601     ]
602
603
604def forward_pass(x, y, degree=1, terms=21, penalty=None, thresh=0.001,
605                  fast_k=21, fast_beta=1, new_var_penalty=2):
606    """ Do earth forward pass.
607    """
608    x = numpy.asfortranarray(x, dtype=ctypes.c_double)
609    y = numpy.asfortranarray(y, dtype=ctypes.c_double)
610    if x.shape[0] != y.shape[0]:
611        raise ValueError("First dimensions of x and y must be the same.")
612    if y.ndim == 1:
613        y = y.reshape((-1, 1), order="F")
614    if penalty is None:
615        penalty = 2
616    n_cases = x.shape[0]
617    n_preds = x.shape[1]
618
619    n_resp = y.shape[1] if y.ndim == 2 else y.shape[0]
620
621    # Output variables
622    n_term = ctypes.c_int()
623    full_set = numpy.zeros((terms,), dtype=ctypes.c_bool, order="F")
624    bx = numpy.zeros((n_cases, terms), dtype=ctypes.c_double, order="F")
625    dirs = numpy.zeros((terms, n_preds), dtype=ctypes.c_int, order="F")
626    cuts = numpy.zeros((terms, n_preds), dtype=ctypes.c_double, order="F")
627    n_factors_in_terms = numpy.zeros((terms,), dtype=ctypes.c_int, order="F")
628    n_uses = numpy.zeros((n_preds,), dtype=ctypes.c_int, order="F")
629    weights = numpy.ones((n_cases,), dtype=ctypes.c_double, order="F")
630    lin_preds = numpy.zeros((n_preds,), dtype=ctypes.c_int, order="F")
631    use_beta_cache = True
632
633    # These tests are performed in ForwardPass, and if they fail the function
634    # calls exit. So we must check it here and raise a exception to avoid a
635    # process shutdown.
636    if n_cases < 8:
637        raise ValueError("Need at least 8 data instances.")
638    if n_cases > 1e8:
639        raise ValueError("To many data instances.")
640    if n_resp < 1:
641        raise ValueError("No response column.")
642    if n_resp > 1e6:
643        raise ValueError("To many response columns.")
644    if n_preds < 1:
645        raise ValueError("No predictor columns.")
646    if n_preds > 1e5:
647        raise ValueError("To many predictor columns.")
648    if degree <= 0 or degree > 100:
649        raise ValueError("Invalid 'degree'.")
650    if terms < 3 or terms > 10000:
651        raise ValueError("'terms' must be in >= 3 and <= 10000.")
652    if penalty < 0 and penalty != -1:
653        raise ValueError("Invalid 'penalty' (the only legal negative value is -1).")
654    if penalty > 1000:
655        raise ValueError("Invalid 'penalty' (must be <= 1000).")
656    if thresh < 0.0 or thresh >= 1.0:
657        raise ValueError("Invalid 'thresh' (must be in [0.0, 1.0) ).")
658    if fast_beta < 0 or fast_beta > 1000:
659        raise ValueError("Invalid 'fast_beta' (must be in [0, 1000] ).")
660    if new_var_penalty < 0 or new_var_penalty > 10:
661        raise ValueError("Invalid 'new_var_penalty' (must be in [0, 10] ).")
662    if (numpy.var(y, axis=0) <= 1e-8).any():
663        raise ValueError("Variance of y is zero (or near zero).")
664
665    _c_forward_pass_(ctypes.byref(n_term), full_set, bx, dirs, cuts,
666                     n_factors_in_terms, n_uses, x, y, weights, n_cases,
667                     n_resp, n_preds, degree, terms, penalty, thresh,
668                     fast_k, fast_beta, new_var_penalty, lin_preds,
669                     use_beta_cache, None)
670    return n_term.value, full_set, bx, dirs, cuts
671
672
673_c_eval_subsets_xtx = _c_orange_lib.EarthEvalSubsetsUsingXtx
674
675_c_eval_subsets_xtx.argtypes = \
676    [ctypeslib.ndpointer(dtype=ctypes.c_bool, ndim=2, flags="F_CONTIGUOUS"),   # PruneTerms
677     ctypeslib.ndpointer(dtype=ctypes.c_double, ndim=1),   # RssVec
678     ctypes.c_int,  # nCases
679     ctypes.c_int,  # nResp
680     ctypes.c_int,  # nMaxTerms
681     ctypeslib.ndpointer(dtype=ctypes.c_double, ndim=2, flags="F_CONTIGUOUS"),  # bx
682     ctypeslib.ndpointer(dtype=ctypes.c_double, ndim=2, flags="F_CONTIGUOUS"),  # y
683     ctypeslib.ndpointer(dtype=ctypes.c_double, ndim=1)  # WeightsArg
684     ]
685
686_c_eval_subsets_xtx.restype = ctypes.c_int
687
688
689def subset_selection_xtx(X, Y):
690    """ Subsets selection using EvalSubsetsUsingXtx in the Earth package.
691    """
692    X = numpy.asfortranarray(X, dtype=ctypes.c_double)
693    Y = numpy.asfortranarray(Y, dtype=ctypes.c_double)
694    if Y.ndim == 1:
695        Y = Y.reshape((-1, 1), order="F")
696
697    if X.shape[0] != Y.shape[0]:
698        raise ValueError("First dimensions of bx and y must be the same")
699
700    var_count = X.shape[1]
701    resp_count = Y.shape[1]
702    cases = X.shape[0]
703    subsets = numpy.zeros((var_count, var_count), dtype=ctypes.c_bool,
704                              order="F")
705    rss_vec = numpy.zeros((var_count,), dtype=ctypes.c_double, order="F")
706    weights = numpy.ones((cases,), dtype=ctypes.c_double, order="F")
707
708    rval = _c_eval_subsets_xtx(subsets, rss_vec, cases, resp_count, var_count,
709                        X, Y, weights)
710    if rval != 0:
711        raise numpy.linalg.LinAlgError("Lin. dep. terms in X")
712
713    subsets_ind = numpy.zeros((var_count, var_count), dtype=int)
714    for i, used in enumerate(subsets.T):
715        subsets_ind[i, :i + 1] = numpy.where(used)[0]
716
717    return subsets_ind, rss_vec
718
719
720def subset_selection_xtx_numpy(X, Y):
721    """ A numpy implementation of EvalSubsetsUsingXtx in the Earth package.
722    Using numpy.linalg.lstsq
723
724    """
725    X = numpy.asarray(X)
726    Y = numpy.asarray(Y)
727
728    var_count = X.shape[1]
729    rss_vec = numpy.zeros(var_count)
730    working_set = range(var_count)
731    subsets = numpy.zeros((var_count, var_count), dtype=int)
732
733    for subset_size in reversed(range(var_count)):
734        subsets[subset_size, :subset_size + 1] = working_set
735        X_work = X[:, working_set]
736        b, res, rank, s = numpy.linalg.lstsq(X_work, Y)
737        if res.size > 0:
738            rss_vec[subset_size] = numpy.sum(res)
739        else:
740            rss_vec[subset_size] = numpy.sum((Y - numpy.dot(X_work, b)) ** 2)
741
742        XtX = numpy.dot(X_work.T, X_work)
743        iXtX = numpy.linalg.pinv(XtX)
744        diag = numpy.diag(iXtX).reshape((-1, 1))
745
746        if subset_size == 0:
747            break
748
749        delta_rss = b ** 2 / diag
750        delta_rss = numpy.sum(delta_rss, axis=1)
751        delete_i = numpy.argmin(delta_rss[1:]) + 1  # Keep the intercept
752        del working_set[delete_i]
753    return subsets, rss_vec
754
755
756def subset_selection_xtx2(X, Y):
757    """ Another implementation (this uses qr decomp).
758    """
759    from Orange.misc import linalg
760    X = numpy.asfortranarray(X, dtype=ctypes.c_double)
761    Y = numpy.asfortranarray(Y, dtype=ctypes.c_double)
762    col_count = X.shape[1]
763    working_set = range(col_count)
764    subsets = numpy.zeros((col_count, col_count), dtype=int)
765    rss_vec = numpy.zeros((col_count,))
766    QR, k, _, jpvt = linalg.qr_decomp(X)
767
768    if k < col_count:
769        # remove jpvt[k:] from the work set. Will have zero
770        # entries in the subsets matrix, and inf rss
771        for i in sorted(jpvt[k:], reverse=True):
772            del working_set[i]
773            rss_vec[len(working_set)] = float("inf")
774        col_count = len(working_set)
775
776    for subset_size in reversed(range(col_count)):
777        subsets[subset_size, :subset_size + 1] = working_set
778        X_work = X[:, working_set]
779        b, rsd, rank = linalg.qr_lstsq(X_work, Y)
780        rss_vec[subset_size] = numpy.sum(rsd ** 2)
781        XtX = numpy.dot(X_work.T, X_work)
782        iXtX = numpy.linalg.pinv(XtX)
783        diag = numpy.diag(iXtX)
784
785        if subset_size == 0:
786            break
787
788        delta_rss = b ** 2 / diag
789        delete_i = numpy.argmin(delta_rss[1:]) + 1  # Keep the intercept
790        del working_set[delete_i]
791    return subsets, rss_vec
792
793
794def pruning_pass(bx, y, penalty, pruned_terms=-1):
795    """ Do pruning pass
796
797    .. todo:: pruned_terms, Leaps
798
799    """
800    try:
801        subsets, rss_vec = subset_selection_xtx(bx, y)
802    except numpy.linalg.LinAlgError:
803        subsets, rss_vec = subset_selection_xtx_numpy(bx, y)
804
805    cases, terms = bx.shape
806    n_effective_params = numpy.arange(terms) + 1.0
807    n_effective_params += penalty * (n_effective_params - 1.0) / 2.0
808
809    gcv_vec = gcv(rss_vec, cases, n_effective_params)
810
811    min_i = numpy.argmin(gcv_vec)
812    used = numpy.zeros((terms), dtype=bool)
813
814    used[subsets[min_i, :min_i + 1]] = True
815
816    return used, subsets, rss_vec, gcv_vec
817
818"""
819Printing functions.
820"""
821
822
823def format_model(model, percision=3, indent=3):
824    """ Return a formated string representation of the earth model.
825    """
826    if model.multitarget:
827        r_vars = list(model.domain.class_vars)
828    elif is_discrete(model.class_var):
829        r_vars = model.expanded_class
830    else:
831        r_vars = [model.class_var]
832
833    r_names = [v.name for v in r_vars]
834    betas = model.betas
835
836    resp = []
837    for name, betas in zip(r_names, betas):
838        resp.append(_format_response(model, name, betas,
839                                     percision, indent))
840    return "\n\n".join(resp)
841
842
843def _format_response(model, resp_name, betas, percision=3, indent=3):
844    header = "%s =" % resp_name
845    indent = " " * indent
846    fmt = "%." + str(percision) + "f"
847    terms = [([], fmt % betas[0])]
848    beta_i = 0
849    for i, used in enumerate(model.best_set[1:], 1):
850        if used:
851            beta_i += 1
852            beta = fmt % abs(betas[beta_i])
853            knots = [_format_knot(model, attr.name, d, c, percision) \
854                     for d, c, attr in \
855                     zip(model.dirs[i], model.cuts[i], model.domain.attributes) \
856                     if d != 0]
857            term_attrs = [a for a, d in zip(model.domain.attributes, model.dirs[i]) \
858                          if d != 0]
859            term_attrs = sorted(term_attrs)
860            sign = "-" if betas[beta_i] < 0 else "+"
861            if knots:
862                terms.append((term_attrs,
863                              sign + " * ".join([beta] + knots)))
864            else:
865                terms.append((term_attrs, sign + beta))
866    # Sort by len(term_attrs), then by term_attrs
867    terms = sorted(terms, key=lambda t: (len(t[0]), t[0]))
868    return "\n".join([header] + [indent + t for _, t in terms])
869
870
871def _format_knot(model, name, dir, cut, percision=3):
872    fmt = "%%.%if" % percision
873    if dir == 1:
874        txt = ("max(0, %s - " + fmt + ")") % (name, cut)
875    elif dir == -1:
876        txt = ("max(0, " + fmt + " - %s)") % (cut, name)
877    elif dir == 2:
878        txt = name
879    return txt
880
881
882"""\
883Variable importance estimation
884------------------------------
885"""
886
887from collections import defaultdict
888
889
890def collect_source(vars):
891    """ Given a list of variables ``var``, return a mapping from source
892    variables (``source_variable`` or ``get_value_from.variable`` members)
893    back to the variables in ``vars`` (assumes the default preprocessor in
894    EarthLearner).
895
896    """
897    source = defaultdict(list)
898    for var in vars:
899        svar = None
900        if var.source_variable:
901            source[var.source_variable].append(var)
902        elif isinstance(var.get_value_from, Orange.core.ClassifierFromVar):
903            source[var.get_value_from.variable].append(var)
904        elif isinstance(var.get_value_from, Orange.core.ImputeClassifier):
905            source[var.get_value_from.classifier_from_var.variable].append(var)
906        else:
907            source[var].append(var)
908    return dict(source)
909
910
911def map_to_source_var(var, sources):
912    """
913    """
914    if var in sources:
915        return var
916    elif var.source_variable in sources:
917        return var.source_variable
918    elif isinstance(var.get_value_from, Orange.core.ClassifierFromVar):
919        return map_to_source_var(var.get_value_from.variable, sources)
920    elif isinstance(var.get_value_from, Orange.core.ImputeClassifier):
921        var = var.get_value_from.classifier_from_var.variable
922        return map_to_source_var(var, sources)
923    else:
924        return None
925
926
927def evimp(model, used_only=True):
928    """ Return the estimated variable importance for the model.
929
930    :param model: Earth model.
931    :type model: `EarthClassifier`
932
933    """
934    if model.subsets is None:
935        raise ValueError("No subsets. Use the learner with 'prune=True'.")
936
937    subsets = model.subsets
938    n_subsets = numpy.sum(model.best_set)
939
940    rss = -numpy.diff(model.rss_per_subset)
941    gcv = -numpy.diff(model.gcv_per_subset)
942    attributes = list(model.domain.variables)
943
944    attr2ind = dict(zip(attributes, range(len(attributes))))
945    importances = numpy.zeros((len(attributes), 4))
946    importances[:, 0] = range(len(attributes))
947
948    for i in range(1, n_subsets):
949        term_subset = subsets[i, :i + 1]
950        used_attributes = reduce(set.union, [model.used_attributes(term) \
951                                             for term in term_subset], set())
952        for attr in used_attributes:
953            importances[attr2ind[attr]][1] += 1.0
954            importances[attr2ind[attr]][2] += gcv[i - 1]
955            importances[attr2ind[attr]][3] += rss[i - 1]
956    imp_min = numpy.min(importances[:, [2, 3]], axis=0)
957    imp_max = numpy.max(importances[:, [2, 3]], axis=0)
958
959    #Normalize importances.
960    importances[:, [2, 3]] = 100.0 * (importances[:, [2, 3]] \
961                            - [imp_min]) / ([imp_max - imp_min])
962
963    importances = list(importances)
964    # Sort by n_subsets and gcv.
965    importances = sorted(importances, key=lambda row: (row[1], row[2]),
966                         reverse=True)
967    importances = numpy.array(importances)
968
969    if used_only:
970        importances = importances[importances[:, 1] > 0.0]
971
972    res = [(attributes[int(row[0])], tuple(row[1:])) for row in importances]
973    return res
974
975
976def plot_evimp(evimp):
977    """ Plot the variable importances as returned from
978    :obj:`EarthClassifier.evimp` call.
979
980    ::
981
982        import Orange
983        data = Orange.data.Table("housing")
984        c = Orange.regression.earth.EarthLearner(data, degree=3)
985        Orange.regression.earth.plot_evimp(c.evimp())
986
987    .. image:: files/earth-evimp.png
988
989    The left axis is the nsubsets measure and on the right are the normalized
990    RSS and GCV.
991
992    """
993    from Orange.ensemble.bagging import BaggedClassifier
994    if isinstance(evimp, EarthClassifier):
995        evimp = evimp.evimp()
996    elif isinstance(evimp, BaggedClassifier):
997        evimp = bagged_evimp(evimp)
998
999    import pylab
1000    fig = pylab.figure()
1001    axes1 = fig.add_subplot(111)
1002    attrs = [a for a, _ in evimp]
1003    imp = [s for _, s in evimp]
1004    imp = numpy.array(imp)
1005    X = range(len(attrs))
1006    l1 = axes1.plot(X, imp[:, 0], "b-",)
1007    axes2 = axes1.twinx()
1008
1009    l2 = axes2.plot(X, imp[:, 1], "g-",)
1010    l3 = axes2.plot(X, imp[:, 2], "r-",)
1011
1012    x_axis = axes1.xaxis
1013    x_axis.set_ticks(X)
1014    x_axis.set_ticklabels([a.name for a in attrs], rotation=90)
1015
1016    axes1.yaxis.set_label_text("nsubsets")
1017    axes2.yaxis.set_label_text("normalized gcv or rss")
1018
1019    axes1.legend([l1, l2, l3], ["nsubsets", "gcv", "rss"])
1020    axes1.set_title("Variable importance")
1021    fig.show()
1022
1023
1024def bagged_evimp(classifier, used_only=True):
1025    """ Extract combined (average) evimp from an instance of BaggedClassifier
1026
1027    Example::
1028
1029        from Orange.ensemble.bagging import BaggedLearner
1030        bc = BaggedLearner(EarthLearner(degree=3, terms=10), data)
1031        bagged_evimp(bc)
1032
1033    """
1034    def assert_type(object, class_):
1035        if not isinstance(object, class_):
1036            raise TypeError("Instance of %r expected." % (class_))
1037
1038    from Orange.ensemble.bagging import BaggedClassifier
1039
1040    assert_type(classifier, BaggedClassifier)
1041    bagged_imp = defaultdict(list)
1042    attrs_by_name = defaultdict(list)
1043    for c in classifier.classifiers:
1044        assert_type(c, EarthClassifier)
1045        imp = evimp(c, used_only=used_only)
1046        for attr, score in imp:
1047            bagged_imp[attr.name].append(score)  # map by name
1048            attrs_by_name[attr.name].append(attr)
1049
1050    for attr, scores in bagged_imp.items():
1051        scores = numpy.average(scores, axis=0)
1052        bagged_imp[attr] = tuple(scores)
1053
1054    bagged_imp = sorted(bagged_imp.items(),
1055                        key=lambda t: (t[1][0], t[1][1]),
1056                        reverse=True)
1057
1058    bagged_imp = [(attrs_by_name[name][0], scores) for name, scores in bagged_imp]
1059
1060    if used_only:
1061        bagged_imp = [(a, r) for a, r in bagged_imp if r[0] > 0]
1062    return bagged_imp
1063
1064"""
1065High level interface for measuring variable importance
1066(compatible with Orange.feature.scoring module).
1067
1068"""
1069from Orange.feature import scoring
1070
1071
1072class ScoreEarthImportance(scoring.Score):
1073    """ A subclass of :class:`Orange.feature.scoring.Score` that.
1074    scores features based on their importance in the Earth
1075    model using ``bagged_evimp``.
1076
1077    """
1078    # Return types
1079    NSUBSETS = 0
1080    RSS = 1
1081    GCV = 2
1082
1083    handles_discrete = True
1084    handles_continuous = True
1085    computes_thresholds = False
1086    needs = scoring.Score.Generator
1087
1088    def __new__(cls, attr=None, data=None, weight_id=None, **kwargs):
1089        self = scoring.Score.__new__(cls)
1090        if attr is not None and data is not None:
1091            self.__init__(**kwargs)
1092            # TODO: Should raise a warning, about caching
1093            return self.__call__(attr, data, weight_id)
1094        elif not attr and not data:
1095            return self
1096        else:
1097            raise ValueError("Both 'attr' and 'data' arguments expected.")
1098
1099    def __init__(self, t=10, degree=2, terms=10, score_what="nsubsets",
1100                 cached=True):
1101        """
1102        :param t: Number of earth models to train on the data
1103            (using BaggedLearner).
1104
1105        :param score_what: What to return as a score.
1106            Can be one of: "nsubsets", "rss", "gcv" or class constants
1107            NSUBSETS, RSS, GCV.
1108
1109        """
1110        self.t = t
1111        self.degree = degree
1112        self.terms = terms
1113        if isinstance(score_what, basestring):
1114            score_what = {"nsubsets": self.NSUBSETS, "rss": self.RSS,
1115                          "gcv": self.GCV}.get(score_what, None)
1116
1117        if score_what not in range(3):
1118            raise ValueError("Invalid  'score_what' parameter.")
1119
1120        self.score_what = score_what
1121        self.cached = cached
1122        self._cache_ref = None
1123        self._cached_evimp = None
1124
1125    def __call__(self, attr, data, weight_id=None):
1126        ref = self._cache_ref
1127        if ref is not None and ref is data:
1128            evimp = self._cached_evimp
1129        else:
1130            from Orange.ensemble.bagging import BaggedLearner
1131            bc = BaggedLearner(EarthLearner(degree=self.degree,
1132                            terms=self.terms), t=self.t)(data, weight_id)
1133            evimp = bagged_evimp(bc, used_only=False)
1134            self._cache_ref = data
1135            self._cached_evimp = evimp
1136
1137        evimp = dict(evimp)
1138        score = evimp.get(attr, None)
1139
1140        if score is None:
1141            source = collect_source(evimp.keys())
1142            if attr in source:
1143                # Return average of source var scores
1144                return numpy.average([evimp[v][self.score_what] \
1145                                      for v in source[attr]])
1146            else:
1147                raise ValueError("Attribute %r not in the domain." % attr)
1148        else:
1149            return score[self.score_what]
1150
1151
1152class ScoreRSS(scoring.Score):
1153
1154    handles_discrete = False
1155    handles_continuous = True
1156    computes_thresholds = False
1157
1158    def __new__(cls, attr=None, data=None, weight_id=None, **kwargs):
1159        self = scoring.Score.__new__(cls)
1160        if attr is not None and data is not None:
1161            self.__init__(**kwargs)
1162            # TODO: Should raise a warning, about caching
1163            return self.__call__(attr, data, weight_id)
1164        elif not attr and not data:
1165            return self
1166        else:
1167            raise ValueError("Both 'attr' and 'data' arguments expected.")
1168
1169    def __init__(self):
1170        self._cache_data = None
1171        self._cache_rss = None
1172
1173    def __call__(self, attr, data, weight_id=None):
1174        ref = self._cache_data
1175        if ref is not None and ref is data:
1176            rss = self._cache_rss
1177        else:
1178            x, y = data.to_numpy_MA("1A/c")
1179            try:
1180                subsets, rss = subset_selection_xtx2(x, y)
1181            except numpy.linalg.LinAlgError:
1182                subsets, rss = subset_selection_xtx_numpy(x, y)
1183            rss_diff = -numpy.diff(rss)
1184            rss = numpy.zeros_like(rss)
1185            for s_size in range(1, subsets.shape[0]):
1186                subset = subsets[s_size, :s_size + 1]
1187                rss[subset] += rss_diff[s_size - 1]
1188            rss = rss[1:]  # Drop the intercept
1189            self._cache_data = data
1190            self._cache_rss = rss
1191
1192        index = list(data.domain.attributes).index(attr)
1193        return rss[index]
Note: See TracBrowser for help on using the repository browser.