source: orange/Orange/regression/earth.py @ 10953:a3b2254b1843

Revision 10953:a3b2254b1843, 40.7 KB checked in by Ales Erjavec <ales.erjavec@…>, 21 months ago (diff)

Check for non-finite values in the data.

The earth C code calls 'exit()' if they are present in the data (fixes #1213).

Line 
1"""\
2====================================================
3Multivariate Adaptive Regression Splines (``earth``)
4====================================================
5
6.. index:: regression, linear model
7
8`Multivariate adaptive regression splines (MARS)`_ is a non-parametric
9regression method that extends a linear model with non-linear
10interactions.
11
12This module borrows the implementation of the technique from the `Earth R
13package`_ by Stephen Milborrow.
14
15.. _`Multivariate adaptive regression splines (MARS)`:
16        http://en.wikipedia.org/wiki/Multivariate_adaptive_regression_splines
17
18.. _`Earth R package`: http://cran.r-project.org/web/packages/earth/index.html
19
20Example ::
21
22    >>> import Orange
23    >>> data = Orange.data.Table("housing")
24    >>> c = Orange.regression.earth.EarthLearner(data, degree=2, terms=10)
25    >>> print c
26    MEDV =
27       23.587
28       +11.896 * max(0, RM - 6.431)
29       +1.142 * max(0, 6.431 - RM)
30       -0.612 * max(0, LSTAT - 6.120)
31       -228.795 * max(0, NOX - 0.647) * max(0, RM - 6.431)
32       +0.023 * max(0, TAX - 307.000) * max(0, 6.120 - LSTAT)
33       +0.029 * max(0, 307.000 - TAX) * max(0, 6.120 - LSTAT)
34
35
36.. autoclass:: EarthLearner
37    :members:
38
39.. autoclass:: EarthClassifier
40    :members:
41
42
43Utility functions
44-----------------
45
46.. autofunction:: gcv
47
48.. autofunction:: plot_evimp
49
50.. autofunction:: bagged_evimp
51
52.. autoclass:: ScoreEarthImportance
53
54"""
55
56import Orange
57from Orange.feature import Discrete, Continuous
58from Orange.data import Table, Domain
59from Orange.data.preprocess import DomainContinuizer
60
61import numpy
62
63
64def is_discrete(var):
65    return isinstance(var, Discrete)
66
67
68def is_continuous(var):
69    return isinstance(var, Continuous)
70
71
72def expand_discrete(var):
73    """ Expand a discrete variable ``var`` returning one continuous indicator
74    variable for each value of ``var`` (if the number of values is grater
75    then 2 else return only one indicator variable).
76
77    """
78    if len(var.values) > 2:
79        values = var.values
80    elif len(var.values) == 2:
81        values = var.values[-1:]
82    else:
83        values = var.values[:1]
84    new_vars = []
85    for value in values:
86        new = Continuous("{0}={1}".format(var.name, value))
87        new.get_value_from = cls = Orange.core.ClassifierFromVar(whichVar=var)
88        cls.transformer = Orange.core.Discrete2Continuous()
89        cls.transformer.value = int(Orange.core.Value(var, value))
90        new.source_variable = var
91        new_vars.append(new)
92    return new_vars
93
94
95def select_attrs(table, features, class_var=None,
96                 class_vars=None, metas=None):
97    """ Select only ``attributes`` from the ``table``.
98    """
99    if class_vars is None:
100        domain = Domain(features, class_var)
101    else:
102        domain = Domain(features, class_var, class_vars=class_vars)
103    if metas:
104        domain.add_metas(metas)
105    return Table(domain, table)
106
107
108class EarthLearner(Orange.regression.base.BaseRegressionLearner):
109    """Earth learner class. Supports both regression and classification
110    problems. For classification, class values are expanded into
111    continuous indicator columns (one for each value if the number of
112    values is grater then 2), and a multi response model is fit to these
113    new columns. The resulting classifier the computes response
114    values on new instances to select the final predicted class.
115
116    """
117    def __new__(cls, instances=None, weight_id=None, **kwargs):
118        self = Orange.regression.base.BaseRegressionLearner.__new__(cls)
119        if instances is not None:
120            self.__init__(**kwargs)
121            return self.__call__(instances, weight_id)
122        else:
123            return self
124
125    def __init__(self, degree=1, terms=21, penalty=None, thresh=1e-3,
126                 min_span=0, new_var_penalty=0, fast_k=20, fast_beta=1,
127                 pruned_terms=None, scale_resp=True, store_instances=True,
128                **kwds):
129        """Initialize the learner instance.
130
131        :param degree: Maximum degree (num. of hinge functions per term)
132            of the terms in the model (default: 1).
133        :type degree: int
134        :param terms: Maximum number of terms in the forward pass
135                (default: 21).  If set to ``None``, ``min(200, max(20, 2
136                * n_attributes)) + 1`` will be used, like the default
137                setting in earth R package.
138        :type terms: int
139        :param penalty: Penalty for hinges in the GCV computation (used
140            in the pruning pass). Default is 3.0 if ``degree`` is above 1,
141            and 2.0 otherwise.
142        :type penalty: float
143        :param thresh: Threshold for RSS decrease in the forward pass
144            (default: 0.001).
145        :type thresh: float
146        :param min_span: TODO.
147        :param new_var_penalty: Penalty for introducing a new variable
148            in the model during the forward pass (default: 0).
149        :type new_var_penalty: float
150        :param fast_k: Fast k.
151        :param fast_beta: Fast beta.
152        :param pruned_terms: Maximum number of terms in the model after
153            pruning (default: ``None``, no limit).
154        :type pruned_terms: int
155        :param scale_resp: Scale responses prior to forward pass (default:
156            ``True``); ignored for models with multiple responses.
157        :type scale_resp: bool
158        :param store_instances: Store training instances in the model
159            (default: ``True``).
160        :type store_instances: bool
161
162        .. todo:: min_span, prunning_method (need Leaps like functionality,
163            currently only eval_subsets_using_xtx is implemented).
164
165        """
166
167        super(EarthLearner, self).__init__()
168
169        self.degree = degree
170        self.terms = terms
171        if penalty is None:
172            penalty = 3 if degree > 1 else 2
173        self.penalty = penalty
174        self.thresh = thresh
175        self.min_span = min_span
176        self.new_var_penalty = new_var_penalty
177        self.fast_k = fast_k
178        self.fast_beta = fast_beta
179        self.pruned_terms = pruned_terms
180        self.scale_resp = scale_resp
181        self.store_instances = store_instances
182        self.__dict__.update(kwds)
183
184        self.continuizer.class_treatment = DomainContinuizer.Ignore
185
186    def __call__(self, instances, weight_id=None):
187        expanded_class = None
188        multitarget = False
189
190        if instances.domain.class_var:
191            instances = self.impute_table(instances)
192            instances = self.continuize_table(instances)
193
194            if is_discrete(instances.domain.class_var):
195                # Expand a discrete class with indicator columns
196                expanded_class = expand_discrete(instances.domain.class_var)
197                y_table = select_attrs(instances, expanded_class)
198                (y, ) = y_table.to_numpy_MA("A")
199                (x, ) = instances.to_numpy_MA("A")
200            elif is_continuous(instances.domain.class_var):
201                x, y, _ = instances.to_numpy_MA()
202                y = y.reshape((-1, 1))
203            else:
204                raise ValueError("Cannot handle the response.")
205        elif instances.domain.class_vars:
206            # Multi-target domain
207            if not all(map(is_continuous, instances.domain.class_vars)):
208                raise TypeError("Only continuous multi-target classes are supported.")
209            x_table = select_attrs(instances, instances.domain.attributes)
210            y_table = select_attrs(instances, instances.domain.class_vars)
211
212            # Impute and continuize only the x_table
213            x_table = self.impute_table(x_table)
214            x_table = self.continuize_table(x_table)
215            domain = Domain(x_table.domain.attributes,
216                            class_vars=instances.domain.class_vars)
217
218            (x, ) = x_table.to_numpy_MA("A")
219            (y, ) = y_table.to_numpy_MA("A")
220
221            multitarget = True
222        else:
223            raise ValueError("Class variable expected.")
224
225        # check for non-finite values in y.
226        if not numpy.isfinite(y).all():
227            raise ValueError("Non-finite values present in Y")
228
229        # mask non-finite values in x.
230        x = numpy.ma.masked_invalid(x, copy=False)
231
232        if self.scale_resp and y.shape[1] == 1:
233            sy = y - numpy.ma.mean(y, axis=0)
234            sy = sy / numpy.ma.std(sy, axis=0)
235        else:
236            sy = y
237
238        # replace masked values with means.
239        if numpy.ma.is_masked(sy):
240            mean_sy = numpy.ma.mean(sy, axis=0)
241            sy = numpy.where(sy.mask, mean_sy, sy)
242
243        if numpy.ma.is_masked(x):
244            mean_x = numpy.ma.mean(x, axis=0)
245            x = numpy.where(x.mask, mean_x, x)
246
247        terms = self.terms
248        if terms is None:
249            # Automatic maximum number of terms
250            terms = min(200, max(20, 2 * x.shape[1])) + 1
251
252        n_terms, used, bx, dirs, cuts = forward_pass(x, sy,
253            degree=self.degree, terms=terms, penalty=self.penalty,
254            thresh=self.thresh, fast_k=self.fast_k, fast_beta=self.fast_beta,
255            new_var_penalty=self.new_var_penalty)
256
257        # discard unused terms from bx, dirs, cuts
258        bx = bx[:, used]
259        dirs = dirs[used, :]
260        cuts = cuts[used, :]
261
262        # pruning
263        used, subsets, rss_per_subset, gcv_per_subset = \
264            pruning_pass(bx, y, self.penalty,
265                         pruned_terms=self.pruned_terms)
266
267        # Fit betas
268        bx_used = bx[:, used]
269        betas, res, rank, s = numpy.linalg.lstsq(bx_used, y)
270
271        return EarthClassifier(instances.domain, used, dirs, cuts, betas.T,
272                               subsets, rss_per_subset, gcv_per_subset,
273                               instances=instances if self.store_instances else None,
274                               multitarget=multitarget,
275                               expanded_class=expanded_class
276                               )
277
278
279def soft_max(values):
280    values = numpy.asarray(values)
281    return numpy.exp(values) / numpy.sum(numpy.exp(values))
282
283
284class EarthClassifier(Orange.core.ClassifierFD):
285    """ Earth classifier.
286    """
287    def __init__(self, domain, best_set, dirs, cuts, betas, subsets=None,
288                 rss_per_subset=None, gcv_per_subset=None, instances=None,
289                 multitarget=False, expanded_class=None,
290                 original_domain=None, **kwargs):
291        self.multitarget = multitarget
292        self.domain = domain
293        self.class_var = domain.class_var
294        if self.multitarget:
295            self.class_vars = domain.class_vars
296
297        self.best_set = best_set
298        self.dirs = dirs
299        self.cuts = cuts
300        self.betas = betas
301        self.subsets = subsets
302        self.rss_per_subset = rss_per_subset
303        self.gcv_per_subset = gcv_per_subset
304        self.instances = instances
305        self.expanded_class = expanded_class
306        self.original_domain = original_domain
307        self.__dict__.update(kwargs)
308
309    def __call__(self, instance, result_type=Orange.core.GetValue):
310        if self.multitarget and self.domain.class_vars:
311            resp_vars = list(self.domain.class_vars)
312        elif is_discrete(self.class_var):
313            resp_vars = self.expanded_class
314        else:
315            resp_vars = [self.class_var]
316
317        vals = self.predict(instance)
318        vals = [var(val) for var, val in zip(resp_vars, vals)]
319
320        from Orange.statistics.distribution import Distribution
321
322        if not self.multitarget and is_discrete(self.class_var):
323            dist = Distribution(self.class_var)
324            if len(self.class_var.values) == 2:
325                probs = [1 - float(vals[0]), float(vals[0])]
326            else:
327                probs = soft_max(map(float, vals))
328
329            for val, p in zip(self.class_var.values, probs):
330                dist[val] = p
331            value = dist.modus()
332            vals, probs = [value], [dist]
333        else:
334            probs = []
335            for var, val in zip(resp_vars, vals):
336                dist = Distribution(var)
337                dist[val] = 1.0
338                probs.append(dist)
339
340        if not self.multitarget:
341            vals, probs = vals[0], probs[0]
342
343        if result_type == Orange.core.GetValue:
344            return vals
345        elif result_type == Orange.core.GetBoth:
346            return vals, probs
347        else:
348            return probs
349
350    def base_matrix(self, instances=None):
351        """Return the base matrix (bx) of the Earth model for the table.
352        If table is not supplied, the base matrix of the training instances
353        is returned.
354        Base matrix is a len(instances) x num_terms matrix of computed values
355        of terms in the model (not multiplied by beta) for each instance.
356
357        :param instances: Input instances for the base matrix.
358        :type instances: :class:`Orange.data.Table`
359
360        """
361        if instances is None:
362            instances = self.instances
363        instances = select_attrs(instances, self.domain.attributes)
364        (data,) = instances.to_numpy_MA("A")
365        bx = base_matrix(data, self.best_set, self.dirs, self.cuts)
366        return bx
367
368    def base_features(self):
369        """Return a list of features for the included Earth terms.
370        The attributes can be used in Orange's domain translation
371        (i.e. they define the proper ``get_value_from`` functions).
372
373        """
374        terms = []
375        dirs = self.dirs[self.best_set]
376        cuts = self.cuts[self.best_set]
377        # For faster domain translation all the features share
378        # this _instance_cache.
379        _instance_cache = {}
380        for dir, cut in zip(dirs[1:], cuts[1:]):  # Drop the intercept (first column)
381            hinge = [_format_knot(self, attr.name, dir1, cut1) \
382                     for (attr, dir1, cut1) in \
383                     zip(self.domain.attributes, dir, cut) \
384                     if dir1 != 0.0]
385            term_name = " * ".join(hinge)
386            term = Orange.feature.Continuous(term_name)
387            term.get_value_from = term_computer(
388                term, self.domain, dir, cut,
389                _instance_cache=_instance_cache
390            )
391
392            terms.append(term)
393        return terms
394
395    def predict(self, instance):
396        """ Predict the response value(s)
397
398        :param instance: Data instance
399        :type instance: :class:`Orange.data.Instance`
400
401        """
402        data = Orange.data.Table(self.domain, [instance])
403        bx = self.base_matrix(data)
404        bx_used = bx[:, self.best_set]
405        vals = numpy.dot(bx_used, self.betas.T).ravel()
406        return vals
407
408    def used_attributes(self, term=None):
409        """Return the used terms for term (index). If no term is given,
410        return all attributes in the model.
411
412        :param term: term index
413        :type term: int
414
415        """
416        if term is None:
417            return reduce(set.union, [self.used_attributes(i) \
418                                      for i in range(self.best_set.size)],
419                          set())
420
421        attrs = self.domain.attributes
422
423        used_mask = self.dirs[term, :] != 0.0
424        return [a for a, u in zip(attrs, used_mask) if u]
425
426    def evimp(self, used_only=True):
427        """ Return the estimated variable importances.
428
429        :param used_only: if True return only used attributes
430
431        """
432        return evimp(self, used_only)
433
434    def __reduce__(self):
435        return (type(self), (self.domain, self.best_set, self.dirs,
436                            self.cuts, self.betas),
437                dict(self.__dict__))
438
439    def to_string(self, percision=3, indent=3):
440        """ Return a string representation of the model.
441        """
442        return format_model(self, percision, indent)
443
444    def __str__(self):
445        return self.to_string()
446
447"""
448Utility functions
449-----------------
450"""
451
452
453def base_matrix(data, best_set, dirs, cuts):
454    """ Return the base matrix for the earth model.
455
456    :param data: Input data
457    :type data: :class:`numpy.ndarray`
458
459    :param best_set: A array of booleans indicating used terms.
460    :type best_set: :class:`numpy.ndarray`
461
462    :param dirs: Earth model's dirs members
463    :type dirs: :class:`numpy.ndarray`
464
465    :param cuts: Earth model's cuts members
466    :type cuts: :class:`numpy.ndarray`
467
468    """
469    data = numpy.asarray(data)
470    best_set = numpy.asarray(best_set)
471    dirs = numpy.asarray(dirs)
472    cuts = numpy.asarray(cuts)
473
474    bx = numpy.zeros((data.shape[0], best_set.shape[0]))
475    bx[:, 0] = 1.0  # The intercept
476    for termi in range(1, best_set.shape[0]):
477        term_dirs = dirs[termi]
478        term_cuts = cuts[termi]
479
480        dir_p1 = numpy.where(term_dirs == 1)[0]
481        dir_m1 = numpy.where(term_dirs == -1)[0]
482        dir_2 = numpy.where(term_dirs == 2)[0]
483
484        x1 = data[:, dir_p1] - term_cuts[dir_p1]
485        x2 = term_cuts[dir_m1] - data[:, dir_m1]
486        x3 = data[:, dir_2]
487
488        x1 = numpy.where(x1 > 0.0, x1, 0.0)
489        x2 = numpy.where(x2 > 0.0, x2, 0.0)
490
491        X = numpy.hstack((x1, x2, x3))
492        X = numpy.cumprod(X, axis=1)
493        bx[:, termi] = X[:, -1] if X.size else 0.0
494
495    return bx
496
497
498def gcv(rss, n, n_effective_params):
499    """ Return the generalized cross validation.
500
501    .. math:: gcv = rss / (n * (1 - NumEffectiveParams / n) ^ 2)
502
503    :param rss: Residual sum of squares.
504    :param n: Number of training instances.
505    :param n_effective_params: Number of effective paramaters.
506
507    """
508    return  rss / (n * (1. - n_effective_params / n) ** 2)
509
510
511class term_computer(Orange.core.ClassifierFD):
512    """An utility class for computing basis terms. Can be used as
513    a :obj:`~Orange.feature.Descriptior.get_value_from` member.
514
515    """
516    def __init__(self, term_var=None, domain=None, dir=None, cut=None,
517                 _instance_cache=None):
518        self.class_var = term_var
519        self.domain = domain
520
521        self.dir = dir
522        self.cut = cut
523
524        if dir is not None:
525            self.mask = self.dir != 0
526            self.masked_dir = self.dir[self.mask]
527            self.masked_cut = self.cut[self.mask]
528        else:
529            # backcompat. with old pickled format.
530            self.mask = self.masked_dir = self.masked_cut = None
531
532        self._instance_cache = _instance_cache
533
534    def __call__(self, instance, return_what=Orange.core.GetValue):
535        instance = self._instance_as_masked_array(instance)
536
537        if self.mask is None:
538            self.mask = self.dir != 0
539            self.masked_dir = self.dir[self.mask]
540            self.masked_cut = self.cut[self.mask]
541
542        values = instance[self.mask]
543        if numpy.ma.is_masked(values):
544            # Can't compute the term.
545            return self.class_var("?")
546
547        # Works faster with plain arrays
548        values = numpy.array(values)
549        values -= self.masked_cut
550        values *= self.masked_dir
551
552        values[values < 0] = 0
553        value = numpy.prod(values)
554
555        return self.class_var(value)
556
557    def _instance_as_masked_array(self, instance):
558        array = None
559        if self._instance_cache is not None:
560            array = self._instance_cache.get(instance, None)
561
562        if array is None:
563            table = Orange.data.Table(self.domain, [instance])
564            (array,) = table.to_numpy_MA("A")
565            array = array[0]
566
567            if self._instance_cache is not None:
568                self._instance_cache.clear()
569                self._instance_cache[instance] = array
570        return array
571
572    def __reduce__(self):
573        return (type(self), (self.class_var, self.domain, self.dir, self.cut),
574                dict(self.__dict__.items()))
575
576
577"""
578Multi-label utility functions
579"""
580
581
582"""
583ctypes interface to ForwardPass and EvalSubsetsUsingXtx.
584"""
585
586import ctypes
587from numpy import ctypeslib
588import orange
589
590_c_orange_lib = ctypeslib.load_library(orange.__file__, "")
591_c_forward_pass_ = _c_orange_lib.EarthForwardPass
592
593_c_forward_pass_.argtypes = \
594    [ctypes.POINTER(ctypes.c_int),  # pnTerms:
595     ctypeslib.ndpointer(dtype=ctypes.c_bool, ndim=1),  # FullSet
596     ctypeslib.ndpointer(dtype=ctypes.c_double, ndim=2, flags="F_CONTIGUOUS"), # bx
597     ctypeslib.ndpointer(dtype=ctypes.c_int, ndim=2, flags="F_CONTIGUOUS"),    # Dirs
598     ctypeslib.ndpointer(dtype=ctypes.c_double, ndim=2, flags="F_CONTIGUOUS"), # Cuts
599     ctypeslib.ndpointer(dtype=ctypes.c_int, ndim=1),  # nFactorsInTerms
600     ctypeslib.ndpointer(dtype=ctypes.c_int, ndim=1),  # nUses
601     ctypeslib.ndpointer(dtype=ctypes.c_double, ndim=2, flags="F_CONTIGUOUS"), # x
602     ctypeslib.ndpointer(dtype=ctypes.c_double, ndim=2, flags="F_CONTIGUOUS"), # y
603     ctypeslib.ndpointer(dtype=ctypes.c_double, ndim=1), # Weights
604     ctypes.c_int,  # nCases
605     ctypes.c_int,  # nResp
606     ctypes.c_int,  # nPred
607     ctypes.c_int,  # nMaxDegree
608     ctypes.c_int,  # nMaxTerms
609     ctypes.c_double,   # Penalty
610     ctypes.c_double,   # Thresh
611     ctypes.c_int,  # nFastK
612     ctypes.c_double,   # FastBeta
613     ctypes.c_double,   # NewVarPenalty
614     ctypeslib.ndpointer(dtype=ctypes.c_int, ndim=1),  # LinPreds
615     ctypes.c_bool, # UseBetaCache
616     ctypes.c_char_p    # sPredNames
617     ]
618
619
620def forward_pass(x, y, degree=1, terms=21, penalty=None, thresh=0.001,
621                  fast_k=21, fast_beta=1, new_var_penalty=2):
622    """ Do earth forward pass.
623    """
624    x = numpy.asfortranarray(x, dtype=ctypes.c_double)
625    y = numpy.asfortranarray(y, dtype=ctypes.c_double)
626    if x.shape[0] != y.shape[0]:
627        raise ValueError("First dimensions of x and y must be the same.")
628    if y.ndim == 1:
629        y = y.reshape((-1, 1), order="F")
630    if penalty is None:
631        penalty = 2
632    n_cases = x.shape[0]
633    n_preds = x.shape[1]
634
635    n_resp = y.shape[1] if y.ndim == 2 else y.shape[0]
636
637    # Output variables
638    n_term = ctypes.c_int()
639    full_set = numpy.zeros((terms,), dtype=ctypes.c_bool, order="F")
640    bx = numpy.zeros((n_cases, terms), dtype=ctypes.c_double, order="F")
641    dirs = numpy.zeros((terms, n_preds), dtype=ctypes.c_int, order="F")
642    cuts = numpy.zeros((terms, n_preds), dtype=ctypes.c_double, order="F")
643    n_factors_in_terms = numpy.zeros((terms,), dtype=ctypes.c_int, order="F")
644    n_uses = numpy.zeros((n_preds,), dtype=ctypes.c_int, order="F")
645    weights = numpy.ones((n_cases,), dtype=ctypes.c_double, order="F")
646    lin_preds = numpy.zeros((n_preds,), dtype=ctypes.c_int, order="F")
647    use_beta_cache = True
648
649    # These tests are performed in ForwardPass, and if they fail the function
650    # calls exit. So we must check it here and raise a exception to avoid a
651    # process shutdown.
652    if n_cases < 8:
653        raise ValueError("Need at least 8 data instances.")
654    if n_cases > 1e8:
655        raise ValueError("To many data instances.")
656    if n_resp < 1:
657        raise ValueError("No response column.")
658    if n_resp > 1e6:
659        raise ValueError("To many response columns.")
660    if n_preds < 1:
661        raise ValueError("No predictor columns.")
662    if n_preds > 1e5:
663        raise ValueError("To many predictor columns.")
664    if degree <= 0 or degree > 100:
665        raise ValueError("Invalid 'degree'.")
666    if terms < 3 or terms > 10000:
667        raise ValueError("'terms' must be in >= 3 and <= 10000.")
668    if penalty < 0 and penalty != -1:
669        raise ValueError("Invalid 'penalty' (the only legal negative value is -1).")
670    if penalty > 1000:
671        raise ValueError("Invalid 'penalty' (must be <= 1000).")
672    if thresh < 0.0 or thresh >= 1.0:
673        raise ValueError("Invalid 'thresh' (must be in [0.0, 1.0) ).")
674    if fast_beta < 0 or fast_beta > 1000:
675        raise ValueError("Invalid 'fast_beta' (must be in [0, 1000] ).")
676    if new_var_penalty < 0 or new_var_penalty > 10:
677        raise ValueError("Invalid 'new_var_penalty' (must be in [0, 10] ).")
678    if (numpy.var(y, axis=0) <= 1e-8).any():
679        raise ValueError("Variance of y is zero (or near zero).")
680
681    _c_forward_pass_(ctypes.byref(n_term), full_set, bx, dirs, cuts,
682                     n_factors_in_terms, n_uses, x, y, weights, n_cases,
683                     n_resp, n_preds, degree, terms, penalty, thresh,
684                     fast_k, fast_beta, new_var_penalty, lin_preds,
685                     use_beta_cache, None)
686    return n_term.value, full_set, bx, dirs, cuts
687
688
689_c_eval_subsets_xtx = _c_orange_lib.EarthEvalSubsetsUsingXtx
690
691_c_eval_subsets_xtx.argtypes = \
692    [ctypeslib.ndpointer(dtype=ctypes.c_bool, ndim=2, flags="F_CONTIGUOUS"),   # PruneTerms
693     ctypeslib.ndpointer(dtype=ctypes.c_double, ndim=1),   # RssVec
694     ctypes.c_int,  # nCases
695     ctypes.c_int,  # nResp
696     ctypes.c_int,  # nMaxTerms
697     ctypeslib.ndpointer(dtype=ctypes.c_double, ndim=2, flags="F_CONTIGUOUS"),  # bx
698     ctypeslib.ndpointer(dtype=ctypes.c_double, ndim=2, flags="F_CONTIGUOUS"),  # y
699     ctypeslib.ndpointer(dtype=ctypes.c_double, ndim=1)  # WeightsArg
700     ]
701
702_c_eval_subsets_xtx.restype = ctypes.c_int
703
704
705def subset_selection_xtx(X, Y):
706    """ Subsets selection using EvalSubsetsUsingXtx in the Earth package.
707    """
708    X = numpy.asfortranarray(X, dtype=ctypes.c_double)
709    Y = numpy.asfortranarray(Y, dtype=ctypes.c_double)
710    if Y.ndim == 1:
711        Y = Y.reshape((-1, 1), order="F")
712
713    if X.shape[0] != Y.shape[0]:
714        raise ValueError("First dimensions of bx and y must be the same")
715
716    var_count = X.shape[1]
717    resp_count = Y.shape[1]
718    cases = X.shape[0]
719    subsets = numpy.zeros((var_count, var_count), dtype=ctypes.c_bool,
720                              order="F")
721    rss_vec = numpy.zeros((var_count,), dtype=ctypes.c_double, order="F")
722    weights = numpy.ones((cases,), dtype=ctypes.c_double, order="F")
723
724    rval = _c_eval_subsets_xtx(subsets, rss_vec, cases, resp_count, var_count,
725                        X, Y, weights)
726    if rval == 1:
727        raise numpy.linalg.LinAlgError("Lin. dep. terms in X")
728    elif rval == 2:
729        raise Exception("Trying to prune the intercept.")
730    elif rval != 0:
731        raise Exception("Error %i" % rval)
732
733    subsets_ind = numpy.zeros((var_count, var_count), dtype=int)
734    for i, used in enumerate(subsets.T):
735        subsets_ind[i, :i + 1] = numpy.where(used)[0]
736
737    return subsets_ind, rss_vec
738
739
740def subset_selection_xtx_numpy(X, Y):
741    """ A numpy implementation of EvalSubsetsUsingXtx in the Earth package.
742    Using numpy.linalg.lstsq
743
744    """
745    X = numpy.asarray(X)
746    Y = numpy.asarray(Y)
747
748    var_count = X.shape[1]
749    rss_vec = numpy.zeros(var_count)
750    working_set = range(var_count)
751    subsets = numpy.zeros((var_count, var_count), dtype=int)
752
753    for subset_size in reversed(range(var_count)):
754        subsets[subset_size, :subset_size + 1] = working_set
755        X_work = X[:, working_set]
756        b, res, rank, s = numpy.linalg.lstsq(X_work, Y)
757        if res.size > 0:
758            rss_vec[subset_size] = numpy.sum(res)
759        else:
760            rss_vec[subset_size] = numpy.sum((Y - numpy.dot(X_work, b)) ** 2)
761
762        XtX = numpy.dot(X_work.T, X_work)
763        iXtX = numpy.linalg.pinv(XtX)
764        diag = numpy.diag(iXtX).reshape((-1, 1))
765
766        if subset_size == 0:
767            break
768
769        delta_rss = b ** 2 / diag
770        delta_rss = numpy.sum(delta_rss, axis=1)
771        delete_i = numpy.argmin(delta_rss[1:]) + 1  # Keep the intercept
772        del working_set[delete_i]
773    return subsets, rss_vec
774
775
776def subset_selection_xtx2(X, Y):
777    """ Another implementation (this uses qr decomp).
778    """
779    from Orange.misc import linalg
780    X = numpy.asfortranarray(X, dtype=ctypes.c_double)
781    Y = numpy.asfortranarray(Y, dtype=ctypes.c_double)
782    col_count = X.shape[1]
783    working_set = range(col_count)
784    subsets = numpy.zeros((col_count, col_count), dtype=int)
785    rss_vec = numpy.zeros((col_count,))
786    QR, k, _, jpvt = linalg.qr_decomp(X)
787
788    if k < col_count:
789        # remove jpvt[k:] from the work set. Will have zero
790        # entries in the subsets matrix, and inf rss
791        for i in sorted(jpvt[k:], reverse=True):
792            del working_set[i]
793            rss_vec[len(working_set)] = float("inf")
794        col_count = len(working_set)
795
796    for subset_size in reversed(range(col_count)):
797        subsets[subset_size, :subset_size + 1] = working_set
798        X_work = X[:, working_set]
799        b, rsd, rank = linalg.qr_lstsq(X_work, Y)
800        rss_vec[subset_size] = numpy.sum(rsd ** 2)
801        XtX = numpy.dot(X_work.T, X_work)
802        iXtX = numpy.linalg.pinv(XtX)
803        diag = numpy.diag(iXtX)
804
805        if subset_size == 0:
806            break
807
808        delta_rss = b ** 2 / diag
809        delete_i = numpy.argmin(delta_rss[1:]) + 1  # Keep the intercept
810        del working_set[delete_i]
811    return subsets, rss_vec
812
813
814def pruning_pass(bx, y, penalty, pruned_terms=-1):
815    """ Do pruning pass
816
817    .. todo:: pruned_terms, Leaps
818
819    """
820    try:
821        subsets, rss_vec = subset_selection_xtx(bx, y)
822    except numpy.linalg.LinAlgError:
823        subsets, rss_vec = subset_selection_xtx_numpy(bx, y)
824
825    cases, terms = bx.shape
826    n_effective_params = numpy.arange(terms) + 1.0
827    n_effective_params += penalty * (n_effective_params - 1.0) / 2.0
828
829    gcv_vec = gcv(rss_vec, cases, n_effective_params)
830
831    min_i = numpy.argmin(gcv_vec)
832    used = numpy.zeros((terms), dtype=bool)
833
834    used[subsets[min_i, :min_i + 1]] = True
835
836    return used, subsets, rss_vec, gcv_vec
837
838"""
839Printing functions.
840"""
841
842
843def format_model(model, percision=3, indent=3):
844    """ Return a formated string representation of the earth model.
845    """
846    if model.multitarget:
847        r_vars = list(model.domain.class_vars)
848    elif is_discrete(model.class_var):
849        r_vars = model.expanded_class
850    else:
851        r_vars = [model.class_var]
852
853    r_names = [v.name for v in r_vars]
854    betas = model.betas
855
856    resp = []
857    for name, betas in zip(r_names, betas):
858        resp.append(_format_response(model, name, betas,
859                                     percision, indent))
860    return "\n\n".join(resp)
861
862
863def _format_response(model, resp_name, betas, percision=3, indent=3):
864    header = "%s =" % resp_name
865    indent = " " * indent
866    fmt = "%." + str(percision) + "f"
867    terms = [([], fmt % betas[0])]
868    beta_i = 0
869    for i, used in enumerate(model.best_set[1:], 1):
870        if used:
871            beta_i += 1
872            beta = fmt % abs(betas[beta_i])
873            knots = [_format_knot(model, attr.name, d, c, percision) \
874                     for d, c, attr in \
875                     zip(model.dirs[i], model.cuts[i], model.domain.attributes) \
876                     if d != 0]
877            term_attrs = [a for a, d in zip(model.domain.attributes, model.dirs[i]) \
878                          if d != 0]
879            term_attrs = sorted(term_attrs)
880            sign = "-" if betas[beta_i] < 0 else "+"
881            if knots:
882                terms.append((term_attrs,
883                              sign + " * ".join([beta] + knots)))
884            else:
885                terms.append((term_attrs, sign + beta))
886    # Sort by len(term_attrs), then by term_attrs
887    terms = sorted(terms, key=lambda t: (len(t[0]), t[0]))
888    return "\n".join([header] + [indent + t for _, t in terms])
889
890
891def _format_knot(model, name, dir, cut, percision=3):
892    fmt = "%%.%if" % percision
893    if dir == 1:
894        txt = ("max(0, %s - " + fmt + ")") % (name, cut)
895    elif dir == -1:
896        txt = ("max(0, " + fmt + " - %s)") % (cut, name)
897    elif dir == 2:
898        txt = name
899    return txt
900
901
902"""\
903Variable importance estimation
904------------------------------
905"""
906
907from collections import defaultdict
908
909
910def collect_source(vars):
911    """ Given a list of variables ``var``, return a mapping from source
912    variables (``source_variable`` or ``get_value_from.variable`` members)
913    back to the variables in ``vars`` (assumes the default preprocessor in
914    EarthLearner).
915
916    """
917    source = defaultdict(list)
918    for var in vars:
919        svar = None
920        if var.source_variable:
921            source[var.source_variable].append(var)
922        elif isinstance(var.get_value_from, Orange.core.ClassifierFromVar):
923            source[var.get_value_from.variable].append(var)
924        elif isinstance(var.get_value_from, Orange.core.ImputeClassifier):
925            source[var.get_value_from.classifier_from_var.variable].append(var)
926        else:
927            source[var].append(var)
928    return dict(source)
929
930
931def map_to_source_var(var, sources):
932    """
933    """
934    if var in sources:
935        return var
936    elif var.source_variable in sources:
937        return var.source_variable
938    elif isinstance(var.get_value_from, Orange.core.ClassifierFromVar):
939        return map_to_source_var(var.get_value_from.variable, sources)
940    elif isinstance(var.get_value_from, Orange.core.ImputeClassifier):
941        var = var.get_value_from.classifier_from_var.variable
942        return map_to_source_var(var, sources)
943    else:
944        return None
945
946
947def evimp(model, used_only=True):
948    """ Return the estimated variable importance for the model.
949
950    :param model: Earth model.
951    :type model: `EarthClassifier`
952
953    """
954    if model.subsets is None:
955        raise ValueError("No subsets. Use the learner with 'prune=True'.")
956
957    subsets = model.subsets
958    n_subsets = numpy.sum(model.best_set)
959
960    rss = -numpy.diff(model.rss_per_subset)
961    gcv = -numpy.diff(model.gcv_per_subset)
962    attributes = list(model.domain.variables)
963
964    attr2ind = dict(zip(attributes, range(len(attributes))))
965    importances = numpy.zeros((len(attributes), 4))
966    importances[:, 0] = range(len(attributes))
967
968    for i in range(1, n_subsets):
969        term_subset = subsets[i, :i + 1]
970        used_attributes = reduce(set.union, [model.used_attributes(term) \
971                                             for term in term_subset], set())
972        for attr in used_attributes:
973            importances[attr2ind[attr]][1] += 1.0
974            importances[attr2ind[attr]][2] += gcv[i - 1]
975            importances[attr2ind[attr]][3] += rss[i - 1]
976    imp_min = numpy.min(importances[:, [2, 3]], axis=0)
977    imp_max = numpy.max(importances[:, [2, 3]], axis=0)
978
979    #Normalize importances.
980    importances[:, [2, 3]] = 100.0 * (importances[:, [2, 3]] \
981                            - [imp_min]) / ([imp_max - imp_min])
982
983    importances = list(importances)
984    # Sort by n_subsets and gcv.
985    importances = sorted(importances, key=lambda row: (row[1], row[2]),
986                         reverse=True)
987    importances = numpy.array(importances)
988
989    if used_only:
990        importances = importances[importances[:, 1] > 0.0]
991
992    res = [(attributes[int(row[0])], tuple(row[1:])) for row in importances]
993    return res
994
995
996def plot_evimp(evimp):
997    """ Plot the variable importances as returned from
998    :obj:`EarthClassifier.evimp` call.
999
1000    ::
1001
1002        import Orange
1003        data = Orange.data.Table("housing")
1004        c = Orange.regression.earth.EarthLearner(data, degree=3)
1005        Orange.regression.earth.plot_evimp(c.evimp())
1006
1007    .. image:: files/earth-evimp.png
1008
1009    The left axis is the nsubsets measure and on the right are the normalized
1010    RSS and GCV.
1011
1012    """
1013    from Orange.ensemble.bagging import BaggedClassifier
1014    if isinstance(evimp, EarthClassifier):
1015        evimp = evimp.evimp()
1016    elif isinstance(evimp, BaggedClassifier):
1017        evimp = bagged_evimp(evimp)
1018
1019    import pylab
1020    fig = pylab.figure()
1021    axes1 = fig.add_subplot(111)
1022    attrs = [a for a, _ in evimp]
1023    imp = [s for _, s in evimp]
1024    imp = numpy.array(imp)
1025    X = range(len(attrs))
1026    l1 = axes1.plot(X, imp[:, 0], "b-",)
1027    axes2 = axes1.twinx()
1028
1029    l2 = axes2.plot(X, imp[:, 1], "g-",)
1030    l3 = axes2.plot(X, imp[:, 2], "r-",)
1031
1032    x_axis = axes1.xaxis
1033    x_axis.set_ticks(X)
1034    x_axis.set_ticklabels([a.name for a in attrs], rotation=90)
1035
1036    axes1.yaxis.set_label_text("nsubsets")
1037    axes2.yaxis.set_label_text("normalized gcv or rss")
1038
1039    axes1.legend([l1, l2, l3], ["nsubsets", "gcv", "rss"])
1040    axes1.set_title("Variable importance")
1041    fig.show()
1042
1043
1044def bagged_evimp(classifier, used_only=True):
1045    """ Extract combined (average) evimp from an instance of BaggedClassifier
1046
1047    Example::
1048
1049        from Orange.ensemble.bagging import BaggedLearner
1050        bc = BaggedLearner(EarthLearner(degree=3, terms=10), data)
1051        bagged_evimp(bc)
1052
1053    """
1054    def assert_type(object, class_):
1055        if not isinstance(object, class_):
1056            raise TypeError("Instance of %r expected." % (class_))
1057
1058    from Orange.ensemble.bagging import BaggedClassifier
1059
1060    assert_type(classifier, BaggedClassifier)
1061    bagged_imp = defaultdict(list)
1062    attrs_by_name = defaultdict(list)
1063    for c in classifier.classifiers:
1064        assert_type(c, EarthClassifier)
1065        imp = evimp(c, used_only=used_only)
1066        for attr, score in imp:
1067            bagged_imp[attr.name].append(score)  # map by name
1068            attrs_by_name[attr.name].append(attr)
1069
1070    for attr, scores in bagged_imp.items():
1071        scores = numpy.average(scores, axis=0)
1072        bagged_imp[attr] = tuple(scores)
1073
1074    bagged_imp = sorted(bagged_imp.items(),
1075                        key=lambda t: (t[1][0], t[1][1]),
1076                        reverse=True)
1077
1078    bagged_imp = [(attrs_by_name[name][0], scores) for name, scores in bagged_imp]
1079
1080    if used_only:
1081        bagged_imp = [(a, r) for a, r in bagged_imp if r[0] > 0]
1082    return bagged_imp
1083
1084"""
1085High level interface for measuring variable importance
1086(compatible with Orange.feature.scoring module).
1087
1088"""
1089from Orange.feature import scoring
1090
1091
1092class ScoreEarthImportance(scoring.Score):
1093    """ A subclass of :class:`Orange.feature.scoring.Score` that.
1094    scores features based on their importance in the Earth
1095    model using ``bagged_evimp``.
1096
1097    """
1098    # Return types
1099    NSUBSETS = 0
1100    RSS = 1
1101    GCV = 2
1102
1103    handles_discrete = True
1104    handles_continuous = True
1105    computes_thresholds = False
1106    needs = scoring.Score.Generator
1107
1108    def __new__(cls, attr=None, data=None, weight_id=None, **kwargs):
1109        self = scoring.Score.__new__(cls)
1110        if attr is not None and data is not None:
1111            self.__init__(**kwargs)
1112            # TODO: Should raise a warning, about caching
1113            return self.__call__(attr, data, weight_id)
1114        elif not attr and not data:
1115            return self
1116        else:
1117            raise ValueError("Both 'attr' and 'data' arguments expected.")
1118
1119    def __init__(self, t=10, degree=2, terms=10, score_what="nsubsets",
1120                 cached=True):
1121        """
1122        :param t: Number of earth models to train on the data
1123            (using BaggedLearner).
1124
1125        :param score_what: What to return as a score.
1126            Can be one of: "nsubsets", "rss", "gcv" or class constants
1127            NSUBSETS, RSS, GCV.
1128
1129        """
1130        self.t = t
1131        self.degree = degree
1132        self.terms = terms
1133        if isinstance(score_what, basestring):
1134            score_what = {"nsubsets": self.NSUBSETS, "rss": self.RSS,
1135                          "gcv": self.GCV}.get(score_what, None)
1136
1137        if score_what not in range(3):
1138            raise ValueError("Invalid  'score_what' parameter.")
1139
1140        self.score_what = score_what
1141        self.cached = cached
1142        self._cache_ref = None
1143        self._cached_evimp = None
1144
1145    def __call__(self, attr, data, weight_id=None):
1146        ref = self._cache_ref
1147        if ref is not None and ref is data:
1148            evimp = self._cached_evimp
1149        else:
1150            from Orange.ensemble.bagging import BaggedLearner
1151            bc = BaggedLearner(EarthLearner(degree=self.degree,
1152                            terms=self.terms), t=self.t)(data, weight_id)
1153            evimp = bagged_evimp(bc, used_only=False)
1154            self._cache_ref = data
1155            self._cached_evimp = evimp
1156
1157        evimp = dict(evimp)
1158        score = evimp.get(attr, None)
1159
1160        if score is None:
1161            source = collect_source(evimp.keys())
1162            if attr in source:
1163                # Return average of source var scores
1164                return numpy.average([evimp[v][self.score_what] \
1165                                      for v in source[attr]])
1166            else:
1167                raise ValueError("Attribute %r not in the domain." % attr)
1168        else:
1169            return score[self.score_what]
1170
1171
1172class ScoreRSS(scoring.Score):
1173
1174    handles_discrete = False
1175    handles_continuous = True
1176    computes_thresholds = False
1177
1178    def __new__(cls, attr=None, data=None, weight_id=None, **kwargs):
1179        self = scoring.Score.__new__(cls)
1180        if attr is not None and data is not None:
1181            self.__init__(**kwargs)
1182            # TODO: Should raise a warning, about caching
1183            return self.__call__(attr, data, weight_id)
1184        elif not attr and not data:
1185            return self
1186        else:
1187            raise ValueError("Both 'attr' and 'data' arguments expected.")
1188
1189    def __init__(self):
1190        self._cache_data = None
1191        self._cache_rss = None
1192
1193    def __call__(self, attr, data, weight_id=None):
1194        ref = self._cache_data
1195        if ref is not None and ref is data:
1196            rss = self._cache_rss
1197        else:
1198            x, y = data.to_numpy_MA("1A/c")
1199            try:
1200                subsets, rss = subset_selection_xtx2(x, y)
1201            except numpy.linalg.LinAlgError:
1202                subsets, rss = subset_selection_xtx_numpy(x, y)
1203            rss_diff = -numpy.diff(rss)
1204            rss = numpy.zeros_like(rss)
1205            for s_size in range(1, subsets.shape[0]):
1206                subset = subsets[s_size, :s_size + 1]
1207                rss[subset] += rss_diff[s_size - 1]
1208            rss = rss[1:]  # Drop the intercept
1209            self._cache_data = data
1210            self._cache_rss = rss
1211
1212        index = list(data.domain.attributes).index(attr)
1213        return rss[index]
Note: See TracBrowser for help on using the repository browser.