source: orange/Orange/regression/earth.py @ 9671:a7b056375472

Revision 9671:a7b056375472, 47.9 KB checked in by anze <anze.staric@…>, 2 years ago (diff)

Moved orange to Orange (part 2)

Line 
1"""\
2====================================================
3Multivariate Adaptive Regression Splines (``earth``)
4====================================================
5
6.. index:: regression, linear model
7
8`Multivariate adaptive regression splines (MARS)`_ is a non-parametric
9regression method that extends a linear model with non-linear
10interactions.
11
12This module borrows the implementation of the technique from the `Earth R
13package`_ by Stephen Milborrow.
14
15.. _`Multivariate adaptive regression splines (MARS)`: http://en.wikipedia.org/wiki/Multivariate_adaptive_regression_splines
16.. _`Earth R package`: http://cran.r-project.org/web/packages/earth/index.html
17
18Example ::
19
20    >>> from Orange.regression import earth
21    >>> data = Orange.data.Table("housing")
22    >>> c = earth.EarthLearner(data, degree=2)
23    >>> print c
24    MEDV =
25       23.587
26       +11.896 * max(0, RM - 6.431)
27       +1.142 * max(0, 6.431 - RM)
28       -0.612 * max(0, LSTAT - 6.120)
29       -228.795 * max(0, NOX - 0.647) * max(0, RM - 6.431)
30       +0.023 * max(0, TAX - 307.000) * max(0, 6.120 - LSTAT)
31       +0.029 * max(0, 307.000 - TAX) * max(0, 6.120 - LSTAT)
32       
33    >>>
34
35
36.. autoclass:: EarthLearner
37    :members:
38
39.. autoclass:: EarthClassifier
40    :members:
41
42
43Utility functions
44-----------------
45
46.. autofunction:: gcv
47
48.. autofunction:: plot_evimp
49
50.. autofunction:: bagged_evimp
51
52.. autoclass:: ScoreEarthImportance
53
54"""
55
56import Orange
57from Orange.data.variable import Discrete, Continuous
58from Orange.data import Table, Domain
59from Orange.preprocess import Preprocessor_continuize, \
60                              Preprocessor_impute, \
61                              Preprocessor_preprocessorList, \
62                              DomainContinuizer
63
64import numpy
65
66def is_discrete(var):
67    return isinstance(var, Discrete)
68
69def is_continuous(var):
70    return isinstance(var, Continuous)
71
72def expand_discrete(var):
73    """ Expand a discrete variable ``var`` returning one continuous indicator
74    variable for each value of ``var`` (if the number of values is grater
75    then 2 else return only one indicator variable).
76   
77    """
78    if len(var.values) > 2:
79        values = var.values
80    elif len(var.values) == 2:
81        values = var.values[-1:]
82    else:
83        values = var.values[:1]
84    new_vars = []
85    for value in values:
86        new = Continuous("{0}={1}".format(var.name, value))
87        new.get_value_from = cls = Orange.core.ClassifierFromVar(whichVar=var)
88        cls.transformer = Orange.core.Discrete2Continuous()
89        cls.transformer.value = int(Orange.core.Value(var, value))
90        new.source_variable = var
91        new_vars.append(new)
92    return new_vars
93   
94class EarthLearner(Orange.core.LearnerFD):
95    """ Earth learner class. Supports both regression and classification
96    problems. In case of classification the class values are expanded into
97    continuous indicator columns (one for each value if the number of
98    values is grater then 2), and a multi response model is learned on these
99    new columns. The resulting classifier will then use the computed response
100    values on new instances to select the final predicted class.
101     
102    """
103    def __new__(cls, instances=None, weight_id=None, **kwargs):
104        self = Orange.core.LearnerFD.__new__(cls)
105        if instances is not None:
106            self.__init__(**kwargs)
107            return self.__call__(instances, weight_id)
108        else:
109            return self
110       
111    def __init__(self, degree=1, terms=21, penalty= None, thresh=1e-3,
112                 min_span=0, new_var_penalty=0, fast_k=20, fast_beta=1,
113                 pruned_terms=None, scale_resp=True, store_instances=True,
114                 multi_label=False, **kwds):
115        """ Initialize the learner instance.
116       
117        :param degree: Maximum degree (num. of hinge functions per term)
118            of the terms in the model.
119        :type degree: int
120        :param terms: Maximum number of terms in the forward pass (default 21).
121           
122            .. note:: If this paramter is None then
123                ``min(200, max(20, 2 * n_attributes)) + 1`` will be used. This
124                is the same as the default setting in earth R package.
125               
126        :type terms: int
127        :param penalty: Penalty for hinges in the GCV computation (used
128            in the pruning pass). By default it is 3.0 if the degree > 1,
129            2.0 otherwise.
130        :type penalty: float
131        :param thresh: Threshold for RSS decrease in the forward pass
132            (default 0.001).
133        :type thresh: float
134        :param min_span: TODO.
135        :param new_var_penalty: Penalty for introducing a new variable
136            in the model during the forward pass (default 0).
137        :type new_var_penalty: float
138        :param fast_k: Fast k.
139        :param fast_beta: Fast beta.
140        :param pruned_terms: Maximum number of terms in the model after
141            pruning (default None - no limit).
142        :type pruned_terms: int
143        :param scale_resp: Scale responses prior to forward pass (default
144            True - ignored for multi response models).
145        :type scale_resp: bool
146        :param store_instances: Store training instances in the model
147            (default True).
148        :type store_instances: bool
149        :param multi_label: If True build a multi label model (default False).
150        :type multi_label: bool 
151         
152        .. todo:: min_span, prunning_method (need Leaps like functionality,
153            currently only eval_subsets_using_xtx is implemented).
154       
155        """
156        self.degree = degree
157        self.terms = terms
158        if penalty is None:
159            penalty = 3 if degree > 1 else 2
160        self.penalty = penalty
161        self.thresh = thresh
162        self.min_span = min_span
163        self.new_var_penalty = new_var_penalty
164        self.fast_k = fast_k
165        self.fast_beta = fast_beta
166        self.pruned_terms = pruned_terms
167        self.scale_resp = scale_resp
168        self.store_instances = store_instances
169        self.multi_label = multi_label
170        self.__dict__.update(kwds)
171       
172        impute = Preprocessor_impute()
173        cont = Preprocessor_continuize(multinomialTreatment=
174                                       DomainContinuizer.AsOrdinal)
175        self.preproc = Preprocessor_preprocessorList(preprocessors=\
176                                                     [impute, cont])
177       
178    def __call__(self, instances, weight_id=None):
179        original_domain = instances.domain # Needed by classifier and evimp
180        instances = self.preproc(instances)
181        expanded_class = None
182        if self.multi_label:
183            label_mask = data_label_mask(instances.domain)
184            data = instances.to_numpy_MA("Ac")[0]
185            y = data[:, label_mask]
186            x = data[:, ~ label_mask]
187        else:
188            # Expand a discrete class with indicator columns
189            if is_discrete(instances.domain.class_var):
190                expanded_class = expand_discrete(instances.domain.class_var)
191                y = Table(Domain(expanded_class, None), instances)
192                y = y.to_numpy_MA("A")[0]
193                x = instances.to_numpy_MA("A")[0]
194                label_mask = [False] * x.shape[1] + [True] * y.shape[1]
195                label_mask = numpy.array(label_mask)
196            elif is_continuous(instances.domain.class_var):
197                label_mask = numpy.zeros(len(instances.domain.variables),
198                                         dtype=bool)
199                label_mask[-1] = True
200                x, y, _ = instances.to_numpy_MA()
201                y = y.reshape((-1, 1))
202            else:
203                raise ValueError("Cannot handle the response.")
204       
205        if self.scale_resp and y.shape[1] == 1:
206            sy = y - numpy.mean(y, axis=0)
207            sy = sy / numpy.std(sy, axis=1)
208        else:
209            sy = y
210           
211        terms = self.terms
212        if terms is None:
213            # Automatic maximum number of terms
214            terms = min(200, max(20, 2 * x.shape[1])) + 1
215           
216        n_terms, used, bx, dirs, cuts = forward_pass(x, sy,
217            degree=self.degree, terms=terms, penalty=self.penalty,
218            thresh=self.thresh, fast_k=self.fast_k, fast_beta=self.fast_beta,
219            new_var_penalty=self.new_var_penalty)
220       
221        # discard unused terms from bx, dirs, cuts
222        bx = bx[:, used]
223        dirs = dirs[used, :]
224        cuts = cuts[used, :]
225       
226        # pruning
227        used, subsets, rss_per_subset, gcv_per_subset = \
228            pruning_pass(bx, y, self.penalty,
229                         pruned_terms=self.pruned_terms)
230       
231        # Fit betas
232        bx_used = bx[:, used]
233        betas, res, rank, s = numpy.linalg.lstsq(bx_used, y)
234       
235        return EarthClassifier(instances.domain, used, dirs, cuts, betas.T,
236                               subsets, rss_per_subset, gcv_per_subset,
237                               instances=instances if self.store_instances else None,
238                               label_mask=label_mask, multi_flag=self.multi_label,
239                               expanded_class=expanded_class)
240   
241   
242def soft_max(values):
243    values = numpy.asarray(values)
244    return numpy.exp(values) / numpy.sum(numpy.exp(values))
245
246
247class EarthClassifier(Orange.core.ClassifierFD):
248    """ Earth classifier.
249    """
250    def __init__(self, domain, best_set, dirs, cuts, betas, subsets=None,
251                 rss_per_subset=None, gcv_per_subset=None, instances=None,
252                 label_mask=None, multi_flag=False, expanded_class=None,
253                 original_domain=None, **kwargs):
254        self.multi_flag = multi_flag
255        self.domain = domain
256        self.class_var = domain.class_var
257        self.best_set = best_set
258        self.dirs = dirs
259        self.cuts = cuts
260        self.betas = betas
261        self.subsets = subsets
262        self.rss_per_subset = rss_per_subset
263        self.gcv_per_subset = gcv_per_subset
264        self.instances = instances
265        self.label_mask = label_mask
266        self.expanded_class = expanded_class
267        self.original_domain = original_domain
268        self.__dict__.update(kwargs)
269       
270    def __call__(self, instance, result_type=Orange.core.GetValue):
271        if self.multi_flag:
272            resp_vars = [v for v, m in zip(self.domain.variables,
273                                           self.label_mask) if m]
274        elif is_discrete(self.class_var):
275            resp_vars = self.expanded_class
276        else:
277            resp_vars = [self.class_var]
278           
279        vals = self.predict(instance)
280        vals = [var(val) for var, val in zip(resp_vars, vals)]
281       
282        from Orange.statistics.distribution import Distribution
283       
284        if not self.multi_flag and is_discrete(self.class_var):
285            dist = Distribution(self.class_var)
286            if len(self.class_var.values) == 2:
287                probs = [1 - float(vals[0]), float(vals[0])]
288            else:
289                probs = soft_max(map(float, vals))
290               
291            for val, p in zip(self.class_var.values, probs):
292                dist[val] = p
293            value = dist.modus()
294            vals, probs = [value], [dist]
295        else:
296            probs = []
297            for var, val in zip(resp_vars, vals):
298                dist = Distribution(var)
299                dist[val] = 1.0
300                probs.append(dist)
301           
302        if not self.multi_flag:
303            vals, probs = vals[0], probs[0]
304           
305        if result_type == Orange.core.GetValue:
306            return vals
307        elif result_type == Orange.core.GetBoth:
308            return vals, probs
309        else:
310            return probs
311       
312    def base_matrix(self, instances=None):
313        """Return the base matrix (bx) of the Earth model for the table.
314        If table is not supplied the base matrix of the training instances
315        is returned.
316        Base matrix is a len(instances) x num_terms matrix of computed values
317        of terms in the model (not multiplied by beta) for each instance.
318       
319        :param instances: Input instances for the base matrix.
320        :type instances: :class:`Orange.data.Table`
321       
322        """
323        if instances is None:
324            instances = self.instances
325        (data,) = instances.to_numpy_MA("Ac")
326        data = data[:, ~ self.label_mask]
327        bx = base_matrix(data, self.best_set, self.dirs, self.cuts)
328        return bx
329   
330    def predict(self, instance):
331        """ Predict the response values for the instance
332       
333        :param instance: Data instance
334        :type instance: :class:`Orange.data.Instance`
335       
336        """
337        data = Orange.data.Table(self.domain, [instance])
338        bx = self.base_matrix(data)
339        bx_used = bx[:, self.best_set]
340        vals = numpy.dot(bx_used, self.betas.T).ravel()
341        return vals
342   
343    def used_attributes(self, term=None):
344        """ Return the used terms for term (index). If no term is given
345        return all attributes in the model.
346       
347        :param term: term index
348        :type term: int
349       
350        """
351        if term is None:
352            return reduce(set.union, [self.used_attributes(i) \
353                                      for i in range(self.best_set.size)],
354                          set())
355        attrs = [a for a, m in zip(self.domain.variables, self.label_mask)
356                 if not m]
357       
358        used_mask = self.dirs[term, :] != 0.0
359        return [a for a, u in zip(attrs, used_mask) if u]
360   
361    def evimp(self, used_only=True):
362        """ Return the estimated variable importances.
363       
364        :param used_only: if True return only used attributes
365       
366        """ 
367        return evimp(self, used_only)
368   
369    def __reduce__(self):
370        return (type(self), (self.domain, self.best_set, self.dirs,
371                            self.cuts, self.betas),
372                dict(self.__dict__))
373       
374    def to_string(self, percision=3, indent=3):
375        """ Return a string representation of the model.
376        """
377        return format_model(self, percision, indent)
378       
379    def __str__(self):
380        return self.to_string()
381   
382
383"""
384Utility functions
385-----------------
386"""
387   
388def base_matrix(data, best_set, dirs, cuts):
389    """ Return the base matrix for the earth model.
390   
391    :param data: Input data
392    :type data: :class:`numpy.ndarray`
393   
394    :param best_set: A array of booleans indicating used terms.
395    :type best_set: :class:`numpy.ndarray`
396   
397    :param dirs: Earth model's dirs members
398    :type dirs: :class:`numpy.ndarray`
399   
400    :param cuts: Earth model's cuts members
401    :type cuts: :class:`numpy.ndarray`
402   
403    """
404    data = numpy.asarray(data)
405    best_set = numpy.asarray(best_set)
406    dirs = numpy.asarray(dirs)
407    cuts = numpy.asarray(cuts)
408   
409    bx = numpy.zeros((data.shape[0], best_set.shape[0]))
410    bx[:, 0] = 1.0 # The intercept
411    for termi in range(1, best_set.shape[0]):
412        term_dirs = dirs[termi]
413        term_cuts = cuts[termi]
414       
415        dir_p1 = numpy.where(term_dirs == 1)[0]
416        dir_m1 = numpy.where(term_dirs == -1)[0]
417        dir_2 = numpy.where(term_dirs == 2)[0]
418       
419        x1 = data[:, dir_p1] - term_cuts[dir_p1]
420        x2 = term_cuts[dir_m1] - data[:, dir_m1]
421        x3 = data[:, dir_2]
422       
423        x1 = numpy.where(x1 > 0.0, x1, 0.0)
424        x2 = numpy.where(x2 > 0.0, x2, 0.0)
425       
426        X = numpy.hstack((x1, x2, x3))
427        X = numpy.cumprod(X, axis=1)
428        bx[:, termi] = X[:, -1] if X.size else 0.0
429       
430    return bx
431
432   
433def gcv(rss, n, n_effective_params):
434    """ Return the generalized cross validation.
435   
436    .. math:: gcv = rss / (n * (1 - NumEffectiveParams / n) ^ 2)
437   
438    :param rss: Residual sum of squares.
439    :param n: Number of training instances.
440    :param n_effective_params: Number of effective paramaters.
441     
442    """
443    return  rss / (n * (1. - n_effective_params / n) ** 2)
444
445"""
446Multi-label utility functions
447"""
448
449def is_label_attr(attr):
450    """ Is attribute a label.
451    """
452    return attr.attributes.has_key("label")
453   
454def data_label_indices(domain):
455    """ Return the indices of label attributes in data.
456    """
457    return numpy.where(data_label_mask(domain))[0]
458
459def data_label_mask(domain):
460    """ Return an array of booleans indicating whether a variable in the
461    domain is a label.
462    """
463    is_label = map(is_label_attr, domain.variables)
464    if domain.class_var:
465        is_label[-1] = True
466    return numpy.array(is_label, dtype=bool)
467
468"""
469ctypes interface to ForwardPass and EvalSubsetsUsingXtx.
470"""
471       
472import ctypes
473from numpy import ctypeslib
474import orange
475
476_c_orange_lib = ctypeslib.load_library(orange.__file__, "")
477_c_forward_pass_ = _c_orange_lib.EarthForwardPass
478
479_c_forward_pass_.argtypes = \
480    [ctypes.POINTER(ctypes.c_int),  #pnTerms:
481     ctypeslib.ndpointer(dtype=ctypes.c_bool, ndim=1),  #FullSet
482     ctypeslib.ndpointer(dtype=ctypes.c_double, ndim=2, flags="F_CONTIGUOUS"), #bx
483     ctypeslib.ndpointer(dtype=ctypes.c_int, ndim=2, flags="F_CONTIGUOUS"),    #Dirs
484     ctypeslib.ndpointer(dtype=ctypes.c_double, ndim=2, flags="F_CONTIGUOUS"), #Cuts
485     ctypeslib.ndpointer(dtype=ctypes.c_int, ndim=1),  #nFactorsInTerms
486     ctypeslib.ndpointer(dtype=ctypes.c_int, ndim=1),  #nUses
487     ctypeslib.ndpointer(dtype=ctypes.c_double, ndim=2, flags="F_CONTIGUOUS"), #x
488     ctypeslib.ndpointer(dtype=ctypes.c_double, ndim=2, flags="F_CONTIGUOUS"), #y
489     ctypeslib.ndpointer(dtype=ctypes.c_double, ndim=1), # Weights
490     ctypes.c_int,  #nCases
491     ctypes.c_int,  #nResp
492     ctypes.c_int,  #nPred
493     ctypes.c_int,  #nMaxDegree
494     ctypes.c_int,  #nMaxTerms
495     ctypes.c_double,   #Penalty
496     ctypes.c_double,   #Thresh
497     ctypes.c_int,  #nFastK
498     ctypes.c_double,   #FastBeta
499     ctypes.c_double,   #NewVarPenalty
500     ctypeslib.ndpointer(dtype=ctypes.c_int, ndim=1),  #LinPreds
501     ctypes.c_bool, #UseBetaCache
502     ctypes.c_char_p    #sPredNames
503     ]
504   
505def forward_pass(x, y, degree=1, terms=21, penalty=None, thresh=0.001,
506                  fast_k=21, fast_beta=1, new_var_penalty=2):
507    """ Do earth forward pass.
508    """
509    import ctypes, orange
510    x = numpy.asfortranarray(x, dtype=ctypes.c_double)
511    y = numpy.asfortranarray(y, dtype=ctypes.c_double)
512    if x.shape[0] != y.shape[0]:
513        raise ValueError("First dimensions of x and y must be the same.")
514    if y.ndim == 1:
515        y = y.reshape((-1, 1), order="F")
516    if penalty is None:
517        penalty = 2
518    n_cases = x.shape[0]
519    n_preds = x.shape[1]
520   
521    n_resp = y.shape[1] if y.ndim == 2 else y.shape[0]
522   
523    # Output variables
524    n_term = ctypes.c_int()
525    full_set = numpy.zeros((terms,), dtype=ctypes.c_bool, order="F")
526    bx = numpy.zeros((n_cases, terms), dtype=ctypes.c_double, order="F")
527    dirs = numpy.zeros((terms, n_preds), dtype=ctypes.c_int, order="F")
528    cuts = numpy.zeros((terms, n_preds), dtype=ctypes.c_double, order="F")
529    n_factors_in_terms = numpy.zeros((terms,), dtype=ctypes.c_int, order="F")
530    n_uses = numpy.zeros((n_preds,), dtype=ctypes.c_int, order="F")
531    weights = numpy.ones((n_cases,), dtype=ctypes.c_double, order="F")
532    lin_preds = numpy.zeros((n_preds,), dtype=ctypes.c_int, order="F")
533    use_beta_cache = True
534   
535    # These tests are performed in ForwardPass, and if they fail the function
536    # calls exit. So we must check it here and raise a exception to avoid a
537    # process shutdown.
538    if n_cases < 8:
539        raise ValueError("Need at least 8 data instances.")
540    if n_cases > 1e8:
541        raise ValueError("To many data instances.")
542    if n_resp < 1:
543        raise ValueError("No response column.")
544    if n_resp > 1e6:
545        raise ValueError("To many response columns.")
546    if n_preds < 1:
547        raise ValueError("No predictor columns.")
548    if n_preds > 1e5:
549        raise ValueError("To many predictor columns.")
550    if degree <= 0 or degree > 100:
551        raise ValueError("Invalid 'degree'.")
552    if terms < 3 or terms > 10000:
553        raise ValueError("'terms' must be in >= 3 and <= 10000.")
554    if penalty < 0 and penalty != -1:
555        raise ValueError("Invalid 'penalty' (the only legal negative value is -1).")
556    if penalty > 1000:
557        raise ValueError("Invalid 'penalty' (must be <= 1000).")
558    if thresh < 0.0 or thresh >= 1.0:
559        raise ValueError("Invalid 'thresh' (must be in [0.0, 1.0) ).")
560    if fast_beta < 0 or fast_beta > 1000:
561        raise ValueError("Invalid 'fast_beta' (must be in [0, 1000] ).")
562    if new_var_penalty < 0 or new_var_penalty > 10:
563        raise ValueError("Invalid 'new_var_penalty' (must be in [0, 10] ).")
564    if (numpy.var(y, axis=0) <= 1e-8).any():
565        raise ValueError("Variance of y is zero (or near zero).")
566     
567    _c_forward_pass_(ctypes.byref(n_term), full_set, bx, dirs, cuts,
568                     n_factors_in_terms, n_uses, x, y, weights, n_cases,
569                     n_resp, n_preds, degree, terms, penalty, thresh,
570                     fast_k, fast_beta, new_var_penalty, lin_preds, 
571                     use_beta_cache, None)
572    return n_term.value, full_set, bx, dirs, cuts
573
574
575_c_eval_subsets_xtx = _c_orange_lib.EarthEvalSubsetsUsingXtx
576
577_c_eval_subsets_xtx.argtypes = \
578    [ctypeslib.ndpointer(dtype=ctypes.c_bool, ndim=2, flags="F_CONTIGUOUS"),   #PruneTerms
579     ctypeslib.ndpointer(dtype=ctypes.c_double, ndim=1),   #RssVec
580     ctypes.c_int,  #nCases
581     ctypes.c_int,  #nResp
582     ctypes.c_int,  #nMaxTerms
583     ctypeslib.ndpointer(dtype=ctypes.c_double, ndim=2, flags="F_CONTIGUOUS"),   #bx
584     ctypeslib.ndpointer(dtype=ctypes.c_double, ndim=2, flags="F_CONTIGUOUS"),   #y
585     ctypeslib.ndpointer(dtype=ctypes.c_double, ndim=1)  #WeightsArg
586     ]
587   
588_c_eval_subsets_xtx.restype = ctypes.c_int
589
590def subset_selection_xtx(X, Y):
591    """ Subsets selection using EvalSubsetsUsingXtx in the Earth package.
592    """
593    X = numpy.asfortranarray(X, dtype=ctypes.c_double)
594    Y = numpy.asfortranarray(Y, dtype=ctypes.c_double)
595    if Y.ndim == 1:
596        Y = Y.reshape((-1, 1), order="F")
597       
598    if X.shape[0] != Y.shape[0]:
599        raise ValueError("First dimensions of bx and y must be the same")
600       
601    var_count = X.shape[1]
602    resp_count = Y.shape[1]
603    cases = X.shape[0]
604    subsets = numpy.zeros((var_count, var_count), dtype=ctypes.c_bool,
605                              order="F")
606    rss_vec = numpy.zeros((var_count,), dtype=ctypes.c_double, order="F")
607    weights = numpy.ones((cases,), dtype=ctypes.c_double, order="F")
608   
609    rval = _c_eval_subsets_xtx(subsets, rss_vec, cases, resp_count, var_count,
610                        X, Y, weights)
611    if rval != 0:
612        raise numpy.linalg.LinAlgError("Lin. dep. terms in X")
613   
614    subsets_ind = numpy.zeros((var_count, var_count), dtype=int)
615    for i, used in enumerate(subsets.T):
616        subsets_ind[i, :i + 1] = numpy.where(used)[0]
617       
618    return subsets_ind, rss_vec
619   
620def subset_selection_xtx_numpy(X, Y):
621    """ A numpy implementation of EvalSubsetsUsingXtx in the Earth package.
622    Using numpy.linalg.lstsq
623   
624    """
625    X = numpy.asarray(X)
626    Y = numpy.asarray(Y)
627   
628    var_count= X.shape[1]
629    rss_vec = numpy.zeros(var_count)
630    working_set = range(var_count)
631    subsets = numpy.zeros((var_count, var_count), dtype=int)
632   
633    for subset_size in reversed(range(var_count)):
634        subsets[subset_size, :subset_size + 1] = working_set
635        X_work = X[:, working_set]
636        b, res, rank, s = numpy.linalg.lstsq(X_work, Y)
637        if res.size > 0:
638            rss_vec[subset_size] = numpy.sum(res)
639        else:
640            rss_vec[subset_size] = numpy.sum((Y - numpy.dot(X_work, b)) ** 2)
641           
642        XtX = numpy.dot(X_work.T, X_work)
643        iXtX = numpy.linalg.pinv(XtX)
644        diag = numpy.diag(iXtX).reshape((-1, 1))
645       
646        if subset_size == 0:
647            break
648       
649        delta_rss = b ** 2 / diag
650        delta_rss = numpy.sum(delta_rss, axis=1)
651        delete_i = numpy.argmin(delta_rss[1:]) + 1 # Keep the intercept
652        del working_set[delete_i]
653    return subsets, rss_vec
654   
655def subset_selection_xtx2(X, Y):
656    """ Another implementation (this uses qr decomp).
657    """
658    from Orange.misc import linalg
659    X = numpy.asfortranarray(X, dtype=ctypes.c_double)
660    Y = numpy.asfortranarray(Y, dtype=ctypes.c_double)
661    col_count = X.shape[1]
662    working_set = range(col_count)
663    subsets = numpy.zeros((col_count, col_count), dtype=int)
664    rss_vec = numpy.zeros((col_count,))
665    QR, k, _, jpvt = linalg.qr_decomp(X)
666   
667    if k < col_count:
668        # remove jpvt[k:] from the work set. Will have zero
669        # entries in the subsets matrix, and inf rss
670        for i in sorted(jpvt[k:], reverse=True):
671            del working_set[i]
672            rss_vec[len(working_set)] = float("inf")
673        col_count = len(working_set)
674       
675    for subset_size in reversed(range(col_count)):
676        subsets[subset_size, :subset_size + 1] = working_set
677        X_work = X[:, working_set]
678        b, rsd, rank = linalg.qr_lstsq(X_work, Y)
679#        print rsd
680        rss_vec[subset_size] = numpy.sum(rsd ** 2)
681        XtX = numpy.dot(X_work.T, X_work)
682        iXtX = numpy.linalg.pinv(XtX)
683        diag = numpy.diag(iXtX)
684       
685        if subset_size == 0:
686            break
687       
688        delta_rss = b ** 2 / diag
689        delete_i = numpy.argmin(delta_rss[1:]) + 1 # Keep the intercept
690        del working_set[delete_i]
691    return subsets, rss_vec
692   
693def pruning_pass(bx, y, penalty, pruned_terms=-1):
694    """ Do pruning pass
695   
696    .. todo:: pruned_terms, Leaps
697   
698    """
699    try:
700        subsets, rss_vec = subset_selection_xtx(bx, y)
701    except numpy.linalg.LinAlgError:
702        subsets, rss_vec = subset_selection_xtx_numpy(bx, y)
703   
704    cases, terms = bx.shape
705    n_effective_params = numpy.arange(terms) + 1.0
706    n_effective_params += penalty * (n_effective_params - 1.0) / 2.0
707   
708    gcv_vec = gcv(rss_vec, cases, n_effective_params)
709   
710    min_i = numpy.argmin(gcv_vec)
711    used = numpy.zeros((terms), dtype=bool)
712   
713    used[subsets[min_i, :min_i + 1]] = True
714   
715    return used, subsets, rss_vec, gcv_vec
716
717"""
718Printing functions.
719"""
720   
721def format_model(model, percision=3, indent=3):
722    """ Return a formated string representation of the earth model.
723    """
724    mask = model.label_mask
725    if model.multi_flag:
726        r_vars = [v for v, m in zip(model.domain.variables,
727                                    model.label_mask)
728                  if m]
729    elif is_discrete(model.class_var):
730        r_vars = model.expanded_class
731    else:
732        r_vars = [model.class_var]
733       
734    r_names = [v.name for v in r_vars]
735    betas = model.betas
736       
737    resp = []
738    for name, betas in zip(r_names, betas):
739        resp.append(_format_response(model, name, betas,
740                                     percision, indent))
741    return "\n\n".join(resp)
742
743def _format_response(model, resp_name, betas, percision=3, indent=3):
744    header = "%s =" % resp_name
745    indent = " " * indent
746    fmt = "%." + str(percision) + "f"
747    terms = [([], fmt % betas[0])]
748    beta_i = 0
749    for i, used in enumerate(model.best_set[1:], 1):
750        if used:
751            beta_i += 1
752            beta = fmt % abs(betas[beta_i])
753            knots = [_format_knot(model, attr.name, d, c) for d, c, attr in \
754                     zip(model.dirs[i], model.cuts[i], model.domain.attributes) \
755                     if d != 0]
756            term_attrs = [a for a, d in zip(model.domain.attributes, model.dirs[i]) \
757                          if d != 0]
758            term_attrs = sorted(term_attrs)
759            sign = "-" if betas[beta_i] < 0 else "+"
760            if knots:
761                terms.append((term_attrs,
762                              sign + " * ".join([beta] + knots)))
763            else:
764                terms.append((term_attrs, sign + beta))
765    # Sort by len(term_attrs), then by term_attrs
766    terms = sorted(terms, key=lambda t: (len(t[0]), t[0]))
767    return "\n".join([header] + [indent + t for _, t in terms])
768       
769def _format_knot(model, name, dir, cut):
770    if dir == 1:
771        txt = "max(0, %s - %.3f)" % (name, cut)
772    elif dir == -1:
773        txt = "max(0, %.3f - %s)" % (cut, name)
774    elif dir == 2:
775        txt = name
776    return txt
777
778def _format_term(model, i, attr_name):
779    knots = [_format_knot(model, attr, d, c) for d, c, attr in \
780             zip(model.dirs[i], model.cuts[i], model.domain.attributes) \
781             if d != 0]
782    return " * ".join(knots)
783   
784"""\
785Variable importance estimation
786------------------------------
787"""
788
789from collections import defaultdict
790
791def collect_source(vars):
792    """ Given a list of variables ``var``, return a mapping from source
793    variables (``source_variable`` or ``get_value_from.variable`` members)
794    back to the variables in ``vars`` (assumes the default preprocessor in
795    EarthLearner).
796   
797    """
798    source = defaultdict(list)
799    for var in vars:
800        svar = None
801        if var.source_variable:
802            source[var.source_variable].append(var)
803        elif isinstance(var.get_value_from, Orange.core.ClassifierFromVar):
804            source[var.get_value_from.variable].append(var)
805        elif isinstance(var.get_value_from, Orange.core.ImputeClassifier):
806            source[var.get_value_from.classifier_from_var.variable].append(var)
807        else:
808            source[var].append(var)
809    return dict(source)
810
811def map_to_source_var(var, sources):
812    """
813    """
814    if var in sources:
815        return var
816    elif var.source_variable in sources:
817        return var.source_variable
818    elif isinstance(var.get_value_from, Orange.core.ClassifierFromVar):
819        return map_to_source_var(var.get_value_from.variable, sources)
820    elif isinstance(var.get_value_from, Orange.core.ImputeClassifier):
821        var = var.get_value_from.classifier_from_var.variable
822        return map_to_source_var(var, sources)
823    else:
824        return None
825   
826def evimp(model, used_only=True):
827    """ Return the estimated variable importance for the model.
828   
829    :param model: Earth model.
830    :type model: `EarthClassifier`
831   
832    """
833    if model.subsets is None:
834        raise ValueError("No subsets. Use the learner with 'prune=True'.")
835   
836    subsets = model.subsets
837    n_subsets = numpy.sum(model.best_set)
838   
839    rss = -numpy.diff(model.rss_per_subset)
840    gcv = -numpy.diff(model.gcv_per_subset)
841    attributes = list(model.domain.variables)
842   
843    attr2ind = dict(zip(attributes, range(len(attributes))))
844    importances = numpy.zeros((len(attributes), 4))
845    importances[:, 0] = range(len(attributes))
846   
847    for i in range(1, n_subsets):
848        term_subset = subsets[i, :i + 1]
849        used_attributes = reduce(set.union, [model.used_attributes(term) \
850                                             for term in term_subset], set())
851        for attr in used_attributes:
852            importances[attr2ind[attr]][1] += 1.0
853            importances[attr2ind[attr]][2] += gcv[i - 1]
854            importances[attr2ind[attr]][3] += rss[i - 1]
855    imp_min = numpy.min(importances[:, [2, 3]], axis=0)
856    imp_max = numpy.max(importances[:, [2, 3]], axis=0)
857   
858    #Normalize importances.
859    importances[:, [2, 3]] = 100.0 * (importances[:, [2, 3]] \
860                            - [imp_min]) / ([imp_max - imp_min])
861   
862    importances = list(importances)
863    # Sort by n_subsets and gcv.
864    importances = sorted(importances, key=lambda row: (row[1], row[2]),
865                         reverse=True)
866    importances = numpy.array(importances)
867   
868    if used_only:
869        importances = importances[importances[:,1] > 0.0]
870   
871    res = [(attributes[int(row[0])], tuple(row[1:])) for row in importances]
872    return res
873
874
875def plot_evimp(evimp):
876    """ Plot the variable importances as returned from
877    :obj:`EarthClassifier.evimp` call.
878   
879        >>> data = Orange.data.Table("housing")
880        >>> c = EarthLearner(data, degree=3)
881        >>> plot_evimp(c.evimp())
882       
883    .. image:: files/earth-evimp.png
884     
885    The left axis is the nsubsets measure an on the right are the normalized
886    RSS and GCV.
887   
888    """
889    from Orange.ensemble.bagging import BaggedClassifier
890    if isinstance(evimp, EarthClassifier):
891        evimp = evimp.evimp()
892    elif isinstance(evimp, BaggedClassifier):
893        evimp = bagged_evimp(evimp)
894       
895    import pylab
896    fig = pylab.figure()
897    axes1 = fig.add_subplot(111)
898    attrs = [a for a, _ in evimp]
899    imp = [s for _, s in evimp]
900    imp = numpy.array(imp)
901    X = range(len(attrs))
902    l1 = axes1.plot(X, imp[:,0], "b-",)
903    axes2 = axes1.twinx()
904   
905    l2 = axes2.plot(X, imp[:,1], "g-",)
906    l3 = axes2.plot(X, imp[:,2], "r-",)
907   
908    x_axis = axes1.xaxis
909    x_axis.set_ticks(X)
910    x_axis.set_ticklabels([a.name for a in attrs], rotation=90)
911   
912    axes1.yaxis.set_label_text("nsubsets")
913    axes2.yaxis.set_label_text("normalized gcv or rss")
914
915    axes1.legend([l1, l2, l3], ["nsubsets", "gcv", "rss"])
916    axes1.set_title("Variable importance")
917    fig.show()
918
919   
920def bagged_evimp(classifier, used_only=True):
921    """ Extract combined (average) evimp from an instance of BaggedClassifier
922   
923    Example ::
924   
925        >>> from Orange.ensemble.bagging import BaggedLearner
926        >>> bc = BaggedLearner(EarthLearner(degree=3, terms=10), data)
927        >>> bagged_evimp(bc)
928       
929    """
930    def assert_type(object, class_):
931        if not isinstance(object, class_):
932            raise TypeError("Instance of %r expected." % (class_))
933    from collections import defaultdict
934    from Orange.ensemble.bagging import BaggedClassifier
935   
936    assert_type(classifier, BaggedClassifier)
937    bagged_imp = defaultdict(list)
938    attrs_by_name = defaultdict(list)
939    for c in classifier.classifiers:
940        assert_type(c, EarthClassifier)
941        imp = evimp(c, used_only=used_only)
942        for attr, score in imp:
943            bagged_imp[attr.name].append(score) # map by name
944            attrs_by_name[attr.name].append(attr)
945           
946    for attr, scores in bagged_imp.items():
947        scores = numpy.average(scores, axis=0)
948        bagged_imp[attr] = tuple(scores)
949   
950   
951    bagged_imp = sorted(bagged_imp.items(), key=lambda t: (t[1][0],t[1][1]),
952                        reverse=True)
953   
954    bagged_imp = [(attrs_by_name[name][0], scores) for name, scores in bagged_imp]
955   
956    if used_only:
957        bagged_imp = [(a, r) for a, r in bagged_imp if r[0] > 0]
958    return bagged_imp
959
960"""
961High level interface for measuring variable importance
962(compatible with Orange.feature.scoring module).
963
964"""
965from Orange.feature import scoring
966           
967class ScoreEarthImportance(scoring.Score):
968    """ An :class:`Orange.feature.scoring.Score` subclass.
969    Scores features based on their importance in the Earth
970    model using ``bagged_evimp``'s function return value.
971   
972    """
973    # Return types 
974    NSUBSETS = 0
975    RSS = 1
976    GCV = 2
977   
978    handles_discrete = True
979    handles_continuous = True
980    computes_thresholds = False
981    needs = scoring.Score.Generator
982   
983    def __new__(cls, attr=None, data=None, weight_id=None, **kwargs):
984        self = scoring.Score.__new__(cls)
985        if attr is not None and data is not None:
986            self.__init__(**kwargs)
987            # TODO: Should raise a warning, about caching
988            return self.__call__(attr, data, weight_id)
989        elif not attr and not data:
990            return self
991        else:
992            raise ValueError("Both 'attr' and 'data' arguments expected.")
993       
994    def __init__(self, t=10, degree=2, terms=10, score_what="nsubsets", cached=True):
995        """
996        :param t: Number of earth models to train on the data
997            (using BaggedLearner).
998           
999        :param score_what: What to return as a score.
1000            Can be one of: "nsubsets", "rss", "gcv" or class constants
1001            NSUBSETS, RSS, GCV.
1002           
1003        """
1004        self.t = t
1005        self.degree = degree
1006        self.terms = terms
1007        if isinstance(score_what, basestring):
1008            score_what = {"nsubsets":self.NSUBSETS, "rss":self.RSS,
1009                          "gcv":self.GCV}.get(score_what, None)
1010                         
1011        if score_what not in range(3):
1012            raise ValueError("Invalid  'score_what' parameter.")
1013
1014        self.score_what = score_what
1015        self.cached = cached
1016        self._cache_ref = None
1017       
1018    def __call__(self, attr, data, weight_id=None):
1019        ref = self._cache_ref
1020        if ref is not None and ref is data:
1021            evimp = self._cached_evimp
1022        else:
1023            from Orange.ensemble.bagging import BaggedLearner
1024            bc = BaggedLearner(EarthLearner(degree=self.degree,
1025                            terms=self.terms), t=self.t)(data, weight_id)
1026            evimp = bagged_evimp(bc, used_only=False)
1027            self._cache_ref = data
1028            self._cached_evimp = evimp
1029       
1030        evimp = dict(evimp)
1031        score = evimp.get(attr, None)
1032       
1033        if score is None:
1034            source = collect_source(evimp.keys())
1035            if attr in source:
1036                # Return average of source var scores
1037                return numpy.average([evimp[v][self.score_what] \
1038                                      for v in source[attr]])
1039            else:
1040                raise ValueError("Attribute %r not in the domain." % attr)
1041        else:
1042            return score[self.score_what]
1043   
1044class ScoreRSS(scoring.Score):
1045   
1046    handles_discrete = False
1047    handles_continuous = True
1048    computes_thresholds = False
1049   
1050    def __new__(cls, attr=None, data=None, weight_id=None, **kwargs):
1051        self = scoring.Score.__new__(cls)
1052        if attr is not None and data is not None:
1053            self.__init__(**kwargs)
1054            # TODO: Should raise a warning, about caching
1055            return self.__call__(attr, data, weight_id)
1056        elif not attr and not data:
1057            return self
1058        else:
1059            raise ValueError("Both 'attr' and 'data' arguments expected.")
1060       
1061    def __init__(self):
1062        self._cache_data = None
1063        self._cache_rss = None
1064       
1065    def __call__(self, attr, data, weight_id=None):
1066        ref = self._cache_data
1067        if ref is not None and ref is data:
1068            rss = self._cache_rss
1069        else:
1070            x, y = data.to_numpy_MA("1A/c")
1071            try:
1072                subsets, rss = subset_selection_xtx2(x, y)
1073            except numpy.linalg.LinAlgError:
1074                subsets, rss = subset_selection_xtx_numpy(x, y)
1075            rss_diff = -numpy.diff(rss)
1076            rss = numpy.zeros_like(rss)
1077            for s_size in range(1, subsets.shape[0]):
1078                subset = subsets[s_size, :s_size + 1]
1079                rss[subset] += rss_diff[s_size - 1]
1080            rss = rss[1:] #Drop the intercept
1081            self._cache_data = data
1082            self._cache_rss = rss
1083#        print sorted(zip(rss, data.domain.attributes))
1084        index = list(data.domain.attributes).index(attr)
1085        return rss[index]
1086   
1087
1088#from Orange.core import EarthLearner as BaseEarthLearner, \
1089#                        EarthClassifier as BaseEarthClassifier
1090#from Orange.misc import member_set
1091#
1092#class _EarthLearner(BaseEarthLearner):
1093#    """ An earth learner.
1094#    """
1095#    def __new__(cls, data=None, weightId=None, **kwargs):
1096#        self = BaseEarthLearner.__new__(cls, **kwargs)
1097#        if data is not None:
1098#            self.__init__(**kwargs)
1099#            return self.__call__(data, weightId)
1100#        return self
1101#   
1102#    def __init__(self, max_degree=1, max_terms=21, new_var_penalty=0.0,
1103#                 threshold=0.001, prune=True, penalty=None, fast_k=20,
1104#                 fast_beta=0.0, store_examples=True, **kwargs):
1105#        """ Initialize the learner instance.
1106#       
1107#        :param max_degree:
1108#        """
1109#        self.max_degree = max_degree
1110#        self.max_terms = max_terms
1111#        self.new_var_penalty = new_var_penalty
1112#        self.threshold = threshold
1113#        self.prune = prunes
1114#        if penaty is None:
1115#            penalty = 2.0 if degree > 1 else 3.0
1116#        self.penalty = penalty
1117#        self.fast_k = fast_k
1118#        self.fast_beta = fast_beta
1119#        self.store_examples = store_examples
1120#       
1121#        for key, val in kwargs.items():
1122#            setattr(self, key, val)
1123#   
1124#    def __call__(self, data, weightId=None):
1125#        if not data.domain.class_var:
1126#            raise ValueError("No class var in the domain.")
1127#       
1128#        with member_set(self, "prune", False):
1129#            # We overwrite the prune argument (will do the pruning in python).
1130#            base_clsf =  BaseEarthLearner.__call__(self, data, weightId)
1131#       
1132#        if self.prune:
1133#            (best_set, betas, rss, subsets, rss_per_subset,
1134#             gcv_per_subset) = self.pruning_pass(base_clsf, data)
1135#           
1136#            return _EarthClassifier(base_clsf, data if self.store_examples else None,
1137#                                   best_set=best_set, dirs=base_clsf.dirs,
1138#                                   cuts=base_clsf.cuts,
1139#                                   betas=betas,
1140#                                   subsets=subsets,
1141#                                   rss_per_subset=rss_per_subset,
1142#                                   gcv_per_subset=gcv_per_subset)
1143#        else:
1144#            return _EarthClassifier(base_clsf, data if self.store_examples else None)
1145#   
1146#   
1147#    def pruning_pass(self, base_clsf, examples):
1148#        """ Prune the terms constructed in the forward pass.
1149#        (Pure numpy reimplementation)
1150#        """
1151#        n_terms = numpy.sum(base_clsf.best_set)
1152#        n_eff_params = n_terms + self.penalty * (n_terms - 1) / 2.0
1153#        data, y, _ = examples.to_numpy_MA()
1154#        data = data.filled(0.0)
1155#        best_set = numpy.asarray(base_clsf.best_set, dtype=bool)
1156#       
1157#        bx = base_matrix(data, base_clsf.best_set,
1158#                         base_clsf.dirs, base_clsf.cuts,
1159#                         )
1160#       
1161#        bx_used = bx[:, best_set]
1162#        subsets, rss_per_subset = subsets_selection_xtx(bx_used, y) # TODO: Use leaps like library
1163#        gcv_per_subset = [gcv(rss, bx.shape[0], i + self.penalty * (i - 1) / 2.0) \
1164#                              for i, rss in enumerate(rss_per_subset, 1)]
1165#        gcv_per_subset = numpy.array(gcv_per_subset)
1166#       
1167#        best_i = numpy.argmin(gcv_per_subset[1:]) + 1 # Ignore the intercept
1168#        best_ind = subsets[best_i, :best_i + 1]
1169#        bs_i = 0
1170#        for i, b in enumerate(best_set):
1171#            if b:
1172#                best_set[i] = bs_i in best_ind
1173#                bs_i += 1
1174#               
1175#        bx_subset = bx[:, best_set]
1176#        betas, rss, rank, s = numpy.linalg.lstsq(bx_subset, y)
1177#        return best_set, betas, rss, subsets, rss_per_subset, gcv_per_subset
1178#   
1179#       
1180#class _EarthClassifier(Orange.core.ClassifierFD):
1181#    def __init__(self, base_classifier=None, examples=None, best_set=None,
1182#                 dirs=None, cuts=None, betas=None, subsets=None,
1183#                 rss_per_subset=None,
1184#                 gcv_per_subset=None):
1185#        self._base_classifier = base_classifier
1186#        self.examples = examples
1187#        self.domain = base_classifier.domain
1188#        self.class_var = base_classifier.class_var
1189#       
1190#        best_set = base_classifier.best_set if best_set is None else best_set
1191#        dirs = base_classifier.dirs if dirs is None else dirs
1192#        cuts = base_classifier.cuts if cuts is None else cuts
1193#        betas = base_classifier.betas if betas is None else betas
1194#       
1195#        self.best_set = numpy.asarray(best_set, dtype=bool)
1196#        self.dirs = numpy.array(dirs, dtype=int)
1197#        self.cuts = numpy.array(cuts)
1198#        self.betas = numpy.array(betas)
1199#       
1200#        self.subsets = subsets
1201#        self.rss_per_subset = rss_per_subset
1202#        self.gcv_per_subset = gcv_per_subset
1203#       
1204#    @property
1205#    def num_terms(self):
1206#        """ Number of terms in the model (including the intercept).
1207#        """
1208#        return numpy.sum(numpy.asarray(self.best_set, dtype=int))
1209#   
1210#    @property
1211#    def max_terms(self):
1212#        """ Maximum number of terms (as specified in the learning step).
1213#        """
1214#        return self.best_set.size
1215#   
1216#    @property
1217#    def num_preds(self):
1218#        """ Number of predictors (variables) included in the model.
1219#        """
1220#        return len(self.used_attributes(term))
1221#   
1222#    def __call__(self, example, what=Orange.core.GetValue):
1223#        value = self.predict(example)
1224#        if isinstance(self.class_var, Orange.data.variable.Continuous):
1225#            value = self.class_var(value)
1226#        else:
1227#            value = self.class_var(int(round(value)))
1228#           
1229#        dist = Orange.statistics.distribution.Distribution(self.class_var)
1230#        dist[value] = 1.0
1231#        if what == Orange.core.GetValue:
1232#            return value
1233#        elif what == Orange.core.GetProbabilities:
1234#            return dist
1235#        else:
1236#            return (value, dist)
1237#   
1238#    def base_matrix(self, examples=None):
1239#        """ Return the base matrix (bx) of the Earth model for the table.
1240#        If table is not supplied the base matrix of the training examples
1241#        is returned.
1242#       
1243#       
1244#        :param examples: Input examples for the base matrix.
1245#        :type examples: Orange.data.Table
1246#       
1247#        """
1248#        if examples is None:
1249#            examples = self.examples
1250#           
1251#        if examples is None:
1252#            raise ValueError("base matrix is only available if 'store_examples=True'")
1253#       
1254#        if isinstance(examples, Orange.data.Table):
1255#            data, cls, w = examples.to_numpy_MA()
1256#            data = data.filled(0.0)
1257#        else:
1258#            data = numpy.asarray(examples)
1259#           
1260#        return base_matrix(data, self.best_set, self.dirs, self.cuts)
1261#   
1262#    def _anova_order(self):
1263#        """ Return indices that sort the terms into the 'ANOVA' format.
1264#        """
1265#        terms = [([], 0)] # intercept
1266#        for i, used in enumerate(self.best_set[1:], 1):
1267#            attrs = sorted(self.used_attributes(i))
1268#            if used and attrs:
1269#                terms.append((attrs, i))
1270#        terms = sotred(terms, key=lambda t:(len(t[0]), t[0]))
1271#        return [i for _, i in terms]
1272#   
1273#    def format_model(self, percision=3, indent=3):
1274#        return format_model(self, percision, indent)
1275#   
1276#    def print_model(self, percision=3, indent=3):
1277#        print self.format_model(percision, indent)
1278#       
1279#    def predict(self, ex):
1280#        """ Return the predicted value (float) for example.
1281#        """
1282#        x = Orange.data.Table(self.domain, [ex])
1283#        x, c, w = x.to_numpy_MA()
1284#        x = x.filled(0.0)[0]
1285#       
1286#        bx = numpy.ones(self.best_set.shape)
1287#        betas = numpy.zeros_like(self.betas)
1288#        betas[0] = self.betas[0]
1289#        beta_i = 0
1290#        for termi in range(1, len(self.best_set)):
1291#            dirs = self.dirs[termi]
1292#            cuts = self.cuts[termi]
1293#            dir_p1 = numpy.where(dirs == 1)[0]
1294#            dir_m1 = numpy.where(dirs == -1)[0]
1295#            dir_2 = numpy.where(dirs == 2)[0]
1296#           
1297#            x1 = x[dir_p1] - cuts[dir_p1]
1298#            x2 = cuts[dir_m1] - x[dir_m1]
1299#            x3 = x[dir_2]
1300#           
1301#            x1 = numpy.maximum(x1, 0.0)
1302#            x2 = numpy.maximum(x2, 0.0)
1303#
1304#            X = numpy.hstack((x1, x2, x3))
1305#            X = numpy.cumprod(X)
1306#           
1307#            bx[termi] = X[-1] if X.size else 0.0
1308#            if self.best_set[termi] != 0:
1309#                beta_i += 1
1310#                betas[beta_i] = self.betas[beta_i]
1311#
1312#        return numpy.sum(bx[self.best_set] * betas)
1313#           
1314#    def used_attributes(self, term=None):
1315#        """ Return a list of used attributes. If term (index) is given
1316#        return only attributes used in that single term.
1317#       
1318#        """
1319#        if term is None:
1320#            terms = numpy.where(self.best_set)[0]
1321#        else:
1322#            terms = [term]
1323#        attrs = set()
1324#        for ti in terms:
1325#            attri = numpy.where(self.dirs[ti] != 0.0)[0]
1326#            attrs.update([self.domain.attributes[i] for i in attri])
1327#        return attrs
1328#       
1329#    def evimp(self, used_only=True):
1330#        """ Return the estimated variable importance.
1331#        """
1332#        return evimp(self, used_only)
1333#       
1334#    def __reduce__(self):
1335#        return (EarthClassifier, (self._base_classifier, self.examples,
1336#                                  self.best_set, self.dirs, self.cuts,
1337#                                  self.betas, self.subsets,
1338#                                  self.rss_per_subset, self.gcv_per_subset),
1339#                {})
1340                                 
Note: See TracBrowser for help on using the repository browser.