source: orange/Orange/regression/lasso.py @ 10308:70e5c7d0b9fe

Revision 10308:70e5c7d0b9fe, 14.1 KB checked in by Miha Stajdohar <miha.stajdohar@…>, 2 years ago (diff)

Disabled prints (iprint=0).

Line 
1"""\
2############################
3Lasso regression (``lasso``)
4############################
5
6.. index:: regression
7
8.. _`Lasso regression. Regression shrinkage and selection via the lasso`:
9    http://www-stat.stanford.edu/~tibs/lasso/lasso.pdf
10
11
12`The Lasso <http://www-stat.stanford.edu/~tibs/lasso/lasso.pdf>`_ is a shrinkage
13and selection method for linear regression. It minimizes the usual sum of squared
14errors, with a bound on the sum of the absolute values of the coefficients.
15
16To fit the regression parameters on housing data set use the following code:
17
18.. literalinclude:: code/lasso-example.py
19   :lines: 7,9,10,11
20
21.. autoclass:: LassoRegressionLearner
22    :members:
23
24.. autoclass:: LassoRegression
25    :members:
26
27
28.. autoclass:: LassoRegressionLearner
29    :members:
30
31.. autoclass:: LassoRegression
32    :members:
33
34Utility functions
35-----------------
36
37.. autofunction:: center
38
39.. autofunction:: get_bootstrap_sample
40
41.. autofunction:: permute_responses
42
43
44========
45Examples
46========
47
48To predict values of the response for the first five instances
49use the code
50
51.. literalinclude:: code/lasso-example.py
52   :lines: 14,15
53
54Output
55
56::
57
58    Actual: 24.00, predicted: 24.58
59    Actual: 21.60, predicted: 23.30
60    Actual: 34.70, predicted: 24.98
61    Actual: 33.40, predicted: 24.78
62    Actual: 36.20, predicted: 24.66
63
64To see the fitted regression coefficients, print the model
65
66.. literalinclude:: code/lasso-example.py
67   :lines: 17
68
69The output
70
71::
72
73    Variable  Coeff Est  Std Error          p
74     Intercept     22.533
75          CRIM     -0.000      0.023      0.480     
76         INDUS     -0.010      0.023      0.300     
77            RM      1.303      0.994      0.000   ***
78           AGE     -0.002      0.000      0.320     
79       PTRATIO     -0.191      0.209      0.050     .
80         LSTAT     -0.126      0.105      0.000   ***
81    Signif. codes:  0 *** 0.001 ** 0.01 * 0.05 . 0.1 empty 1
82
83
84    For 7 variables the regression coefficient equals 0:
85    ZN
86    CHAS
87    NOX
88    DIS
89    RAD
90    TAX
91    B
92
93shows that some of the regression coefficients are equal to 0.   
94
95
96
97
98
99"""
100
101import Orange
102import numpy
103
104from Orange.regression import base
105
106from Orange.misc import deprecated_members, deprecated_keywords
107
108def center(X):
109    """Centers the data, i.e. subtracts the column means.
110    Returns the centered data and the mean.
111
112    :param X: the data arry
113    :type table: :class:`numpy.array`
114    """
115    mu = X.mean(axis=0)
116    return X - mu, mu
117
118def standardize(X):
119    """Standardizes the data, i.e. subtracts the column means and divide by
120    standard deviation.
121    Returns the centered data, the mean, and deviations.
122
123    :param X: the data arry
124    :type table: :class:`numpy.array`
125    """
126    mu = numpy.mean(X, axis=0)
127    std = numpy.std(X, axis=0)
128    return (X - mu) / std, mu, std
129
130def get_bootstrap_sample(table):
131    """Generates boostrap sample from an Orange Example Table
132    and stores it in a new :class:`Orange.data.Table` object
133
134    :param table: the original data sample
135    :type table: :class:`Orange.data.Table`
136    """
137    n = len(table)
138    bootTable = Orange.data.Table(table.domain)
139    for i in range(n):
140        id = numpy.random.randint(0, n)
141        bootTable.append(table[id])
142    return bootTable
143
144def permute_responses(table):
145    """ Permutes values of the class (response) variable.
146    The independence between independent variables and the response
147    is obtained but the distribution of the response variable is kept.
148
149    :param table: the original data sample
150    :type table: :class:`Orange.data.Table`
151    """
152    n = len(table)
153    perm = numpy.random.permutation(n)
154    permTable = Orange.data.Table(table.domain, table)
155    for i, ins in enumerate(table):
156        permTable[i].set_class(table[perm[i]].get_class())
157    return permTable
158
159class LassoRegressionLearner(base.BaseRegressionLearner):
160    """Fits the lasso regression model, i.e. learns the regression parameters
161    The class is derived from
162    :class:`Orange.regression.base.BaseRegressionLearner`
163    which is used for preprocessing the data (continuization and imputation)
164    before fitting the regression parameters
165
166    """
167
168    def __init__(self, name='lasso regression', t=1, s=None, tol=0.001, \
169                 n_boot=100, n_perm=100, imputer=None, continuizer=None):
170        """
171        :param name: name of the linear model, default 'lasso regression'
172        :type name: string
173       
174        :param t: tuning parameter, upper bound for the L1-norm of the
175            regression coefficients
176        :type t: float
177       
178        :param s: An alternative way to specify the tuning parameter ``t``.
179            Here ``t`` is taken to be t = s * sum(abs(B)) where B are the
180            coefficients of an ordinary least square linear fit. ``t`` parameter is ignored if ``s`` is specified (by default it
181            is None).
182        :type s: float
183       
184        :param tol: tolerance parameter, regression coefficients
185            (absoulute value) under tol are set to 0,
186            default=0.001
187        :type tol: float
188       
189        :param n_boot: number of bootstrap samples used for non-parametric
190            estimation of standard errors
191        :type n_boot: int
192       
193        :param n_perm: number of permuations used for non-parametric
194            estimation of p-values
195        :type n_perm: int
196       
197        """
198
199        self.name = name
200        self.t = t
201        self.s = s
202        self.tol = tol
203        self.n_boot = n_boot
204        self.n_perm = n_perm
205        self.set_imputer(imputer=imputer)
206        self.set_continuizer(continuizer=continuizer)
207
208
209    def __call__(self, table, weight=None):
210        """
211        :param table: data instances.
212        :type table: :class:`Orange.data.Table`
213        :param weight: the weights for instances. Default: None, i.e.
214            all data instances are eqaully important in fitting
215            the regression parameters
216        :type weight: None or list of Orange.feature.Continuous
217            which stores weights for instances
218       
219        """
220        # dicrete values are continuized       
221        table = self.continuize_table(table)
222        # missing values are imputed
223        table = self.impute_table(table)
224
225        domain = table.domain
226        X, y, w = table.to_numpy()
227        n, m = numpy.shape(X)
228
229        X, mu_x, sigma_x = standardize(X)
230        y, coef0 = center(y)
231
232        t = self.t
233
234        if self.s is not None:
235            beta_full, rss, _, _ = numpy.linalg.lstsq(X, y)
236            t = self.s * numpy.sum(numpy.abs(beta_full))
237            print "t =", t
238
239        import scipy.optimize
240
241        # objective function to be minimized
242        objective = lambda beta: numpy.linalg.norm(y - numpy.dot(X, beta))
243        # initial guess for the regression parameters
244        beta_init = numpy.random.random(m)
245        # constraints for the regression coefficients
246        cnstr = lambda beta: t - numpy.sum(numpy.abs(beta))
247        # optimal solution
248        coefficients = scipy.optimize.fmin_cobyla(objective, beta_init, cnstr, iprint=0)
249
250        # set small coefficients to 0
251        def set_2_0(c): return c if abs(c) > self.tol else 0
252        coefficients = numpy.array(map(set_2_0, coefficients))
253        coefficients /= sigma_x
254
255        # bootstrap estimator of standard error of the coefficient estimators
256        # assumption: fixed t
257        if self.n_boot > 0:
258            coeff_b = [] # bootstrapped coefficients
259            for i in range(self.n_boot):
260                tmp_table = get_bootstrap_sample(table)
261                l = LassoRegressionLearner(t=t, n_boot=0, n_perm=0)
262                c = l(tmp_table)
263                coeff_b.append(c.coefficients)
264            std_errors_fixed_t = numpy.std(coeff_b, axis=0)
265        else:
266            std_errors_fixed_t = [float("nan")] * m
267
268        # permutation test to obtain the significance of the regression
269        #coefficients
270        if self.n_perm > 0:
271            coeff_p = []
272            for i in range(self.n_perm):
273                tmp_table = permute_responses(table)
274                l = LassoRegressionLearner(t=t, n_boot=0, n_perm=0)
275                c = l(tmp_table)
276                coeff_p.append(c.coefficients)
277            p_vals = \
278                   numpy.sum(abs(numpy.array(coeff_p)) > \
279                             abs(numpy.array(coefficients)), \
280                             axis=0) / float(self.n_perm)
281        else:
282            p_vals = [float("nan")] * m
283
284        # dictionary of regression coefficients with standard errors
285        # and p-values
286        dict_model = {}
287        for i, var in enumerate(domain.attributes):
288            dict_model[var.name] = (coefficients[i], std_errors_fixed_t[i], p_vals[i])
289
290        return LassoRegression(domain=domain, class_var=domain.class_var,
291                               coef0=coef0, coefficients=coefficients,
292                               std_errors_fixed_t=std_errors_fixed_t,
293                               p_vals=p_vals,
294                               dict_model=dict_model,
295                               mu_x=mu_x)
296
297deprecated_members({"nBoot": "n_boot",
298                    "nPerm": "n_perm"},
299                   wrap_methods=["__init__"],
300                   in_place=True)(LassoRegressionLearner)
301
302class LassoRegression(Orange.classification.Classifier):
303    """Lasso regression predicts value of the response variable
304    based on the values of independent variables.
305
306    .. attribute:: coef0
307
308        Intercept (sample mean of the response variable).   
309
310    .. attribute:: coefficients
311
312        Regression coefficients, sotred in list.
313
314    .. attribute:: std_errors_fixed_t
315
316        Standard errors of the coefficient estimator for the fixed
317        tuning parameter t. The standard errors are estimated using
318        bootstrapping method.
319
320    .. attribute:: p_vals
321
322        List of p-values for the null hypothesis that the regression
323        coefficients equal 0 based on non-parametric permutation test.
324
325    .. attribute:: dict_model
326
327        Statistical properties of the model stored in dictionary:
328        Keys - names of the independent variables
329        Values - tuples (coefficient, standard error, p-value)
330
331    .. attribute:: mu_x
332
333        Sample mean of the all independent variables.   
334
335    """
336    def __init__(self, domain=None, class_var=None, coef0=None,
337                 coefficients=None, std_errors_fixed_t=None, p_vals=None,
338                 dict_model=None, mu_x=None):
339        self.domain = domain
340        self.class_var = class_var
341        self.coef0 = coef0
342        self.coefficients = coefficients
343        self.std_errors_fixed_t = std_errors_fixed_t
344        self.p_vals = p_vals
345        self.dict_model = dict_model
346        self.mu_x = mu_x
347
348    @deprecated_keywords({"resultType": "result_type"})
349    def __call__(self, instance, result_type=Orange.core.GetValue):
350        """
351        :param instance: data instance for which the value of the response
352            variable will be predicted
353        :type instance:
354        """
355        ins = Orange.data.Instance(self.domain, instance)
356        if "?" in ins: # missing value -> corresponding coefficient omitted
357            def miss_2_0(x): return x if x != "?" else 0
358            ins = map(miss_2_0, ins)
359            ins = numpy.array(ins)[:-1] - self.mu_x
360        else:
361            ins = numpy.array(ins.native())[:-1] - self.mu_x
362
363        y_hat = numpy.dot(self.coefficients, ins) + self.coef0
364        y_hat = self.class_var(y_hat)
365        dist = Orange.statistics.distribution.Continuous(self.class_var)
366        dist[y_hat] = 1.0
367        if result_type == Orange.core.GetValue:
368            return y_hat
369        if result_type == Orange.core.GetProbabilities:
370            return dist
371        else:
372            return (y_hat, dist)
373
374    @deprecated_keywords({"skipZero": "skip_zero"})
375    def to_string(self, skip_zero=True):
376        """Pretty-prints Lasso regression model,
377        i.e. estimated regression coefficients with standard errors
378        and significances. Standard errors are obtained using bootstrapping
379        method and significances by the permuation test
380
381        :param skip_zero: if True variables with estimated coefficient equal to 0
382            are omitted
383        :type skip_zero: boolean
384        """
385
386        from string import join
387        labels = ('Variable', 'Coeff Est', 'Std Error', 'p')
388        lines = [join(['%10s' % l for l in labels], ' ')]
389
390        fmt = "%10s " + join(["%10.3f"] * 3, " ") + " %5s"
391        fmt1 = "%10s %10.3f"
392
393        def get_star(p):
394            if p < 0.001: return  "*"*3
395            elif p < 0.01: return "*"*2
396            elif p < 0.05: return "*"
397            elif p < 0.1: return  "."
398            else: return " "
399
400        stars = get_star(self.p_vals[0])
401        lines.append(fmt1 % ('Intercept', self.coef0))
402        skipped = []
403        for i in range(len(self.domain.attributes)):
404            if self.coefficients[i] == 0. and skip_zero:
405                skipped.append(self.domain.attributes[i].name)
406                continue
407            stars = get_star(self.p_vals[i])
408            lines.append(fmt % (self.domain.attributes[i].name,
409                         self.coefficients[i], self.std_errors_fixed_t[i],
410                         self.p_vals[i], stars))
411        lines.append("Signif. codes:  0 *** 0.001 ** 0.01 * 0.05 . 0.1 empty 1")
412        lines.append("\n")
413        if skip_zero:
414            k = len(skipped)
415            if k == 0:
416                lines.append("All variables have non-zero regression coefficients. ")
417            else:
418                suff = "s" if k > 1 else ""
419                lines.append("For %d variable%s the regression coefficient equals 0: " \
420                      % (k, suff))
421                for var in skipped:
422                    lines.append(var)
423        return "\n".join(lines)
424
425    def __str__(self):
426        return self.to_string(skip_zero=True)
427
428deprecated_members({"muX": "mu_x",
429                    "stdErrorsFixedT": "std_errors_fixed_t",
430                    "pVals": "p_vals",
431                    "dictModel": "dict_model"},
432                   wrap_methods=["__init__"],
433                   in_place=True)(LassoRegression)
434
435if __name__ == "__main__":
436
437    import Orange
438
439    table = Orange.data.Table("housing.tab")
440
441    c = LassoRegressionLearner(table, t=len(table.domain))
442    print c
Note: See TracBrowser for help on using the repository browser.