source: orange/Orange/regression/lasso.py @ 9927:d6ca7b346864

Revision 9927:d6ca7b346864, 14.3 KB checked in by markotoplak, 2 years ago (diff)

data.variable -> feature.

Line 
1"""\
2============================
3Lasso regression (``lasso``)
4============================
5
6.. index:: regression
7
8.. _`Lasso regression. Regression shrinkage and selection via the lasso`:
9    http://www-stat.stanford.edu/~tibs/lasso/lasso.pdf
10
11
12`The Lasso <http://www-stat.stanford.edu/~tibs/lasso/lasso.pdf>`_ is a shrinkage
13and selection method for linear regression. It minimizes the usual sum of squared
14errors, with a bound on the sum of the absolute values of the coefficients.
15
16To fit the regression parameters on housing data set use the following code:
17
18.. literalinclude:: code/lasso-example.py
19   :lines: 7,9,10,11
20
21.. autoclass:: LassoRegressionLearner
22    :members:
23
24.. autoclass:: LassoRegression
25    :members:
26
27
28.. autoclass:: LassoRegressionLearner
29    :members:
30
31.. autoclass:: LassoRegression
32    :members:
33
34Utility functions
35-----------------
36
37.. autofunction:: center
38
39.. autofunction:: get_bootstrap_sample
40
41.. autofunction:: permute_responses
42
43
44========
45Examples
46========
47
48To predict values of the response for the first five instances
49use the code
50
51.. literalinclude:: code/lasso-example.py
52   :lines: 14,15
53
54Output
55
56::
57
58    Actual: 24.00, predicted: 24.58
59    Actual: 21.60, predicted: 23.30
60    Actual: 34.70, predicted: 24.98
61    Actual: 33.40, predicted: 24.78
62    Actual: 36.20, predicted: 24.66
63
64To see the fitted regression coefficients, print the model
65
66.. literalinclude:: code/lasso-example.py
67   :lines: 17
68
69The output
70
71::
72
73    Variable  Coeff Est  Std Error          p
74     Intercept     22.533
75          CRIM     -0.000      0.023      0.480     
76         INDUS     -0.010      0.023      0.300     
77            RM      1.303      0.994      0.000   ***
78           AGE     -0.002      0.000      0.320     
79       PTRATIO     -0.191      0.209      0.050     .
80         LSTAT     -0.126      0.105      0.000   ***
81    Signif. codes:  0 *** 0.001 ** 0.01 * 0.05 . 0.1 empty 1
82
83
84    For 7 variables the regression coefficient equals 0:
85    ZN
86    CHAS
87    NOX
88    DIS
89    RAD
90    TAX
91    B
92
93shows that some of the regression coefficients are equal to 0.   
94
95
96
97
98
99"""
100
101import Orange
102import numpy
103
104from Orange.regression import base
105
106from Orange.misc import deprecated_members, deprecated_keywords
107
108def center(X):
109    """Centers the data, i.e. subtracts the column means.
110    Returns the centered data and the mean.
111
112    :param X: the data arry
113    :type table: :class:`numpy.array`
114    """
115    mu = X.mean(axis=0)
116    return X - mu, mu
117
118def standardize(X):
119    """Standardizes the data, i.e. subtracts the column means and divide by
120    standard deviation.
121    Returns the centered data, the mean, and deviations.
122
123    :param X: the data arry
124    :type table: :class:`numpy.array`
125    """
126    mu = numpy.mean(X, axis=0)
127    std = numpy.std(X, axis=0)
128    return (X - mu) / std, mu, std
129
130def get_bootstrap_sample(table):
131    """Generates boostrap sample from an Orange Example Table
132    and stores it in a new :class:`Orange.data.Table` object
133
134    :param table: the original data sample
135    :type table: :class:`Orange.data.Table`
136    """
137    n = len(table)
138    bootTable = Orange.data.Table(table.domain)
139    for i in range(n):
140        id = numpy.random.randint(0,n)
141        bootTable.append(table[id])
142    return bootTable
143
144def permute_responses(table):
145    """ Permutes values of the class (response) variable.
146    The independence between independent variables and the response
147    is obtained but the distribution of the response variable is kept.
148
149    :param table: the original data sample
150    :type table: :class:`Orange.data.Table`
151    """
152    n = len(table)
153    perm = numpy.random.permutation(n)
154    permTable = Orange.data.Table(table.domain, table)
155    for i, ins in enumerate(table):
156        permTable[i].set_class(table[perm[i]].get_class())
157    return permTable
158
159class LassoRegressionLearner(base.BaseRegressionLearner):
160    """Fits the lasso regression model, i.e. learns the regression parameters
161    The class is derived from
162    :class:`Orange.regression.base.BaseRegressionLearner`
163    which is used for preprocessing the data (continuization and imputation)
164    before fitting the regression parameters
165
166    """
167
168    def __init__(self, name='lasso regression', t=1, s=None, tol=0.001, \
169                 n_boot=100, n_perm=100, imputer=None, continuizer=None):
170        """
171        :param name: name of the linear model, default 'lasso regression'
172        :type name: string
173       
174        :param t: tuning parameter, upper bound for the L1-norm of the
175            regression coefficients
176        :type t: float
177       
178        :param s: An alternative way to specify the tuning parameter ``t``.
179            Here ``t`` is taken to be t = s * sum(abs(B)) where B are the
180            coefficients of an ordinary least square linear fit. ``t`` parameter is ignored if ``s`` is specified (by default it
181            is None).
182        :type s: float
183       
184        :param tol: tolerance parameter, regression coefficients
185            (absoulute value) under tol are set to 0,
186            default=0.001
187        :type tol: float
188       
189        :param n_boot: number of bootstrap samples used for non-parametric
190            estimation of standard errors
191        :type n_boot: int
192       
193        :param n_perm: number of permuations used for non-parametric
194            estimation of p-values
195        :type n_perm: int
196       
197        """
198
199        self.name = name
200        self.t = t
201        self.s = s
202        self.tol = tol
203        self.n_boot = n_boot
204        self.n_perm = n_perm
205        self.set_imputer(imputer=imputer)
206        self.set_continuizer(continuizer=continuizer)
207       
208       
209    def __call__(self, table, weight=None):
210        """
211        :param table: data instances.
212        :type table: :class:`Orange.data.Table`
213        :param weight: the weights for instances. Default: None, i.e.
214            all data instances are eqaully important in fitting
215            the regression parameters
216        :type weight: None or list of Orange.feature.Continuous
217            which stores weights for instances
218       
219        """ 
220        # dicrete values are continuized       
221        table = self.continuize_table(table)
222        # missing values are imputed
223        table = self.impute_table(table)
224
225        domain = table.domain
226        X, y, w = table.to_numpy()
227        n, m = numpy.shape(X)
228       
229        X, mu_x, sigma_x = standardize(X)
230        y, coef0 = center(y)
231       
232        t = self.t
233       
234        if self.s is not None:
235            beta_full, rss, _, _ = numpy.linalg.lstsq(X, y)
236            t = self.s * numpy.sum(numpy.abs(beta_full))
237            print "t =", t
238           
239        import scipy.optimize
240           
241        # objective function to be minimized
242        objective = lambda beta: numpy.linalg.norm(y - numpy.dot(X, beta))
243        # initial guess for the regression parameters
244        beta_init = numpy.random.random(m)
245        # constraints for the regression coefficients
246        cnstr = lambda beta: t - numpy.sum(numpy.abs(beta))
247        # optimal solution
248        coefficients = scipy.optimize.fmin_cobyla(objective, beta_init,\
249                                                       cnstr, disp=0)
250
251        # set small coefficients to 0
252        def set_2_0(c): return c if abs(c) > self.tol else 0
253        coefficients = numpy.array(map(set_2_0, coefficients))
254        coefficients /= sigma_x
255       
256        # bootstrap estimator of standard error of the coefficient estimators
257        # assumption: fixed t
258        if self.n_boot > 0:
259            coeff_b = [] # bootstrapped coefficients
260            for i in range(self.n_boot):
261                tmp_table = get_bootstrap_sample(table)
262                l = LassoRegressionLearner(t=t, n_boot=0, n_perm=0)
263                c = l(tmp_table)
264                coeff_b.append(c.coefficients)
265            std_errors_fixed_t = numpy.std(coeff_b, axis=0)
266        else:
267            std_errors_fixed_t = [float("nan")] * m
268
269        # permutation test to obtain the significance of the regression
270        #coefficients
271        if self.n_perm > 0:
272            coeff_p = []
273            for i in range(self.n_perm):
274                tmp_table = permute_responses(table)
275                l = LassoRegressionLearner(t=t, n_boot=0, n_perm=0)
276                c = l(tmp_table)
277                coeff_p.append(c.coefficients)
278            p_vals = \
279                   numpy.sum(abs(numpy.array(coeff_p))>\
280                             abs(numpy.array(coefficients)), \
281                             axis=0)/float(self.n_perm)
282        else:
283            p_vals = [float("nan")] * m
284
285        # dictionary of regression coefficients with standard errors
286        # and p-values
287        dict_model = {}
288        for i, var in enumerate(domain.attributes):
289            dict_model[var.name] = (coefficients[i], std_errors_fixed_t[i], p_vals[i])           
290       
291        return LassoRegression(domain=domain, class_var=domain.class_var,
292                               coef0=coef0, coefficients=coefficients,
293                               std_errors_fixed_t=std_errors_fixed_t,
294                               p_vals=p_vals,
295                               dict_model= dict_model,
296                               mu_x=mu_x)
297
298deprecated_members({"nBoot": "n_boot",
299                    "nPerm": "n_perm"}, 
300                   wrap_methods=["__init__"],
301                   in_place=True)(LassoRegressionLearner)
302
303class LassoRegression(Orange.classification.Classifier):
304    """Lasso regression predicts value of the response variable
305    based on the values of independent variables.
306
307    .. attribute:: coef0
308
309        Intercept (sample mean of the response variable).   
310
311    .. attribute:: coefficients
312
313        Regression coefficients, sotred in list.
314
315    .. attribute:: std_errors_fixed_t
316
317        Standard errors of the coefficient estimator for the fixed
318        tuning parameter t. The standard errors are estimated using
319        bootstrapping method.
320
321    .. attribute:: p_vals
322
323        List of p-values for the null hypothesis that the regression
324        coefficients equal 0 based on non-parametric permutation test.
325
326    .. attribute:: dict_model
327
328        Statistical properties of the model stored in dictionary:
329        Keys - names of the independent variables
330        Values - tuples (coefficient, standard error, p-value)
331
332    .. attribute:: mu_x
333
334        Sample mean of the all independent variables.   
335
336    """ 
337    def __init__(self, domain=None, class_var=None, coef0=None,
338                 coefficients=None, std_errors_fixed_t=None, p_vals=None,
339                 dict_model=None, mu_x=None):
340        self.domain = domain
341        self.class_var = class_var
342        self.coef0 = coef0
343        self.coefficients = coefficients
344        self.std_errors_fixed_t = std_errors_fixed_t
345        self.p_vals = p_vals
346        self.dict_model = dict_model
347        self.mu_x = mu_x
348
349    @deprecated_keywords({"resultType": "result_type"})
350    def __call__(self, instance, result_type=Orange.core.GetValue):
351        """
352        :param instance: data instance for which the value of the response
353            variable will be predicted
354        :type instance:
355        """ 
356        ins = Orange.data.Instance(self.domain, instance)
357        if "?" in ins: # missing value -> corresponding coefficient omitted
358            def miss_2_0(x): return x if x != "?" else 0
359            ins = map(miss_2_0, ins)
360            ins = numpy.array(ins)[:-1] - self.mu_x
361        else:
362            ins = numpy.array(ins.native())[:-1] - self.mu_x
363
364        y_hat = numpy.dot(self.coefficients, ins) + self.coef0
365        y_hat = self.class_var(y_hat)
366        dist = Orange.statistics.distribution.Continuous(self.class_var)
367        dist[y_hat] = 1.0
368        if result_type == Orange.core.GetValue:
369            return y_hat
370        if result_type == Orange.core.GetProbabilities:
371            return dist
372        else:
373            return (y_hat, dist)
374       
375    @deprecated_keywords({"skipZero": "skip_zero"})
376    def to_string(self, skip_zero=True):
377        """Pretty-prints Lasso regression model,
378        i.e. estimated regression coefficients with standard errors
379        and significances. Standard errors are obtained using bootstrapping
380        method and significances by the permuation test
381
382        :param skip_zero: if True variables with estimated coefficient equal to 0
383            are omitted
384        :type skip_zero: boolean
385        """
386       
387        from string import join
388        labels = ('Variable', 'Coeff Est', 'Std Error', 'p')
389        lines = [join(['%10s' % l for l in labels], ' ')]
390
391        fmt = "%10s " + join(["%10.3f"]*3, " ") + " %5s"
392        fmt1 = "%10s %10.3f"
393
394        def get_star(p):
395            if p < 0.001: return  "*"*3
396            elif p < 0.01: return "*"*2
397            elif p < 0.05: return "*"
398            elif p < 0.1: return  "."
399            else: return " "
400
401        stars =  get_star(self.p_vals[0])
402        lines.append(fmt1 % ('Intercept', self.coef0))
403        skipped = []
404        for i in range(len(self.domain.attributes)):
405            if self.coefficients[i] == 0. and skip_zero:
406                skipped.append(self.domain.attributes[i].name)
407                continue           
408            stars = get_star(self.p_vals[i])
409            lines.append(fmt % (self.domain.attributes[i].name, 
410                         self.coefficients[i], self.std_errors_fixed_t[i], 
411                         self.p_vals[i], stars))
412        lines.append("Signif. codes:  0 *** 0.001 ** 0.01 * 0.05 . 0.1 empty 1")
413        lines.append("\n")
414        if skip_zero:
415            k = len(skipped)
416            if k == 0:
417                lines.append("All variables have non-zero regression coefficients. ")
418            else:
419                suff = "s" if k > 1 else ""
420                lines.append("For %d variable%s the regression coefficient equals 0: " \
421                      % (k, suff))
422                for var in skipped:
423                    lines.append(var)
424        return "\n".join(lines)
425
426    def __str__(self):
427        return self.to_string(skip_zero=True)
428
429deprecated_members({"muX": "mu_x",
430                    "stdErrorsFixedT": "std_errors_fixed_t",
431                    "pVals": "p_vals",
432                    "dictModel": "dict_model"},
433                   wrap_methods=["__init__"],
434                   in_place=True)(LassoRegression)
435
436if __name__ == "__main__":
437
438    import Orange
439   
440    table = Orange.data.Table("housing.tab")       
441
442    c = LassoRegressionLearner(table, t=len(table.domain))
443    print c
Note: See TracBrowser for help on using the repository browser.