# Changeset 10859:08a0a35c1687 in orange

Ignore:
Timestamp:
05/04/12 18:07:12 (23 months ago)
Branch:
default
Message:

Reimplemented lasso. Breaks compatibility.

It now uses a proximal gradient method for optimization instead of using scipy.optimize (see #1118).
The formulation is slightly different so there are new parameters (mainly lasso_lambda instead of t/s).
Improved some other things as well.

Files:
4 edited

Unmodified
Removed
• ## Orange/regression/lasso.py

 r10580 from numpy import dot, std, array, zeros, maximum, sqrt, sign, log, abs, \ ascontiguousarray, random as rnd from scipy.linalg import norm, eigh import Orange import numpy from Orange.regression import base from Orange.utils import deprecated_members, deprecated_keywords def center(X): """Centers the data, i.e. subtracts the column means. Returns the centered data and the mean. :param X: the data arry :type table: :class:numpy.array def get_bootstrap_sample(data): """Generate a bootstrap sample of a given data set. :param data: the original data sample :type data: :class:Orange.data.Table """ mu = X.mean(axis=0) return X - mu, mu def standardize(X): """Standardizes the data, i.e. subtracts the column means and divide by standard deviation. Returns the centered data, the mean, and deviations. :param X: the data arry :type table: :class:numpy.array """ mu = numpy.mean(X, axis=0) std = numpy.std(X, axis=0) return (X - mu) / std, mu, std def get_bootstrap_sample(table): """Generates boostrap sample from an Orange Example Table and stores it in a new :class:Orange.data.Table object :param table: the original data sample :type table: :class:Orange.data.Table """ n = len(table) bootTable = Orange.data.Table(table.domain) for i in range(n): id = numpy.random.randint(0, n) bootTable.append(table[id]) return bootTable def permute_responses(table): """ Permutes values of the class (response) variable. n = len(data) bootstrap = Orange.data.Table(data.domain) for id in rnd.randint(0, n, n): bootstrap.append(data[id]) return bootstrap def permute_responses(data): """Permute values of the class (response) variable. The independence between independent variables and the response is obtained but the distribution of the response variable is kept. :param table: the original data sample :type table: :class:Orange.data.Table :param data: Original data. :type data: :class:Orange.data.Table """ n = len(table) perm = numpy.random.permutation(n) permTable = Orange.data.Table(table.domain, table) for i, ins in enumerate(table): permTable[i].set_class(table[perm[i]].get_class()) return permTable class LassoRegressionLearner(base.BaseRegressionLearner): """Fits the lasso regression model, i.e. learns the regression parameters The class is derived from :class:Orange.regression.base.BaseRegressionLearner which is used for preprocessing the data (continuization and imputation) before fitting the regression parameters n = len(data) perm = rnd.permutation(n) perm_data = Orange.data.Table(data.domain, data) for i, ins in enumerate(data): perm_data[i].set_class(data[perm[i]].get_class()) return perm_data class LassoRegressionLearner(Orange.regression.base.BaseRegressionLearner): """Fits the lasso regression model using FISTA (Fast Iterative Shrinkage-Thresholding Algorithm). """ def __init__(self, name='lasso regression', t=1, s=None, tol=0.001, \ n_boot=100, n_perm=100, imputer=None, continuizer=None): """ :param name: name of the linear model, default 'lasso regression' :type name: string def __init__(self, lasso_lambda=0.1, max_iter=20000, eps=1e-6, n_boot=0, n_perm=0, imputer=None, continuizer=None, name='Lasso'): """ :param lasso_lambda: Regularization parameter. :type lasso_lambda: float :param max_iter: Maximum number of iterations for the optimization method. :type max_iter: int :param eps: Stop optimization when improvements are lower than eps. :type eps: float :param n_boot: Number of bootstrap samples used for non-parametric estimation of standard errors. :type n_boot: int :param n_perm: Number of permuations used for non-parametric estimation of p-values. :type n_perm: int :param name: Learner name. :type name: str :param t: tuning parameter, upper bound for the L1-norm of the regression coefficients :type t: float :param s: An alternative way to specify the tuning parameter t. Here t is taken to be t = s * sum(abs(B)) where B are the coefficients of an ordinary least square linear fit. t parameter is ignored if s is specified (by default it is None). :type s: float :param tol: tolerance parameter, regression coefficients (absoulute value) under tol are set to 0, default=0.001 :type tol: float :param n_boot: number of bootstrap samples used for non-parametric estimation of standard errors :type n_boot: int :param n_perm: number of permuations used for non-parametric estimation of p-values :type n_perm: int """ self.name = name self.t = t self.s = s self.tol = tol """ self.lasso_lambda = lasso_lambda self.max_iter = max_iter self.eps = eps self.n_boot = n_boot self.n_perm = n_perm self.set_imputer(imputer=imputer) self.set_continuizer(continuizer=continuizer) def __call__(self, table, weight=None): """ :param table: data instances. :type table: :class:Orange.data.Table :param weight: the weights for instances. Default: None, i.e. all data instances are eqaully important in fitting the regression parameters :type weight: None or list of Orange.feature.Continuous which stores weights for instances self.name = name def get_lipschitz(self, X): """Return the Lipschitz constant of :math:\\nabla f, where :math:f(w) = \\frac{1}{2}||Xw-y||^2. """ n, m = X.shape if n > m: X = ascontiguousarray(X.T) k = min(n, m) - 1 eigvals = eigh(dot(X, X.T), eigvals_only=True, eigvals=(k, k)) return eigvals[-1] def fista(self, X, y, l, lipschitz, w_init=None): """Fast Iterative Shrinkage-Thresholding Algorithm (FISTA).""" z = w_old = zeros(X.shape[1]) if w_init is None else w_init t_old, obj_old = 1, 1e400 XT = ascontiguousarray(X.T) for i in range(self.max_iter): z -= 1. / lipschitz * dot(XT, dot(X, z) - y) w = maximum(0, abs(z) - l / lipschitz) * sign(z) t = (1 + sqrt(1 + 4 * t_old**2)) / 2 z = w + (t_old - 1) / t * (w - w_old) obj = ((y - dot(X, w))**2).sum() + l * norm(w, 1) if abs(obj_old - obj) / obj < self.eps: stop += 1 if obj < obj_old and stop > log(i + 1): break else: stop = 0 w_old, t_old = w, t obj_old = obj return w def __call__(self, data, weight=None): """ :param data: Training data. :type data: :class:Orange.data.Table :param weight: Weights for instances. Not implemented yet. """ # dicrete values are continuized table = self.continuize_table(table) data = self.continuize_table(data) # missing values are imputed table = self.impute_table(table) domain = table.domain X, y, w = table.to_numpy() n, m = numpy.shape(X) X, mu_x, sigma_x = standardize(X) y, coef0 = center(y) t = self.t if self.s is not None: beta_full, rss, _, _ = numpy.linalg.lstsq(X, y) t = self.s * numpy.sum(numpy.abs(beta_full)) print "t =", t import scipy.optimize # objective function to be minimized objective = lambda beta: numpy.linalg.norm(y - numpy.dot(X, beta)) # initial guess for the regression parameters beta_init = numpy.random.random(m) # constraints for the regression coefficients cnstr = lambda beta: t - numpy.sum(numpy.abs(beta)) # optimal solution coefficients = scipy.optimize.fmin_cobyla(objective, beta_init, cnstr, iprint=0) # set small coefficients to 0 def set_2_0(c): return c if abs(c) > self.tol else 0 coefficients = numpy.array(map(set_2_0, coefficients)) coefficients /= sigma_x data = self.impute_table(data) domain = data.domain # prepare numpy matrices X, y, _ = data.to_numpy() n, m = X.shape coefficients = zeros(m) std_errors = array([float('nan')] * m) p_vals = array([float('nan')] * m) # standardize y coef0, sigma_y = y.mean(), y.std() + 1e-6 y = (y - coef0) / sigma_y # standardize X and remove constant vars mu_x = X.mean(axis=0) X -= mu_x sigma_x = X.std(axis=0) nz = sigma_x != 0 X = ascontiguousarray(X[:, nz]) sigma_x = sigma_x[nz] X /= sigma_x m = sum(nz) # run optimization method lipschitz = self.get_lipschitz(X) l = 0.5 * self.lasso_lambda * n / m coefficients[nz] = self.fista(X, y, l, lipschitz) coefficients[nz] *= sigma_y / sigma_x d = dict(self.__dict__) d.update({'n_boot': 0, 'n_perm': 0}) # bootstrap estimator of standard error of the coefficient estimators # assumption: fixed t # assumption: fixed regularization parameter if self.n_boot > 0: coeff_b = [] # bootstrapped coefficients for i in range(self.n_boot): tmp_table = get_bootstrap_sample(table) l = LassoRegressionLearner(t=t, n_boot=0, n_perm=0) c = l(tmp_table) tmp_data = get_bootstrap_sample(data) l = LassoRegressionLearner(**d) c = l(tmp_data) coeff_b.append(c.coefficients) std_errors_fixed_t = numpy.std(coeff_b, axis=0) else: std_errors_fixed_t = [float("nan")] * m # permutation test to obtain the significance of the regression #coefficients std_errors[nz] = std(coeff_b, axis=0) # permutation test to obtain the significance of # the regression coefficients if self.n_perm > 0: coeff_p = [] for i in range(self.n_perm): tmp_table = permute_responses(table) l = LassoRegressionLearner(t=t, n_boot=0, n_perm=0) c = l(tmp_table) tmp_data = permute_responses(data) l = LassoRegressionLearner(**d) c = l(tmp_data) coeff_p.append(c.coefficients) p_vals = \ numpy.sum(abs(numpy.array(coeff_p)) > \ abs(numpy.array(coefficients)), \ axis=0) / float(self.n_perm) else: p_vals = [float("nan")] * m p_vals[nz] = (abs(coeff_p) > abs(coefficients)).sum(axis=0) p_vals[nz] /= float(self.n_perm) # dictionary of regression coefficients with standard errors # and p-values dict_model = {} model = {} for i, var in enumerate(domain.attributes): dict_model[var.name] = (coefficients[i], std_errors_fixed_t[i], p_vals[i]) model[var.name] = (coefficients[i], std_errors[i], p_vals[i]) return LassoRegression(domain=domain, class_var=domain.class_var, coef0=coef0, coefficients=coefficients, std_errors_fixed_t=std_errors_fixed_t, p_vals=p_vals, dict_model=dict_model, mu_x=mu_x) coef0=coef0, coefficients=coefficients, std_errors=std_errors, p_vals=p_vals, model=model, mu_x=mu_x) deprecated_members({"nBoot": "n_boot", class LassoRegression(Orange.classification.Classifier): """Lasso regression predicts value of the response variable """Lasso regression predicts the value of the response variable based on the values of independent variables. .. attribute:: coefficients Regression coefficients, sotred in list. .. attribute:: std_errors_fixed_t Standard errors of the coefficient estimator for the fixed tuning parameter t. The standard errors are estimated using bootstrapping method. Regression coefficients. .. attribute:: std_errors Standard errors of coefficient estimates for a fixed regularization parameter. The standard errors are estimated using the bootstrapping method. .. attribute:: p_vals List of p-values for the null hypothesis that the regression coefficients equal 0 based on non-parametric permutation test. .. attribute:: dict_model Statistical properties of the model stored in dictionary: List of p-values for the null hypotheses that the regression coefficients equal 0 based on a non-parametric permutation test. .. attribute:: model Dictionary with the statistical properties of the model: Keys - names of the independent variables Values - tuples (coefficient, standard error, p-value) .. attribute:: mu_x Sample mean of the all independent variables. Sample mean of independent variables. """ def __init__(self, domain=None, class_var=None, coef0=None, coefficients=None, std_errors_fixed_t=None, p_vals=None, dict_model=None, mu_x=None): coefficients=None, std_errors=None, p_vals=None, model=None, mu_x=None): self.domain = domain self.class_var = class_var self.coef0 = coef0 self.coefficients = coefficients self.std_errors_fixed_t = std_errors_fixed_t self.std_errors = std_errors self.p_vals = p_vals self.dict_model = dict_model self.model = model self.mu_x = mu_x def _miss_2_0(self, x): return x if x != '?' else 0 @deprecated_keywords({"resultType": "result_type"}) def __call__(self, instance, result_type=Orange.core.GetValue): """ :param instance: data instance for which the value of the response variable will be predicted :type instance: :param instance: Data instance for which the value of the response variable will be predicted. :type instance: :obj:Orange.data.Instance """ ins = Orange.data.Instance(self.domain, instance) if "?" in ins: # missing value -> corresponding coefficient omitted def miss_2_0(x): return x if x != "?" else 0 ins = map(miss_2_0, ins) ins = numpy.array(ins)[:-1] - self.mu_x if '?' in ins: # missing value -> corresponding coefficient omitted ins = map(self._miss_2_0, ins) ins = array(ins)[:-1] - self.mu_x else: ins = numpy.array(ins.native())[:-1] - self.mu_x y_hat = numpy.dot(self.coefficients, ins) + self.coef0 ins = array(ins.native())[:-1] - self.mu_x y_hat = dot(self.coefficients, ins) + self.coef0 y_hat = self.class_var(y_hat) dist = Orange.statistics.distribution.Continuous(self.class_var) dist[y_hat] = 1.0 dist[y_hat] = 1. if result_type == Orange.core.GetValue: return y_hat @deprecated_keywords({"skipZero": "skip_zero"}) def to_string(self, skip_zero=True): """Pretty-prints Lasso regression model, """Pretty-prints a lasso regression model, i.e. estimated regression coefficients with standard errors and significances. Standard errors are obtained using bootstrapping method and significances by the permuation test :param skip_zero: if True variables with estimated coefficient equal to 0 are omitted :type skip_zero: boolean """ from string import join and significances. Standard errors are obtained using the bootstrapping method and significances by a permuation test. :param skip_zero: If True, variables with estimated coefficient equal to 0 are omitted. :type skip_zero: bool """ labels = ('Variable', 'Coeff Est', 'Std Error', 'p') lines = [join(['%10s' % l for l in labels], ' ')] fmt = "%10s " + join(["%10.3f"] * 3, " ") + " %5s" fmt1 = "%10s %10.3f" lines = [' '.join(['%10s' % l for l in labels])] fmt = '%10s ' + ' '.join(['%10.3f'] * 3) + ' %5s' fmt1 = '%10s %10.3f' def get_star(p): if p < 0.001: return  "*"*3 elif p < 0.01: return "*"*2 elif p < 0.05: return "*" elif p < 0.1: return  "." else: return " " if p < 0.001: return  '*' * 3 elif p < 0.01: return '*' * 2 elif p < 0.05: return '*' elif p < 0.1: return  '.' else: return ' ' stars = get_star(self.p_vals[0]) stars = get_star(self.p_vals[i]) lines.append(fmt % (self.domain.attributes[i].name, self.coefficients[i], self.std_errors_fixed_t[i], self.coefficients[i], self.std_errors[i], self.p_vals[i], stars)) lines.append("Signif. codes:  0 *** 0.001 ** 0.01 * 0.05 . 0.1 empty 1") lines.append("\n") lines.append('Signif. codes:  0 *** 0.001 ** 0.01 * 0.05 . 0.1 empty 1\n') if skip_zero: k = len(skipped) if k == 0: lines.append("All variables have non-zero regression coefficients. ") lines.append('All variables have non-zero regression coefficients.') else: suff = "s" if k > 1 else "" lines.append("For %d variable%s the regression coefficient equals 0: " \ % (k, suff)) for var in skipped: lines.append(var) return "\n".join(lines) lines.append('For %d variable%s the regression coefficient equals 0:' % (k, 's' if k > 1 else '')) lines.append(', '.join(var for var in skipped)) return '\n'.join(lines) def __str__(self): deprecated_members({"muX": "mu_x", "stdErrorsFixedT": "std_errors_fixed_t", "stdErrorsFixedT": "std_errors", "pVals": "p_vals", "dictModel": "dict_model"}, "dictModel": "model"}, wrap_methods=["__init__"], in_place=True)(LassoRegression) if __name__ == "__main__": import Orange table = Orange.data.Table("housing.tab") c = LassoRegressionLearner(table, t=len(table.domain)) data = Orange.data.Table('housing') c = LassoRegressionLearner(data) print c
• ## Orange/testing/regression/results_reference/lasso-example.py.txt

 r10044 Actual: 24.00, predicted: 25.81 Actual: 21.60, predicted: 24.11 Actual: 34.70, predicted: 27.29 Actual: 33.40, predicted: 27.01 Actual: 36.20, predicted: 26.77 Actual: 24.00, predicted: 30.45 Actual: 21.60, predicted: 25.60 Actual: 34.70, predicted: 31.48 Actual: 33.40, predicted: 30.18 Actual: 36.20, predicted: 29.59 Variable  Coeff Est  Std Error          p Intercept     22.533 RM      2.464      0.781      0.000   *** DIS      0.001      0.022      0.450 TAX     -0.002      0.001      0.130 PTRATIO     -0.138      0.205      0.070     . LSTAT     -0.254      0.098      0.000   *** CRIM     -0.023      0.024      0.050     . CHAS      1.970      1.331      0.040     * NOX     -4.226      2.944      0.010     * RM      4.270      0.934      0.000   *** DIS     -0.373      0.170      0.010     * PTRATIO     -0.798      0.117      0.000   *** B      0.007      0.003      0.020     * LSTAT     -0.519      0.102      0.000   *** Signif. codes:  0 *** 0.001 ** 0.01 * 0.05 . 0.1 empty 1 For 8 variables the regression coefficient equals 0: CRIM ZN INDUS CHAS NOX AGE RAD B For 5 variables the regression coefficient equals 0: ZN, INDUS, AGE, RAD, TAX
• ## docs/reference/rst/Orange.regression.lasso.rst

 r10536 The Lasso _ is a shrinkage and selection method for linear regression. It minimizes the usual sum of squared errors, with a bound on the sum of the absolute values of the coefficients. The lasso _ (least absolute shrinkage and selection operator) is a regularized version of least squares regression. It minimizes the sum of squared errors while also penalizing the :math:L_1 norm (sum of absolute values) of the coefficients. To fit the regression parameters on housing data set use the following code: Concretely, the function that is minimized in Orange is: .. literalinclude:: code/lasso-example.py :lines: 9,10,11 .. math:: \frac{1}{n}\|Xw - y\|_2^2 + \frac{\lambda}{m} \|w\|_1 Where :math:X is a :math:n \times m data matrix, :math:y the vector of class values and :math:w the regression coefficients to be estimated. .. autoclass:: LassoRegressionLearner :members: :show-inheritance: .. autoclass:: LassoRegression :members: .. autoclass:: LassoRegressionLearner :members: .. autoclass:: LassoRegression :members: :show-inheritance: Utility functions ----------------- .. autofunction:: center .. autofunction:: get_bootstrap_sample ======== To predict values of the response for the first five instances use the code To fit the regression parameters on housing data set use the following code: .. literalinclude:: code/lasso-example.py :lines: 14,15 :lines: 9,10,11 Output :: Actual: 24.00, predicted: 24.58 Actual: 21.60, predicted: 23.30 Actual: 34.70, predicted: 24.98 Actual: 33.40, predicted: 24.78 Actual: 36.20, predicted: 24.66 To see the fitted regression coefficients, print the model To predict values of the response for the first five instances: .. literalinclude:: code/lasso-example.py :lines: 17 :lines: 15,16 The output Output:: :: Actual: 24.00, predicted: 30.45 Actual: 21.60, predicted: 25.60 Actual: 34.70, predicted: 31.48 Actual: 33.40, predicted: 30.18 Actual: 36.20, predicted: 29.59 Variable  Coeff Est  Std Error          p To see the fitted regression coefficients, print the model: .. literalinclude:: code/lasso-example.py :lines: 19 Output:: Variable  Coeff Est  Std Error          p Intercept     22.533 CRIM     -0.000      0.023      0.480 INDUS     -0.010      0.023      0.300 RM      1.303      0.994      0.000   *** AGE     -0.002      0.000      0.320 PTRATIO     -0.191      0.209      0.050     . LSTAT     -0.126      0.105      0.000   *** CRIM     -0.023      0.024      0.050     . CHAS      1.970      1.331      0.040     * NOX     -4.226      2.944      0.010     * RM      4.270      0.934      0.000   *** DIS     -0.373      0.170      0.010     * PTRATIO     -0.798      0.117      0.000   *** B      0.007      0.003      0.020     * LSTAT     -0.519      0.102      0.000   *** Signif. codes:  0 *** 0.001 ** 0.01 * 0.05 . 0.1 empty 1 For 5 variables the regression coefficient equals 0: ZN, INDUS, AGE, RAD, TAX For 7 variables the regression coefficient equals 0: ZN CHAS NOX DIS RAD TAX B Note that some of the regression coefficients are equal to 0. shows that some of the regression coefficients are equal to 0.
• ## docs/reference/rst/code/lasso-example.py

 r10042 housing = Orange.data.Table("housing") learner = Orange.regression.lasso.LassoRegressionLearner() learner = Orange.regression.lasso.LassoRegressionLearner( lasso_lambda=1, n_boot=100, n_perm=100) classifier = learner(housing) # prediction for five data instances and comparison to actual values # prediction for five data instances for ins in housing[:5]: print "Actual: %3.2f, predicted: %3.2f " % (ins.get_class(), classifier(ins)) print "Actual: %3.2f, predicted: %3.2f" % ( ins.get_class(), classifier(ins)) print classifier
Note: See TracChangeset for help on using the changeset viewer.