source: orange/Orange/regression/lasso.py @ 10580:c4cbae8dcf8b

Revision 10580:c4cbae8dcf8b, 12.0 KB checked in by markotoplak, 2 years ago (diff)

Moved deprecation functions, progress bar support and environ into Orange.utils. Orange imports cleanly, although it is not tested yet.

Line 
1import Orange
2import numpy
3
4from Orange.regression import base
5
6from Orange.utils import deprecated_members, deprecated_keywords
7
8def center(X):
9    """Centers the data, i.e. subtracts the column means.
10    Returns the centered data and the mean.
11
12    :param X: the data arry
13    :type table: :class:`numpy.array`
14    """
15    mu = X.mean(axis=0)
16    return X - mu, mu
17
18def standardize(X):
19    """Standardizes the data, i.e. subtracts the column means and divide by
20    standard deviation.
21    Returns the centered data, the mean, and deviations.
22
23    :param X: the data arry
24    :type table: :class:`numpy.array`
25    """
26    mu = numpy.mean(X, axis=0)
27    std = numpy.std(X, axis=0)
28    return (X - mu) / std, mu, std
29
30def get_bootstrap_sample(table):
31    """Generates boostrap sample from an Orange Example Table
32    and stores it in a new :class:`Orange.data.Table` object
33
34    :param table: the original data sample
35    :type table: :class:`Orange.data.Table`
36    """
37    n = len(table)
38    bootTable = Orange.data.Table(table.domain)
39    for i in range(n):
40        id = numpy.random.randint(0, n)
41        bootTable.append(table[id])
42    return bootTable
43
44def permute_responses(table):
45    """ Permutes values of the class (response) variable.
46    The independence between independent variables and the response
47    is obtained but the distribution of the response variable is kept.
48
49    :param table: the original data sample
50    :type table: :class:`Orange.data.Table`
51    """
52    n = len(table)
53    perm = numpy.random.permutation(n)
54    permTable = Orange.data.Table(table.domain, table)
55    for i, ins in enumerate(table):
56        permTable[i].set_class(table[perm[i]].get_class())
57    return permTable
58
59class LassoRegressionLearner(base.BaseRegressionLearner):
60    """Fits the lasso regression model, i.e. learns the regression parameters
61    The class is derived from
62    :class:`Orange.regression.base.BaseRegressionLearner`
63    which is used for preprocessing the data (continuization and imputation)
64    before fitting the regression parameters
65
66    """
67
68    def __init__(self, name='lasso regression', t=1, s=None, tol=0.001, \
69                 n_boot=100, n_perm=100, imputer=None, continuizer=None):
70        """
71        :param name: name of the linear model, default 'lasso regression'
72        :type name: string
73       
74        :param t: tuning parameter, upper bound for the L1-norm of the
75            regression coefficients
76        :type t: float
77       
78        :param s: An alternative way to specify the tuning parameter ``t``.
79            Here ``t`` is taken to be t = s * sum(abs(B)) where B are the
80            coefficients of an ordinary least square linear fit. ``t`` parameter is ignored if ``s`` is specified (by default it
81            is None).
82        :type s: float
83       
84        :param tol: tolerance parameter, regression coefficients
85            (absoulute value) under tol are set to 0,
86            default=0.001
87        :type tol: float
88       
89        :param n_boot: number of bootstrap samples used for non-parametric
90            estimation of standard errors
91        :type n_boot: int
92       
93        :param n_perm: number of permuations used for non-parametric
94            estimation of p-values
95        :type n_perm: int
96       
97        """
98
99        self.name = name
100        self.t = t
101        self.s = s
102        self.tol = tol
103        self.n_boot = n_boot
104        self.n_perm = n_perm
105        self.set_imputer(imputer=imputer)
106        self.set_continuizer(continuizer=continuizer)
107
108
109    def __call__(self, table, weight=None):
110        """
111        :param table: data instances.
112        :type table: :class:`Orange.data.Table`
113        :param weight: the weights for instances. Default: None, i.e.
114            all data instances are eqaully important in fitting
115            the regression parameters
116        :type weight: None or list of Orange.feature.Continuous
117            which stores weights for instances
118       
119        """
120        # dicrete values are continuized       
121        table = self.continuize_table(table)
122        # missing values are imputed
123        table = self.impute_table(table)
124
125        domain = table.domain
126        X, y, w = table.to_numpy()
127        n, m = numpy.shape(X)
128
129        X, mu_x, sigma_x = standardize(X)
130        y, coef0 = center(y)
131
132        t = self.t
133
134        if self.s is not None:
135            beta_full, rss, _, _ = numpy.linalg.lstsq(X, y)
136            t = self.s * numpy.sum(numpy.abs(beta_full))
137            print "t =", t
138
139        import scipy.optimize
140
141        # objective function to be minimized
142        objective = lambda beta: numpy.linalg.norm(y - numpy.dot(X, beta))
143        # initial guess for the regression parameters
144        beta_init = numpy.random.random(m)
145        # constraints for the regression coefficients
146        cnstr = lambda beta: t - numpy.sum(numpy.abs(beta))
147        # optimal solution
148        coefficients = scipy.optimize.fmin_cobyla(objective, beta_init, cnstr, iprint=0)
149
150        # set small coefficients to 0
151        def set_2_0(c): return c if abs(c) > self.tol else 0
152        coefficients = numpy.array(map(set_2_0, coefficients))
153        coefficients /= sigma_x
154
155        # bootstrap estimator of standard error of the coefficient estimators
156        # assumption: fixed t
157        if self.n_boot > 0:
158            coeff_b = [] # bootstrapped coefficients
159            for i in range(self.n_boot):
160                tmp_table = get_bootstrap_sample(table)
161                l = LassoRegressionLearner(t=t, n_boot=0, n_perm=0)
162                c = l(tmp_table)
163                coeff_b.append(c.coefficients)
164            std_errors_fixed_t = numpy.std(coeff_b, axis=0)
165        else:
166            std_errors_fixed_t = [float("nan")] * m
167
168        # permutation test to obtain the significance of the regression
169        #coefficients
170        if self.n_perm > 0:
171            coeff_p = []
172            for i in range(self.n_perm):
173                tmp_table = permute_responses(table)
174                l = LassoRegressionLearner(t=t, n_boot=0, n_perm=0)
175                c = l(tmp_table)
176                coeff_p.append(c.coefficients)
177            p_vals = \
178                   numpy.sum(abs(numpy.array(coeff_p)) > \
179                             abs(numpy.array(coefficients)), \
180                             axis=0) / float(self.n_perm)
181        else:
182            p_vals = [float("nan")] * m
183
184        # dictionary of regression coefficients with standard errors
185        # and p-values
186        dict_model = {}
187        for i, var in enumerate(domain.attributes):
188            dict_model[var.name] = (coefficients[i], std_errors_fixed_t[i], p_vals[i])
189
190        return LassoRegression(domain=domain, class_var=domain.class_var,
191                               coef0=coef0, coefficients=coefficients,
192                               std_errors_fixed_t=std_errors_fixed_t,
193                               p_vals=p_vals,
194                               dict_model=dict_model,
195                               mu_x=mu_x)
196
197deprecated_members({"nBoot": "n_boot",
198                    "nPerm": "n_perm"},
199                   wrap_methods=["__init__"],
200                   in_place=True)(LassoRegressionLearner)
201
202class LassoRegression(Orange.classification.Classifier):
203    """Lasso regression predicts value of the response variable
204    based on the values of independent variables.
205
206    .. attribute:: coef0
207
208        Intercept (sample mean of the response variable).   
209
210    .. attribute:: coefficients
211
212        Regression coefficients, sotred in list.
213
214    .. attribute:: std_errors_fixed_t
215
216        Standard errors of the coefficient estimator for the fixed
217        tuning parameter t. The standard errors are estimated using
218        bootstrapping method.
219
220    .. attribute:: p_vals
221
222        List of p-values for the null hypothesis that the regression
223        coefficients equal 0 based on non-parametric permutation test.
224
225    .. attribute:: dict_model
226
227        Statistical properties of the model stored in dictionary:
228        Keys - names of the independent variables
229        Values - tuples (coefficient, standard error, p-value)
230
231    .. attribute:: mu_x
232
233        Sample mean of the all independent variables.   
234
235    """
236    def __init__(self, domain=None, class_var=None, coef0=None,
237                 coefficients=None, std_errors_fixed_t=None, p_vals=None,
238                 dict_model=None, mu_x=None):
239        self.domain = domain
240        self.class_var = class_var
241        self.coef0 = coef0
242        self.coefficients = coefficients
243        self.std_errors_fixed_t = std_errors_fixed_t
244        self.p_vals = p_vals
245        self.dict_model = dict_model
246        self.mu_x = mu_x
247
248    @deprecated_keywords({"resultType": "result_type"})
249    def __call__(self, instance, result_type=Orange.core.GetValue):
250        """
251        :param instance: data instance for which the value of the response
252            variable will be predicted
253        :type instance:
254        """
255        ins = Orange.data.Instance(self.domain, instance)
256        if "?" in ins: # missing value -> corresponding coefficient omitted
257            def miss_2_0(x): return x if x != "?" else 0
258            ins = map(miss_2_0, ins)
259            ins = numpy.array(ins)[:-1] - self.mu_x
260        else:
261            ins = numpy.array(ins.native())[:-1] - self.mu_x
262
263        y_hat = numpy.dot(self.coefficients, ins) + self.coef0
264        y_hat = self.class_var(y_hat)
265        dist = Orange.statistics.distribution.Continuous(self.class_var)
266        dist[y_hat] = 1.0
267        if result_type == Orange.core.GetValue:
268            return y_hat
269        if result_type == Orange.core.GetProbabilities:
270            return dist
271        else:
272            return (y_hat, dist)
273
274    @deprecated_keywords({"skipZero": "skip_zero"})
275    def to_string(self, skip_zero=True):
276        """Pretty-prints Lasso regression model,
277        i.e. estimated regression coefficients with standard errors
278        and significances. Standard errors are obtained using bootstrapping
279        method and significances by the permuation test
280
281        :param skip_zero: if True variables with estimated coefficient equal to 0
282            are omitted
283        :type skip_zero: boolean
284        """
285
286        from string import join
287        labels = ('Variable', 'Coeff Est', 'Std Error', 'p')
288        lines = [join(['%10s' % l for l in labels], ' ')]
289
290        fmt = "%10s " + join(["%10.3f"] * 3, " ") + " %5s"
291        fmt1 = "%10s %10.3f"
292
293        def get_star(p):
294            if p < 0.001: return  "*"*3
295            elif p < 0.01: return "*"*2
296            elif p < 0.05: return "*"
297            elif p < 0.1: return  "."
298            else: return " "
299
300        stars = get_star(self.p_vals[0])
301        lines.append(fmt1 % ('Intercept', self.coef0))
302        skipped = []
303        for i in range(len(self.domain.attributes)):
304            if self.coefficients[i] == 0. and skip_zero:
305                skipped.append(self.domain.attributes[i].name)
306                continue
307            stars = get_star(self.p_vals[i])
308            lines.append(fmt % (self.domain.attributes[i].name,
309                         self.coefficients[i], self.std_errors_fixed_t[i],
310                         self.p_vals[i], stars))
311        lines.append("Signif. codes:  0 *** 0.001 ** 0.01 * 0.05 . 0.1 empty 1")
312        lines.append("\n")
313        if skip_zero:
314            k = len(skipped)
315            if k == 0:
316                lines.append("All variables have non-zero regression coefficients. ")
317            else:
318                suff = "s" if k > 1 else ""
319                lines.append("For %d variable%s the regression coefficient equals 0: " \
320                      % (k, suff))
321                for var in skipped:
322                    lines.append(var)
323        return "\n".join(lines)
324
325    def __str__(self):
326        return self.to_string(skip_zero=True)
327
328deprecated_members({"muX": "mu_x",
329                    "stdErrorsFixedT": "std_errors_fixed_t",
330                    "pVals": "p_vals",
331                    "dictModel": "dict_model"},
332                   wrap_methods=["__init__"],
333                   in_place=True)(LassoRegression)
334
335if __name__ == "__main__":
336
337    import Orange
338
339    table = Orange.data.Table("housing.tab")
340
341    c = LassoRegressionLearner(table, t=len(table.domain))
342    print c
Note: See TracBrowser for help on using the repository browser.