# source:orange/Orange/regression/pls.py@9776:600ac31393ee

Revision 9776:600ac31393ee, 16.7 KB checked in by lanumek, 2 years ago (diff)

documentation for regression: lasso, linear, pls

Line
1"""\
2==========================================
3Partial least sqaures regression (``PLS``)
4==========================================
5
6.. index:: regression
7
8.. _`Parital Least Squares Regression`: http://en.wikipedia.org/wiki/Partial_least_squares_regression
9
10`Partial least squares
11<http://en.wikipedia.org/wiki/Partial_least_squares_regression>`_
12regression is a statistical method which can be used to predict
13multiple response variables simultaniously. Implementation is based on
14`Scikit learn python implementation
15<https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/pls.py>`_.
16
17The following code shows how to fit a PLS regression model on a multi-target data set.
18
19.. literalinclude:: code/pls-example.py
20    :lines: 7,9,13,14
21
22.. autoclass:: PLSRegressionLearner
23    :members:
24
25.. autoclass:: PLSRegression
26    :members:
27
28Utility functions
29-----------------
30
31.. autofunction:: normalize_matrix
32
33.. autofunction:: nipals_xy
34
35.. autofunction:: svd_xy
36
37
38========
39Examples
40========
41
42To predict values for the first two data instances
43use the followin code
44
45.. literalinclude:: code/pls-example.py
46    :lines: 16-20
47
48Output
49
50::
51
52    Actual     [<orange.Value 'Y1'='0.490'>, <orange.Value 'Y2'='1.237'>, <orange.Value 'Y3'='1.808'>, <orange.Value 'Y4'='0.422'>]
53    Predicted  [<orange.Value 'Y1'='0.613'>, <orange.Value 'Y2'='0.826'>, <orange.Value 'Y3'='1.084'>, <orange.Value 'Y4'='0.534'>]
54
55    Actual     [<orange.Value 'Y1'='0.167'>, <orange.Value 'Y2'='-0.664'>, <orange.Value 'Y3'='-1.378'>, <orange.Value 'Y4'='0.589'>]
56    Predicted  [<orange.Value 'Y1'='0.058'>, <orange.Value 'Y2'='-0.706'>, <orange.Value 'Y3'='-1.420'>, <orange.Value 'Y4'='0.599'>]
57
58To see the coefficient of the model (in this case they are stored in a matrix)
59print the model
60
61.. literalinclude:: code/pls-example.py
62    :lines: 22
63
64The ouptut looks like
65
66::
67
68    Regression coefficients:
69                       Y1           Y2           Y3           Y4
70          X1        0.714        2.153        3.590       -0.078
71          X2       -0.238       -2.500       -4.797       -0.036
72          X3        0.230       -0.314       -0.880       -0.060
73
74
75"""
76
77import Orange
78import numpy
79
80from Orange.regression import base
82from numpy import dot, zeros
83from numpy import linalg
84from numpy.linalg import svd, pinv
85
86from Orange.misc import deprecated_members, deprecated_keywords
87
88
89def normalize_matrix(X):
90    """ Normalizes matrix, i.e. subtracts column means
91    and divides them by column standard deviations.
92    Returns the standardized matrix, sample mean and
93    standard deviation
94
95    :param X: data matrix
96    :type X: :class:`numpy.array`
97
98    """
99    mu_x, sigma_x = numpy.mean(X, axis=0), numpy.std(X, axis=0)
100    sigma_x[sigma_x == 0] = 1.
101    return (X - mu_x)/sigma_x, mu_x, sigma_x
102
103@deprecated_keywords({"maxIter": "max_iter"})
104def nipals_xy(X, Y, mode="PLS", max_iter=500, tol=1e-06):
105    """
106    NIPALS algorithm. Returns the first left and rigth singular
107    vectors of X'Y.
108
109    :param X, Y: data matrix
110    :type X, Y: :class:`numpy.array`
111
112    :param mode: possible values "PLS" (default) or "CCA"
113    :type mode: string
114
115    :param max_iter: maximal number of iterations (default 500)
116    :type max_iter: int
117
118    :param tol: tolerance parameter, if norm of difference
119        between two successive left singular vectors is less than tol,
120        iteration is stopped
121    :type tol: a not negative float
122
123    """
124    yScore, uOld, ite = Y[:, [0]], 0, 1
125    Xpinv = Ypinv = None
126    # Inner loop of the Wold algo.
127    while True and ite < max_iter:
128        # Update u: the X weights
129        if mode == "CCA":
130            if Xpinv is None:
131                Xpinv = linalg.pinv(X) # compute once pinv(X)
132            u = dot(Xpinv, yScore)
133        else: # mode PLS
134        # Mode PLS regress each X column on yScore
135            u = dot(X.T, yScore) / dot(yScore.T, yScore)
136        # Normalize u
137        u /= numpy.sqrt(dot(u.T, u))
138        # Update xScore: the X latent scores
139        xScore = dot(X, u)
140
141        # Update v: the Y weights
142        if mode == "CCA":
143            if Ypinv is None:
144                Ypinv = linalg.pinv(Y) # compute once pinv(Y)
145            v = dot(Ypinv, xScore)
146        else:
147            # Mode PLS regress each X column on yScore
148            v = dot(Y.T, xScore) / dot(xScore.T, xScore)
149        # Normalize v
150        v /= numpy.sqrt(dot(v.T, v))
151        # Update yScore: the Y latent scores
152        yScore = dot(Y, v)
153
154        uDiff = u - uOld
155        if dot(uDiff.T, uDiff) < tol or Y.shape[1] == 1:
156            break
157        uOld = u
158        ite += 1
159    return u, v
160
161def svd_xy(X, Y):
162    """ Returns the first left and right singular
163    vectors of X'Y.
164
165    :param X, Y: data matrix
166    :type X, Y: :class:`numpy.array`
167
168    """
169    U, s, V = svd(dot(X.T, Y), full_matrices=False)
170    u = U[:, [0]]
171    v = V.T[:, [0]]
172    return u, v
173
174
175def select_attrs(table, attributes, class_var=None, metas=None):
176    """ Select only ``attributes`` from the ``table``.
177    """
178    domain = Orange.data.Domain(attributes, class_var)
179    if metas:
181    return Orange.data.Table(domain, table)
182
183
184class PLSRegressionLearner(base.BaseRegressionLearner):
185    """ Fits the partial least squares regression model,
186    i.e. learns the regression parameters. The implementation is based on
187    `Scikit learn python implementation`_
188
189    The class is derived from
190    :class:`Orange.regression.base.BaseRegressionLearner`
191    which is used for preprocessing the data (continuization and imputation)
192    before fitting the regression parameters
193
194    """
195
196    def __init__(self, n_comp=2, deflation_mode="regression", mode="PLS",
197                 algorithm="nipals", max_iter=500,
198                 imputer=None, continuizer=None,
199                 **kwds):
200        """
201        .. attribute:: n_comp
202
203            number of components to keep. Default: 2
204
205        .. attribute:: deflation_mode
206
207            "canonical" or "regression" (default)
208
209        .. attribute:: mode
210
211            "CCA" or "PLS" (default)
212
213
214        .. attribute:: algorithm
215
216            The algorithm used to estimate the weights:
217            "nipals" or "svd" (default)
218
219
220        """
221        self.n_comp = n_comp
222        self.deflation_mode = deflation_mode
223        self.mode = mode
224        self.algorithm = algorithm
225        self.max_iter = max_iter
226        self.set_imputer(imputer=imputer)
227        self.set_continuizer(continuizer=continuizer)
228        self.__dict__.update(kwds)
229
230    @deprecated_keywords({"xVars": "x_vars", "yVars": "y_vars"})
231    def __call__(self, table, weight_id=None, x_vars=None, y_vars=None):
232        """
233        :param table: data instances.
234        :type table: :class:`Orange.data.Table`
235
236        :param x_vars, y_vars: List of input and response variables
237            (:obj:`Orange.data.variable.Continuous` or
238            :obj:`Orange.data.variable.Discrete`). If None (default) it is
239            assumed that the data domain provides information which variables
240            are reponses and which are not. If data has
241            :obj:`~Orange.data.Domain.class_var` defined in its domain, a
242            single-target regression learner is constructed. Otherwise a
243            multi-target learner predicting response variables defined by
244            :obj:`~Orange.data.Domain.class_vars` is constructed.
245        :type x_vars, y_vars: list
246
247        """
248        domain = table.domain
249        if x_vars is None and y_vars is None:
250            # Response variables are defined in the table.
251            x_vars = domain.features
252            if domain.class_var:
253                y_vars = [domain.class_var]
254            elif domain.class_vars:
255                y_vars = domain.class_vars
256            else:
257                raise TypeError('Class-less domain (x-vars and y-vars needed).')
258            x_table = select_attrs(table, x_vars)
259            y_table = select_attrs(table, y_vars)
260        elif not (x_vars and y_vars):
261            raise ValueError("Both x_vars and y_vars must be defined.")
262
263        x_table = select_attrs(table, x_vars)
264        y_table = select_attrs(table, y_vars)
265
266        # dicrete values are continuized
267        x_table = self.continuize_table(x_table)
268        y_table = self.continuize_table(y_table)
269        # missing values are imputed
270        x_table = self.impute_table(x_table)
271        y_table = self.impute_table(y_table)
272
273        # Collect the new transformed x_vars/y_vars
274        x_vars = list(x_table.domain.variables)
275        y_vars = list(y_table.domain.variables)
276
277        domain = Orange.data.Domain(x_vars + y_vars, False)
278        multitarget = True if len(y_vars) > 1 else False
279
280        x = x_table.to_numpy()[0]
281        y = y_table.to_numpy()[0]
282
283        kwargs = self.fit(x, y)
284        return PLSRegression(domain=domain, x_vars=x_vars, y_vars=y_vars,
285                             multitarget=multitarget, **kwargs)
286
287    def fit(self, X, Y):
288        """ Fits all unknown parameters, i.e.
290        Returns a dict with all of the parameters.
291
292        """
293        # copy since this will contain the residuals (deflated) matrices
294
295        X, Y = X.copy(), Y.copy()
296        if Y.ndim == 1:
297            Y = Y.reshape((Y.size, 1))
298        n, p = X.shape
299        q = Y.shape[1]
300
301        # normalization of data matrices
302        X, muX, sigmaX = normalize_matrix(X)
303        Y, muY, sigmaY = normalize_matrix(Y)
304        # Residuals (deflated) matrices
305        Xk, Yk = X, Y
306        # Results matrices
307        T, U = zeros((n, self.n_comp)), zeros((n, self.n_comp))
308        W, C = zeros((p, self.n_comp)), zeros((q, self.n_comp))
309        P, Q = zeros((p, self.n_comp)), zeros((q, self.n_comp))
310
311        # NIPALS over components
312        for k in xrange(self.n_comp):
313            # Weights estimation (inner loop)
314            if self.algorithm == "nipals":
315                u, v = nipals_xy(X=Xk, Y=Yk, mode=self.mode,
316                                 max_iter=self.max_iter)
317            elif self.algorithm == "svd":
318                u, v = svd_xy(X=Xk, Y=Yk)
319            # compute scores
320            xScore, yScore = dot(Xk, u), dot(Yk, v)
321            # Deflation (in place)
322            # - regress Xk's on xScore
324            # - substract rank-one approximations to obtain remainder matrix
326            if self.deflation_mode == "canonical":
327                # - regress Yk's on yScore, then substract rank-one approx.
330            if self.deflation_mode == "regression":
331                # - regress Yk's on xScore, then substract rank-one approx.
335            T[:, k] = xScore.ravel() # x-scores
336            U[:, k] = yScore.ravel() # y-scores
337            W[:, k] = u.ravel() # x-weights
338            C[:, k] = v.ravel() # y-weights
341        # X = TP' + E and Y = UQ' + E
342
343        # Rotations from input space to transformed space (scores)
344        # T = X W(P'W)^-1 = XW* (W* : p x k matrix)
345        # U = Y C(Q'C)^-1 = YC* (W* : q x k matrix)
346        xRotations = dot(W, pinv(dot(P.T, W)))
347        if Y.shape[1] > 1:
348            yRotations = dot(C, pinv(dot(Q.T, C)))
349        else:
350            yRotations = numpy.ones(1)
351
352        if True or self.deflation_mode == "regression":
353            # Estimate regression coefficient
354            # Y = TQ' + E = X W(P'W)^-1Q' + E = XB + E
355            # => B = W*Q' (p x q)
356            coefs = dot(xRotations, Q.T)
357            coefs = 1. / sigmaX.reshape((p, 1)) * \
358                    coefs * sigmaY
359
360        return {"mu_x": muX, "mu_y": muY, "sigma_x": sigmaX,
361                "sigma_y": sigmaY, "T": T, "U":U, "W":U,
362                "C": C, "P":P, "Q":Q, "x_rotations": xRotations,
363                "y_rotations": yRotations, "coefs": coefs}
364
365deprecated_members({"nComp": "n_comp",
366                    "deflationMode": "deflation_mode",
367                    "maxIter": "max_iter"},
368                   wrap_methods=["__init__"],
369                   in_place=True)(PLSRegressionLearner)
370
371class PLSRegression(Orange.classification.Classifier):
372    """ PLSRegression predicts value of the response variables
373    based on the values of independent variables.
374
375    Basic notations:
376    n - number of data instances
377    p - number of independent variables
378    q - number of reponse variables
379
380    .. attribute:: T
381
382        A n x n_comp numpy array of x-scores
383
384    .. attribute:: U
385
386        A n x n_comp numpy array of y-scores
387
388    .. attribute:: W
389
390        A p x n_comp numpy array of x-weights
391
392    .. attribute:: C
393
394        A q x n_comp numpy array of y-weights
395
396    .. attribute:: P
397
399
400    .. attribute:: Q
401
403
404    .. attribute:: coefs
405
406        A p x q numpy array coefficients
407        of the linear model: Y = X coefs + E
408
409    .. attribute:: x_vars
410
411        Predictor variables
412
413    .. attribute:: y_vars
414
415        Response variables
416
417    """
418    def __init__(self, domain=None, multitarget=False, coefs=None, sigma_x=None, sigma_y=None,
419                 mu_x=None, mu_y=None, x_vars=None, y_vars=None, **kwargs):
420        self.domain = domain
421        self.multitarget = multitarget
422        self.coefs = coefs
423        self.mu_x, self.mu_y = mu_x, mu_y
424        self.sigma_x, self.sigma_y = sigma_x, sigma_y
425        self.x_vars, self.y_vars = x_vars, y_vars
426
427        for name, val in kwargs.items():
428            setattr(self, name, val)
429
430    def __call__(self, instance, result_type=Orange.core.GetValue):
431        """
432        :param instance: data instance for which the value of the response
433            variable will be predicted
434        :type instance: :class:`Orange.data.Instance`
435
436        """
437        instance = Orange.data.Instance(self.domain, instance)
438        ins = [instance[v].native() for v in self.x_vars]
439
440        if "?" in ins: # missing value -> corresponding coefficient omitted
441            def miss_2_0(x): return x if x != "?" else 0
442            ins = map(miss_2_0, ins)
443        ins = numpy.array(ins)
444        xc = (ins - self.mu_x)
445        predicted = dot(xc, self.coefs) + self.mu_y
446        y_hat = [var(val) for var, val in zip(self.y_vars, predicted)]
447        if result_type == Orange.core.GetValue:
448            return y_hat if self.multitarget else y_hat[0]
449        else:
450            from Orange.statistics.distribution import Distribution
451            probs = []
452            for var, val in zip(self.y_vars, y_hat):
453                dist = Distribution(var)
454                dist[val] = 1.0
455                probs.append(dist)
456            if result_type == Orange.core.GetBoth:
457                return (y_hat, probs) if self.multitarget else (y_hat[0], probs[0])
458            else:
459                return probs if self.multitarget else probs[0]
460
461    def to_string(self):
462        """ Pretty-prints the coefficient of the PLS regression model.
463        """
464        x_vars, y_vars = [x.name for x in self.x_vars], [y.name for y in self.y_vars]
465        fmt = "%8s " + "%12.3f " * len(y_vars)
466        first = [" " * 8 + "%13s" * len(y_vars) % tuple(y_vars)]
467        lines = [fmt % tuple([x_vars[i]] + list(coef))
468                 for i, coef in enumerate(self.coefs)]
469        return '\n'.join(first + lines)
470
471    def __str__(self):
472        return self.to_string()
473
474    """
475    def transform(self, X, Y=None):
476
477        # Normalize
478        Xc = (X - self.muX) / self.sigmaX
479        if Y is not None:
480            Yc = (Y - self.muY) / self.sigmaY
481        # Apply rotation
482        xScores = dot(Xc, self.xRotations)
483        if Y is not None:
484            yScores = dot(Yc, self.yRotations)
485            return xScores, yScores
486
487        return xScores
488    """
489
490deprecated_members({"xVars": "x_vars",
491                    "yVars": "y_vars",
492                    "muX": "mu_x",
493                    "muY": "mu_y",
494                    "sigmaX": "sigma_x",
495                    "sigmaY": "sigma_y"},
496                   wrap_methods=["__init__"],
497                   in_place=True)(PLSRegression)
498
499if __name__ == "__main__":
500
501    import Orange
502    from Orange.regression import pls
503
504    data = Orange.data.Table("multitarget-synthetic")
505    l = pls.PLSRegressionLearner()
506
507    x = data.domain.features
508    y = data.domain.class_vars
509    print x, y
510    # c = l(data, x_vars=x, y_vars=y)
511    c = l(data)
512    print c
Note: See TracBrowser for help on using the repository browser.