source: orange/Orange/regression/pls.py @ 10294:c9f3541fa3f9

Revision 10294:c9f3541fa3f9, 16.7 KB checked in by markotoplak, 2 years ago (diff)

Fixed the level of some titles in regression section.

Line 
1"""\
2##########################################
3Partial least sqaures regression (``PLS``)
4##########################################
5
6.. index:: regression
7
8.. _`Parital Least Squares Regression`: http://en.wikipedia.org/wiki/Partial_least_squares_regression
9
10`Partial least squares
11<http://en.wikipedia.org/wiki/Partial_least_squares_regression>`_
12regression is a statistical method which can be used to predict
13multiple response variables simultaniously. Implementation is based on
14`Scikit learn python implementation
15<https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/pls.py>`_.
16
17The following code shows how to fit a PLS regression model on a multi-target data set.
18
19.. literalinclude:: code/pls-example.py
20    :lines: 7,9,13,14
21
22.. autoclass:: PLSRegressionLearner
23    :members:
24
25.. autoclass:: PLSRegression
26    :members:
27
28Utility functions
29-----------------
30
31.. autofunction:: normalize_matrix
32
33.. autofunction:: nipals_xy
34
35.. autofunction:: svd_xy
36
37
38========
39Examples
40========
41
42To predict values for the first two data instances
43use the followin code
44
45.. literalinclude:: code/pls-example.py
46    :lines: 16-20
47
48Output
49
50::
51
52    Actual     [<orange.Value 'Y1'='0.490'>, <orange.Value 'Y2'='1.237'>, <orange.Value 'Y3'='1.808'>, <orange.Value 'Y4'='0.422'>]
53    Predicted  [<orange.Value 'Y1'='0.613'>, <orange.Value 'Y2'='0.826'>, <orange.Value 'Y3'='1.084'>, <orange.Value 'Y4'='0.534'>]
54
55    Actual     [<orange.Value 'Y1'='0.167'>, <orange.Value 'Y2'='-0.664'>, <orange.Value 'Y3'='-1.378'>, <orange.Value 'Y4'='0.589'>]
56    Predicted  [<orange.Value 'Y1'='0.058'>, <orange.Value 'Y2'='-0.706'>, <orange.Value 'Y3'='-1.420'>, <orange.Value 'Y4'='0.599'>]
57
58To see the coefficient of the model (in this case they are stored in a matrix)
59print the model
60
61.. literalinclude:: code/pls-example.py
62    :lines: 22
63
64The ouptut looks like
65
66::
67
68    Regression coefficients:
69                       Y1           Y2           Y3           Y4
70          X1        0.714        2.153        3.590       -0.078
71          X2       -0.238       -2.500       -4.797       -0.036
72          X3        0.230       -0.314       -0.880       -0.060
73
74
75"""
76
77import Orange
78import numpy
79
80from Orange.regression import base
81from Orange.regression.earth import data_label_mask
82from numpy import dot, zeros
83from numpy import linalg
84from numpy.linalg import svd, pinv
85
86from Orange.misc import deprecated_members, deprecated_keywords
87
88
89def normalize_matrix(X):
90    """ Normalizes matrix, i.e. subtracts column means
91    and divides them by column standard deviations.
92    Returns the standardized matrix, sample mean and
93    standard deviation
94
95    :param X: data matrix
96    :type X: :class:`numpy.array`
97   
98    """
99    mu_x, sigma_x = numpy.mean(X, axis=0), numpy.std(X, axis=0)
100    sigma_x[sigma_x == 0] = 1.
101    return (X - mu_x)/sigma_x, mu_x, sigma_x
102
103@deprecated_keywords({"maxIter": "max_iter"})
104def nipals_xy(X, Y, mode="PLS", max_iter=500, tol=1e-06):
105    """
106    NIPALS algorithm. Returns the first left and rigth singular
107    vectors of X'Y.
108
109    :param X, Y: data matrix
110    :type X, Y: :class:`numpy.array`
111
112    :param mode: possible values "PLS" (default) or "CCA"
113    :type mode: string
114
115    :param max_iter: maximal number of iterations (default 500)
116    :type max_iter: int
117
118    :param tol: tolerance parameter, if norm of difference
119        between two successive left singular vectors is less than tol,
120        iteration is stopped
121    :type tol: a not negative float
122           
123    """
124    yScore, uOld, ite = Y[:, [0]], 0, 1
125    Xpinv = Ypinv = None
126    # Inner loop of the Wold algo.
127    while True and ite < max_iter:
128        # Update u: the X weights
129        if mode == "CCA":
130            if Xpinv is None:
131                Xpinv = linalg.pinv(X) # compute once pinv(X)
132            u = dot(Xpinv, yScore)
133        else: # mode PLS
134        # Mode PLS regress each X column on yScore
135            u = dot(X.T, yScore) / dot(yScore.T, yScore)
136        # Normalize u
137        u /= numpy.sqrt(dot(u.T, u))
138        # Update xScore: the X latent scores
139        xScore = dot(X, u)
140
141        # Update v: the Y weights
142        if mode == "CCA":
143            if Ypinv is None:
144                Ypinv = linalg.pinv(Y) # compute once pinv(Y)
145            v = dot(Ypinv, xScore)
146        else:
147            # Mode PLS regress each X column on yScore
148            v = dot(Y.T, xScore) / dot(xScore.T, xScore)
149        # Normalize v
150        v /= numpy.sqrt(dot(v.T, v))
151        # Update yScore: the Y latent scores
152        yScore = dot(Y, v)
153
154        uDiff = u - uOld
155        if dot(uDiff.T, uDiff) < tol or Y.shape[1] == 1:
156            break
157        uOld = u
158        ite += 1
159    return u, v
160
161def svd_xy(X, Y):
162    """ Returns the first left and right singular
163    vectors of X'Y.
164
165    :param X, Y: data matrix
166    :type X, Y: :class:`numpy.array`   
167   
168    """
169    U, s, V = svd(dot(X.T, Y), full_matrices=False)
170    u = U[:, [0]]
171    v = V.T[:, [0]]
172    return u, v
173
174
175def select_attrs(table, attributes, class_var=None, metas=None):
176    """ Select only ``attributes`` from the ``table``.
177    """
178    domain = Orange.data.Domain(attributes, class_var)
179    if metas:
180        domain.add_metas(metas)
181    return Orange.data.Table(domain, table)
182
183
184class PLSRegressionLearner(base.BaseRegressionLearner):
185    """ Fits the partial least squares regression model,
186    i.e. learns the regression parameters. The implementation is based on
187    `Scikit learn python implementation`_
188   
189    The class is derived from
190    :class:`Orange.regression.base.BaseRegressionLearner`
191    which is used for preprocessing the data (continuization and imputation)
192    before fitting the regression parameters
193   
194    """
195
196    def __init__(self, n_comp=2, deflation_mode="regression", mode="PLS",
197                 algorithm="nipals", max_iter=500, 
198                 imputer=None, continuizer=None,
199                 **kwds):
200        """
201        .. attribute:: n_comp
202   
203            number of components to keep. Default: 2
204
205        .. attribute:: deflation_mode
206   
207            "canonical" or "regression" (default)
208
209        .. attribute:: mode
210   
211            "CCA" or "PLS" (default)
212
213
214        .. attribute:: algorithm
215   
216            The algorithm used to estimate the weights:
217            "nipals" or "svd" (default)
218
219
220        """       
221        self.n_comp = n_comp
222        self.deflation_mode = deflation_mode
223        self.mode = mode
224        self.algorithm = algorithm
225        self.max_iter = max_iter
226        self.set_imputer(imputer=imputer)
227        self.set_continuizer(continuizer=continuizer)
228        self.__dict__.update(kwds)
229
230    @deprecated_keywords({"xVars": "x_vars", "yVars": "y_vars"})
231    def __call__(self, table, weight_id=None, x_vars=None, y_vars=None):
232        """
233        :param table: data instances.
234        :type table: :class:`Orange.data.Table`
235
236        :param x_vars, y_vars: List of input and response variables
237            (:obj:`Orange.feature.Continuous` or
238            :obj:`Orange.feature.Discrete`). If None (default) it is
239            assumed that the data domain provides information which variables
240            are reponses and which are not. If data has
241            :obj:`~Orange.data.Domain.class_var` defined in its domain, a
242            single-target regression learner is constructed. Otherwise a
243            multi-target learner predicting response variables defined by
244            :obj:`~Orange.data.Domain.class_vars` is constructed.
245        :type x_vars, y_vars: list           
246
247        """     
248        domain = table.domain
249        if x_vars is None and y_vars is None:
250            # Response variables are defined in the table.
251            x_vars = domain.features
252            if domain.class_var:
253                y_vars = [domain.class_var]
254            elif domain.class_vars:
255                y_vars = domain.class_vars
256            else:
257                raise TypeError('Class-less domain (x-vars and y-vars needed).')
258            x_table = select_attrs(table, x_vars)
259            y_table = select_attrs(table, y_vars)
260        elif not (x_vars and y_vars):
261            raise ValueError("Both x_vars and y_vars must be defined.")
262
263        x_table = select_attrs(table, x_vars)
264        y_table = select_attrs(table, y_vars)
265
266        # dicrete values are continuized       
267        x_table = self.continuize_table(x_table)
268        y_table = self.continuize_table(y_table)
269        # missing values are imputed
270        x_table = self.impute_table(x_table)
271        y_table = self.impute_table(y_table)
272       
273        # Collect the new transformed x_vars/y_vars
274        x_vars = list(x_table.domain.variables)
275        y_vars = list(y_table.domain.variables)
276       
277        domain = Orange.data.Domain(x_vars + y_vars, False)
278        multitarget = True if len(y_vars) > 1 else False
279
280        x = x_table.to_numpy()[0]
281        y = y_table.to_numpy()[0]
282       
283        kwargs = self.fit(x, y)
284        return PLSRegression(domain=domain, x_vars=x_vars, y_vars=y_vars,
285                             multitarget=multitarget, **kwargs)
286
287    def fit(self, X, Y):
288        """ Fits all unknown parameters, i.e.
289        weights, scores, loadings (for x and y) and regression coefficients.
290        Returns a dict with all of the parameters.
291       
292        """
293        # copy since this will contain the residuals (deflated) matrices
294
295        X, Y = X.copy(), Y.copy()
296        if Y.ndim == 1:
297            Y = Y.reshape((Y.size, 1))
298        n, p = X.shape
299        q = Y.shape[1]
300
301        # normalization of data matrices
302        X, muX, sigmaX = normalize_matrix(X)
303        Y, muY, sigmaY = normalize_matrix(Y)
304        # Residuals (deflated) matrices
305        Xk, Yk = X, Y
306        # Results matrices
307        T, U = zeros((n, self.n_comp)), zeros((n, self.n_comp))
308        W, C = zeros((p, self.n_comp)), zeros((q, self.n_comp))
309        P, Q = zeros((p, self.n_comp)), zeros((q, self.n_comp))     
310
311        # NIPALS over components
312        for k in xrange(self.n_comp):
313            # Weights estimation (inner loop)
314            if self.algorithm == "nipals":
315                u, v = nipals_xy(X=Xk, Y=Yk, mode=self.mode, 
316                                 max_iter=self.max_iter)
317            elif self.algorithm == "svd":
318                u, v = svd_xy(X=Xk, Y=Yk)
319            # compute scores
320            xScore, yScore = dot(Xk, u), dot(Yk, v)
321            # Deflation (in place)
322            # - regress Xk's on xScore
323            xLoadings = dot(Xk.T, xScore) / dot(xScore.T, xScore)
324            # - substract rank-one approximations to obtain remainder matrix
325            Xk -= dot(xScore, xLoadings.T)
326            if self.deflation_mode == "canonical":
327                # - regress Yk's on yScore, then substract rank-one approx.
328                yLoadings = dot(Yk.T, yScore) / dot(yScore.T, yScore)
329                Yk -= dot(yScore, yLoadings.T)
330            if self.deflation_mode == "regression":
331                # - regress Yk's on xScore, then substract rank-one approx.
332                yLoadings = dot(Yk.T, xScore) / dot(xScore.T, xScore)
333                Yk -= dot(xScore, yLoadings.T)
334            # Store weights, scores and loadings
335            T[:, k] = xScore.ravel() # x-scores
336            U[:, k] = yScore.ravel() # y-scores
337            W[:, k] = u.ravel() # x-weights
338            C[:, k] = v.ravel() # y-weights
339            P[:, k] = xLoadings.ravel() # x-loadings
340            Q[:, k] = yLoadings.ravel() # y-loadings
341        # X = TP' + E and Y = UQ' + E
342
343        # Rotations from input space to transformed space (scores)
344        # T = X W(P'W)^-1 = XW* (W* : p x k matrix)
345        # U = Y C(Q'C)^-1 = YC* (W* : q x k matrix)
346        xRotations = dot(W, pinv(dot(P.T, W)))
347        if Y.shape[1] > 1:
348            yRotations = dot(C, pinv(dot(Q.T, C)))
349        else:
350            yRotations = numpy.ones(1)
351
352        if True or self.deflation_mode == "regression":
353            # Estimate regression coefficient
354            # Y = TQ' + E = X W(P'W)^-1Q' + E = XB + E
355            # => B = W*Q' (p x q)
356            coefs = dot(xRotations, Q.T)
357            coefs = 1. / sigmaX.reshape((p, 1)) * \
358                    coefs * sigmaY
359       
360        return {"mu_x": muX, "mu_y": muY, "sigma_x": sigmaX,
361                "sigma_y": sigmaY, "T": T, "U":U, "W":U, 
362                "C": C, "P":P, "Q":Q, "x_rotations": xRotations,
363                "y_rotations": yRotations, "coefs": coefs}
364
365deprecated_members({"nComp": "n_comp",
366                    "deflationMode": "deflation_mode",
367                    "maxIter": "max_iter"},
368                   wrap_methods=["__init__"],
369                   in_place=True)(PLSRegressionLearner)
370
371class PLSRegression(Orange.classification.Classifier):
372    """ PLSRegression predicts value of the response variables
373    based on the values of independent variables.
374   
375    Basic notations:
376    n - number of data instances
377    p - number of independent variables
378    q - number of reponse variables
379
380    .. attribute:: T
381   
382        A n x n_comp numpy array of x-scores
383
384    .. attribute:: U
385   
386        A n x n_comp numpy array of y-scores
387
388    .. attribute:: W
389   
390        A p x n_comp numpy array of x-weights
391
392    .. attribute:: C
393   
394        A q x n_comp numpy array of y-weights
395
396    .. attribute:: P
397   
398        A p x n_comp numpy array of x-loadings
399
400    .. attribute:: Q
401   
402        A q x n_comp numpy array of y-loading
403
404    .. attribute:: coefs
405   
406        A p x q numpy array coefficients
407        of the linear model: Y = X coefs + E
408
409    .. attribute:: x_vars
410   
411        Predictor variables
412
413    .. attribute:: y_vars
414   
415        Response variables
416       
417    """
418    def __init__(self, domain=None, multitarget=False, coefs=None, sigma_x=None, sigma_y=None,
419                 mu_x=None, mu_y=None, x_vars=None, y_vars=None, **kwargs):
420        self.domain = domain
421        self.multitarget = multitarget
422        self.coefs = coefs
423        self.mu_x, self.mu_y = mu_x, mu_y
424        self.sigma_x, self.sigma_y = sigma_x, sigma_y
425        self.x_vars, self.y_vars = x_vars, y_vars
426           
427        for name, val in kwargs.items():
428            setattr(self, name, val)
429
430    def __call__(self, instance, result_type=Orange.core.GetValue):
431        """
432        :param instance: data instance for which the value of the response
433            variable will be predicted
434        :type instance: :class:`Orange.data.Instance`
435       
436        """ 
437        instance = Orange.data.Instance(self.domain, instance)
438        ins = [instance[v].native() for v in self.x_vars]
439       
440        if "?" in ins: # missing value -> corresponding coefficient omitted
441            def miss_2_0(x): return x if x != "?" else 0
442            ins = map(miss_2_0, ins)
443        ins = numpy.array(ins)
444        xc = (ins - self.mu_x)
445        predicted = dot(xc, self.coefs) + self.mu_y
446        y_hat = [var(val) for var, val in zip(self.y_vars, predicted)]
447        if result_type == Orange.core.GetValue:
448            return y_hat if self.multitarget else y_hat[0]
449        else:
450            from Orange.statistics.distribution import Distribution
451            probs = []
452            for var, val in zip(self.y_vars, y_hat):
453                dist = Distribution(var)
454                dist[val] = 1.0
455                probs.append(dist)
456            if result_type == Orange.core.GetBoth:
457                return (y_hat, probs) if self.multitarget else (y_hat[0], probs[0])
458            else:
459                return probs if self.multitarget else probs[0]
460           
461    def to_string(self):
462        """ Pretty-prints the coefficient of the PLS regression model.
463        """       
464        x_vars, y_vars = [x.name for x in self.x_vars], [y.name for y in self.y_vars]
465        fmt = "%8s " + "%12.3f " * len(y_vars)
466        first = [" " * 8 + "%13s" * len(y_vars) % tuple(y_vars)]
467        lines = [fmt % tuple([x_vars[i]] + list(coef))
468                 for i, coef in enumerate(self.coefs)]
469        return '\n'.join(first + lines)
470           
471    def __str__(self):
472        return self.to_string()
473
474    """
475    def transform(self, X, Y=None):
476
477        # Normalize
478        Xc = (X - self.muX) / self.sigmaX
479        if Y is not None:
480            Yc = (Y - self.muY) / self.sigmaY
481        # Apply rotation
482        xScores = dot(Xc, self.xRotations)
483        if Y is not None:
484            yScores = dot(Yc, self.yRotations)
485            return xScores, yScores
486
487        return xScores
488    """
489             
490deprecated_members({"xVars": "x_vars", 
491                    "yVars": "y_vars",
492                    "muX": "mu_x",
493                    "muY": "mu_y",
494                    "sigmaX": "sigma_x",
495                    "sigmaY": "sigma_y"},
496                   wrap_methods=["__init__"],
497                   in_place=True)(PLSRegression)
498                   
499if __name__ == "__main__":
500
501    import Orange
502    from Orange.regression import pls
503
504    data = Orange.data.Table("multitarget-synthetic")
505    l = pls.PLSRegressionLearner()
506
507    x = data.domain.features
508    y = data.domain.class_vars
509    print x, y
510    # c = l(data, x_vars=x, y_vars=y)
511    c = l(data)
512    print c
Note: See TracBrowser for help on using the repository browser.