source: orange/Orange/regression/pls.py @ 10367:058e9b1cc258

Revision 10367:058e9b1cc258, 16.7 KB checked in by Lan Zagar <lan.zagar@…>, 2 years ago (diff)

Some more code style and doc improvements.

Line 
1"""\
2##########################################
3Partial least sqaures regression (``PLS``)
4##########################################
5
6.. index:: regression
7
8.. _`Parital Least Squares Regression`: http://en.wikipedia.org/wiki/Partial_least_squares_regression
9
10`Partial least squares
11<http://en.wikipedia.org/wiki/Partial_least_squares_regression>`_
12regression is a statistical method which can be used to predict
13multiple response variables simultaniously. Implementation is based on
14`Scikit learn python implementation
15<https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/pls.py>`_.
16
17The following code shows how to fit a PLS regression model on a multi-target data set.
18
19.. literalinclude:: code/pls-example.py
20    :lines: 7,9,13,14
21
22.. autoclass:: PLSRegressionLearner
23    :members:
24
25.. autoclass:: PLSRegression
26    :members:
27
28Utility functions
29-----------------
30
31.. autofunction:: normalize_matrix
32
33.. autofunction:: nipals_xy
34
35.. autofunction:: svd_xy
36
37
38========
39Examples
40========
41
42To predict values for the first two data instances
43use the following code:
44
45.. literalinclude:: code/pls-example.py
46    :lines: 16-20
47
48Output::
49
50    Actual     [<orange.Value 'Y1'='0.490'>, <orange.Value 'Y2'='1.237'>, <orange.Value 'Y3'='1.808'>, <orange.Value 'Y4'='0.422'>]
51    Predicted  [<orange.Value 'Y1'='0.613'>, <orange.Value 'Y2'='0.826'>, <orange.Value 'Y3'='1.084'>, <orange.Value 'Y4'='0.534'>]
52
53    Actual     [<orange.Value 'Y1'='0.167'>, <orange.Value 'Y2'='-0.664'>, <orange.Value 'Y3'='-1.378'>, <orange.Value 'Y4'='0.589'>]
54    Predicted  [<orange.Value 'Y1'='0.058'>, <orange.Value 'Y2'='-0.706'>, <orange.Value 'Y3'='-1.420'>, <orange.Value 'Y4'='0.599'>]
55
56To see the coefficient of the model (in this case they are stored in a matrix)
57print the model:
58
59.. literalinclude:: code/pls-example.py
60    :lines: 22
61
62The ouptut looks like this::
63
64    Regression coefficients:
65                       Y1           Y2           Y3           Y4
66          X1        0.714        2.153        3.590       -0.078
67          X2       -0.238       -2.500       -4.797       -0.036
68          X3        0.230       -0.314       -0.880       -0.060
69
70
71"""
72
73import Orange
74import numpy
75
76from Orange.regression import base
77from numpy import dot, zeros
78from numpy import linalg
79from numpy.linalg import svd, pinv
80
81from Orange.misc import deprecated_members, deprecated_keywords
82
83
84def normalize_matrix(X):
85    """ Normalizes matrix, i.e. subtracts column means
86    and divides them by column standard deviations.
87    Returns the standardized matrix, sample mean and
88    standard deviation
89
90    :param X: data matrix
91    :type X: :class:`numpy.array`
92   
93    """
94    mu_x, sigma_x = numpy.mean(X, axis=0), numpy.std(X, axis=0)
95    sigma_x[sigma_x == 0] = 1.
96    return (X - mu_x)/sigma_x, mu_x, sigma_x
97
98@deprecated_keywords({"maxIter": "max_iter"})
99def nipals_xy(X, Y, mode="PLS", max_iter=500, tol=1e-06):
100    """
101    NIPALS algorithm. Returns the first left and rigth singular
102    vectors of X'Y.
103
104    :param X, Y: data matrix
105    :type X, Y: :class:`numpy.array`
106
107    :param mode: possible values "PLS" (default) or "CCA"
108    :type mode: string
109
110    :param max_iter: maximal number of iterations (default 500)
111    :type max_iter: int
112
113    :param tol: tolerance parameter, if norm of difference
114        between two successive left singular vectors is less than tol,
115        iteration is stopped
116    :type tol: a not negative float
117           
118    """
119    yScore, uOld, ite = Y[:, [0]], 0, 1
120    Xpinv = Ypinv = None
121    # Inner loop of the Wold algo.
122    while True and ite < max_iter:
123        # Update u: the X weights
124        if mode == "CCA":
125            if Xpinv is None:
126                Xpinv = linalg.pinv(X) # compute once pinv(X)
127            u = dot(Xpinv, yScore)
128        else: # mode PLS
129        # Mode PLS regress each X column on yScore
130            u = dot(X.T, yScore) / dot(yScore.T, yScore)
131        # Normalize u
132        u /= numpy.sqrt(dot(u.T, u))
133        # Update xScore: the X latent scores
134        xScore = dot(X, u)
135
136        # Update v: the Y weights
137        if mode == "CCA":
138            if Ypinv is None:
139                Ypinv = linalg.pinv(Y) # compute once pinv(Y)
140            v = dot(Ypinv, xScore)
141        else:
142            # Mode PLS regress each X column on yScore
143            v = dot(Y.T, xScore) / dot(xScore.T, xScore)
144        # Normalize v
145        v /= numpy.sqrt(dot(v.T, v))
146        # Update yScore: the Y latent scores
147        yScore = dot(Y, v)
148
149        uDiff = u - uOld
150        if dot(uDiff.T, uDiff) < tol or Y.shape[1] == 1:
151            break
152        uOld = u
153        ite += 1
154    return u, v
155
156def svd_xy(X, Y):
157    """ Returns the first left and right singular
158    vectors of X'Y.
159
160    :param X, Y: data matrix
161    :type X, Y: :class:`numpy.array`   
162   
163    """
164    U, s, V = svd(dot(X.T, Y), full_matrices=False)
165    u = U[:, [0]]
166    v = V.T[:, [0]]
167    return u, v
168
169
170def select_attrs(table, attributes, class_var=None, metas=None):
171    """ Select only ``attributes`` from the ``table``.
172    """
173    domain = Orange.data.Domain(attributes, class_var)
174    if metas:
175        domain.add_metas(metas)
176    return Orange.data.Table(domain, table)
177
178
179class PLSRegressionLearner(base.BaseRegressionLearner):
180    """ Fits the partial least squares regression model,
181    i.e. learns the regression parameters. The implementation is based on
182    `Scikit learn python implementation`_
183   
184    The class is derived from
185    :class:`Orange.regression.base.BaseRegressionLearner`
186    which is used for preprocessing the data (continuization and imputation)
187    before fitting the regression parameters
188   
189    """
190
191    def __init__(self, n_comp=2, deflation_mode="regression", mode="PLS",
192                 algorithm="nipals", max_iter=500, 
193                 imputer=None, continuizer=None,
194                 **kwds):
195        """
196        .. attribute:: n_comp
197   
198            number of components to keep. Default: 2
199
200        .. attribute:: deflation_mode
201   
202            "canonical" or "regression" (default)
203
204        .. attribute:: mode
205   
206            "CCA" or "PLS" (default)
207
208
209        .. attribute:: algorithm
210   
211            The algorithm used to estimate the weights:
212            "nipals" or "svd" (default)
213
214
215        """       
216        self.n_comp = n_comp
217        self.deflation_mode = deflation_mode
218        self.mode = mode
219        self.algorithm = algorithm
220        self.max_iter = max_iter
221        self.set_imputer(imputer=imputer)
222        self.set_continuizer(continuizer=continuizer)
223        self.__dict__.update(kwds)
224
225    @deprecated_keywords({"xVars": "x_vars", "yVars": "y_vars"})
226    def __call__(self, table, weight_id=None, x_vars=None, y_vars=None):
227        """
228        :param table: data instances.
229        :type table: :class:`Orange.data.Table`
230
231        :param x_vars, y_vars: List of input and response variables
232            (:obj:`Orange.feature.Continuous` or
233            :obj:`Orange.feature.Discrete`). If None (default) it is
234            assumed that the data domain provides information which variables
235            are reponses and which are not. If data has
236            :obj:`~Orange.data.Domain.class_var` defined in its domain, a
237            single-target regression learner is constructed. Otherwise a
238            multi-target learner predicting response variables defined by
239            :obj:`~Orange.data.Domain.class_vars` is constructed.
240        :type x_vars, y_vars: list           
241
242        """     
243        domain = table.domain
244        if x_vars is None and y_vars is None:
245            # Response variables are defined in the table.
246            x_vars = domain.features
247            if domain.class_var:
248                y_vars = [domain.class_var]
249            elif domain.class_vars:
250                y_vars = domain.class_vars
251            else:
252                raise TypeError('Class-less domain (x-vars and y-vars needed).')
253            x_table = select_attrs(table, x_vars)
254            y_table = select_attrs(table, y_vars)
255        elif not (x_vars and y_vars):
256            raise ValueError("Both x_vars and y_vars must be defined.")
257
258        x_table = select_attrs(table, x_vars)
259        y_table = select_attrs(table, y_vars)
260
261        # dicrete values are continuized       
262        x_table = self.continuize_table(x_table)
263        y_table = self.continuize_table(y_table)
264        # missing values are imputed
265        x_table = self.impute_table(x_table)
266        y_table = self.impute_table(y_table)
267       
268        # Collect the new transformed x_vars/y_vars
269        x_vars = list(x_table.domain.variables)
270        y_vars = list(y_table.domain.variables)
271       
272        domain = Orange.data.Domain(x_vars + y_vars, False)
273        multitarget = True if len(y_vars) > 1 else False
274
275        x = x_table.to_numpy()[0]
276        y = y_table.to_numpy()[0]
277       
278        kwargs = self.fit(x, y)
279        return PLSRegression(domain=domain, x_vars=x_vars, y_vars=y_vars,
280                             multitarget=multitarget, **kwargs)
281
282    def fit(self, X, Y):
283        """ Fits all unknown parameters, i.e.
284        weights, scores, loadings (for x and y) and regression coefficients.
285        Returns a dict with all of the parameters.
286       
287        """
288        # copy since this will contain the residuals (deflated) matrices
289
290        X, Y = X.copy(), Y.copy()
291        if Y.ndim == 1:
292            Y = Y.reshape((Y.size, 1))
293        n, p = X.shape
294        q = Y.shape[1]
295
296        # normalization of data matrices
297        X, muX, sigmaX = normalize_matrix(X)
298        Y, muY, sigmaY = normalize_matrix(Y)
299        # Residuals (deflated) matrices
300        Xk, Yk = X, Y
301        # Results matrices
302        T, U = zeros((n, self.n_comp)), zeros((n, self.n_comp))
303        W, C = zeros((p, self.n_comp)), zeros((q, self.n_comp))
304        P, Q = zeros((p, self.n_comp)), zeros((q, self.n_comp))     
305
306        # NIPALS over components
307        for k in xrange(self.n_comp):
308            # Weights estimation (inner loop)
309            if self.algorithm == "nipals":
310                u, v = nipals_xy(X=Xk, Y=Yk, mode=self.mode, 
311                                 max_iter=self.max_iter)
312            elif self.algorithm == "svd":
313                u, v = svd_xy(X=Xk, Y=Yk)
314            # compute scores
315            xScore, yScore = dot(Xk, u), dot(Yk, v)
316            # Deflation (in place)
317            # - regress Xk's on xScore
318            xLoadings = dot(Xk.T, xScore) / dot(xScore.T, xScore)
319            # - substract rank-one approximations to obtain remainder matrix
320            Xk -= dot(xScore, xLoadings.T)
321            if self.deflation_mode == "canonical":
322                # - regress Yk's on yScore, then substract rank-one approx.
323                yLoadings = dot(Yk.T, yScore) / dot(yScore.T, yScore)
324                Yk -= dot(yScore, yLoadings.T)
325            if self.deflation_mode == "regression":
326                # - regress Yk's on xScore, then substract rank-one approx.
327                yLoadings = dot(Yk.T, xScore) / dot(xScore.T, xScore)
328                Yk -= dot(xScore, yLoadings.T)
329            # Store weights, scores and loadings
330            T[:, k] = xScore.ravel() # x-scores
331            U[:, k] = yScore.ravel() # y-scores
332            W[:, k] = u.ravel() # x-weights
333            C[:, k] = v.ravel() # y-weights
334            P[:, k] = xLoadings.ravel() # x-loadings
335            Q[:, k] = yLoadings.ravel() # y-loadings
336        # X = TP' + E and Y = UQ' + E
337
338        # Rotations from input space to transformed space (scores)
339        # T = X W(P'W)^-1 = XW* (W* : p x k matrix)
340        # U = Y C(Q'C)^-1 = YC* (W* : q x k matrix)
341        xRotations = dot(W, pinv(dot(P.T, W)))
342        if Y.shape[1] > 1:
343            yRotations = dot(C, pinv(dot(Q.T, C)))
344        else:
345            yRotations = numpy.ones(1)
346
347        if True or self.deflation_mode == "regression":
348            # Estimate regression coefficient
349            # Y = TQ' + E = X W(P'W)^-1Q' + E = XB + E
350            # => B = W*Q' (p x q)
351            coefs = dot(xRotations, Q.T)
352            coefs = 1. / sigmaX.reshape((p, 1)) * \
353                    coefs * sigmaY
354       
355        return {"mu_x": muX, "mu_y": muY, "sigma_x": sigmaX,
356                "sigma_y": sigmaY, "T": T, "U":U, "W":U, 
357                "C": C, "P":P, "Q":Q, "x_rotations": xRotations,
358                "y_rotations": yRotations, "coefs": coefs}
359
360deprecated_members({"nComp": "n_comp",
361                    "deflationMode": "deflation_mode",
362                    "maxIter": "max_iter"},
363                   wrap_methods=["__init__"],
364                   in_place=True)(PLSRegressionLearner)
365
366class PLSRegression(Orange.classification.Classifier):
367    """ PLSRegression predicts value of the response variables
368    based on the values of independent variables.
369   
370    Basic notations:
371    n - number of data instances
372    p - number of independent variables
373    q - number of reponse variables
374
375    .. attribute:: T
376   
377        A n x n_comp numpy array of x-scores
378
379    .. attribute:: U
380   
381        A n x n_comp numpy array of y-scores
382
383    .. attribute:: W
384   
385        A p x n_comp numpy array of x-weights
386
387    .. attribute:: C
388   
389        A q x n_comp numpy array of y-weights
390
391    .. attribute:: P
392   
393        A p x n_comp numpy array of x-loadings
394
395    .. attribute:: Q
396   
397        A q x n_comp numpy array of y-loading
398
399    .. attribute:: coefs
400   
401        A p x q numpy array coefficients
402        of the linear model: Y = X coefs + E
403
404    .. attribute:: x_vars
405   
406        Predictor variables
407
408    .. attribute:: y_vars
409   
410        Response variables
411       
412    """
413    def __init__(self, domain=None, multitarget=False, coefs=None, sigma_x=None, sigma_y=None,
414                 mu_x=None, mu_y=None, x_vars=None, y_vars=None, **kwargs):
415        self.domain = domain
416        self.multitarget = multitarget
417        self.coefs = coefs
418        self.mu_x, self.mu_y = mu_x, mu_y
419        self.sigma_x, self.sigma_y = sigma_x, sigma_y
420        self.x_vars, self.y_vars = x_vars, y_vars
421           
422        for name, val in kwargs.items():
423            setattr(self, name, val)
424
425    def __call__(self, instance, result_type=Orange.core.GetValue):
426        """
427        :param instance: data instance for which the value of the response
428            variable will be predicted
429        :type instance: :class:`Orange.data.Instance`
430       
431        """ 
432        instance = Orange.data.Instance(self.domain, instance)
433        ins = [instance[v].native() for v in self.x_vars]
434       
435        if "?" in ins: # missing value -> corresponding coefficient omitted
436            def miss_2_0(x): return x if x != "?" else 0
437            ins = map(miss_2_0, ins)
438        ins = numpy.array(ins)
439        xc = (ins - self.mu_x)
440        predicted = dot(xc, self.coefs) + self.mu_y
441        y_hat = [var(val) for var, val in zip(self.y_vars, predicted)]
442        if result_type == Orange.core.GetValue:
443            return y_hat if self.multitarget else y_hat[0]
444        else:
445            from Orange.statistics.distribution import Distribution
446            probs = []
447            for var, val in zip(self.y_vars, y_hat):
448                dist = Distribution(var)
449                dist[val] = 1.0
450                probs.append(dist)
451            if result_type == Orange.core.GetBoth:
452                return (y_hat, probs) if self.multitarget else (y_hat[0], probs[0])
453            else:
454                return probs if self.multitarget else probs[0]
455           
456    def to_string(self):
457        """ Pretty-prints the coefficient of the PLS regression model.
458        """       
459        x_vars, y_vars = [x.name for x in self.x_vars], [y.name for y in self.y_vars]
460        fmt = "%8s " + "%12.3f " * len(y_vars)
461        first = [" " * 8 + "%13s" * len(y_vars) % tuple(y_vars)]
462        lines = [fmt % tuple([x_vars[i]] + list(coef))
463                 for i, coef in enumerate(self.coefs)]
464        return '\n'.join(first + lines)
465           
466    def __str__(self):
467        return self.to_string()
468
469    """
470    def transform(self, X, Y=None):
471
472        # Normalize
473        Xc = (X - self.muX) / self.sigmaX
474        if Y is not None:
475            Yc = (Y - self.muY) / self.sigmaY
476        # Apply rotation
477        xScores = dot(Xc, self.xRotations)
478        if Y is not None:
479            yScores = dot(Yc, self.yRotations)
480            return xScores, yScores
481
482        return xScores
483    """
484             
485deprecated_members({"xVars": "x_vars", 
486                    "yVars": "y_vars",
487                    "muX": "mu_x",
488                    "muY": "mu_y",
489                    "sigmaX": "sigma_x",
490                    "sigmaY": "sigma_y"},
491                   wrap_methods=["__init__"],
492                   in_place=True)(PLSRegression)
493                   
494if __name__ == "__main__":
495
496    import Orange
497    from Orange.regression import pls
498
499    data = Orange.data.Table("multitarget-synthetic")
500    l = pls.PLSRegressionLearner()
501
502    x = data.domain.features
503    y = data.domain.class_vars
504    print x, y
505    # c = l(data, x_vars=x, y_vars=y)
506    c = l(data)
507    print c
Note: See TracBrowser for help on using the repository browser.