source: orange/Orange/regression/pls.py @ 10580:c4cbae8dcf8b

Revision 10580:c4cbae8dcf8b, 16.7 KB checked in by markotoplak, 2 years ago (diff)

Moved deprecation functions, progress bar support and environ into Orange.utils. Orange imports cleanly, although it is not tested yet.

Line 
1"""\
2##########################################
3Partial least sqaures regression (``PLS``)
4##########################################
5
6.. index:: regression
7
8.. _`Parital Least Squares Regression`: http://en.wikipedia.org/wiki/Partial_least_squares_regression
9
10`Partial least squares
11<http://en.wikipedia.org/wiki/Partial_least_squares_regression>`_
12regression is a statistical method for simultaneous prediction of
13multiple response variables. Orange's implementation is
14based on `Scikit learn python implementation
15<https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/pls.py>`_.
16
17The following code shows how to fit a PLS regression model on a multi-target data set.
18
19.. literalinclude:: code/pls-example.py
20    :lines: 7,9,13,14
21
22.. autoclass:: PLSRegressionLearner
23    :members:
24
25.. autoclass:: PLSRegression
26    :members:
27
28Utility functions
29-----------------
30
31.. autofunction:: normalize_matrix
32
33.. autofunction:: nipals_xy
34
35.. autofunction:: svd_xy
36
37========
38Examples
39========
40
41The following code predicts the values of output variables for the
42first two instances in ``data``.
43
44
45.. literalinclude:: code/pls-example.py
46    :lines: 16-20
47
48::
49
50    Actual     [<orange.Value 'Y1'='0.490'>, <orange.Value 'Y2'='1.237'>, <orange.Value 'Y3'='1.808'>, <orange.Value 'Y4'='0.422'>]
51    Predicted  [<orange.Value 'Y1'='0.613'>, <orange.Value 'Y2'='0.826'>, <orange.Value 'Y3'='1.084'>, <orange.Value 'Y4'='0.534'>]
52
53    Actual     [<orange.Value 'Y1'='0.167'>, <orange.Value 'Y2'='-0.664'>, <orange.Value 'Y3'='-1.378'>, <orange.Value 'Y4'='0.589'>]
54    Predicted  [<orange.Value 'Y1'='0.058'>, <orange.Value 'Y2'='-0.706'>, <orange.Value 'Y3'='-1.420'>, <orange.Value 'Y4'='0.599'>]
55
56To see the coefficient of the model, print the model:
57
58.. literalinclude:: code/pls-example.py
59    :lines: 22
60
61::
62
63    Regression coefficients:
64                       Y1           Y2           Y3           Y4
65          X1        0.714        2.153        3.590       -0.078
66          X2       -0.238       -2.500       -4.797       -0.036
67          X3        0.230       -0.314       -0.880       -0.060
68
69Note that coefficients are stored in a matrix since the model predicts
70values of multiple outputs.
71"""
72
73import Orange
74import numpy
75
76from Orange.regression import base
77from numpy import dot, zeros
78from numpy import linalg
79from numpy.linalg import svd, pinv
80
81from Orange.utils import deprecated_members, deprecated_keywords
82
83
84def normalize_matrix(X):
85    """
86    Normalize a matrix column-wise: subtract the means and divide by
87    standard deviations. Returns the standardized matrix, sample mean
88    and standard deviation
89
90    :param X: data matrix
91    :type X: :class:`numpy.array`
92   
93    """
94    mu_x, sigma_x = numpy.mean(X, axis=0), numpy.std(X, axis=0)
95    sigma_x[sigma_x == 0] = 1.
96    return (X - mu_x)/sigma_x, mu_x, sigma_x
97
98@deprecated_keywords({"maxIter": "max_iter"})
99def nipals_xy(X, Y, mode="PLS", max_iter=500, tol=1e-06):
100    """
101    NIPALS algorithm; returns the first left and rigth singular
102    vectors of X'Y.
103
104    :param X, Y: data matrix
105    :type X, Y: :class:`numpy.array`
106
107    :param mode: possible values "PLS" (default) or "CCA"
108    :type mode: string
109
110    :param max_iter: maximal number of iterations (default: 500)
111    :type max_iter: int
112
113    :param tol: tolerance parameter; if norm of difference
114        between two successive left singular vectors is less than tol,
115        iteration is stopped
116    :type tol: a not negative float
117           
118    """
119    yScore, uOld, ite = Y[:, [0]], 0, 1
120    Xpinv = Ypinv = None
121    # Inner loop of the Wold algo.
122    while True and ite < max_iter:
123        # Update u: the X weights
124        if mode == "CCA":
125            if Xpinv is None:
126                Xpinv = linalg.pinv(X) # compute once pinv(X)
127            u = dot(Xpinv, yScore)
128        else: # mode PLS
129        # Mode PLS regress each X column on yScore
130            u = dot(X.T, yScore) / dot(yScore.T, yScore)
131        # Normalize u
132        u /= numpy.sqrt(dot(u.T, u))
133        # Update xScore: the X latent scores
134        xScore = dot(X, u)
135
136        # Update v: the Y weights
137        if mode == "CCA":
138            if Ypinv is None:
139                Ypinv = linalg.pinv(Y) # compute once pinv(Y)
140            v = dot(Ypinv, xScore)
141        else:
142            # Mode PLS regress each X column on yScore
143            v = dot(Y.T, xScore) / dot(xScore.T, xScore)
144        # Normalize v
145        v /= numpy.sqrt(dot(v.T, v))
146        # Update yScore: the Y latent scores
147        yScore = dot(Y, v)
148
149        uDiff = u - uOld
150        if dot(uDiff.T, uDiff) < tol or Y.shape[1] == 1:
151            break
152        uOld = u
153        ite += 1
154    return u, v
155
156def svd_xy(X, Y):
157    """ Return the first left and right singular
158    vectors of X'Y.
159
160    :param X, Y: data matrix
161    :type X, Y: :class:`numpy.array`   
162   
163    """
164    U, s, V = svd(dot(X.T, Y), full_matrices=False)
165    u = U[:, [0]]
166    v = V.T[:, [0]]
167    return u, v
168
169
170def select_attrs(table, attributes, class_var=None, metas=None):
171    """ Select ``attributes`` from the ``table`` and return a new data table.
172    """
173    domain = Orange.data.Domain(attributes, class_var)
174    if metas:
175        domain.add_metas(metas)
176    return Orange.data.Table(domain, table)
177
178
179class PLSRegressionLearner(base.BaseRegressionLearner):
180    """
181    Fit the partial least squares regression model, i.e. learn the
182    regression parameters. The implementation is based on `Scikit
183    learn python implementation`_
184   
185    The class is derived from
186    :class:`Orange.regression.base.BaseRegressionLearner` that is
187    used for preprocessing the data (continuization and imputation)
188    before fitting the regression parameters
189   
190    """
191
192    def __init__(self, n_comp=2, deflation_mode="regression", mode="PLS",
193                 algorithm="nipals", max_iter=500, 
194                 imputer=None, continuizer=None,
195                 **kwds):
196        """
197        .. attribute:: n_comp
198   
199            number of components to keep (default: 2)
200
201        .. attribute:: deflation_mode
202   
203            "canonical" or "regression" (default)
204
205        .. attribute:: mode
206   
207            "CCA" or "PLS" (default)
208
209
210        .. attribute:: algorithm
211   
212            The algorithm for estimating the weights:
213            "nipals" or "svd" (default)
214
215
216        """       
217        self.n_comp = n_comp
218        self.deflation_mode = deflation_mode
219        self.mode = mode
220        self.algorithm = algorithm
221        self.max_iter = max_iter
222        self.set_imputer(imputer=imputer)
223        self.set_continuizer(continuizer=continuizer)
224        self.__dict__.update(kwds)
225
226    @deprecated_keywords({"xVars": "x_vars", "yVars": "y_vars"})
227    def __call__(self, table, weight_id=None, x_vars=None, y_vars=None):
228        """
229        :param table: data instances.
230        :type table: :class:`Orange.data.Table`
231
232        :param x_vars, y_vars: List of input and response variables
233            (:obj:`Orange.feature.Continuous` or
234            :obj:`Orange.feature.Discrete`). If ``None`` (default) it is
235            assumed that the data domain provides information which variables
236            are reponses and which are not. If data has
237            :obj:`~Orange.data.Domain.class_var` defined in its domain, a
238            single-target regression learner is constructed. Otherwise a
239            multi-target learner predicting response variables defined by
240            :obj:`~Orange.data.Domain.class_vars` is constructed.
241        :type x_vars, y_vars: list           
242
243        """     
244        domain = table.domain
245        if x_vars is None and y_vars is None:
246            # Response variables are defined in the table.
247            x_vars = domain.features
248            if domain.class_var:
249                y_vars = [domain.class_var]
250            elif domain.class_vars:
251                y_vars = domain.class_vars
252            else:
253                raise TypeError('Class-less domain (x-vars and y-vars needed).')
254            x_table = select_attrs(table, x_vars)
255            y_table = select_attrs(table, y_vars)
256        elif not (x_vars and y_vars):
257            raise ValueError("Both x_vars and y_vars must be defined.")
258
259        x_table = select_attrs(table, x_vars)
260        y_table = select_attrs(table, y_vars)
261
262        # dicrete values are continuized       
263        x_table = self.continuize_table(x_table)
264        y_table = self.continuize_table(y_table)
265        # missing values are imputed
266        x_table = self.impute_table(x_table)
267        y_table = self.impute_table(y_table)
268       
269        # Collect the new transformed x_vars/y_vars
270        x_vars = list(x_table.domain.variables)
271        y_vars = list(y_table.domain.variables)
272       
273        domain = Orange.data.Domain(x_vars + y_vars, False)
274        multitarget = True if len(y_vars) > 1 else False
275
276        x = x_table.to_numpy()[0]
277        y = y_table.to_numpy()[0]
278       
279        kwargs = self.fit(x, y)
280        return PLSRegression(domain=domain, x_vars=x_vars, y_vars=y_vars,
281                             multitarget=multitarget, **kwargs)
282
283    def fit(self, X, Y):
284        """ Fit all unknown parameters, i.e.
285        weights, scores, loadings (for x and y) and regression coefficients.
286        Return a dict with all of the parameters.
287        """
288        # copy since this will contain the residuals (deflated) matrices
289
290        X, Y = X.copy(), Y.copy()
291        if Y.ndim == 1:
292            Y = Y.reshape((Y.size, 1))
293        n, p = X.shape
294        q = Y.shape[1]
295
296        # normalization of data matrices
297        X, muX, sigmaX = normalize_matrix(X)
298        Y, muY, sigmaY = normalize_matrix(Y)
299        # Residuals (deflated) matrices
300        Xk, Yk = X, Y
301        # Results matrices
302        T, U = zeros((n, self.n_comp)), zeros((n, self.n_comp))
303        W, C = zeros((p, self.n_comp)), zeros((q, self.n_comp))
304        P, Q = zeros((p, self.n_comp)), zeros((q, self.n_comp))     
305
306        # NIPALS over components
307        for k in xrange(self.n_comp):
308            # Weights estimation (inner loop)
309            if self.algorithm == "nipals":
310                u, v = nipals_xy(X=Xk, Y=Yk, mode=self.mode, 
311                                 max_iter=self.max_iter)
312            elif self.algorithm == "svd":
313                u, v = svd_xy(X=Xk, Y=Yk)
314            # compute scores
315            xScore, yScore = dot(Xk, u), dot(Yk, v)
316            # Deflation (in place)
317            # - regress Xk's on xScore
318            xLoadings = dot(Xk.T, xScore) / dot(xScore.T, xScore)
319            # - substract rank-one approximations to obtain remainder matrix
320            Xk -= dot(xScore, xLoadings.T)
321            if self.deflation_mode == "canonical":
322                # - regress Yk's on yScore, then substract rank-one approx.
323                yLoadings = dot(Yk.T, yScore) / dot(yScore.T, yScore)
324                Yk -= dot(yScore, yLoadings.T)
325            if self.deflation_mode == "regression":
326                # - regress Yk's on xScore, then substract rank-one approx.
327                yLoadings = dot(Yk.T, xScore) / dot(xScore.T, xScore)
328                Yk -= dot(xScore, yLoadings.T)
329            # Store weights, scores and loadings
330            T[:, k] = xScore.ravel() # x-scores
331            U[:, k] = yScore.ravel() # y-scores
332            W[:, k] = u.ravel() # x-weights
333            C[:, k] = v.ravel() # y-weights
334            P[:, k] = xLoadings.ravel() # x-loadings
335            Q[:, k] = yLoadings.ravel() # y-loadings
336        # X = TP' + E and Y = UQ' + E
337
338        # Rotations from input space to transformed space (scores)
339        # T = X W(P'W)^-1 = XW* (W* : p x k matrix)
340        # U = Y C(Q'C)^-1 = YC* (W* : q x k matrix)
341        xRotations = dot(W, pinv(dot(P.T, W)))
342        if Y.shape[1] > 1:
343            yRotations = dot(C, pinv(dot(Q.T, C)))
344        else:
345            yRotations = numpy.ones(1)
346
347        if True or self.deflation_mode == "regression":
348            # Estimate regression coefficient
349            # Y = TQ' + E = X W(P'W)^-1Q' + E = XB + E
350            # => B = W*Q' (p x q)
351            coefs = dot(xRotations, Q.T)
352            coefs = 1. / sigmaX.reshape((p, 1)) * \
353                    coefs * sigmaY
354       
355        return {"mu_x": muX, "mu_y": muY, "sigma_x": sigmaX,
356                "sigma_y": sigmaY, "T": T, "U":U, "W":U, 
357                "C": C, "P":P, "Q":Q, "x_rotations": xRotations,
358                "y_rotations": yRotations, "coefs": coefs}
359
360deprecated_members({"nComp": "n_comp",
361                    "deflationMode": "deflation_mode",
362                    "maxIter": "max_iter"},
363                   wrap_methods=["__init__"],
364                   in_place=True)(PLSRegressionLearner)
365
366class PLSRegression(Orange.classification.Classifier):
367    """ Predict values of the response variables
368    based on the values of independent variables.
369   
370    Basic notations:
371    n - number of data instances
372    p - number of independent variables
373    q - number of reponse variables
374
375    .. attribute:: T
376   
377        A n x n_comp numpy array of x-scores
378
379    .. attribute:: U
380   
381        A n x n_comp numpy array of y-scores
382
383    .. attribute:: W
384   
385        A p x n_comp numpy array of x-weights
386
387    .. attribute:: C
388   
389        A q x n_comp numpy array of y-weights
390
391    .. attribute:: P
392   
393        A p x n_comp numpy array of x-loadings
394
395    .. attribute:: Q
396   
397        A q x n_comp numpy array of y-loading
398
399    .. attribute:: coefs
400   
401        A p x q numpy array coefficients
402        of the linear model: Y = X coefs + E
403
404    .. attribute:: x_vars
405   
406        Predictor variables
407
408    .. attribute:: y_vars
409   
410        Response variables
411       
412    """
413    def __init__(self, domain=None, multitarget=False, coefs=None, sigma_x=None, sigma_y=None,
414                 mu_x=None, mu_y=None, x_vars=None, y_vars=None, **kwargs):
415        self.domain = domain
416        self.multitarget = multitarget
417        self.coefs = coefs
418        self.mu_x, self.mu_y = mu_x, mu_y
419        self.sigma_x, self.sigma_y = sigma_x, sigma_y
420        self.x_vars, self.y_vars = x_vars, y_vars
421           
422        for name, val in kwargs.items():
423            setattr(self, name, val)
424
425    def __call__(self, instance, result_type=Orange.core.GetValue):
426        """
427        :param instance: data instance for which the value of the response
428            variable will be predicted
429        :type instance: :class:`Orange.data.Instance`
430       
431        """ 
432        instance = Orange.data.Instance(self.domain, instance)
433        ins = [instance[v].native() for v in self.x_vars]
434       
435        if "?" in ins: # missing value -> corresponding coefficient omitted
436            def miss_2_0(x): return x if x != "?" else 0
437            ins = map(miss_2_0, ins)
438        ins = numpy.array(ins)
439        xc = (ins - self.mu_x)
440        predicted = dot(xc, self.coefs) + self.mu_y
441        y_hat = [var(val) for var, val in zip(self.y_vars, predicted)]
442        if result_type == Orange.core.GetValue:
443            return y_hat if self.multitarget else y_hat[0]
444        else:
445            from Orange.statistics.distribution import Distribution
446            probs = []
447            for var, val in zip(self.y_vars, y_hat):
448                dist = Distribution(var)
449                dist[val] = 1.0
450                probs.append(dist)
451            if result_type == Orange.core.GetBoth:
452                return (y_hat, probs) if self.multitarget else (y_hat[0], probs[0])
453            else:
454                return probs if self.multitarget else probs[0]
455           
456    def to_string(self):
457        """ Pretty-prints the coefficient of the PLS regression model.
458        """       
459        x_vars, y_vars = [x.name for x in self.x_vars], [y.name for y in self.y_vars]
460        fmt = "%8s " + "%12.3f " * len(y_vars)
461        first = [" " * 8 + "%13s" * len(y_vars) % tuple(y_vars)]
462        lines = [fmt % tuple([x_vars[i]] + list(coef))
463                 for i, coef in enumerate(self.coefs)]
464        return '\n'.join(first + lines)
465           
466    def __str__(self):
467        return self.to_string()
468
469    """
470    def transform(self, X, Y=None):
471
472        # Normalize
473        Xc = (X - self.muX) / self.sigmaX
474        if Y is not None:
475            Yc = (Y - self.muY) / self.sigmaY
476        # Apply rotation
477        xScores = dot(Xc, self.xRotations)
478        if Y is not None:
479            yScores = dot(Yc, self.yRotations)
480            return xScores, yScores
481
482        return xScores
483    """
484             
485deprecated_members({"xVars": "x_vars", 
486                    "yVars": "y_vars",
487                    "muX": "mu_x",
488                    "muY": "mu_y",
489                    "sigmaX": "sigma_x",
490                    "sigmaY": "sigma_y"},
491                   wrap_methods=["__init__"],
492                   in_place=True)(PLSRegression)
493                   
494if __name__ == "__main__":
495
496    import Orange
497    from Orange.regression import pls
498
499    data = Orange.data.Table("multitarget-synthetic")
500    l = pls.PLSRegressionLearner()
501
502    x = data.domain.features
503    y = data.domain.class_vars
504    print x, y
505    # c = l(data, x_vars=x, y_vars=y)
506    c = l(data)
507    print c
Note: See TracBrowser for help on using the repository browser.