source: orange/Orange/regression/pls.py @ 10330:616584f3fd52

Revision 10330:616584f3fd52, 16.7 KB checked in by Ales Erjavec <ales.erjavec@…>, 2 years ago (diff)

Removed old unneeded code (for old mult-label domains) from earth module.

Line 
1"""\
2##########################################
3Partial least sqaures regression (``PLS``)
4##########################################
5
6.. index:: regression
7
8.. _`Parital Least Squares Regression`: http://en.wikipedia.org/wiki/Partial_least_squares_regression
9
10`Partial least squares
11<http://en.wikipedia.org/wiki/Partial_least_squares_regression>`_
12regression is a statistical method which can be used to predict
13multiple response variables simultaniously. Implementation is based on
14`Scikit learn python implementation
15<https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/pls.py>`_.
16
17The following code shows how to fit a PLS regression model on a multi-target data set.
18
19.. literalinclude:: code/pls-example.py
20    :lines: 7,9,13,14
21
22.. autoclass:: PLSRegressionLearner
23    :members:
24
25.. autoclass:: PLSRegression
26    :members:
27
28Utility functions
29-----------------
30
31.. autofunction:: normalize_matrix
32
33.. autofunction:: nipals_xy
34
35.. autofunction:: svd_xy
36
37
38========
39Examples
40========
41
42To predict values for the first two data instances
43use the followin code
44
45.. literalinclude:: code/pls-example.py
46    :lines: 16-20
47
48Output
49
50::
51
52    Actual     [<orange.Value 'Y1'='0.490'>, <orange.Value 'Y2'='1.237'>, <orange.Value 'Y3'='1.808'>, <orange.Value 'Y4'='0.422'>]
53    Predicted  [<orange.Value 'Y1'='0.613'>, <orange.Value 'Y2'='0.826'>, <orange.Value 'Y3'='1.084'>, <orange.Value 'Y4'='0.534'>]
54
55    Actual     [<orange.Value 'Y1'='0.167'>, <orange.Value 'Y2'='-0.664'>, <orange.Value 'Y3'='-1.378'>, <orange.Value 'Y4'='0.589'>]
56    Predicted  [<orange.Value 'Y1'='0.058'>, <orange.Value 'Y2'='-0.706'>, <orange.Value 'Y3'='-1.420'>, <orange.Value 'Y4'='0.599'>]
57
58To see the coefficient of the model (in this case they are stored in a matrix)
59print the model
60
61.. literalinclude:: code/pls-example.py
62    :lines: 22
63
64The ouptut looks like
65
66::
67
68    Regression coefficients:
69                       Y1           Y2           Y3           Y4
70          X1        0.714        2.153        3.590       -0.078
71          X2       -0.238       -2.500       -4.797       -0.036
72          X3        0.230       -0.314       -0.880       -0.060
73
74
75"""
76
77import Orange
78import numpy
79
80from Orange.regression import base
81from numpy import dot, zeros
82from numpy import linalg
83from numpy.linalg import svd, pinv
84
85from Orange.misc import deprecated_members, deprecated_keywords
86
87
88def normalize_matrix(X):
89    """ Normalizes matrix, i.e. subtracts column means
90    and divides them by column standard deviations.
91    Returns the standardized matrix, sample mean and
92    standard deviation
93
94    :param X: data matrix
95    :type X: :class:`numpy.array`
96   
97    """
98    mu_x, sigma_x = numpy.mean(X, axis=0), numpy.std(X, axis=0)
99    sigma_x[sigma_x == 0] = 1.
100    return (X - mu_x)/sigma_x, mu_x, sigma_x
101
102@deprecated_keywords({"maxIter": "max_iter"})
103def nipals_xy(X, Y, mode="PLS", max_iter=500, tol=1e-06):
104    """
105    NIPALS algorithm. Returns the first left and rigth singular
106    vectors of X'Y.
107
108    :param X, Y: data matrix
109    :type X, Y: :class:`numpy.array`
110
111    :param mode: possible values "PLS" (default) or "CCA"
112    :type mode: string
113
114    :param max_iter: maximal number of iterations (default 500)
115    :type max_iter: int
116
117    :param tol: tolerance parameter, if norm of difference
118        between two successive left singular vectors is less than tol,
119        iteration is stopped
120    :type tol: a not negative float
121           
122    """
123    yScore, uOld, ite = Y[:, [0]], 0, 1
124    Xpinv = Ypinv = None
125    # Inner loop of the Wold algo.
126    while True and ite < max_iter:
127        # Update u: the X weights
128        if mode == "CCA":
129            if Xpinv is None:
130                Xpinv = linalg.pinv(X) # compute once pinv(X)
131            u = dot(Xpinv, yScore)
132        else: # mode PLS
133        # Mode PLS regress each X column on yScore
134            u = dot(X.T, yScore) / dot(yScore.T, yScore)
135        # Normalize u
136        u /= numpy.sqrt(dot(u.T, u))
137        # Update xScore: the X latent scores
138        xScore = dot(X, u)
139
140        # Update v: the Y weights
141        if mode == "CCA":
142            if Ypinv is None:
143                Ypinv = linalg.pinv(Y) # compute once pinv(Y)
144            v = dot(Ypinv, xScore)
145        else:
146            # Mode PLS regress each X column on yScore
147            v = dot(Y.T, xScore) / dot(xScore.T, xScore)
148        # Normalize v
149        v /= numpy.sqrt(dot(v.T, v))
150        # Update yScore: the Y latent scores
151        yScore = dot(Y, v)
152
153        uDiff = u - uOld
154        if dot(uDiff.T, uDiff) < tol or Y.shape[1] == 1:
155            break
156        uOld = u
157        ite += 1
158    return u, v
159
160def svd_xy(X, Y):
161    """ Returns the first left and right singular
162    vectors of X'Y.
163
164    :param X, Y: data matrix
165    :type X, Y: :class:`numpy.array`   
166   
167    """
168    U, s, V = svd(dot(X.T, Y), full_matrices=False)
169    u = U[:, [0]]
170    v = V.T[:, [0]]
171    return u, v
172
173
174def select_attrs(table, attributes, class_var=None, metas=None):
175    """ Select only ``attributes`` from the ``table``.
176    """
177    domain = Orange.data.Domain(attributes, class_var)
178    if metas:
179        domain.add_metas(metas)
180    return Orange.data.Table(domain, table)
181
182
183class PLSRegressionLearner(base.BaseRegressionLearner):
184    """ Fits the partial least squares regression model,
185    i.e. learns the regression parameters. The implementation is based on
186    `Scikit learn python implementation`_
187   
188    The class is derived from
189    :class:`Orange.regression.base.BaseRegressionLearner`
190    which is used for preprocessing the data (continuization and imputation)
191    before fitting the regression parameters
192   
193    """
194
195    def __init__(self, n_comp=2, deflation_mode="regression", mode="PLS",
196                 algorithm="nipals", max_iter=500, 
197                 imputer=None, continuizer=None,
198                 **kwds):
199        """
200        .. attribute:: n_comp
201   
202            number of components to keep. Default: 2
203
204        .. attribute:: deflation_mode
205   
206            "canonical" or "regression" (default)
207
208        .. attribute:: mode
209   
210            "CCA" or "PLS" (default)
211
212
213        .. attribute:: algorithm
214   
215            The algorithm used to estimate the weights:
216            "nipals" or "svd" (default)
217
218
219        """       
220        self.n_comp = n_comp
221        self.deflation_mode = deflation_mode
222        self.mode = mode
223        self.algorithm = algorithm
224        self.max_iter = max_iter
225        self.set_imputer(imputer=imputer)
226        self.set_continuizer(continuizer=continuizer)
227        self.__dict__.update(kwds)
228
229    @deprecated_keywords({"xVars": "x_vars", "yVars": "y_vars"})
230    def __call__(self, table, weight_id=None, x_vars=None, y_vars=None):
231        """
232        :param table: data instances.
233        :type table: :class:`Orange.data.Table`
234
235        :param x_vars, y_vars: List of input and response variables
236            (:obj:`Orange.feature.Continuous` or
237            :obj:`Orange.feature.Discrete`). If None (default) it is
238            assumed that the data domain provides information which variables
239            are reponses and which are not. If data has
240            :obj:`~Orange.data.Domain.class_var` defined in its domain, a
241            single-target regression learner is constructed. Otherwise a
242            multi-target learner predicting response variables defined by
243            :obj:`~Orange.data.Domain.class_vars` is constructed.
244        :type x_vars, y_vars: list           
245
246        """     
247        domain = table.domain
248        if x_vars is None and y_vars is None:
249            # Response variables are defined in the table.
250            x_vars = domain.features
251            if domain.class_var:
252                y_vars = [domain.class_var]
253            elif domain.class_vars:
254                y_vars = domain.class_vars
255            else:
256                raise TypeError('Class-less domain (x-vars and y-vars needed).')
257            x_table = select_attrs(table, x_vars)
258            y_table = select_attrs(table, y_vars)
259        elif not (x_vars and y_vars):
260            raise ValueError("Both x_vars and y_vars must be defined.")
261
262        x_table = select_attrs(table, x_vars)
263        y_table = select_attrs(table, y_vars)
264
265        # dicrete values are continuized       
266        x_table = self.continuize_table(x_table)
267        y_table = self.continuize_table(y_table)
268        # missing values are imputed
269        x_table = self.impute_table(x_table)
270        y_table = self.impute_table(y_table)
271       
272        # Collect the new transformed x_vars/y_vars
273        x_vars = list(x_table.domain.variables)
274        y_vars = list(y_table.domain.variables)
275       
276        domain = Orange.data.Domain(x_vars + y_vars, False)
277        multitarget = True if len(y_vars) > 1 else False
278
279        x = x_table.to_numpy()[0]
280        y = y_table.to_numpy()[0]
281       
282        kwargs = self.fit(x, y)
283        return PLSRegression(domain=domain, x_vars=x_vars, y_vars=y_vars,
284                             multitarget=multitarget, **kwargs)
285
286    def fit(self, X, Y):
287        """ Fits all unknown parameters, i.e.
288        weights, scores, loadings (for x and y) and regression coefficients.
289        Returns a dict with all of the parameters.
290       
291        """
292        # copy since this will contain the residuals (deflated) matrices
293
294        X, Y = X.copy(), Y.copy()
295        if Y.ndim == 1:
296            Y = Y.reshape((Y.size, 1))
297        n, p = X.shape
298        q = Y.shape[1]
299
300        # normalization of data matrices
301        X, muX, sigmaX = normalize_matrix(X)
302        Y, muY, sigmaY = normalize_matrix(Y)
303        # Residuals (deflated) matrices
304        Xk, Yk = X, Y
305        # Results matrices
306        T, U = zeros((n, self.n_comp)), zeros((n, self.n_comp))
307        W, C = zeros((p, self.n_comp)), zeros((q, self.n_comp))
308        P, Q = zeros((p, self.n_comp)), zeros((q, self.n_comp))     
309
310        # NIPALS over components
311        for k in xrange(self.n_comp):
312            # Weights estimation (inner loop)
313            if self.algorithm == "nipals":
314                u, v = nipals_xy(X=Xk, Y=Yk, mode=self.mode, 
315                                 max_iter=self.max_iter)
316            elif self.algorithm == "svd":
317                u, v = svd_xy(X=Xk, Y=Yk)
318            # compute scores
319            xScore, yScore = dot(Xk, u), dot(Yk, v)
320            # Deflation (in place)
321            # - regress Xk's on xScore
322            xLoadings = dot(Xk.T, xScore) / dot(xScore.T, xScore)
323            # - substract rank-one approximations to obtain remainder matrix
324            Xk -= dot(xScore, xLoadings.T)
325            if self.deflation_mode == "canonical":
326                # - regress Yk's on yScore, then substract rank-one approx.
327                yLoadings = dot(Yk.T, yScore) / dot(yScore.T, yScore)
328                Yk -= dot(yScore, yLoadings.T)
329            if self.deflation_mode == "regression":
330                # - regress Yk's on xScore, then substract rank-one approx.
331                yLoadings = dot(Yk.T, xScore) / dot(xScore.T, xScore)
332                Yk -= dot(xScore, yLoadings.T)
333            # Store weights, scores and loadings
334            T[:, k] = xScore.ravel() # x-scores
335            U[:, k] = yScore.ravel() # y-scores
336            W[:, k] = u.ravel() # x-weights
337            C[:, k] = v.ravel() # y-weights
338            P[:, k] = xLoadings.ravel() # x-loadings
339            Q[:, k] = yLoadings.ravel() # y-loadings
340        # X = TP' + E and Y = UQ' + E
341
342        # Rotations from input space to transformed space (scores)
343        # T = X W(P'W)^-1 = XW* (W* : p x k matrix)
344        # U = Y C(Q'C)^-1 = YC* (W* : q x k matrix)
345        xRotations = dot(W, pinv(dot(P.T, W)))
346        if Y.shape[1] > 1:
347            yRotations = dot(C, pinv(dot(Q.T, C)))
348        else:
349            yRotations = numpy.ones(1)
350
351        if True or self.deflation_mode == "regression":
352            # Estimate regression coefficient
353            # Y = TQ' + E = X W(P'W)^-1Q' + E = XB + E
354            # => B = W*Q' (p x q)
355            coefs = dot(xRotations, Q.T)
356            coefs = 1. / sigmaX.reshape((p, 1)) * \
357                    coefs * sigmaY
358       
359        return {"mu_x": muX, "mu_y": muY, "sigma_x": sigmaX,
360                "sigma_y": sigmaY, "T": T, "U":U, "W":U, 
361                "C": C, "P":P, "Q":Q, "x_rotations": xRotations,
362                "y_rotations": yRotations, "coefs": coefs}
363
364deprecated_members({"nComp": "n_comp",
365                    "deflationMode": "deflation_mode",
366                    "maxIter": "max_iter"},
367                   wrap_methods=["__init__"],
368                   in_place=True)(PLSRegressionLearner)
369
370class PLSRegression(Orange.classification.Classifier):
371    """ PLSRegression predicts value of the response variables
372    based on the values of independent variables.
373   
374    Basic notations:
375    n - number of data instances
376    p - number of independent variables
377    q - number of reponse variables
378
379    .. attribute:: T
380   
381        A n x n_comp numpy array of x-scores
382
383    .. attribute:: U
384   
385        A n x n_comp numpy array of y-scores
386
387    .. attribute:: W
388   
389        A p x n_comp numpy array of x-weights
390
391    .. attribute:: C
392   
393        A q x n_comp numpy array of y-weights
394
395    .. attribute:: P
396   
397        A p x n_comp numpy array of x-loadings
398
399    .. attribute:: Q
400   
401        A q x n_comp numpy array of y-loading
402
403    .. attribute:: coefs
404   
405        A p x q numpy array coefficients
406        of the linear model: Y = X coefs + E
407
408    .. attribute:: x_vars
409   
410        Predictor variables
411
412    .. attribute:: y_vars
413   
414        Response variables
415       
416    """
417    def __init__(self, domain=None, multitarget=False, coefs=None, sigma_x=None, sigma_y=None,
418                 mu_x=None, mu_y=None, x_vars=None, y_vars=None, **kwargs):
419        self.domain = domain
420        self.multitarget = multitarget
421        self.coefs = coefs
422        self.mu_x, self.mu_y = mu_x, mu_y
423        self.sigma_x, self.sigma_y = sigma_x, sigma_y
424        self.x_vars, self.y_vars = x_vars, y_vars
425           
426        for name, val in kwargs.items():
427            setattr(self, name, val)
428
429    def __call__(self, instance, result_type=Orange.core.GetValue):
430        """
431        :param instance: data instance for which the value of the response
432            variable will be predicted
433        :type instance: :class:`Orange.data.Instance`
434       
435        """ 
436        instance = Orange.data.Instance(self.domain, instance)
437        ins = [instance[v].native() for v in self.x_vars]
438       
439        if "?" in ins: # missing value -> corresponding coefficient omitted
440            def miss_2_0(x): return x if x != "?" else 0
441            ins = map(miss_2_0, ins)
442        ins = numpy.array(ins)
443        xc = (ins - self.mu_x)
444        predicted = dot(xc, self.coefs) + self.mu_y
445        y_hat = [var(val) for var, val in zip(self.y_vars, predicted)]
446        if result_type == Orange.core.GetValue:
447            return y_hat if self.multitarget else y_hat[0]
448        else:
449            from Orange.statistics.distribution import Distribution
450            probs = []
451            for var, val in zip(self.y_vars, y_hat):
452                dist = Distribution(var)
453                dist[val] = 1.0
454                probs.append(dist)
455            if result_type == Orange.core.GetBoth:
456                return (y_hat, probs) if self.multitarget else (y_hat[0], probs[0])
457            else:
458                return probs if self.multitarget else probs[0]
459           
460    def to_string(self):
461        """ Pretty-prints the coefficient of the PLS regression model.
462        """       
463        x_vars, y_vars = [x.name for x in self.x_vars], [y.name for y in self.y_vars]
464        fmt = "%8s " + "%12.3f " * len(y_vars)
465        first = [" " * 8 + "%13s" * len(y_vars) % tuple(y_vars)]
466        lines = [fmt % tuple([x_vars[i]] + list(coef))
467                 for i, coef in enumerate(self.coefs)]
468        return '\n'.join(first + lines)
469           
470    def __str__(self):
471        return self.to_string()
472
473    """
474    def transform(self, X, Y=None):
475
476        # Normalize
477        Xc = (X - self.muX) / self.sigmaX
478        if Y is not None:
479            Yc = (Y - self.muY) / self.sigmaY
480        # Apply rotation
481        xScores = dot(Xc, self.xRotations)
482        if Y is not None:
483            yScores = dot(Yc, self.yRotations)
484            return xScores, yScores
485
486        return xScores
487    """
488             
489deprecated_members({"xVars": "x_vars", 
490                    "yVars": "y_vars",
491                    "muX": "mu_x",
492                    "muY": "mu_y",
493                    "sigmaX": "sigma_x",
494                    "sigmaY": "sigma_y"},
495                   wrap_methods=["__init__"],
496                   in_place=True)(PLSRegression)
497                   
498if __name__ == "__main__":
499
500    import Orange
501    from Orange.regression import pls
502
503    data = Orange.data.Table("multitarget-synthetic")
504    l = pls.PLSRegressionLearner()
505
506    x = data.domain.features
507    y = data.domain.class_vars
508    print x, y
509    # c = l(data, x_vars=x, y_vars=y)
510    c = l(data)
511    print c
Note: See TracBrowser for help on using the repository browser.