source: orange/orange/Orange/regression/pls.py @ 9559:fe974ae3f768

Revision 9559:fe974ae3f768, 16.3 KB checked in by lanz <lan.zagar@…>, 2 years ago (diff)

Fixed multitarget bug in PLS.

Line 
1"""\
2==========================================
3Partial least sqaures regression (``PLS``)
4==========================================
5
6.. index:: regression
7
8.. _`Parital Least Squares Regression`: http://en.wikipedia.org/wiki/Partial_least_squares_regression
9
10Implementation is based on `Scikit learn python implementation`_
11
12Example ::
13
14    >>> import Orange
15    >>> from Orange.regression import pls
16    >>> data = Orange.data.Table("test-pls.tab")
17    >>> # set independent and response variables
18    >>> x = data.domain.features
19    >>> y = data.domain.class_vars
20    >>> print x
21        [FloatVariable 'X1', FloatVariable 'X2', FloatVariable 'X3']
22    >>> print y
23        [FloatVariable 'Y1', FloatVariable 'Y2', FloatVariable 'Y3', FloatVariable 'Y4']
24    >>> # The information which variables are independent and which are responses
25    >>> # can be provided in the data definition.
26    >>> # If a variable var has key "label" in dictionary Orange.data.Domain[var].attributes
27    >>> # it is considered as a response variable.
28    >>> # In such situation x and y do not need to be specified.
29    >>> l = pls.PLSRegressionLearner()
30    >>> c = l(data, x_vars=x, y_vars=y)
31    >>> print c
32           Y1     Y2     Y3     Y4     
33    X1     0.513  0.915  0.341  -0.069 
34    X2     0.641  -1.044  0.249  -0.015 
35    X3     1.393  0.050  0.729  -0.108 
36    >>>
37
38.. autoclass:: PLSRegressionLearner
39    :members:
40
41.. autoclass:: PLSRegression
42    :members:
43
44Utility functions
45-----------------
46
47.. autofunction:: normalize_matrix
48
49.. autofunction:: nipals_xy
50
51.. autofunction:: svd_xy
52
53.. _`Scikit learn python implementation`: https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/pls.py
54
55"""
56
57import Orange
58import numpy
59
60from Orange.regression import base
61from Orange.regression.earth import data_label_mask
62from numpy import dot, zeros
63from numpy import linalg
64from numpy.linalg import svd, pinv
65
66from Orange.misc import deprecated_members, deprecated_keywords
67
68
69def normalize_matrix(X):
70    """ Normalizes matrix, i.e. subtracts column means
71    and divides them by column standard deviations.
72    Returns the standardized matrix, sample mean and
73    standard deviation
74
75    :param X: data matrix
76    :type X: :class:`numpy.array`
77   
78    """
79    mu_x, sigma_x = numpy.mean(X, axis=0), numpy.std(X, axis=0)
80    sigma_x[sigma_x == 0] = 1.
81    return (X - mu_x)/sigma_x, mu_x, sigma_x
82
83@deprecated_keywords({"maxIter": "max_iter"})
84def nipals_xy(X, Y, mode="PLS", max_iter=500, tol=1e-06):
85    """
86    NIPALS algorithm. Returns the first left and rigth singular
87    vectors of X'Y.
88
89    :param X, Y: data matrix
90    :type X, Y: :class:`numpy.array`
91
92    :param mode: possible values "PLS" (default) or "CCA"
93    :type mode: string
94
95    :param max_iter: maximal number of iterations (default 500)
96    :type max_iter: int
97
98    :param tol: tolerance parameter, if norm of difference
99        between two successive left singular vectors is less than tol,
100        iteration is stopped
101    :type tol: a not negative float
102           
103    """
104    yScore, uOld, ite = Y[:, [0]], 0, 1
105    Xpinv = Ypinv = None
106    # Inner loop of the Wold algo.
107    while True and ite < max_iter:
108        # Update u: the X weights
109        if mode == "CCA":
110            if Xpinv is None:
111                Xpinv = linalg.pinv(X) # compute once pinv(X)
112            u = dot(Xpinv, yScore)
113        else: # mode PLS
114        # Mode PLS regress each X column on yScore
115            u = dot(X.T, yScore) / dot(yScore.T, yScore)
116        # Normalize u
117        u /= numpy.sqrt(dot(u.T, u))
118        # Update xScore: the X latent scores
119        xScore = dot(X, u)
120
121        # Update v: the Y weights
122        if mode == "CCA":
123            if Ypinv is None:
124                Ypinv = linalg.pinv(Y) # compute once pinv(Y)
125            v = dot(Ypinv, xScore)
126        else:
127            # Mode PLS regress each X column on yScore
128            v = dot(Y.T, xScore) / dot(xScore.T, xScore)
129        # Normalize v
130        v /= numpy.sqrt(dot(v.T, v))
131        # Update yScore: the Y latent scores
132        yScore = dot(Y, v)
133
134        uDiff = u - uOld
135        if dot(uDiff.T, uDiff) < tol or Y.shape[1] == 1:
136            break
137        uOld = u
138        ite += 1
139    return u, v
140
141def svd_xy(X, Y):
142    """ Returns the first left and right singular
143    vectors of X'Y.
144
145    :param X, Y: data matrix
146    :type X, Y: :class:`numpy.array`   
147   
148    """
149    U, s, V = svd(dot(X.T, Y), full_matrices=False)
150    u = U[:, [0]]
151    v = V.T[:, [0]]
152    return u, v
153
154
155def select_attrs(table, attributes, class_var=None, metas=None):
156    """ Select only ``attributes`` from the ``table``.
157    """
158    domain = Orange.data.Domain(attributes, class_var)
159    if metas:
160        domain.add_metas(metas)
161    return Orange.data.Table(domain, table)
162
163
164class PLSRegressionLearner(base.BaseRegressionLearner):
165    """ Fits the partial least squares regression model,
166    i.e. learns the regression parameters. The implementation is based on
167    `Scikit learn python implementation`_
168   
169    The class is derived from
170    :class:`Orange.regression.base.BaseRegressionLearner`
171    which is used for preprocessing the data (continuization and imputation)
172    before fitting the regression parameters
173   
174    """
175
176    def __init__(self, n_comp=2, deflation_mode="regression", mode="PLS",
177                 algorithm="nipals", max_iter=500, 
178                 imputer=None, continuizer=None,
179                 **kwds):
180        """
181        .. attribute:: n_comp
182   
183            number of components to keep. Default: 2
184
185        .. attribute:: deflation_mode
186   
187            "canonical" or "regression" (default)
188
189        .. attribute:: mode
190   
191            "CCA" or "PLS" (default)
192
193
194        .. attribute:: algorithm
195   
196            The algorithm used to estimate the weights:
197            "nipals" or "svd" (default)
198
199
200        """       
201        self.n_comp = n_comp
202        self.deflation_mode = deflation_mode
203        self.mode = mode
204        self.algorithm = algorithm
205        self.max_iter = max_iter
206        self.set_imputer(imputer=imputer)
207        self.set_continuizer(continuizer=continuizer)
208        self.__dict__.update(kwds)
209
210    @deprecated_keywords({"xVars": "x_vars", "yVars": "y_vars"})
211    def __call__(self, table, weight_id=None, x_vars=None, y_vars=None):
212        """
213        :param table: data instances.
214        :type table: :class:`Orange.data.Table`
215
216        :param x_vars, y_vars: List of input and response variables
217            (`Orange.data.variable.Continuous` or `Orange.data.variable.Discrete`).
218            If None (default) it is assumed that data definition provides information
219            which variables are reponses and which not. If a variable var
220            has key "label" in dictionary Orange.data.Domain[var].attributes
221            it is treated as a response variable
222        :type x_vars, y_vars: list           
223
224        """     
225        domain = table.domain
226        if x_vars is None and y_vars is None:
227            # Response variables are defined in the table.
228            x_vars = domain.features
229            if domain.class_var:
230                y_vars = [domain.class_var]
231                multitarget = False
232            elif domain.class_vars:
233                y_vars = domain.class_vars
234                multitarget = True
235            else:
236                raise TypeError('Class-less domain (x-vars and y-vars needed).')
237            x_table = select_attrs(table, x_vars)
238            y_table = select_attrs(table, y_vars)
239        elif x_vars and y_vars:
240            # independent and response variables are passed by the caller
241            multitarget = True
242        else:
243            raise ValueError("Both x_vars and y_vars must be defined.")
244
245        x_table = select_attrs(table, x_vars)
246        y_table = select_attrs(table, y_vars)
247
248        # dicrete values are continuized       
249        x_table = self.continuize_table(x_table)
250        y_table = self.continuize_table(y_table)
251        # missing values are imputed
252        x_table = self.impute_table(x_table)
253        y_table = self.impute_table(y_table)
254       
255        # Collect the new transformed x_vars/y_vars
256        x_vars = list(x_table.domain.variables)
257        y_vars = list(y_table.domain.variables)
258       
259        domain = Orange.data.Domain(x_vars + y_vars, False)
260       
261        x = x_table.to_numpy()[0]
262        y = y_table.to_numpy()[0]
263       
264        kwargs = self.fit(x, y)
265        return PLSRegression(domain=domain, x_vars=x_vars, y_vars=y_vars,
266                             multitarget=multitarget, **kwargs)
267
268    def fit(self, X, Y):
269        """ Fits all unknown parameters, i.e.
270        weights, scores, loadings (for x and y) and regression coefficients.
271        Returns a dict with all of the parameters.
272       
273        """
274        # copy since this will contain the residuals (deflated) matrices
275
276        X, Y = X.copy(), Y.copy()
277        if Y.ndim == 1:
278            Y = Y.reshape((Y.size, 1))
279        n, p = X.shape
280        q = Y.shape[1]
281
282        # normalization of data matrices
283        X, muX, sigmaX = normalize_matrix(X)
284        Y, muY, sigmaY = normalize_matrix(Y)
285        # Residuals (deflated) matrices
286        Xk, Yk = X, Y
287        # Results matrices
288        T, U = zeros((n, self.n_comp)), zeros((n, self.n_comp))
289        W, C = zeros((p, self.n_comp)), zeros((q, self.n_comp))
290        P, Q = zeros((p, self.n_comp)), zeros((q, self.n_comp))     
291
292        # NIPALS over components
293        for k in xrange(self.n_comp):
294            # Weights estimation (inner loop)
295            if self.algorithm == "nipals":
296                u, v = nipals_xy(X=Xk, Y=Yk, mode=self.mode, 
297                                 max_iter=self.max_iter)
298            elif self.algorithm == "svd":
299                u, v = svd_xy(X=Xk, Y=Yk)
300            # compute scores
301            xScore, yScore = dot(Xk, u), dot(Yk, v)
302            # Deflation (in place)
303            # - regress Xk's on xScore
304            xLoadings = dot(Xk.T, xScore) / dot(xScore.T, xScore)
305            # - substract rank-one approximations to obtain remainder matrix
306            Xk -= dot(xScore, xLoadings.T)
307            if self.deflation_mode == "canonical":
308                # - regress Yk's on yScore, then substract rank-one approx.
309                yLoadings = dot(Yk.T, yScore) / dot(yScore.T, yScore)
310                Yk -= dot(yScore, yLoadings.T)
311            if self.deflation_mode == "regression":
312                # - regress Yk's on xScore, then substract rank-one approx.
313                yLoadings = dot(Yk.T, xScore) / dot(xScore.T, xScore)
314                Yk -= dot(xScore, yLoadings.T)
315            # Store weights, scores and loadings
316            T[:, k] = xScore.ravel() # x-scores
317            U[:, k] = yScore.ravel() # y-scores
318            W[:, k] = u.ravel() # x-weights
319            C[:, k] = v.ravel() # y-weights
320            P[:, k] = xLoadings.ravel() # x-loadings
321            Q[:, k] = yLoadings.ravel() # y-loadings
322        # X = TP' + E and Y = UQ' + E
323
324        # Rotations from input space to transformed space (scores)
325        # T = X W(P'W)^-1 = XW* (W* : p x k matrix)
326        # U = Y C(Q'C)^-1 = YC* (W* : q x k matrix)
327        xRotations = dot(W, pinv(dot(P.T, W)))
328        if Y.shape[1] > 1:
329            yRotations = dot(C, pinv(dot(Q.T, C)))
330        else:
331            yRotations = numpy.ones(1)
332
333        if True or self.deflation_mode == "regression":
334            # Estimate regression coefficient
335            # Y = TQ' + E = X W(P'W)^-1Q' + E = XB + E
336            # => B = W*Q' (p x q)
337            coefs = dot(xRotations, Q.T)
338            coefs = 1. / sigmaX.reshape((p, 1)) * \
339                    coefs * sigmaY
340       
341        return {"mu_x": muX, "mu_y": muY, "sigma_x": sigmaX,
342                "sigma_y": sigmaY, "T": T, "U":U, "W":U, 
343                "C": C, "P":P, "Q":Q, "x_rotations": xRotations,
344                "y_rotations": yRotations, "coefs": coefs}
345
346deprecated_members({"nComp": "n_comp",
347                    "deflationMode": "deflation_mode",
348                    "maxIter": "max_iter"},
349                   wrap_methods=["__init__"],
350                   in_place=True)(PLSRegressionLearner)
351
352class PLSRegression(Orange.classification.Classifier):
353    """ PLSRegression predicts value of the response variables
354    based on the values of independent variables.
355   
356    Basic notations:
357    n - number of data instances
358    p - number of independent variables
359    q - number of reponse variables
360
361    .. attribute:: T
362   
363        A n x n_comp numpy array of x-scores
364
365    .. attribute:: U
366   
367        A n x n_comp numpy array of y-scores
368
369    .. attribute:: W
370   
371        A p x n_comp numpy array of x-weights
372
373    .. attribute:: C
374   
375        A q x n_comp numpy array of y-weights
376
377    .. attribute:: P
378   
379        A p x n_comp numpy array of x-loadings
380
381    .. attribute:: Q
382   
383        A q x n_comp numpy array of y-loading
384
385    .. attribute:: coefs
386   
387        A p x q numpy array coefficients
388        of the linear model: Y = X coefs + E
389
390    .. attribute:: x_vars
391   
392        list of independent variables
393
394    .. attribute:: y_vars
395   
396        list of response variables
397       
398    """
399    def __init__(self, domain=None, multitarget=False, coefs=None, sigma_x=None, sigma_y=None,
400                 mu_x=None, mu_y=None, x_vars=None, y_vars=None, **kwargs):
401        self.domain = domain
402        self.multitarget = multitarget
403        self.coefs = coefs
404        self.mu_x, self.mu_y = mu_x, mu_y
405        self.sigma_x, self.sigma_y = sigma_x, sigma_y
406        self.x_vars, self.y_vars = x_vars, y_vars
407           
408        for name, val in kwargs.items():
409            setattr(self, name, val)
410
411    def __call__(self, instance, result_type=Orange.core.GetValue):
412        """
413        :param instance: data instance for which the value of the response
414            variable will be predicted
415        :type instance: :class:`Orange.data.Instance`
416       
417        """ 
418        instance = Orange.data.Instance(self.domain, instance)
419        ins = [instance[v].native() for v in self.x_vars]
420       
421        if "?" in ins: # missing value -> corresponding coefficient omitted
422            def miss_2_0(x): return x if x != "?" else 0
423            ins = map(miss_2_0, ins)
424        ins = numpy.array(ins)
425        xc = (ins - self.mu_x) / self.sigma_x
426        predicted = dot(xc, self.coefs) * self.sigma_y + self.mu_y
427        y_hat = [var(val) for var, val in zip(self.y_vars, predicted)]
428        if result_type == Orange.core.GetValue:
429            return y_hat if self.multitarget else y_hat[0]
430        else:
431            from Orange.statistics.distribution import Distribution
432            probs = []
433            for var, val in zip(self.y_vars, y_hat):
434                dist = Distribution(var)
435                dist[val] = 1.0
436                probs.append(dist)
437            if result_type == Orange.core.GetBoth:
438                return (y_hat, probs) if self.multitarget else (y_hat[0], probs[0])
439            else:
440                return probs if self.multitarget else probs[0]
441           
442    def to_string(self):
443        """ Pretty-prints the coefficient of the PLS regression model.
444        """       
445        x_vars, y_vars = [x.name for x in self.x_vars], [y.name for y in self.y_vars]
446        fmt = "%-6s " + "%-5.3f  " * len(y_vars)
447        first = [" " * 7 + "%-6s " * len(y_vars) % tuple(y_vars)]
448        lines = [fmt % tuple([x_vars[i]] + list(coef))
449                 for i, coef in enumerate(self.coefs)]
450        return '\n'.join(first + lines)
451           
452    def __str__(self):
453        return self.to_string()
454
455    """
456    def transform(self, X, Y=None):
457
458        # Normalize
459        Xc = (X - self.muX) / self.sigmaX
460        if Y is not None:
461            Yc = (Y - self.muY) / self.sigmaY
462        # Apply rotation
463        xScores = dot(Xc, self.xRotations)
464        if Y is not None:
465            yScores = dot(Yc, self.yRotations)
466            return xScores, yScores
467
468        return xScores
469    """
470             
471deprecated_members({"xVars": "x_vars", 
472                    "yVars": "y_vars",
473                    "muX": "mu_x",
474                    "muY": "mu_y",
475                    "sigmaX": "sigma_x",
476                    "sigmaY": "sigma_y"},
477                   wrap_methods=["__init__"],
478                   in_place=True)(PLSRegression)
479                   
480if __name__ == "__main__":
481
482    import Orange
483    from Orange.regression import pls
484
485    data = Orange.data.Table("test-pls.tab")
486    l = pls.PLSRegressionLearner()
487
488    x = data.domain.features
489    y = data.domain.class_vars
490    print x, y
491    # c = l(data, x_vars=x, y_vars=y)
492    c = l(data)
493    print c
Note: See TracBrowser for help on using the repository browser.