source: orange/orange/Orange/regression/pls.py @ 9612:4a74a19879a1

Revision 9612:4a74a19879a1, 16.6 KB checked in by Lan Zagar <lan.zagar@…>, 2 years ago (diff)

PLS documentation updated.

Line 
1"""\
2==========================================
3Partial least sqaures regression (``PLS``)
4==========================================
5
6.. index:: regression
7
8.. _`Parital Least Squares Regression`: http://en.wikipedia.org/wiki/Partial_least_squares_regression
9
10Implementation is based on `Scikit learn python implementation`_
11
12Example
13-------
14
15The following code shows how to fit a PLS regression model on a multi-target data set.
16
17.. literalinclude:: code/pls-example.py
18
19Output ::
20
21    Input variables:     <FloatVariable 'X1', FloatVariable 'X2', FloatVariable 'X3'>
22    Response variables:  <FloatVariable 'Y1', FloatVariable 'Y2', FloatVariable 'Y3', FloatVariable 'Y4'>
23    Prediction for the first 2 data instances:
24
25    Actual     [<orange.Value 'Y1'='0.490'>, <orange.Value 'Y2'='1.237'>, <orange.Value 'Y3'='1.808'>, <orange.Value 'Y4'='0.422'>]
26    Predicted  [<orange.Value 'Y1'='0.613'>, <orange.Value 'Y2'='0.826'>, <orange.Value 'Y3'='1.084'>, <orange.Value 'Y4'='0.534'>]
27
28    Actual     [<orange.Value 'Y1'='0.167'>, <orange.Value 'Y2'='-0.664'>, <orange.Value 'Y3'='-1.378'>, <orange.Value 'Y4'='0.589'>]
29    Predicted  [<orange.Value 'Y1'='0.058'>, <orange.Value 'Y2'='-0.706'>, <orange.Value 'Y3'='-1.420'>, <orange.Value 'Y4'='0.599'>]
30
31    Regression coefficients:
32                       Y1           Y2           Y3           Y4
33          X1        0.714        2.153        3.590       -0.078
34          X2       -0.238       -2.500       -4.797       -0.036
35          X3        0.230       -0.314       -0.880       -0.060
36
37
38
39.. autoclass:: PLSRegressionLearner
40    :members:
41
42.. autoclass:: PLSRegression
43    :members:
44
45Utility functions
46-----------------
47
48.. autofunction:: normalize_matrix
49
50.. autofunction:: nipals_xy
51
52.. autofunction:: svd_xy
53
54.. _`Scikit learn python implementation`: https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/pls.py
55
56"""
57
58import Orange
59import numpy
60
61from Orange.regression import base
62from Orange.regression.earth import data_label_mask
63from numpy import dot, zeros
64from numpy import linalg
65from numpy.linalg import svd, pinv
66
67from Orange.misc import deprecated_members, deprecated_keywords
68
69
70def normalize_matrix(X):
71    """ Normalizes matrix, i.e. subtracts column means
72    and divides them by column standard deviations.
73    Returns the standardized matrix, sample mean and
74    standard deviation
75
76    :param X: data matrix
77    :type X: :class:`numpy.array`
78   
79    """
80    mu_x, sigma_x = numpy.mean(X, axis=0), numpy.std(X, axis=0)
81    sigma_x[sigma_x == 0] = 1.
82    return (X - mu_x)/sigma_x, mu_x, sigma_x
83
84@deprecated_keywords({"maxIter": "max_iter"})
85def nipals_xy(X, Y, mode="PLS", max_iter=500, tol=1e-06):
86    """
87    NIPALS algorithm. Returns the first left and rigth singular
88    vectors of X'Y.
89
90    :param X, Y: data matrix
91    :type X, Y: :class:`numpy.array`
92
93    :param mode: possible values "PLS" (default) or "CCA"
94    :type mode: string
95
96    :param max_iter: maximal number of iterations (default 500)
97    :type max_iter: int
98
99    :param tol: tolerance parameter, if norm of difference
100        between two successive left singular vectors is less than tol,
101        iteration is stopped
102    :type tol: a not negative float
103           
104    """
105    yScore, uOld, ite = Y[:, [0]], 0, 1
106    Xpinv = Ypinv = None
107    # Inner loop of the Wold algo.
108    while True and ite < max_iter:
109        # Update u: the X weights
110        if mode == "CCA":
111            if Xpinv is None:
112                Xpinv = linalg.pinv(X) # compute once pinv(X)
113            u = dot(Xpinv, yScore)
114        else: # mode PLS
115        # Mode PLS regress each X column on yScore
116            u = dot(X.T, yScore) / dot(yScore.T, yScore)
117        # Normalize u
118        u /= numpy.sqrt(dot(u.T, u))
119        # Update xScore: the X latent scores
120        xScore = dot(X, u)
121
122        # Update v: the Y weights
123        if mode == "CCA":
124            if Ypinv is None:
125                Ypinv = linalg.pinv(Y) # compute once pinv(Y)
126            v = dot(Ypinv, xScore)
127        else:
128            # Mode PLS regress each X column on yScore
129            v = dot(Y.T, xScore) / dot(xScore.T, xScore)
130        # Normalize v
131        v /= numpy.sqrt(dot(v.T, v))
132        # Update yScore: the Y latent scores
133        yScore = dot(Y, v)
134
135        uDiff = u - uOld
136        if dot(uDiff.T, uDiff) < tol or Y.shape[1] == 1:
137            break
138        uOld = u
139        ite += 1
140    return u, v
141
142def svd_xy(X, Y):
143    """ Returns the first left and right singular
144    vectors of X'Y.
145
146    :param X, Y: data matrix
147    :type X, Y: :class:`numpy.array`   
148   
149    """
150    U, s, V = svd(dot(X.T, Y), full_matrices=False)
151    u = U[:, [0]]
152    v = V.T[:, [0]]
153    return u, v
154
155
156def select_attrs(table, attributes, class_var=None, metas=None):
157    """ Select only ``attributes`` from the ``table``.
158    """
159    domain = Orange.data.Domain(attributes, class_var)
160    if metas:
161        domain.add_metas(metas)
162    return Orange.data.Table(domain, table)
163
164
165class PLSRegressionLearner(base.BaseRegressionLearner):
166    """ Fits the partial least squares regression model,
167    i.e. learns the regression parameters. The implementation is based on
168    `Scikit learn python implementation`_
169   
170    The class is derived from
171    :class:`Orange.regression.base.BaseRegressionLearner`
172    which is used for preprocessing the data (continuization and imputation)
173    before fitting the regression parameters
174   
175    """
176
177    def __init__(self, n_comp=2, deflation_mode="regression", mode="PLS",
178                 algorithm="nipals", max_iter=500, 
179                 imputer=None, continuizer=None,
180                 **kwds):
181        """
182        .. attribute:: n_comp
183   
184            number of components to keep. Default: 2
185
186        .. attribute:: deflation_mode
187   
188            "canonical" or "regression" (default)
189
190        .. attribute:: mode
191   
192            "CCA" or "PLS" (default)
193
194
195        .. attribute:: algorithm
196   
197            The algorithm used to estimate the weights:
198            "nipals" or "svd" (default)
199
200
201        """       
202        self.n_comp = n_comp
203        self.deflation_mode = deflation_mode
204        self.mode = mode
205        self.algorithm = algorithm
206        self.max_iter = max_iter
207        self.set_imputer(imputer=imputer)
208        self.set_continuizer(continuizer=continuizer)
209        self.__dict__.update(kwds)
210
211    @deprecated_keywords({"xVars": "x_vars", "yVars": "y_vars"})
212    def __call__(self, table, weight_id=None, x_vars=None, y_vars=None):
213        """
214        :param table: data instances.
215        :type table: :class:`Orange.data.Table`
216
217        :param x_vars, y_vars: List of input and response variables
218            (:obj:`Orange.data.variable.Continuous` or
219            :obj:`Orange.data.variable.Discrete`). If None (default) it is
220            assumed that the data domain provides information which variables
221            are reponses and which are not. If data has
222            :obj:`~Orange.data.Domain.class_var` defined in its domain, a
223            single-target regression learner is constructed. Otherwise a
224            multi-target learner predicting response variables defined by
225            :obj:`~Orange.data.Domain.class_vars` is constructed.
226        :type x_vars, y_vars: list           
227
228        """     
229        domain = table.domain
230        if x_vars is None and y_vars is None:
231            # Response variables are defined in the table.
232            x_vars = domain.features
233            if domain.class_var:
234                y_vars = [domain.class_var]
235                multitarget = False
236            elif domain.class_vars:
237                y_vars = domain.class_vars
238                multitarget = True
239            else:
240                raise TypeError('Class-less domain (x-vars and y-vars needed).')
241            x_table = select_attrs(table, x_vars)
242            y_table = select_attrs(table, y_vars)
243        elif x_vars and y_vars:
244            # independent and response variables are passed by the caller
245            multitarget = True
246        else:
247            raise ValueError("Both x_vars and y_vars must be defined.")
248
249        x_table = select_attrs(table, x_vars)
250        y_table = select_attrs(table, y_vars)
251
252        # dicrete values are continuized       
253        x_table = self.continuize_table(x_table)
254        y_table = self.continuize_table(y_table)
255        # missing values are imputed
256        x_table = self.impute_table(x_table)
257        y_table = self.impute_table(y_table)
258       
259        # Collect the new transformed x_vars/y_vars
260        x_vars = list(x_table.domain.variables)
261        y_vars = list(y_table.domain.variables)
262       
263        domain = Orange.data.Domain(x_vars + y_vars, False)
264       
265        x = x_table.to_numpy()[0]
266        y = y_table.to_numpy()[0]
267       
268        kwargs = self.fit(x, y)
269        return PLSRegression(domain=domain, x_vars=x_vars, y_vars=y_vars,
270                             multitarget=multitarget, **kwargs)
271
272    def fit(self, X, Y):
273        """ Fits all unknown parameters, i.e.
274        weights, scores, loadings (for x and y) and regression coefficients.
275        Returns a dict with all of the parameters.
276       
277        """
278        # copy since this will contain the residuals (deflated) matrices
279
280        X, Y = X.copy(), Y.copy()
281        if Y.ndim == 1:
282            Y = Y.reshape((Y.size, 1))
283        n, p = X.shape
284        q = Y.shape[1]
285
286        # normalization of data matrices
287        X, muX, sigmaX = normalize_matrix(X)
288        Y, muY, sigmaY = normalize_matrix(Y)
289        # Residuals (deflated) matrices
290        Xk, Yk = X, Y
291        # Results matrices
292        T, U = zeros((n, self.n_comp)), zeros((n, self.n_comp))
293        W, C = zeros((p, self.n_comp)), zeros((q, self.n_comp))
294        P, Q = zeros((p, self.n_comp)), zeros((q, self.n_comp))     
295
296        # NIPALS over components
297        for k in xrange(self.n_comp):
298            # Weights estimation (inner loop)
299            if self.algorithm == "nipals":
300                u, v = nipals_xy(X=Xk, Y=Yk, mode=self.mode, 
301                                 max_iter=self.max_iter)
302            elif self.algorithm == "svd":
303                u, v = svd_xy(X=Xk, Y=Yk)
304            # compute scores
305            xScore, yScore = dot(Xk, u), dot(Yk, v)
306            # Deflation (in place)
307            # - regress Xk's on xScore
308            xLoadings = dot(Xk.T, xScore) / dot(xScore.T, xScore)
309            # - substract rank-one approximations to obtain remainder matrix
310            Xk -= dot(xScore, xLoadings.T)
311            if self.deflation_mode == "canonical":
312                # - regress Yk's on yScore, then substract rank-one approx.
313                yLoadings = dot(Yk.T, yScore) / dot(yScore.T, yScore)
314                Yk -= dot(yScore, yLoadings.T)
315            if self.deflation_mode == "regression":
316                # - regress Yk's on xScore, then substract rank-one approx.
317                yLoadings = dot(Yk.T, xScore) / dot(xScore.T, xScore)
318                Yk -= dot(xScore, yLoadings.T)
319            # Store weights, scores and loadings
320            T[:, k] = xScore.ravel() # x-scores
321            U[:, k] = yScore.ravel() # y-scores
322            W[:, k] = u.ravel() # x-weights
323            C[:, k] = v.ravel() # y-weights
324            P[:, k] = xLoadings.ravel() # x-loadings
325            Q[:, k] = yLoadings.ravel() # y-loadings
326        # X = TP' + E and Y = UQ' + E
327
328        # Rotations from input space to transformed space (scores)
329        # T = X W(P'W)^-1 = XW* (W* : p x k matrix)
330        # U = Y C(Q'C)^-1 = YC* (W* : q x k matrix)
331        xRotations = dot(W, pinv(dot(P.T, W)))
332        if Y.shape[1] > 1:
333            yRotations = dot(C, pinv(dot(Q.T, C)))
334        else:
335            yRotations = numpy.ones(1)
336
337        if True or self.deflation_mode == "regression":
338            # Estimate regression coefficient
339            # Y = TQ' + E = X W(P'W)^-1Q' + E = XB + E
340            # => B = W*Q' (p x q)
341            coefs = dot(xRotations, Q.T)
342            coefs = 1. / sigmaX.reshape((p, 1)) * \
343                    coefs * sigmaY
344       
345        return {"mu_x": muX, "mu_y": muY, "sigma_x": sigmaX,
346                "sigma_y": sigmaY, "T": T, "U":U, "W":U, 
347                "C": C, "P":P, "Q":Q, "x_rotations": xRotations,
348                "y_rotations": yRotations, "coefs": coefs}
349
350deprecated_members({"nComp": "n_comp",
351                    "deflationMode": "deflation_mode",
352                    "maxIter": "max_iter"},
353                   wrap_methods=["__init__"],
354                   in_place=True)(PLSRegressionLearner)
355
356class PLSRegression(Orange.classification.Classifier):
357    """ PLSRegression predicts value of the response variables
358    based on the values of independent variables.
359   
360    Basic notations:
361    n - number of data instances
362    p - number of independent variables
363    q - number of reponse variables
364
365    .. attribute:: T
366   
367        A n x n_comp numpy array of x-scores
368
369    .. attribute:: U
370   
371        A n x n_comp numpy array of y-scores
372
373    .. attribute:: W
374   
375        A p x n_comp numpy array of x-weights
376
377    .. attribute:: C
378   
379        A q x n_comp numpy array of y-weights
380
381    .. attribute:: P
382   
383        A p x n_comp numpy array of x-loadings
384
385    .. attribute:: Q
386   
387        A q x n_comp numpy array of y-loading
388
389    .. attribute:: coefs
390   
391        A p x q numpy array coefficients
392        of the linear model: Y = X coefs + E
393
394    .. attribute:: x_vars
395   
396        list of independent variables
397
398    .. attribute:: y_vars
399   
400        list of response variables
401       
402    """
403    def __init__(self, domain=None, multitarget=False, coefs=None, sigma_x=None, sigma_y=None,
404                 mu_x=None, mu_y=None, x_vars=None, y_vars=None, **kwargs):
405        self.domain = domain
406        self.multitarget = multitarget
407        self.coefs = coefs
408        self.mu_x, self.mu_y = mu_x, mu_y
409        self.sigma_x, self.sigma_y = sigma_x, sigma_y
410        self.x_vars, self.y_vars = x_vars, y_vars
411           
412        for name, val in kwargs.items():
413            setattr(self, name, val)
414
415    def __call__(self, instance, result_type=Orange.core.GetValue):
416        """
417        :param instance: data instance for which the value of the response
418            variable will be predicted
419        :type instance: :class:`Orange.data.Instance`
420       
421        """ 
422        instance = Orange.data.Instance(self.domain, instance)
423        ins = [instance[v].native() for v in self.x_vars]
424       
425        if "?" in ins: # missing value -> corresponding coefficient omitted
426            def miss_2_0(x): return x if x != "?" else 0
427            ins = map(miss_2_0, ins)
428        ins = numpy.array(ins)
429        xc = (ins - self.mu_x)
430        predicted = dot(xc, self.coefs) + self.mu_y
431        y_hat = [var(val) for var, val in zip(self.y_vars, predicted)]
432        if result_type == Orange.core.GetValue:
433            return y_hat if self.multitarget else y_hat[0]
434        else:
435            from Orange.statistics.distribution import Distribution
436            probs = []
437            for var, val in zip(self.y_vars, y_hat):
438                dist = Distribution(var)
439                dist[val] = 1.0
440                probs.append(dist)
441            if result_type == Orange.core.GetBoth:
442                return (y_hat, probs) if self.multitarget else (y_hat[0], probs[0])
443            else:
444                return probs if self.multitarget else probs[0]
445           
446    def to_string(self):
447        """ Pretty-prints the coefficient of the PLS regression model.
448        """       
449        x_vars, y_vars = [x.name for x in self.x_vars], [y.name for y in self.y_vars]
450        fmt = "%8s " + "%12.3f " * len(y_vars)
451        first = [" " * 8 + "%13s" * len(y_vars) % tuple(y_vars)]
452        lines = [fmt % tuple([x_vars[i]] + list(coef))
453                 for i, coef in enumerate(self.coefs)]
454        return '\n'.join(first + lines)
455           
456    def __str__(self):
457        return self.to_string()
458
459    """
460    def transform(self, X, Y=None):
461
462        # Normalize
463        Xc = (X - self.muX) / self.sigmaX
464        if Y is not None:
465            Yc = (Y - self.muY) / self.sigmaY
466        # Apply rotation
467        xScores = dot(Xc, self.xRotations)
468        if Y is not None:
469            yScores = dot(Yc, self.yRotations)
470            return xScores, yScores
471
472        return xScores
473    """
474             
475deprecated_members({"xVars": "x_vars", 
476                    "yVars": "y_vars",
477                    "muX": "mu_x",
478                    "muY": "mu_y",
479                    "sigmaX": "sigma_x",
480                    "sigmaY": "sigma_y"},
481                   wrap_methods=["__init__"],
482                   in_place=True)(PLSRegression)
483                   
484if __name__ == "__main__":
485
486    import Orange
487    from Orange.regression import pls
488
489    data = Orange.data.Table("multitarget-synthetic")
490    l = pls.PLSRegressionLearner()
491
492    x = data.domain.features
493    y = data.domain.class_vars
494    print x, y
495    # c = l(data, x_vars=x, y_vars=y)
496    c = l(data)
497    print c
Note: See TracBrowser for help on using the repository browser.