source: orange/Orange/regression/pls.py @ 9671:a7b056375472

Revision 9671:a7b056375472, 16.5 KB checked in by anze <anze.staric@…>, 2 years ago (diff)

Moved orange to Orange (part 2)

Line 
1"""\
2==========================================
3Partial least sqaures regression (``PLS``)
4==========================================
5
6.. index:: regression
7
8.. _`Parital Least Squares Regression`: http://en.wikipedia.org/wiki/Partial_least_squares_regression
9
10Implementation is based on `Scikit learn python implementation`_
11
12Example
13-------
14
15The following code shows how to fit a PLS regression model on a multi-target data set.
16
17.. literalinclude:: code/pls-example.py
18
19Output ::
20
21    Input variables:     <FloatVariable 'X1', FloatVariable 'X2', FloatVariable 'X3'>
22    Response variables:  <FloatVariable 'Y1', FloatVariable 'Y2', FloatVariable 'Y3', FloatVariable 'Y4'>
23    Prediction for the first 2 data instances:
24
25    Actual     [<orange.Value 'Y1'='0.490'>, <orange.Value 'Y2'='1.237'>, <orange.Value 'Y3'='1.808'>, <orange.Value 'Y4'='0.422'>]
26    Predicted  [<orange.Value 'Y1'='0.613'>, <orange.Value 'Y2'='0.826'>, <orange.Value 'Y3'='1.084'>, <orange.Value 'Y4'='0.534'>]
27
28    Actual     [<orange.Value 'Y1'='0.167'>, <orange.Value 'Y2'='-0.664'>, <orange.Value 'Y3'='-1.378'>, <orange.Value 'Y4'='0.589'>]
29    Predicted  [<orange.Value 'Y1'='0.058'>, <orange.Value 'Y2'='-0.706'>, <orange.Value 'Y3'='-1.420'>, <orange.Value 'Y4'='0.599'>]
30
31    Regression coefficients:
32                       Y1           Y2           Y3           Y4
33          X1        0.714        2.153        3.590       -0.078
34          X2       -0.238       -2.500       -4.797       -0.036
35          X3        0.230       -0.314       -0.880       -0.060
36
37
38
39.. autoclass:: PLSRegressionLearner
40    :members:
41
42.. autoclass:: PLSRegression
43    :members:
44
45Utility functions
46-----------------
47
48.. autofunction:: normalize_matrix
49
50.. autofunction:: nipals_xy
51
52.. autofunction:: svd_xy
53
54.. _`Scikit learn python implementation`: https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/pls.py
55
56"""
57
58import Orange
59import numpy
60
61from Orange.regression import base
62from Orange.regression.earth import data_label_mask
63from numpy import dot, zeros
64from numpy import linalg
65from numpy.linalg import svd, pinv
66
67from Orange.misc import deprecated_members, deprecated_keywords
68
69
70def normalize_matrix(X):
71    """ Normalizes matrix, i.e. subtracts column means
72    and divides them by column standard deviations.
73    Returns the standardized matrix, sample mean and
74    standard deviation
75
76    :param X: data matrix
77    :type X: :class:`numpy.array`
78   
79    """
80    mu_x, sigma_x = numpy.mean(X, axis=0), numpy.std(X, axis=0)
81    sigma_x[sigma_x == 0] = 1.
82    return (X - mu_x)/sigma_x, mu_x, sigma_x
83
84@deprecated_keywords({"maxIter": "max_iter"})
85def nipals_xy(X, Y, mode="PLS", max_iter=500, tol=1e-06):
86    """
87    NIPALS algorithm. Returns the first left and rigth singular
88    vectors of X'Y.
89
90    :param X, Y: data matrix
91    :type X, Y: :class:`numpy.array`
92
93    :param mode: possible values "PLS" (default) or "CCA"
94    :type mode: string
95
96    :param max_iter: maximal number of iterations (default 500)
97    :type max_iter: int
98
99    :param tol: tolerance parameter, if norm of difference
100        between two successive left singular vectors is less than tol,
101        iteration is stopped
102    :type tol: a not negative float
103           
104    """
105    yScore, uOld, ite = Y[:, [0]], 0, 1
106    Xpinv = Ypinv = None
107    # Inner loop of the Wold algo.
108    while True and ite < max_iter:
109        # Update u: the X weights
110        if mode == "CCA":
111            if Xpinv is None:
112                Xpinv = linalg.pinv(X) # compute once pinv(X)
113            u = dot(Xpinv, yScore)
114        else: # mode PLS
115        # Mode PLS regress each X column on yScore
116            u = dot(X.T, yScore) / dot(yScore.T, yScore)
117        # Normalize u
118        u /= numpy.sqrt(dot(u.T, u))
119        # Update xScore: the X latent scores
120        xScore = dot(X, u)
121
122        # Update v: the Y weights
123        if mode == "CCA":
124            if Ypinv is None:
125                Ypinv = linalg.pinv(Y) # compute once pinv(Y)
126            v = dot(Ypinv, xScore)
127        else:
128            # Mode PLS regress each X column on yScore
129            v = dot(Y.T, xScore) / dot(xScore.T, xScore)
130        # Normalize v
131        v /= numpy.sqrt(dot(v.T, v))
132        # Update yScore: the Y latent scores
133        yScore = dot(Y, v)
134
135        uDiff = u - uOld
136        if dot(uDiff.T, uDiff) < tol or Y.shape[1] == 1:
137            break
138        uOld = u
139        ite += 1
140    return u, v
141
142def svd_xy(X, Y):
143    """ Returns the first left and right singular
144    vectors of X'Y.
145
146    :param X, Y: data matrix
147    :type X, Y: :class:`numpy.array`   
148   
149    """
150    U, s, V = svd(dot(X.T, Y), full_matrices=False)
151    u = U[:, [0]]
152    v = V.T[:, [0]]
153    return u, v
154
155
156def select_attrs(table, attributes, class_var=None, metas=None):
157    """ Select only ``attributes`` from the ``table``.
158    """
159    domain = Orange.data.Domain(attributes, class_var)
160    if metas:
161        domain.add_metas(metas)
162    return Orange.data.Table(domain, table)
163
164
165class PLSRegressionLearner(base.BaseRegressionLearner):
166    """ Fits the partial least squares regression model,
167    i.e. learns the regression parameters. The implementation is based on
168    `Scikit learn python implementation`_
169   
170    The class is derived from
171    :class:`Orange.regression.base.BaseRegressionLearner`
172    which is used for preprocessing the data (continuization and imputation)
173    before fitting the regression parameters
174   
175    """
176
177    def __init__(self, n_comp=2, deflation_mode="regression", mode="PLS",
178                 algorithm="nipals", max_iter=500, 
179                 imputer=None, continuizer=None,
180                 **kwds):
181        """
182        .. attribute:: n_comp
183   
184            number of components to keep. Default: 2
185
186        .. attribute:: deflation_mode
187   
188            "canonical" or "regression" (default)
189
190        .. attribute:: mode
191   
192            "CCA" or "PLS" (default)
193
194
195        .. attribute:: algorithm
196   
197            The algorithm used to estimate the weights:
198            "nipals" or "svd" (default)
199
200
201        """       
202        self.n_comp = n_comp
203        self.deflation_mode = deflation_mode
204        self.mode = mode
205        self.algorithm = algorithm
206        self.max_iter = max_iter
207        self.set_imputer(imputer=imputer)
208        self.set_continuizer(continuizer=continuizer)
209        self.__dict__.update(kwds)
210
211    @deprecated_keywords({"xVars": "x_vars", "yVars": "y_vars"})
212    def __call__(self, table, weight_id=None, x_vars=None, y_vars=None):
213        """
214        :param table: data instances.
215        :type table: :class:`Orange.data.Table`
216
217        :param x_vars, y_vars: List of input and response variables
218            (:obj:`Orange.data.variable.Continuous` or
219            :obj:`Orange.data.variable.Discrete`). If None (default) it is
220            assumed that the data domain provides information which variables
221            are reponses and which are not. If data has
222            :obj:`~Orange.data.Domain.class_var` defined in its domain, a
223            single-target regression learner is constructed. Otherwise a
224            multi-target learner predicting response variables defined by
225            :obj:`~Orange.data.Domain.class_vars` is constructed.
226        :type x_vars, y_vars: list           
227
228        """     
229        domain = table.domain
230        if x_vars is None and y_vars is None:
231            # Response variables are defined in the table.
232            x_vars = domain.features
233            if domain.class_var:
234                y_vars = [domain.class_var]
235            elif domain.class_vars:
236                y_vars = domain.class_vars
237            else:
238                raise TypeError('Class-less domain (x-vars and y-vars needed).')
239            x_table = select_attrs(table, x_vars)
240            y_table = select_attrs(table, y_vars)
241        elif not (x_vars and y_vars):
242            raise ValueError("Both x_vars and y_vars must be defined.")
243
244        x_table = select_attrs(table, x_vars)
245        y_table = select_attrs(table, y_vars)
246
247        # dicrete values are continuized       
248        x_table = self.continuize_table(x_table)
249        y_table = self.continuize_table(y_table)
250        # missing values are imputed
251        x_table = self.impute_table(x_table)
252        y_table = self.impute_table(y_table)
253       
254        # Collect the new transformed x_vars/y_vars
255        x_vars = list(x_table.domain.variables)
256        y_vars = list(y_table.domain.variables)
257       
258        domain = Orange.data.Domain(x_vars + y_vars, False)
259        multitarget = True if len(y_vars) > 1 else False
260
261        x = x_table.to_numpy()[0]
262        y = y_table.to_numpy()[0]
263       
264        kwargs = self.fit(x, y)
265        return PLSRegression(domain=domain, x_vars=x_vars, y_vars=y_vars,
266                             multitarget=multitarget, **kwargs)
267
268    def fit(self, X, Y):
269        """ Fits all unknown parameters, i.e.
270        weights, scores, loadings (for x and y) and regression coefficients.
271        Returns a dict with all of the parameters.
272       
273        """
274        # copy since this will contain the residuals (deflated) matrices
275
276        X, Y = X.copy(), Y.copy()
277        if Y.ndim == 1:
278            Y = Y.reshape((Y.size, 1))
279        n, p = X.shape
280        q = Y.shape[1]
281
282        # normalization of data matrices
283        X, muX, sigmaX = normalize_matrix(X)
284        Y, muY, sigmaY = normalize_matrix(Y)
285        # Residuals (deflated) matrices
286        Xk, Yk = X, Y
287        # Results matrices
288        T, U = zeros((n, self.n_comp)), zeros((n, self.n_comp))
289        W, C = zeros((p, self.n_comp)), zeros((q, self.n_comp))
290        P, Q = zeros((p, self.n_comp)), zeros((q, self.n_comp))     
291
292        # NIPALS over components
293        for k in xrange(self.n_comp):
294            # Weights estimation (inner loop)
295            if self.algorithm == "nipals":
296                u, v = nipals_xy(X=Xk, Y=Yk, mode=self.mode, 
297                                 max_iter=self.max_iter)
298            elif self.algorithm == "svd":
299                u, v = svd_xy(X=Xk, Y=Yk)
300            # compute scores
301            xScore, yScore = dot(Xk, u), dot(Yk, v)
302            # Deflation (in place)
303            # - regress Xk's on xScore
304            xLoadings = dot(Xk.T, xScore) / dot(xScore.T, xScore)
305            # - substract rank-one approximations to obtain remainder matrix
306            Xk -= dot(xScore, xLoadings.T)
307            if self.deflation_mode == "canonical":
308                # - regress Yk's on yScore, then substract rank-one approx.
309                yLoadings = dot(Yk.T, yScore) / dot(yScore.T, yScore)
310                Yk -= dot(yScore, yLoadings.T)
311            if self.deflation_mode == "regression":
312                # - regress Yk's on xScore, then substract rank-one approx.
313                yLoadings = dot(Yk.T, xScore) / dot(xScore.T, xScore)
314                Yk -= dot(xScore, yLoadings.T)
315            # Store weights, scores and loadings
316            T[:, k] = xScore.ravel() # x-scores
317            U[:, k] = yScore.ravel() # y-scores
318            W[:, k] = u.ravel() # x-weights
319            C[:, k] = v.ravel() # y-weights
320            P[:, k] = xLoadings.ravel() # x-loadings
321            Q[:, k] = yLoadings.ravel() # y-loadings
322        # X = TP' + E and Y = UQ' + E
323
324        # Rotations from input space to transformed space (scores)
325        # T = X W(P'W)^-1 = XW* (W* : p x k matrix)
326        # U = Y C(Q'C)^-1 = YC* (W* : q x k matrix)
327        xRotations = dot(W, pinv(dot(P.T, W)))
328        if Y.shape[1] > 1:
329            yRotations = dot(C, pinv(dot(Q.T, C)))
330        else:
331            yRotations = numpy.ones(1)
332
333        if True or self.deflation_mode == "regression":
334            # Estimate regression coefficient
335            # Y = TQ' + E = X W(P'W)^-1Q' + E = XB + E
336            # => B = W*Q' (p x q)
337            coefs = dot(xRotations, Q.T)
338            coefs = 1. / sigmaX.reshape((p, 1)) * \
339                    coefs * sigmaY
340       
341        return {"mu_x": muX, "mu_y": muY, "sigma_x": sigmaX,
342                "sigma_y": sigmaY, "T": T, "U":U, "W":U, 
343                "C": C, "P":P, "Q":Q, "x_rotations": xRotations,
344                "y_rotations": yRotations, "coefs": coefs}
345
346deprecated_members({"nComp": "n_comp",
347                    "deflationMode": "deflation_mode",
348                    "maxIter": "max_iter"},
349                   wrap_methods=["__init__"],
350                   in_place=True)(PLSRegressionLearner)
351
352class PLSRegression(Orange.classification.Classifier):
353    """ PLSRegression predicts value of the response variables
354    based on the values of independent variables.
355   
356    Basic notations:
357    n - number of data instances
358    p - number of independent variables
359    q - number of reponse variables
360
361    .. attribute:: T
362   
363        A n x n_comp numpy array of x-scores
364
365    .. attribute:: U
366   
367        A n x n_comp numpy array of y-scores
368
369    .. attribute:: W
370   
371        A p x n_comp numpy array of x-weights
372
373    .. attribute:: C
374   
375        A q x n_comp numpy array of y-weights
376
377    .. attribute:: P
378   
379        A p x n_comp numpy array of x-loadings
380
381    .. attribute:: Q
382   
383        A q x n_comp numpy array of y-loading
384
385    .. attribute:: coefs
386   
387        A p x q numpy array coefficients
388        of the linear model: Y = X coefs + E
389
390    .. attribute:: x_vars
391   
392        list of independent variables
393
394    .. attribute:: y_vars
395   
396        list of response variables
397       
398    """
399    def __init__(self, domain=None, multitarget=False, coefs=None, sigma_x=None, sigma_y=None,
400                 mu_x=None, mu_y=None, x_vars=None, y_vars=None, **kwargs):
401        self.domain = domain
402        self.multitarget = multitarget
403        self.coefs = coefs
404        self.mu_x, self.mu_y = mu_x, mu_y
405        self.sigma_x, self.sigma_y = sigma_x, sigma_y
406        self.x_vars, self.y_vars = x_vars, y_vars
407           
408        for name, val in kwargs.items():
409            setattr(self, name, val)
410
411    def __call__(self, instance, result_type=Orange.core.GetValue):
412        """
413        :param instance: data instance for which the value of the response
414            variable will be predicted
415        :type instance: :class:`Orange.data.Instance`
416       
417        """ 
418        instance = Orange.data.Instance(self.domain, instance)
419        ins = [instance[v].native() for v in self.x_vars]
420       
421        if "?" in ins: # missing value -> corresponding coefficient omitted
422            def miss_2_0(x): return x if x != "?" else 0
423            ins = map(miss_2_0, ins)
424        ins = numpy.array(ins)
425        xc = (ins - self.mu_x)
426        predicted = dot(xc, self.coefs) + self.mu_y
427        y_hat = [var(val) for var, val in zip(self.y_vars, predicted)]
428        if result_type == Orange.core.GetValue:
429            return y_hat if self.multitarget else y_hat[0]
430        else:
431            from Orange.statistics.distribution import Distribution
432            probs = []
433            for var, val in zip(self.y_vars, y_hat):
434                dist = Distribution(var)
435                dist[val] = 1.0
436                probs.append(dist)
437            if result_type == Orange.core.GetBoth:
438                return (y_hat, probs) if self.multitarget else (y_hat[0], probs[0])
439            else:
440                return probs if self.multitarget else probs[0]
441           
442    def to_string(self):
443        """ Pretty-prints the coefficient of the PLS regression model.
444        """       
445        x_vars, y_vars = [x.name for x in self.x_vars], [y.name for y in self.y_vars]
446        fmt = "%8s " + "%12.3f " * len(y_vars)
447        first = [" " * 8 + "%13s" * len(y_vars) % tuple(y_vars)]
448        lines = [fmt % tuple([x_vars[i]] + list(coef))
449                 for i, coef in enumerate(self.coefs)]
450        return '\n'.join(first + lines)
451           
452    def __str__(self):
453        return self.to_string()
454
455    """
456    def transform(self, X, Y=None):
457
458        # Normalize
459        Xc = (X - self.muX) / self.sigmaX
460        if Y is not None:
461            Yc = (Y - self.muY) / self.sigmaY
462        # Apply rotation
463        xScores = dot(Xc, self.xRotations)
464        if Y is not None:
465            yScores = dot(Yc, self.yRotations)
466            return xScores, yScores
467
468        return xScores
469    """
470             
471deprecated_members({"xVars": "x_vars", 
472                    "yVars": "y_vars",
473                    "muX": "mu_x",
474                    "muY": "mu_y",
475                    "sigmaX": "sigma_x",
476                    "sigmaY": "sigma_y"},
477                   wrap_methods=["__init__"],
478                   in_place=True)(PLSRegression)
479                   
480if __name__ == "__main__":
481
482    import Orange
483    from Orange.regression import pls
484
485    data = Orange.data.Table("multitarget-synthetic")
486    l = pls.PLSRegressionLearner()
487
488    x = data.domain.features
489    y = data.domain.class_vars
490    print x, y
491    # c = l(data, x_vars=x, y_vars=y)
492    c = l(data)
493    print c
Note: See TracBrowser for help on using the repository browser.