source: orange/orange/Orange/regression/pls.py @ 9248:2bba59e63e7b

Revision 9248:2bba59e63e7b, 17.1 KB checked in by ales_erjavec <ales.erjavec@…>, 2 years ago (diff)

CamelCase parameters/variables to underscore.
Moved all model parameters from the learner to the predictor class.

Line 
1"""\
2==========================================
3Partial least sqaures regression (``PLS``)
4==========================================
5
6.. index:: regression
7
8.. _`Parital Least Squares Regression`: http://en.wikipedia.org/wiki/Partial_least_squares_regression
9
10Implementation is based on `Scikit learn python implementation`_
11
12Example ::
13
14    >>> import Orange
15    >>>     from Orange.regression import pls
16    >>> table = Orange.data.Table("test-pls.tab")
17    >>> # set independent and response variables
18    >>> x = [var for var in table.domain if var.name[0]=="X"]
19    >>> y = [var for var in table.domain if var.name[0]=="Y"]
20    >>> print x
21        [FloatVariable 'X1', FloatVariable 'X2', FloatVariable 'X3']
22    >>> print y
23        [FloatVariable 'Y1', FloatVariable 'Y2', FloatVariable 'Y3', FloatVariable 'Y4']
24    >>> # The information which variables are independent and which are responses
25    >>> # can be provided in the data definition.
26    >>> # If a variable var has key "label" in dictionary Orange.data.Domain[var].attributes
27    >>> # it is considered as a response variable.
28    >>> # In such situation x and y do not need to be specified.
29    >>> l = pls.PLSRegressionLearner()
30    >>> c = l(table, x_vars=x, y_vars=y)
31    >>> c.print_pls_regression_coefficients()
32           Y1     Y2     Y3     Y4     
33    X1     0.513  0.915  0.341  -0.069 
34    X2     0.641  -1.044  0.249  -0.015 
35    X3     1.393  0.050  0.729  -0.108 
36    >>>
37
38.. autoclass:: PLSRegressionLearner
39    :members:
40
41.. autoclass:: PLSRegression
42    :members:
43
44Utility functions
45-----------------
46
47.. autofunction:: normalize_matrix
48
49.. autofunction:: nipals_xy
50
51.. autofunction:: svd_xy
52
53.. _`Scikit learn python implementation`: https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/pls.py
54
55"""
56
57import Orange
58import numpy
59
60from Orange.regression import base
61from Orange.regression.earth import data_label_mask
62from numpy import dot, zeros
63from numpy import linalg
64from numpy.linalg import svd, pinv
65
66from Orange.misc import deprecated_members, deprecated_keywords
67
68
69def normalize_matrix(X):
70    """ Normalizes matrix, i.e. subtracts column means
71    and divides them by column standard deviations.
72    Returns the standardized matrix, sample mean and
73    standard deviation
74
75    :param X: data matrix
76    :type X: :class:`numpy.array`
77   
78    """
79    mu_x, sigma_x = numpy.mean(X, axis=0), numpy.std(X, axis=0)
80    sigma_x[sigma_x == 0] = 1.
81    return (X - mu_x)/sigma_x, mu_x, sigma_x
82
83@deprecated_keywords({"maxIter": "max_iter"})
84def nipals_xy(X, Y, mode="PLS", max_iter=500, tol=1e-06):
85    """
86    NIPALS algorithm. Returns the first left and rigth singular
87    vectors of X'Y.
88
89    :param X, Y: data matrix
90    :type X, Y: :class:`numpy.array`
91
92    :param mode: possible values "PLS" (default) or "CCA"
93    :type mode: string
94
95    :param max_iter: maximal number of iterations (default 500)
96    :type max_iter: int
97
98    :param tol: tolerance parameter, if norm of difference
99        between two successive left singular vectors is less than tol,
100        iteration is stopped
101    :type tol: a not negative float
102           
103    """
104    yScore, uOld, ite = Y[:, [0]], 0, 1
105    Xpinv = Ypinv = None
106    # Inner loop of the Wold algo.
107    while True and ite < max_iter:
108        # Update u: the X weights
109        if mode == "CCA":
110            if Xpinv is None:
111                Xpinv = linalg.pinv(X) # compute once pinv(X)
112            u = dot(Xpinv, yScore)
113        else: # mode PLS
114        # Mode PLS regress each X column on yScore
115            u = dot(X.T, yScore) / dot(yScore.T, yScore)
116        # Normalize u
117        u /= numpy.sqrt(dot(u.T, u))
118        # Update xScore: the X latent scores
119        xScore = dot(X, u)
120
121        # Update v: the Y weights
122        if mode == "CCA":
123            if Ypinv is None:
124                Ypinv = linalg.pinv(Y) # compute once pinv(Y)
125            v = dot(Ypinv, xScore)
126        else:
127            # Mode PLS regress each X column on yScore
128            v = dot(Y.T, xScore) / dot(xScore.T, xScore)
129        # Normalize v
130        v /= numpy.sqrt(dot(v.T, v))
131        # Update yScore: the Y latent scores
132        yScore = dot(Y, v)
133
134        uDiff = u - uOld
135        if dot(uDiff.T, uDiff) < tol or Y.shape[1] == 1:
136            break
137        uOld = u
138        ite += 1
139    return u, v
140
141def svd_xy(X, Y):
142    """ Returns the first left and right singular
143    vectors of X'Y.
144
145    :param X, Y: data matrix
146    :type X, Y: :class:`numpy.array`   
147   
148    """
149    U, s, V = svd(dot(X.T, Y), full_matrices=False)
150    u = U[:, [0]]
151    v = V.T[:, [0]]
152    return u, v
153
154
155def select_attrs(table, attributes, class_var=None, metas=None):
156    """ Select only ``attributes`` from the ``table``.
157    """
158    domain = Orange.data.Domain(attributes, class_var)
159    if metas:
160        domain.add_metas(metas)
161    return Orange.data.Table(domain, table)
162
163
164class PLSRegressionLearner(base.BaseRegressionLearner):
165    """ Fits the partial least squares regression model,
166    i.e. learns the regression parameters. The implementation is based on
167    `Scikit learn python implementation`_
168   
169    The class is derived from
170    :class:`Orange.regression.base.BaseRegressionLearner`
171    which is used for preprocessing the data (continuization and imputation)
172    before fitting the regression parameters
173   
174    """
175
176    def __init__(self, n_comp=2, deflation_mode="regression", mode="PLS",
177                 algorithm="nipals", max_iter=500, 
178                 imputer=None, continuizer=None,
179                 **kwds):
180        """
181        .. attribute:: n_comp
182   
183            number of components to keep. Default: 2
184
185        .. attribute:: deflation_mode
186   
187            "canonical" or "regression" (default)
188
189        .. attribute:: mode
190   
191            "CCA" or "PLS" (default)
192
193
194        .. attribute:: algorithm
195   
196            The algorithm used to estimate the weights:
197            "nipals" or "svd" (default)
198
199
200        """       
201        self.n_comp = n_comp
202        self.deflation_mode = deflation_mode
203        self.mode = mode
204        self.algorithm = algorithm
205        self.max_iter = max_iter
206        self.set_imputer(imputer=imputer)
207        self.set_continuizer(continuizer=continuizer)
208        self.__dict__.update(kwds)
209
210    @deprecated_keywords({"xVars": "x_vars", "yVars": "y_vars"})
211    def __call__(self, table, weight_id=None, x_vars=None, y_vars=None):
212        """
213        :param table: data instances.
214        :type table: :class:`Orange.data.Table`
215
216        :param x_vars, y_vars: List of input and response variables
217            (`Orange.data.variable.Continuous` or `Orange.data.variable.Discrete`).
218            If None (default) it is assumed that data definition provides information
219            which variables are reponses and which not. If a variable var
220            has key "label" in dictionary Orange.data.Domain[var].attributes
221            it is treated as a response variable
222        :type x_vars, y_vars: list           
223
224        """     
225        domain = table.domain
226        if x_vars is None and y_vars is None:
227            # Response variables are defined in the table.
228            label_mask = data_label_mask(domain)
229            multilabel_flag = (sum(label_mask) - (1 if domain.class_var else 0)) > 0
230            x_vars = [v for v, label in zip(domain, label_mask) if not label]
231            y_vars = [v for v, label in zip(domain, label_mask) if label]
232            x_table = select_attrs(table, x_vars)
233            y_table = select_attrs(table, y_vars)
234           
235        elif x_vars and y_vars:
236            # independent and response variables are passed by the caller
237            if domain.class_var and domain.class_var not in y_vars:
238                # if the original table contains class variable
239                # add it to the y_vars
240                y_vars.append(domain.class_var)
241            label_mask = [v in y_vars for v in domain.variables]
242            multilabel_flag = True
243            x_table = select_attrs(table, x_vars)
244            y_table = select_attrs(table, y_vars)
245        else:
246            raise ValueError("Both x_vars and y_vars must be defined.")
247
248        # dicrete values are continuized       
249        x_table = self.continuize_table(x_table)
250        y_table = self.continuize_table(y_table)
251        # missing values are imputed
252        x_table = self.impute_table(x_table)
253        y_table = self.impute_table(y_table)
254       
255        # Collect the new transformed x_vars/y_vars
256        x_vars = list(x_table.domain.variables)
257        y_vars = list(y_table.domain.variables)
258       
259        domain = Orange.data.Domain(x_vars + y_vars, False)
260        label_mask = [False for _ in x_vars] + [True for _ in y_vars]
261       
262        x = x_table.toNumpy()[0]
263        y = y_table.toNumpy()[0]
264       
265        kwargs = self.fit(x, y)
266        return PLSRegression(label_mask=label_mask, domain=domain, \
267#                             coefs=self.coefs, muX=self.muX, muY=self.muY, \
268#                             sigmaX=self.sigmaX, sigmaY=self.sigmaY, \
269                             x_vars=x_vars, y_vars=y_vars,
270                             multilabel_flag=multilabel_flag, **kwargs)
271
272    def fit(self, X, Y):
273        """ Fits all unknown parameters, i.e.
274        weights, scores, loadings (for x and y) and regression coefficients.
275        Returns a dict with all of the parameters.
276       
277        """
278        # copy since this will contain the residuals (deflated) matrices
279
280        X, Y = X.copy(), Y.copy()
281        if Y.ndim == 1:
282            Y = Y.reshape((Y.size, 1))
283        n, p = X.shape
284        q = Y.shape[1]
285
286        # normalization of data matrices
287        X, muX, sigmaX = normalize_matrix(X)
288        Y, muY, sigmaY = normalize_matrix(Y)
289        # Residuals (deflated) matrices
290        Xk, Yk = X, Y
291        # Results matrices
292        T, U = zeros((n, self.n_comp)), zeros((n, self.n_comp))
293        W, C = zeros((p, self.n_comp)), zeros((q, self.n_comp))
294        P, Q = zeros((p, self.n_comp)), zeros((q, self.n_comp))     
295
296        # NIPALS over components
297        for k in xrange(self.n_comp):
298            # Weights estimation (inner loop)
299            if self.algorithm == "nipals":
300                u, v = nipals_xy(X=Xk, Y=Yk, mode=self.mode, 
301                                 max_iter=self.max_iter)
302            elif self.algorithm == "svd":
303                u, v = svd_xy(X=Xk, Y=Yk)
304            # compute scores
305            xScore, yScore = dot(Xk, u), dot(Yk, v)
306            # Deflation (in place)
307            # - regress Xk's on xScore
308            xLoadings = dot(Xk.T, xScore) / dot(xScore.T, xScore)
309            # - substract rank-one approximations to obtain remainder matrix
310            Xk -= dot(xScore, xLoadings.T)
311            if self.deflation_mode == "canonical":
312                # - regress Yk's on yScore, then substract rank-one approx.
313                yLoadings = dot(Yk.T, yScore) / dot(yScore.T, yScore)
314                Yk -= dot(yScore, yLoadings.T)
315            if self.deflation_mode == "regression":
316                # - regress Yk's on xScore, then substract rank-one approx.
317                yLoadings = dot(Yk.T, xScore) / dot(xScore.T, xScore)
318                Yk -= dot(xScore, yLoadings.T)
319            # Store weights, scores and loadings
320            T[:, k] = xScore.ravel() # x-scores
321            U[:, k] = yScore.ravel() # y-scores
322            W[:, k] = u.ravel() # x-weights
323            C[:, k] = v.ravel() # y-weights
324            P[:, k] = xLoadings.ravel() # x-loadings
325            Q[:, k] = yLoadings.ravel() # y-loadings
326        # X = TP' + E and Y = UQ' + E
327
328        # Rotations from input space to transformed space (scores)
329        # T = X W(P'W)^-1 = XW* (W* : p x k matrix)
330        # U = Y C(Q'C)^-1 = YC* (W* : q x k matrix)
331        xRotations = dot(W, pinv(dot(P.T, W)))
332        if Y.shape[1] > 1:
333            yRotations = dot(C, pinv(dot(Q.T, C)))
334        else:
335            yRotations = numpy.ones(1)
336
337        if True or self.deflation_mode == "regression":
338            # Estimate regression coefficient
339            # Y = TQ' + E = X W(P'W)^-1Q' + E = XB + E
340            # => B = W*Q' (p x q)
341            coefs = dot(xRotations, Q.T)
342            coefs = 1. / sigmaX.reshape((p, 1)) * \
343                    coefs * sigmaY
344       
345        return {"mu_x": muX, "mu_y": muY, "sigma_x": sigmaX,
346                "sigma_y": sigmaY, "T": T, "U":U, "W":U, 
347                "C": C, "P":P, "Q":Q, "x_rotations": xRotations,
348                "y_rotations": yRotations, "coefs": coefs}
349
350deprecated_members({"nComp": "n_comp",
351                    "deflationMode": "deflation_mode",
352                    "maxIter": "max_iter"},
353                   wrap_methods=["__init__"],
354                   in_place=True)(PLSRegressionLearner)
355
356class PLSRegression(Orange.classification.Classifier):
357    """ PLSRegression predicts value of the response variables
358    based on the values of independent variables.
359   
360    Basic notations:
361    n - number of data instances
362    p - number of independent variables
363    q - number of reponse variables
364
365    .. attribute:: T
366   
367        A n x n_comp numpy array of x-scores
368
369    .. attribute:: U
370   
371        A n x n_comp numpy array of y-scores
372
373    .. attribute:: W
374   
375        A p x n_comp numpy array of x-weights
376
377    .. attribute:: C
378   
379        A q x n_comp numpy array of y-weights
380
381    .. attribute:: P
382   
383        A p x n_comp numpy array of x-loadings
384
385    .. attribute:: Q
386   
387        A q x n_comp numpy array of y-loading
388
389    .. attribute:: coefs
390   
391        A p x q numpy array coefficients
392        of the linear model: Y = X coefs + E
393
394    .. attribute:: x_vars
395   
396        list of independent variables
397
398    .. attribute:: y_vars
399   
400        list of response variables
401       
402    """
403    def __init__(self, label_mask=None, domain=None, \
404                 coefs=None, mu_x=None, mu_y=None, sigma_x=None, sigma_y=None, \
405                 x_vars=None, y_vars=None, multilabel_flag=0, **kwargs):
406        self.label_mask = label_mask
407        self.domain = domain
408        self.coefs = coefs
409        self.mu_x, self.mu_y = mu_x, mu_y
410        self.sigma_x, self.sigma_y = sigma_x, sigma_y
411        self.x_vars, self.y_vars = x_vars, y_vars
412        self.multilabel_flag = multilabel_flag
413        if not multilabel_flag and y_vars:
414            self.class_var = y_vars[0]
415           
416        for name, val in kwargs.items():
417            setattr(self, name, val)
418
419    def __call__(self, instance, result_type=Orange.core.GetValue):
420        """
421        :param instance: data instance for which the value of the response
422            variable will be predicted
423        :type instance: :class:`Orange.data.Instance`
424       
425        """ 
426        instance = Orange.data.Instance(self.domain, instance)
427        ins = [instance[v].native() for v in self.x_vars]
428       
429        if "?" in ins: # missing value -> corresponding coefficient omitted
430            def miss_2_0(x): return x if x != "?" else 0
431            ins = map(miss_2_0, ins)
432        ins = numpy.array(ins)
433        xc = (ins - self.mu_x) / self.sigma_x
434        predicted = dot(xc, self.coefs) * self.sigma_y + self.mu_y
435        y_hat = [var(val) for var, val in zip(self.y_vars, predicted)]
436        if result_type == Orange.core.GetValue:
437            return y_hat if self.multilabel_flag else y_hat[0]
438        else:
439            from Orange.statistics.distribution import Distribution
440            probs = []
441            for var, val in zip(self.y_vars, y_hat):
442                dist = Distribution(var)
443                dist[val] = 1.0
444                probs.append(dist)
445            if result_type == Orange.core.GetBoth:
446                return (y_hat, probs) if self.multilabel_flag else (y_hat[0], probs[0])
447            else:
448                return probs if self.multilabel_flag else probs[0]
449           
450    def print_pls_regression_coefficients(self):
451        """ Pretty-prints the coefficient of the PLS regression model.
452        """       
453        x_vars, y_vars = [x.name for x in self.x_vars], [y.name for y in self.y_vars]
454        print " " * 7 + "%-6s " * len(y_vars) % tuple(y_vars)
455        fmt = "%-6s " + "%-5.3f  " * len(y_vars)
456        for i, coef in enumerate(self.coefs):
457            print fmt % tuple([x_vars[i]] + list(coef))
458           
459    """
460    def transform(self, X, Y=None):
461
462        # Normalize
463        Xc = (X - self.muX) / self.sigmaX
464        if Y is not None:
465            Yc = (Y - self.muY) / self.sigmaY
466        # Apply rotation
467        xScores = dot(Xc, self.xRotations)
468        if Y is not None:
469            yScores = dot(Yc, self.yRotations)
470            return xScores, yScores
471
472        return xScores
473    """
474             
475deprecated_members({"xVars": "x_vars", 
476                    "yVars": "y_vars",
477                    "muX": "mu_x",
478                    "muY": "mu_y",
479                    "sigmaX": "sigma_x",
480                    "sigmaY": "sigma_y"},
481                   wrap_methods=["__init__"],
482                   in_place=True)(PLSRegression)
483                   
484if __name__ == "__main__":
485
486    import Orange
487    from Orange.regression import pls
488
489    table = Orange.data.Table("test-pls.tab")
490    l = pls.PLSRegressionLearner()
491
492    x = [var for var in table.domain if var.name[0]=="X"]
493    y = [var for var in table.domain if var.name[0]=="Y"]
494    print x, y
495#    c = l(table, x_vars=x, y_vars=y)
496    c = l(table)
497    c.print_pls_regression_coefficients()
Note: See TracBrowser for help on using the repository browser.