Changeset 10612:85f3705b313d in orange for Orange/projection/linear.py
 Timestamp:
 03/21/12 13:48:12 (2 years ago)
 Branch:
 default
 File:

 1 edited
Legend:
 Unmodified
 Added
 Removed

Orange/projection/linear.py
r10611 r10612 1198 1198 """ 1199 1199 Stores a linear projection of data and uses it to transform any given data with matching input domain. 1200 1201 .. attribute:: input_domain1202 1203 Domain of the data set that was used to construct principal component1204 subspace.1205 1206 .. attribute:: output_domain1207 1208 Domain used in returned data sets. This domain has a continuous1209 variable for each axis in the projected space,1210 and no class variable(s).1211 1212 .. attribute:: mean1213 1214 Array containing means of each variable in the data set that was used1215 to construct the projection.1216 1217 .. attribute:: stdev1218 1219 An array containing standard deviations of each variable in the data1220 set that was used to construct the projection.1221 1222 .. attribute:: standardize1223 1224 True, if standardization was used when constructing the projection. If1225 set, instances will be standardized before being projected.1226 1227 .. attribute:: projection1228 1229 Array containing projection (vectors that describe the1230 transformation from input to output domain).1231 1232 1200 """ 1201 #: Domain of the data set that was used to construct principal component subspace. 1202 input_domain = None 1203 1204 #: Domain used in returned data sets. This domain has a continuous 1205 #: variable for each axis in the projected space, 1206 #: and no class variable(s). 1207 output_domain = None 1208 1209 #: Array containing means of each variable in the data set that was used 1210 #: to construct the projection. 1211 mean = numpy.array(()) 1212 1213 #: An array containing standard deviations of each variable in the data 1214 #: set that was used to construct the projection. 1215 stdev = numpy.array(()) 1216 1217 #: True, if standardization was used when constructing the projection. If 1218 #: set, instances will be standardized before being projected. 1219 standardize = True 1220 1221 #: Array containing projection (vectors that describe the 1222 #: transformation from input to output domain). 1223 projection = numpy.array(()).reshape(0, 0) 1224 1233 1225 def __init__(self, **kwds): 1234 1226 self.__dict__.update(kwds) 1235 if not hasattr(self, "output_domain"): 1236 self.output_domain = Orange.data.Domain([Orange.feature.Continuous("a.%d"%(i+1)) for i in range(len(self.projection))], False) 1237 1238 1239 def __call__(self, data): 1227 1228 features = [] 1229 for i in range(len(self.projection)): 1230 f = feature.Continuous("Comp.%d" % (i + 1)) 1231 f.get_value_from = lambda ex, w: self._project_single(ex, w, f, i) 1232 features.append(f) 1233 1234 self.output_domain = Orange.data.Domain(features, 1235 self.input_domain.class_var, 1236 class_vars=self.input_domain.class_vars) 1237 1238 def _project_single(self, example, return_what, new_feature, feature_idx): 1239 ex = Orange.data.Table([example]).to_numpy("a")[0] 1240 ex = self.mean 1241 if self.standardize: 1242 ex /= self.stdev 1243 return new_feature(numpy.dot(self.projection[feature_idx, :], ex.T)[0]) 1244 1245 def __call__(self, dataset): 1240 1246 """ 1241 1247 Project data. … … 1263 1269 Xd /= self.stdev 1264 1270 1265 self.A = numpy.ma.dot(Xd, U.T) 1266 1267 return Orange.data.Table(self.output_domain, self.A.tolist()) 1271 self.A = numpy.dot(Xd, U.T) 1272 1273 # TODO: Delete when orange will support creating data.Table from masked array. 1274 self.A = self.A.filled(0.) if isinstance(self.A, numpy.ma.core.MaskedArray) else self.A 1275 # append class variable 1276 1277 class_, classes = dataset.to_numpy("c")[0], dataset.to_numpy("m")[0] 1278 return data.Table(self.output_domain, numpy.hstack((self.A, class_, classes))) 1279 1268 1280 1269 1281 #color table for biplot 1270 1282 Colors = ['bo', 'go', 'yo', 'co', 'mo'] 1271 1283 1272 class P ca(object):1284 class PCA(object): 1273 1285 """ 1274 1286 Orthogonal transformation of data into a set of uncorrelated variables called … … 1289 1301 def __new__(cls, dataset=None, **kwds): 1290 1302 optimizer = object.__new__(cls) 1291 optimizer.__init__(**kwds)1292 1303 1293 1304 if dataset: 1305 optimizer.__init__(**kwds) 1294 1306 return optimizer(dataset) 1295 1307 else: … … 1303 1315 self.use_generalized_eigenvectors = use_generalized_eigenvectors 1304 1316 1305 def _pca(self, dataset, Xd, Xg):1306 n,m = Xd.shape1307 if n < m:1308 C = numpy.ma.dot(Xg.T, Xd.T)1309 V, D, T = numpy.linalg.svd(C)1310 U = numpy.ma.dot(V.T, Xd) / numpy.sqrt(D.reshape(1, 1))1311 else:1312 C = numpy.ma.dot(Xg, Xd)1313 U, D, T = numpy.linalg.svd(C)1314 U = U.T # eigenvectors are now in rows1315 return U, D1316 1317 1317 def __call__(self, dataset): 1318 1318 """ … … 1325 1325 :rtype: :class:`~Orange.projection.linear.PcaProjector` 1326 1326 """ 1327 1328 X = dataset.to_numpy_MA("a")[0] 1329 N,M = X.shape 1330 Xm = numpy.mean(X, axis=0) 1331 Xd = X  Xm 1332 1333 #take care of the constant features 1334 stdev = numpy.std(Xd, axis=0) 1335 relevant_features = stdev != 0 1336 Xd = Xd[:, relevant_features] 1337 if self.standardize: 1338 Xd /= stdev[relevant_features] 1327 Xd, stdev, mean, relevant_features = self._normalize_data(dataset) 1339 1328 1340 1329 #use generalized eigenvectors … … 1345 1334 Xg = Xd.T 1346 1335 1347 #actual pca 1336 components, variances = self._perform_pca(dataset, Xd, Xg) 1337 components = self._insert_zeros_for_constant_features(len(dataset.domain.features), 1338 components, 1339 relevant_features) 1340 1341 variances, components, variance_sum = self._select_components(variances, components) 1342 1343 n, m = components.shape 1344 1345 return PcaProjector(input_domain=dataset.domain, 1346 mean=mean, 1347 stdev=stdev, 1348 standardize=self.standardize, 1349 eigen_vectors=components, 1350 projection=components, 1351 eigen_values=variances, 1352 variance_sum=variance_sum) 1353 1354 def _normalize_data(self, dataset): 1355 if not len(dataset) or not len(dataset.domain.features): 1356 raise ValueError("Empty dataset") 1357 X = dataset.to_numpy("a")[0] 1358 1359 Xm = numpy.mean(X, axis=0) 1360 Xd = X  Xm 1361 1362 if not Xd.any(): 1363 raise ValueError("All features are constant") 1364 1365 #take care of the constant features 1366 stdev = numpy.std(Xd, axis=0) 1367 stdev[stdev == 0] = 1. # to prevent division by zero 1368 relevant_features = stdev != 0 1369 Xd = Xd[:, relevant_features] 1370 if self.standardize: 1371 Xd /= stdev[relevant_features] 1372 return Xd, stdev, Xm, relevant_features 1373 1374 def _perform_pca(self, dataset, Xd, Xg): 1348 1375 n, m = Xd.shape 1349 U, D = self._pca(dataset, Xd, Xg) 1350 1351 #insert zeros for constant features 1352 n, m = U.shape 1353 if m != M: 1354 U_ = numpy.zeros((n, M)) 1355 U_[:, relevant_features] = U 1356 U = U_ 1357 1376 if n < m: 1377 C = numpy.dot(Xg.T, Xd.T) 1378 V, D, T = numpy.linalg.svd(C) 1379 U = numpy.dot(V.T, Xd) / numpy.sqrt(D.reshape(1, 1)) 1380 else: 1381 C = numpy.dot(Xg, Xd) 1382 U, D, T = numpy.linalg.svd(C) 1383 U = U.T # eigenvectors are now in rows 1384 return U, D 1385 1386 def _select_components(self, D, U): 1358 1387 variance_sum = D.sum() 1359 1360 1388 #select eigen vectors 1361 1389 if self.variance_covered != 1: … … 1364 1392 U = U[:nfeatures, :] 1365 1393 D = D[:nfeatures] 1366 1367 1394 if self.max_components > 0: 1368 1395 U = U[:self.max_components, :] 1369 1396 D = D[:self.max_components] 1370 1397 return D, U, variance_sum 1398 1399 def _insert_zeros_for_constant_features(self, original_dimension, U, relevant_features): 1371 1400 n, m = U.shape 1372 pc_domain = Orange.data.Domain([Orange.feature.Continuous("Comp.%d"% 1373 (i + 1)) for i in range(n)], False) 1374 1375 return PcaProjector(input_domain=dataset.domain, 1376 output_domain = pc_domain, 1377 pc_domain = pc_domain, 1378 mean = Xm, 1379 stdev = stdev, 1380 standardize = self.standardize, 1381 eigen_vectors = U, 1382 projection = U, 1383 eigen_values = D, 1384 variance_sum = variance_sum) 1401 if m != original_dimension: 1402 U_ = numpy.zeros((n, original_dimension)) 1403 U_[:, relevant_features] = U 1404 U = U_ 1405 return U 1406 1407 Pca = PCA 1385 1408 1386 1409 1387 1410 class Spca(Pca): 1388 def _p ca(self, dataset, Xd, Xg):1411 def _perform_pca(self, dataset, Xd, Xg): 1389 1412 # define the Laplacian matrix 1390 1413 c = dataset.to_numpy("c")[0] … … 1394 1417 Xg = numpy.dot(Xg, l) 1395 1418 1396 return Pca._pca(self, dataset, Xd, Xg) 1397 1419 return Pca._perform_pca(self, dataset, Xd, Xg) 1420 1421 1422 @deprecated_members({"pc_domain": "output_domain"}) 1398 1423 class PcaProjector(Projector): 1399 """ 1400 .. attribute:: pc_domain 1401 1402 Synonymous for :obj:`~Orange.projection.linear.Projector.output_domain`. 1403 1404 .. attribute:: eigen_vectors 1405 1406 Synonymous for :obj:`~Orange.projection.linear.Projector.projection`. 1407 1408 .. attribute:: eigen_values 1409 1410 Array containing standard deviations of principal components. 1411 1412 .. attribute:: variance_sum 1413 1414 Sum of all variances in the data set that was used to construct the PCA 1415 space. 1416 1417 """ 1418 1419 def __init__(self, **kwds): 1420 self.__dict__.update(kwds) 1424 #: Synonymous for :obj:`~Orange.projection.linear.Projector.projection`. 1425 eigen_vectors = numpy.array(()).reshape(0, 0) 1426 1427 #: Array containing standard deviations of principal components. 1428 eigen_values = numpy.array(()) 1429 1430 #: Sum of all variances in the data set that was used to construct the PCA space. 1431 variance_sum = 0. 1421 1432 1422 1433 def __str__(self): … … 1429 1440 "Std. deviation of components:", 1430 1441 " ".join([" "] + 1431 ["%10s" % a.name for a in self. pc_domain.attributes]),1442 ["%10s" % a.name for a in self.output_domain.attributes]), 1432 1443 " ".join(["Std. deviation"] + 1433 1444 ["%10.3f" % a for a in self.eigen_values]), 1434 1445 " ".join(["Proportion Var"] + 1435 ["%10.3f" % a for a in 1446 ["%10.3f" % a for a in self.eigen_values / s * 100]), 1436 1447 " ".join(["Cumulative Var"] + 1437 1448 ["%10.3f" % a for a in cs * 100]), 1438 1449 "", 1439 #"Loadings:", 1440 #" ".join(["%10s"%""] + ["%10s" % a.name for a in self.pc_domain]), 1441 #"\n".join([ 1442 # " ".join([a.name] + ["%10.3f" % b for b in self.eigen_vectors.T[i]]) 1443 # for i, a in enumerate(self.input_domain.attributes) 1444 # ]) 1445 ]) if len(self.pc_domain) <= ncomponents else\ 1450 ]) if len(self.output_domain) <= ncomponents else\ 1446 1451 "\n".join([ 1447 1452 "PCA SUMMARY", … … 1449 1454 "Std. deviation of components:", 1450 1455 " ".join([" "] + 1451 ["%10s" % a.name for a in self. pc_domain.attributes[:ncomponents]] +1456 ["%10s" % a.name for a in self.output_domain.attributes[:ncomponents]] + 1452 1457 ["%10s" % "..."] + 1453 ["%10s" % self. pc_domain.attributes[1].name]),1458 ["%10s" % self.output_domain.attributes[1].name]), 1454 1459 " ".join(["Std. deviation"] + 1455 1460 ["%10.3f" % a for a in self.eigen_values[:ncomponents]] + … … 1465 1470 ["%10.3f" % (cs[1] * 100)]), 1466 1471 "", 1467 #"Loadings:", 1468 #" ".join(["%16s" % ""] + 1469 # ["%8s" % a.name for a in self.pc_domain.attributes[:ncomponents]] + 1470 # ["%8s" % "..."] + 1471 # ["%8s" % self.pc_domain.attributes[1].name]), 1472 #"\n".join([ 1473 # " ".join(["%16.16s" %a.name] + 1474 # ["%8.3f" % b for b in self.eigen_vectors.T[i, :ncomponents]] + 1475 # ["%8s" % ""] + 1476 # ["%8.3f" % self.eigen_vectors.T[i, 1]]) 1477 # for i, a in enumerate(self.input_domain.attributes) 1478 # ]) 1479 ]) 1480 1481 1482 1483 ################ Plotting functions ################### 1484 1485 def scree_plot(self, filename = None, title = 'Scree Plot'): 1472 ]) 1473 1474 1475 def scree_plot(self, filename=None, title='Scree Plot'): 1486 1476 """ 1487 1477 Draw a scree plot of principal components 1488 1478 1489 :param filename: Name of the file to which the plot will be saved. \1490 If None, plot will be displayed instead.1479 :param filename: Name of the file to which the plot will be saved. 1480 If None, plot will be displayed instead. 1491 1481 :type filename: str 1492 1482 :param title: Plot title … … 1503 1493 1504 1494 x_axis = range(len(self.eigen_values)) 1505 # x_labels = ["PC%d" % (i + 1, ) for i in x_axis]1506 1507 # ax.set_xticks(x_axis)1508 # ax.set_xticklabels(x_labels)1509 # plt.setp(ax.get_xticklabels(), "rotation", 90)1510 1495 plt.grid(True) 1511 1496 … … 1527 1512 plt.show() 1528 1513 1529 def biplot(self, filename = None, components = [0,1], title ='Biplot'):1514 def biplot(self, filename=None, components=(0, 1), title='Biplot'): 1530 1515 """ 1531 1516 Draw biplot for PCA. Actual projection must be performed via pca(data) 1532 1517 before bipot can be used. 1533 1518 1534 :param filename: Name of the file to which the plot will be saved. \1535 If None, plot will be displayed instead.1536 :type plot: str1519 :param filename: Name of the file to which the plot will be saved. 1520 If None, plot will be displayed instead. 1521 :type filename: str 1537 1522 :param components: List of two components to plot. 1538 1523 :type components: list
Note: See TracChangeset
for help on using the changeset viewer.