source: orange/Orange/testing/unit/tests/test_projection_linear.py @ 10645:a9cffa7f948c

Revision 10645:a9cffa7f948c, 6.8 KB checked in by anze <anze.staric@…>, 2 years ago (diff)

PCA now returns the same results as prcomp(X, scale=TRUE) in R.
(Fixed normalization in linalg numpy operations and output of st. dev)

Line 
1try:
2    import unittest2 as unittest
3except:
4    import unittest
5
6import numpy as np
7import random
8
9from Orange import data, feature
10from Orange.projection import linear
11
12def normalize(a):
13    a = a if isinstance(a, np.ndarray) else np.array(a)
14    return a / np.linalg.norm(a)
15
16datasets = None
17
18def prepare_dataset(components=((),), n=150):
19    components = components if isinstance(components, np.ndarray) else np.array(components)
20
21    ncomponents, m = components.shape
22    coefficients = np.random.normal(0., 1., (n, ncomponents))
23
24    d = np.dot(coefficients, components)
25
26    domain = data.Domain([feature.Continuous("A%d" % i) for i in range(m)], False)
27    return domain, d
28
29
30
31class TestPca(unittest.TestCase):
32    def create_normal_dataset(self):
33        self.principal_component = normalize([random.randint(0, 5) for _ in range(10)])
34        self.dataset = data.Table(*prepare_dataset(components=[self.principal_component]))
35
36    def create_wide_dataset(self):
37        self.principal_component = normalize([random.randint(0, 5) for _ in range(250)])
38        self.dataset = data.Table(*prepare_dataset(components=[self.principal_component]))
39
40    def create_empty_dataset(self):
41        self.dataset = data.Table(*prepare_dataset(components=([0, 0, 0, 0, 0],), n=0))
42
43    def create_constant_dataset(self):
44        self.dataset = data.Table(*prepare_dataset(components=([0, 0, 0, 0, 0],)))
45
46    def create_dataset_with_unknowns(self, percentage=0.05):
47        self.principal_component = normalize([random.randint(0, 5) for _ in range(10)])
48        self.dataset = data.Table(*prepare_dataset(components=[self.principal_component]))
49
50        for ex in self.dataset:
51            for i, _ in enumerate(ex):
52                if random.random() < percentage:
53                    ex[i] = "?"
54
55
56    def test_pca_on_normal_data(self):
57        self.create_normal_dataset()
58
59        pca = linear.Pca(standardize=False)(self.dataset)
60        self.assertIsInstance(pca, linear.PcaProjector)
61
62        absolute_error = (np.abs(pca.projection[0]) - np.abs(self.principal_component)).sum()
63        self.assertAlmostEqual(absolute_error, 0.)
64
65    def test_pca_on_wide_data(self):
66        self.create_wide_dataset()
67
68        pca = linear.Pca(standardize=False)(self.dataset)
69        self.assertIsInstance(pca, linear.PcaProjector)
70
71        absolute_error = (np.abs(pca.projection[0]) - np.abs(self.principal_component)).sum()
72        self.assertAlmostEqual(absolute_error, 0., 1)
73
74    def test_pca_with_standardization(self):
75        self.create_normal_dataset()
76
77        pca = linear.Pca(standardize=True)(self.dataset)
78        eigen_vector = pca.projection[0]
79        non_zero_elements = eigen_vector[eigen_vector.nonzero()]
80
81        # since values in all dimensions are normally distributed, dimensions should be treated as equally important
82        self.assertAlmostEqual(non_zero_elements.min(), non_zero_elements.max())
83
84    def test_pca_with_variance_covered(self):
85        self.create_normal_dataset()
86
87        pca = linear.Pca(variance_covered=.99)(self.dataset)
88        # all data points lie in one dimension, one component should cover all the variance
89        nvectors, vector_dimension = pca.projection.shape
90        self.assertEqual(nvectors, 1)
91
92    def test_pca_with_max_components(self):
93        self.create_normal_dataset()
94        max_components = 3
95
96        pca = linear.Pca(max_components=max_components)(self.dataset)
97        # all data points lie in one dimension, one component should cover all the variance
98        nvectors, vector_dimension = pca.projection.shape
99        self.assertEqual(nvectors, max_components)
100
101    def test_pca_handles_unknowns(self):
102        self.create_dataset_with_unknowns()
103
104        pca = linear.Pca()(self.dataset)
105
106
107
108    def test_pca_on_empty_data(self):
109        self.create_empty_dataset()
110
111        with self.assertRaises(ValueError):
112            linear.Pca()(self.dataset)
113
114    def test_pca_on_only_constant_features(self):
115        self.create_constant_dataset()
116
117        with self.assertRaises(ValueError):
118            linear.Pca()(self.dataset)
119
120
121class TestProjector(unittest.TestCase):
122    def create_normal_dataset(self):
123        self.principal_component = normalize([random.randint(0, 5) for _ in range(10)])
124        self.dataset = data.Table(*prepare_dataset(components=[self.principal_component]))
125
126    def create_dataset_with_classes(self):
127        domain, features = prepare_dataset(components=[[random.randint(0, 5) for _ in range(10)]])
128        domain = data.Domain(domain.features,
129                             feature.Discrete("C", values=["F", "T"]),
130                             class_vars=[feature.Discrete("MC%i" % i, values=["F", "T"]) for i in range(4)])
131
132        self.dataset = data.Table(domain, np.hstack((features, np.random.random((len(features), 5)))))
133
134
135    def test_projected_domain_can_convert_data_with_class(self):
136        self.create_dataset_with_classes()
137        projector = linear.Pca(variance_covered=.99)(self.dataset)
138
139        projected_data = projector(self.dataset)
140        converted_data = data.Table(projected_data.domain, self.dataset)
141
142        self.assertItemsEqual(projected_data, converted_data)
143
144    def test_projected_domain_can_convert_data_without_class(self):
145        self.create_normal_dataset()
146        projector = linear.Pca(variance_covered=.99)(self.dataset)
147
148        projected_data = projector(self.dataset)
149        converted_data = data.Table(projected_data.domain, self.dataset)
150
151        self.assertItemsEqual(projected_data, converted_data)
152
153    def test_projected_domain_contains_class_vars(self):
154        self.create_dataset_with_classes()
155
156        projector = linear.Pca(variance_covered=.99)(self.dataset)
157        projected_data = projector(self.dataset)
158
159        self.assertIn(self.dataset.domain.class_var, projected_data.domain)
160        for class_ in self.dataset.domain.class_vars:
161            self.assertIn(class_, projected_data.domain)
162        for ex1, ex2 in zip(self.dataset, projected_data):
163            self.assertEqual(ex1.get_class(), ex2.get_class())
164            for v1, v2 in zip(ex1.get_classes(), ex2.get_classes()):
165                self.assertEqual(v2, v2)
166
167
168    def test_projects_example(self):
169        self.create_normal_dataset()
170        projector = linear.Pca(variance_covered=.99)(self.dataset)
171
172        projector(self.dataset[0])
173
174    def test_projects_data_table(self):
175        self.create_normal_dataset()
176        projector = linear.Pca(variance_covered=.99)(self.dataset)
177
178        projector(self.dataset)
179
180    def test_converts_input_domain_if_needed(self):
181        self.create_normal_dataset()
182        projector = linear.Pca(variance_covered=.99)(self.dataset)
183
184        new_examples = data.Table(data.Domain(self.dataset.domain.features[:5]), [[1.,2.,3.,4.,5.]])
185
186        projector(new_examples)
187
188
189class TestFda(unittest.TestCase):
190    pass
191
192if __name__ == '__main__':
193    unittest.main()
194
Note: See TracBrowser for help on using the repository browser.