source: orange/Orange/data/outliers.py @ 10054:857b2a2f275a

Revision 10054:857b2a2f275a, 4.5 KB checked in by gregorr, 2 years ago (diff)

Added outlier files.

Line 
1"""
2.. index:: outlier detection
3
4.. index::
5   single: outlier; detection
6
7********************************
8Outlier detection (``outliers``)
9********************************
10
11.. autoclass:: OutlierDetection
12    :members:
13
14.. rubric:: Examples
15
16The following example prints a list of Z-values of examples in bridges dataset
17(:download:`outlier1.py <code/outlier1.py>`).
18
19.. literalinclude:: code/outlier1.py
20
21The following example prints 5 examples with highest Z-scores. Euclidean
22distance is used as a distance measurement and average distance is calculated
23over 3 nearest neighbours (:download:`outlier2.py <code/outlier2.py>`).
24
25.. literalinclude:: code/outlier2.py
26
27The output::
28
29    ['M', 1838, 'HIGHWAY', ?, 2, 'N', 'THROUGH', 'WOOD', '?', 'S', 'WOOD'] Z-score: 1.732
30    ['M', 1818, 'HIGHWAY', ?, 2, 'N', 'THROUGH', 'WOOD', 'SHORT', 'S', 'WOOD'] Z-score: 1.732
31    ['A', 1853, 'RR', ?, 2, 'N', 'DECK', 'WOOD', '?', 'S', 'WOOD'] Z-score: 1.732
32    ['A', 1829, 'AQUEDUCT', ?, 1, 'N', 'THROUGH', 'WOOD', '?', 'S', 'WOOD'] Z-score: 1.733
33    ['A', 1848, 'AQUEDUCT', ?, 1, 'N', 'DECK', 'WOOD', '?', 'S', 'WOOD'] Z-score: 1.733
34
35"""
36
37import Orange
38from Orange import statc
39
40class OutlierDetection:
41    """
42    A class for detecting outliers.
43
44    It calculates average distances of each example to other examples
45    and converts them to Z-scores. Z-scores higher than zero denote an
46    example that is more distant to other examples than average.
47
48    Detection of outliers can be performed directly on examples or on
49    an existant distance matrix. Also, the number of nearest neighbours
50    used for averaging distances can be set. The default 0 means
51    that all examples are used when calculating average distances.
52    """
53
54    def __init__(self):
55        self._clear()
56        self.set_knn()
57
58    def _clear(self):
59        #distmatrix not calculated yet
60        self.distmatrixC = 0
61
62        #using distance measurment
63        self.distance = None
64
65        self.examples = None
66        self.distmatrix = None
67
68    def set_examples(self, examples, distance=None):
69        """Set examples on which the outlier detection will be
70        performed. Distance is a distance constructor for distances
71        between examples. If omitted, Manhattan distance is used."""
72        self._clear()
73        self.examples = examples
74        if (distance == None):
75          distance = Orange.distance.Manhattan(self.examples)
76        self.distance = distance
77
78    def set_distance_matrix(self, distances):
79        """Set the distance matrix on which the outlier detection
80        will be performed.
81        """
82        self._clear()
83        self.distmatrix = distances
84        self.distmatrixC = 1
85
86    def set_knn(self, knn=0):
87        """
88        Set the number of nearest neighbours considered in determinating.
89        """
90        self.knn = knn
91
92    def _calc_distance_matrix(self):
93        """
94        other distance measures
95        """
96        self.distmatrix = Orange.misc.SymMatrix(len(self.examples)) #FIXME
97        for i in range(len(self.examples)):
98            for j in range(i + 1):
99                self.distmatrix[i, j] = self.distance(self.examples[i],
100                                                      self.examples[j])
101        self.distmatrixC = 1
102
103    def distance_matrix(self):
104        """
105        Return the distance matrix of the dataset.
106        """
107        if (self.distmatrixC == 0):
108            self._calc_distance_matrix()
109        return self.distmatrix
110
111    def _average_means(self):
112        means = []
113        dm = self.distance_matrix()
114        for i, dist in enumerate(dm):
115            nearest = self._find_nearest_limited(i, dist, self.knn)
116            means.append(statc.mean(nearest))
117        return means
118
119    def _find_nearest_limited(self, i, dist, knn):
120        copy = []
121        for el in dist:
122            copy.append(el)
123        #remove distance to same element
124        copy[i:i + 1] = []
125        if (knn == 0):
126            return copy
127        else:
128            takelimit = min(len(dist) - 1, knn)
129            copy.sort()
130            return copy[:takelimit]
131
132    def z_values(self):
133        """ Return a list of Z values of average distances for each element
134        to others. N-th number in the list is the Z-value of N-th example.
135        """
136        list = self._average_means()
137        return [statc.z(list, e) for e in list]
138
139Orange.misc.deprecated_members(
140    {"setKNN": "set_knn",
141    "setExamples": "set_examples",
142    "setDistanceMatrix": "set_distance_matrix",
143    "distanceMatrix": "distance_matrix",
144    "zValues": "z_values"
145    })(OutlierDetection)
146
Note: See TracBrowser for help on using the repository browser.