#
source:
orange/Orange/preprocess/outliers.py
@
9765:f0d7e3c1a3ad

Revision 9765:f0d7e3c1a3ad, 4.5 KB checked in by Miha Stajdohar <miha.stajdohar@…>, 2 years ago (diff) |
---|

Line | |
---|---|

1 | """ |

2 | .. index:: outlier detection |

3 | |

4 | .. index:: |

5 | single: outlier; detection |

6 | |

7 | ******************************** |

8 | Outlier detection (``outliers``) |

9 | ******************************** |

10 | |

11 | .. autoclass:: OutlierDetection |

12 | :members: |

13 | |

14 | .. rubric:: Examples |

15 | |

16 | The following example prints a list of Z-values of examples in bridges dataset |

17 | (:download:`outlier1.py <code/outlier1.py>`). |

18 | |

19 | .. literalinclude:: code/outlier1.py |

20 | |

21 | The following example prints 5 examples with highest Z-scores. Euclidean |

22 | distance is used as a distance measurement and average distance is calculated |

23 | over 3 nearest neighbours (:download:`outlier2.py <code/outlier2.py>`). |

24 | |

25 | .. literalinclude:: code/outlier2.py |

26 | |

27 | The output:: |

28 | |

29 | ['M', 1838, 'HIGHWAY', ?, 2, 'N', 'THROUGH', 'WOOD', '?', 'S', 'WOOD'] Z-score: 1.732 |

30 | ['M', 1818, 'HIGHWAY', ?, 2, 'N', 'THROUGH', 'WOOD', 'SHORT', 'S', 'WOOD'] Z-score: 1.732 |

31 | ['A', 1853, 'RR', ?, 2, 'N', 'DECK', 'WOOD', '?', 'S', 'WOOD'] Z-score: 1.732 |

32 | ['A', 1829, 'AQUEDUCT', ?, 1, 'N', 'THROUGH', 'WOOD', '?', 'S', 'WOOD'] Z-score: 1.733 |

33 | ['A', 1848, 'AQUEDUCT', ?, 1, 'N', 'DECK', 'WOOD', '?', 'S', 'WOOD'] Z-score: 1.733 |

34 | |

35 | """ |

36 | |

37 | import Orange |

38 | from Orange import statc |

39 | |

40 | class OutlierDetection: |

41 | """ |

42 | A class for detecting outliers. |

43 | |

44 | It calculates average distances of each example to other examples |

45 | and converts them to Z-scores. Z-scores higher than zero denote an |

46 | example that is more distant to other examples than average. |

47 | |

48 | Detection of outliers can be performed directly on examples or on |

49 | an existant distance matrix. Also, the number of nearest neighbours |

50 | used for averaging distances can be set. The default 0 means |

51 | that all examples are used when calculating average distances. |

52 | """ |

53 | |

54 | def __init__(self): |

55 | self._clear() |

56 | self.set_knn() |

57 | |

58 | def _clear(self): |

59 | #distmatrix not calculated yet |

60 | self.distmatrixC = 0 |

61 | |

62 | #using distance measurment |

63 | self.distance = None |

64 | |

65 | self.examples = None |

66 | self.distmatrix = None |

67 | |

68 | def set_examples(self, examples, distance=None): |

69 | """Set examples on which the outlier detection will be |

70 | performed. Distance is a distance constructor for distances |

71 | between examples. If omitted, Manhattan distance is used.""" |

72 | self._clear() |

73 | self.examples = examples |

74 | if (distance == None): |

75 | distance = Orange.distance.Manhattan(self.examples) |

76 | self.distance = distance |

77 | |

78 | def set_distance_matrix(self, distances): |

79 | """Set the distance matrix on which the outlier detection |

80 | will be performed. |

81 | """ |

82 | self._clear() |

83 | self.distmatrix = distances |

84 | self.distmatrixC = 1 |

85 | |

86 | def set_knn(self, knn=0): |

87 | """ |

88 | Set the number of nearest neighbours considered in determinating. |

89 | """ |

90 | self.knn = knn |

91 | |

92 | def _calc_distance_matrix(self): |

93 | """ |

94 | other distance measures |

95 | """ |

96 | self.distmatrix = Orange.core.SymMatrix(len(self.examples)) #FIXME |

97 | for i in range(len(self.examples)): |

98 | for j in range(i + 1): |

99 | self.distmatrix[i, j] = self.distance(self.examples[i], |

100 | self.examples[j]) |

101 | self.distmatrixC = 1 |

102 | |

103 | def distance_matrix(self): |

104 | """ |

105 | Return the distance matrix of the dataset. |

106 | """ |

107 | if (self.distmatrixC == 0): |

108 | self._calc_distance_matrix() |

109 | return self.distmatrix |

110 | |

111 | def _average_means(self): |

112 | means = [] |

113 | dm = self.distance_matrix() |

114 | for i, dist in enumerate(dm): |

115 | nearest = self._find_nearest_limited(i, dist, self.knn) |

116 | means.append(statc.mean(nearest)) |

117 | return means |

118 | |

119 | def _find_nearest_limited(self, i, dist, knn): |

120 | copy = [] |

121 | for el in dist: |

122 | copy.append(el) |

123 | #remove distance to same element |

124 | copy[i:i + 1] = [] |

125 | if (knn == 0): |

126 | return copy |

127 | else: |

128 | takelimit = min(len(dist) - 1, knn) |

129 | copy.sort() |

130 | return copy[:takelimit] |

131 | |

132 | def z_values(self): |

133 | """ Return a list of Z values of average distances for each element |

134 | to others. N-th number in the list is the Z-value of N-th example. |

135 | """ |

136 | list = self._average_means() |

137 | return [statc.z(list, e) for e in list] |

138 | |

139 | Orange.misc.deprecated_members( |

140 | {"setKNN": "set_knn", |

141 | "setExamples": "set_examples", |

142 | "setDistanceMatrix": "set_distance_matrix", |

143 | "distanceMatrix": "distance_matrix", |

144 | "zValues": "z_values" |

145 | })(OutlierDetection) |

146 |

**Note:**See TracBrowser for help on using the repository browser.