#
source:
orange/Orange/misc/selection.py
@
9732:470857cc675f

Revision 9732:470857cc675f, 8.8 KB checked in by crt.gorup@…, 2 years ago (diff) |
---|

Line | |
---|---|

1 | """ |

2 | ========================= |

3 | Selection (``selection``) |

4 | ========================= |

5 | |

6 | .. index:: selection |

7 | .. index:: |

8 | single: misc; selection |

9 | |

10 | Many machine learning techniques generate a set of different solutions or |

11 | have to choose, as for instance in classification tree induction, between |

12 | different features. The most trivial solution is to iterate through the |

13 | candidates, compare them and remember the optimal one. The problem occurs, |

14 | however, when there are multiple candidates that are equally good, and the |

15 | naive approaches would select the first or the last one, depending upon the |

16 | formulation of the if-statement. |

17 | |

18 | :class:`Orange.misc.selection` provides a class that makes a random choice |

19 | in such cases. Each new candidate is compared with the currently optimal |

20 | one; it replaces the optimal if it is better, while if they are equal, |

21 | one is chosen by random. The number of competing optimal candidates is stored, |

22 | so in this random choice the probability to select the new candidate (over the |

23 | current one) is 1/w, where w is the current number of equal candidates, |

24 | including the present one. One can easily verify that this gives equal |

25 | chances to all candidates, independent of the order in which they are |

26 | presented. |

27 | |

28 | Example |

29 | -------- |

30 | |

31 | The following snippet loads the data set lymphography and prints out the |

32 | feature with the highest information gain. |

33 | |

34 | part of :download:`misc-selection-bestonthefly.py <code/misc-selection-bestonthefly.py>` (uses :download:`lymphography.tab <code/lymphography.tab>`) |

35 | |

36 | .. literalinclude:: code/misc-selection-bestonthefly.py |

37 | :lines: 7-16 |

38 | |

39 | Our candidates are tuples gain ratios and features, so we set |

40 | :obj:`call_compare_on_1st` to make the compare function compare the first |

41 | element (gain ratios). We could achieve the same by initializing the object |

42 | like this: |

43 | |

44 | part of :download:`misc-selection-bestonthefly.py <code/misc-selection-bestonthefly.py>` (uses :download:`lymphography.tab <code/lymphography.tab>`) |

45 | |

46 | .. literalinclude:: code/misc-selection-bestonthefly.py |

47 | :lines: 18-18 |

48 | |

49 | |

50 | The other way to do it is through indices. |

51 | |

52 | :download:`misc-selection-bestonthefly.py <code/misc-selection-bestonthefly.py>` (uses :download:`lymphography.tab <code/lymphography.tab>`) |

53 | |

54 | .. literalinclude:: code/misc-selection-bestonthefly.py |

55 | :lines: 25- |

56 | |

57 | Here we only give gain ratios to :obj:`BestOnTheFly`, so we don't have |

58 | to specify a special compare operator. After checking all features we |

59 | get the index of the optimal one by calling :obj:`winner_index`. |

60 | |

61 | .. autoclass:: BestOnTheFly |

62 | |

63 | .. autofunction:: select_best |

64 | |

65 | .. autofunction:: select_best_index |

66 | |

67 | .. autofunction:: compare_first_bigger |

68 | |

69 | .. autofunction:: compare_first_smaller |

70 | |

71 | .. autofunction:: compare_last_bigger |

72 | |

73 | .. autofunction:: compare_last_smaller |

74 | |

75 | .. autofunction:: compare_bigger |

76 | |

77 | .. autofunction:: compare_smaller |

78 | |

79 | """ |

80 | |

81 | import random |

82 | |

83 | from Orange.misc import deprecated_members, deprecated_function_name, \ |

84 | deprecated_keywords |

85 | |

86 | class BestOnTheFly: |

87 | """ |

88 | Finds the optimal object in a sequence of objects. The class is fed the |

89 | candidates one by one, and remembers the winner. It can thus be used by |

90 | methods that generate different solutions to a problem and need to |

91 | select the optimal one, but do not want to store them all. |

92 | |

93 | :param compare: compare function. |

94 | :param seed: If not given, a random seed of 0 is used to ensure that\ |

95 | the same experiment always gives the same results, despite\ |

96 | pseudo-randomness.random seed. |

97 | :type seed: int |

98 | :param call_compare_on_1st: If set, :obj:`BestOnTheFly` will suppose\ |

99 | that the candidates are lists are tuples, and it will call compare\ |

100 | with the first element of the tuple. |

101 | :type call_compare_on_1st: bool |

102 | """ |

103 | |

104 | def __init__(self, compare=cmp, seed = 0, call_compare_on_1st = False): |

105 | self.randomGenerator = random.RandomGenerator(seed) |

106 | self.compare=compare |

107 | self.wins=0 |

108 | self.bestIndex, self.index = -1, -1 |

109 | self.best = None |

110 | self.call_compare_on_1st = call_compare_on_1st |

111 | |

112 | def candidate(self, x): |

113 | """Add new candidate. |

114 | |

115 | :param x: new candidate. |

116 | :type x: object |

117 | """ |

118 | self.index += 1 |

119 | if not self.wins: |

120 | self.best=x |

121 | self.wins=1 |

122 | self.bestIndex=self.index |

123 | return 1 |

124 | else: |

125 | if self.call_compare_on_1st: |

126 | cmpr=self.compare(x[0], self.best[0]) |

127 | else: |

128 | cmpr=self.compare(x, self.best) |

129 | if cmpr>0: |

130 | self.best=x |

131 | self.wins=1 |

132 | self.bestIndex=self.index |

133 | return 1 |

134 | elif cmpr==0: |

135 | self.wins=self.wins+1 |

136 | if not self.randomGenerator(self.wins): |

137 | self.best=x |

138 | self.bestIndex=self.index |

139 | return 1 |

140 | return 0 |

141 | |

142 | def winner(self): |

143 | """Return (currently) optimal object. This function can be called |

144 | any number of times, even when the candidates are still coming. |

145 | |

146 | :rtype: object |

147 | """ |

148 | return self.best |

149 | |

150 | def winner_index(self): |

151 | """Return the index of the optimal object within the sequence of |

152 | the candidates. |

153 | |

154 | :rtype: int |

155 | """ |

156 | if self.best is not None: |

157 | return self.bestIndex |

158 | else: |

159 | return None |

160 | |

161 | BestOnTheFly = deprecated_members({"callCompareOn1st": "call_compare_on_1st", |

162 | "winnerIndex": "winner_index", |

163 | }, |

164 | wrap_methods=["__init__"])(BestOnTheFly) |

165 | |

166 | |

167 | @deprecated_keywords({"callCompareOn1st": "call_compare_on_1st"}) |

168 | def select_best(x, compare=cmp, seed = 0, call_compare_on_1st = False): |

169 | """Return the optimal object from list x. The function is used if the candidates |

170 | are already in the list, so using the more complicated :obj:`BestOnTheFly` directly is |

171 | not needed. |

172 | |

173 | To demonstrate the use of :obj:`BestOnTheFly` see the implementation of |

174 | :obj:`selectBest`:: |

175 | |

176 | def selectBest(x, compare=cmp, seed = 0, call_compare_on_1st = False): |

177 | bs=BestOnTheFly(compare, seed, call_compare_on_1st) |

178 | for i in x: |

179 | bs.candidate(i) |

180 | return bs.winner() |

181 | |

182 | :param x: list of existing candidates. |

183 | :type x: list |

184 | :param compare: compare function. |

185 | :param seed: If not given, a random seed of 0 is used to ensure that |

186 | the same experiment always gives the same results, despite |

187 | pseudo-randomness.random seed. |

188 | :type seed: int |

189 | :param call_compare_on_1st: If set, :obj:`BestOnTheFly` will suppose |

190 | that the candidates are lists are tuples, and it will call compare |

191 | with the first element of the tuple. |

192 | :type call_compare_on_1st: bool |

193 | :rtype: object |

194 | |

195 | """ |

196 | bs=BestOnTheFly(compare, seed, call_compare_on_1st) |

197 | for i in x: |

198 | bs.candidate(i) |

199 | return bs.winner() |

200 | |

201 | selectBest = deprecated_function_name(select_best) |

202 | |

203 | |

204 | @deprecated_keywords({"callCompareOn1st": "call_compare_on_1st"}) |

205 | def select_best_index(x, compare=cmp, seed = 0, call_compare_on_1st = False): |

206 | """Similar to :obj:`selectBest` except that it doesn't return the best object |

207 | but its index in the list x. |

208 | """ |

209 | bs=BestOnTheFly(compare, seed, call_compare_on_1st) |

210 | for i in x: |

211 | bs.candidate(i) |

212 | return bs.winner_index() |

213 | |

214 | selectBestIndex = deprecated_function_name(select_best_index) |

215 | |

216 | |

217 | def compare_first_bigger(x, y): |

218 | """Function takes two lists and compares first elements. |

219 | |

220 | :param x: list of values. |

221 | :type x: list |

222 | :param y: list of values. |

223 | :type y: list |

224 | :rtype: cmp(x[0], y[0]) |

225 | """ |

226 | return cmp(x[0], y[0]) |

227 | |

228 | def compare_first_smaller(x, y): |

229 | """Function takes two lists and compares first elements. |

230 | |

231 | :param x: list of values. |

232 | :type x: list |

233 | :param y: list of values. |

234 | :type y: list |

235 | :rtype: -cmp(x[0], y[0]) |

236 | """ |

237 | return -cmp(x[0], y[0]) |

238 | |

239 | def compare_last_bigger(x, y): |

240 | """Function takes two lists and compares last elements. |

241 | |

242 | :param x: list of values. |

243 | :type x: list |

244 | :param y: list of values. |

245 | :type y: list |

246 | :rtype: cmp(x[0], y[0]) |

247 | """ |

248 | return cmp(x[-1], y[-1]) |

249 | |

250 | def compare_last_smaller(x, y): |

251 | """Function takes two lists and compares last elements. |

252 | |

253 | :param x: list of values. |

254 | :type x: list |

255 | :param y: list of values. |

256 | :type y: list |

257 | :rtype: -cmp(x[0], y[0]) |

258 | """ |

259 | return -cmp(x[-1], y[-1]) |

260 | |

261 | def compare_bigger(x, y): |

262 | """Function takes and compares two numbers. |

263 | |

264 | :param x: value. |

265 | :type x: int |

266 | :param y: value. |

267 | :type y: int |

268 | :rtype: cmp(x, y) |

269 | """ |

270 | return cmp(x, y) |

271 | |

272 | def compare_smaller(x, y): |

273 | """Function takes and compares two numbers. |

274 | |

275 | :param x: value. |

276 | :type x: int |

277 | :param y: value. |

278 | :type y: int |

279 | :rtype: cmp(x, y) |

280 | """ |

281 | return -cmp(x, y) |

**Note:**See TracBrowser for help on using the repository browser.