#
source:
orange/Orange/preprocess/__init__.py
@
9923:5f55c90e0077

Revision 9923:5f55c90e0077, 13.2 KB checked in by markotoplak, 2 years ago (diff) |
---|

Line | |
---|---|

1 | """ |

2 | .. autoclass:: Preprocessor_discretizeEntropy |

3 | |

4 | .. autoclass:: Preprocessor_removeContinuous |

5 | |

6 | .. autoclass:: Preprocessor_continuize |

7 | |

8 | .. autoclass:: Preprocessor_removeDiscrete |

9 | |

10 | .. autoclass:: Preprocessor_impute |

11 | |

12 | .. autoclass:: Preprocessor_featureSelection |

13 | |

14 | .. autofunction:: bestP |

15 | |

16 | .. autofunction:: bestN |

17 | |

18 | .. autofunction:: selectNRandom |

19 | |

20 | .. autofunction:: selectPRandom |

21 | |

22 | .. autoclass:: Preprocessor_RFE |

23 | |

24 | .. autoclass:: Preprocessor_sample |

25 | |

26 | .. autoclass:: Preprocessor_preprocessorList |

27 | |

28 | .. class:: RemoveUnusedValues(variable, data, remove_one_valued=False) |

29 | |

30 | Removes unused values and reduces the variable, if a variable |

31 | declares values that do not appear in the data. |

32 | |

33 | :param variable: :class:`Orange.feature.Descriptor` |

34 | :param data: :class:`Orange.data.Table` |

35 | :param remove_one_valued: Decides whether to remove or to retain |

36 | the attributes with only one value defined (default: False). |

37 | |

38 | Example: |

39 | |

40 | .. literalinclude:: code/unusedValues.py |

41 | |

42 | There are four possible outcomes: |

43 | |

44 | 1. The variable does not have any used values in the data - value |

45 | of this variable is undefined for all examples. The variable is |

46 | thus useless and the class returns None. |

47 | |

48 | 2. The variable has only one used value (or, possibly, only one |

49 | value at all). Such a variable is in fact useless, and can |

50 | probably be removed without harm. Nevertheless, its fate is |

51 | decided by the flag remove_one_valued which is False by default, |

52 | so such variables are retained unless explicitly specified |

53 | otherwise. |

54 | |

55 | 3. All variable's values occur in the data (and the variable has more |

56 | than one value; otherwise the above case applies). The original variable |

57 | is returned. |

58 | |

59 | 4. There are some unused values. A new variable is constructed and the |

60 | unused values are omitted. The value of the new variable is computed |

61 | automatically from the value of the original variable |

62 | :class:`Orange.classification.lookup.ClassifierByLookupTable` is used |

63 | for mapping. |

64 | |

65 | Results of example: |

66 | |

67 | .. literalinclude:: code/unusedValues.res |

68 | |

69 | Variables a and y are OK and are left alone. In b, value 1 is not used |

70 | and is removed (not in the original variable, of course; a new variable |

71 | is created). c is useless and is removed altogether. d is retained since |

72 | remove_one_valued was left at False; if we set it to True, this variable |

73 | would be removed as well. |

74 | |

75 | """ |

76 | |

77 | from orange import \ |

78 | DomainContinuizer, \ |

79 | VariableFilterMap, \ |

80 | ValueFilter, \ |

81 | ValueFilter_continuous, \ |

82 | ValueFilter_discrete, \ |

83 | ValueFilter_string, \ |

84 | ValueFilter_stringList, \ |

85 | ValueFilterList, \ |

86 | TransformValue, \ |

87 | Discrete2Continuous, \ |

88 | Discretizer, \ |

89 | BiModalDiscretizer, \ |

90 | EquiDistDiscretizer, \ |

91 | IntervalDiscretizer, \ |

92 | ThresholdDiscretizer, \ |

93 | MapIntValue, \ |

94 | NormalizeContinuous, \ |

95 | Ordinal2Continuous, \ |

96 | TransformValue_IsDefined, \ |

97 | TableAverager, \ |

98 | Preprocessor, \ |

99 | Preprocessor_addCensorWeight, \ |

100 | Preprocessor_addClassNoise, \ |

101 | Preprocessor_addClassWeight, \ |

102 | Preprocessor_addGaussianClassNoise, \ |

103 | Preprocessor_addGaussianNoise, \ |

104 | Preprocessor_addMissing, \ |

105 | Preprocessor_addMissingClasses, \ |

106 | Preprocessor_addNoise, \ |

107 | Preprocessor_discretize, \ |

108 | Preprocessor_drop, \ |

109 | Preprocessor_dropMissing, \ |

110 | Preprocessor_dropMissingClasses, \ |

111 | Preprocessor_filter, \ |

112 | Preprocessor_ignore, \ |

113 | Preprocessor_imputeByLearner, \ |

114 | Preprocessor_removeDuplicates, \ |

115 | Preprocessor_select, \ |

116 | Preprocessor_shuffle, \ |

117 | Preprocessor_take, \ |

118 | Preprocessor_takeMissing, \ |

119 | Preprocessor_takeMissingClasses, \ |

120 | Imputer, \ |

121 | Imputer_asValue, \ |

122 | Imputer_defaults, \ |

123 | Imputer_model, \ |

124 | Imputer_random, \ |

125 | ImputerConstructor, \ |

126 | ImputerConstructor_asValue, \ |

127 | ImputerConstructor_average, \ |

128 | ImputerConstructor_maximal, \ |

129 | ImputerConstructor_minimal, \ |

130 | ImputerConstructor_model, \ |

131 | ImputerConstructor_random, \ |

132 | FilterList, \ |

133 | Filter, \ |

134 | Filter_conjunction, \ |

135 | Filter_disjunction, \ |

136 | Filter_hasClassValue, \ |

137 | Filter_hasMeta, \ |

138 | Filter_hasSpecial, \ |

139 | Filter_isDefined, \ |

140 | Filter_random, \ |

141 | Filter_sameValue, \ |

142 | Filter_values, \ |

143 | Discretization, \ |

144 | BiModalDiscretization, \ |

145 | EntropyDiscretization, \ |

146 | EquiDistDiscretization, \ |

147 | EquiNDiscretization, \ |

148 | DomainTransformerConstructor, \ |

149 | RemoveRedundant, \ |

150 | RemoveRedundantByInduction, \ |

151 | RemoveRedundantByQuality, \ |

152 | RemoveRedundantOneValue, \ |

153 | RemoveUnusedValues |

154 | |

155 | import outliers |

156 | |

157 | |

158 | import math |

159 | |

160 | import orange |

161 | from Orange.misc import _orange__new__, _orange__reduce__ |

162 | |

163 | class Preprocessor_discretizeEntropy(Preprocessor_discretize): |

164 | """ An discretizer that uses orange.EntropyDiscretization method but, |

165 | unlike Preprocessor_discretize class, also removes unused attributes |

166 | from the domain. |

167 | |

168 | """ |

169 | |

170 | __new__ = _orange__new__(Preprocessor_discretize) |

171 | __reduce__ = _orange__reduce__ |

172 | |

173 | def __init__(self, method=orange.EntropyDiscretization()): |

174 | self.method = method |

175 | assert(isinstance(method, orange.EntropyDiscretization)) |

176 | |

177 | def __call__(self, data, wightId=0): |

178 | newattr_list = [] |

179 | for attr in data.domain.attributes: |

180 | if attr.varType == orange.VarTypes.Continuous: |

181 | newattr = self.method(attr, data) |

182 | if newattr.getValueFrom.transformer.points: |

183 | newattr_list.append(newattr) |

184 | else: |

185 | newattr_list.append(attr) |

186 | newdomain = orange.Domain(newattr_list, data.domain.classVar) |

187 | newdomain.addmetas(data.domain.getmetas()) |

188 | return orange.ExampleTable(newdomain, data) |

189 | |

190 | class Preprocessor_removeContinuous(Preprocessor_discretize): |

191 | """ A preprocessor that removes all continuous features. |

192 | """ |

193 | __new__ = _orange__new__(Preprocessor_discretize) |

194 | __reduce__ = _orange__reduce__ |

195 | |

196 | def __call__(self, data, weightId=None): |

197 | attrs = [attr for attr in data.domain.attributes if attr.varType == orange.VarTypes.Discrete] |

198 | domain = orange.Domain(attrs, data.domain.classVar) |

199 | domain.addmetas(data.domain.getmetas()) |

200 | return orange.ExampleTable(domain, data) |

201 | |

202 | class Preprocessor_continuize(orange.Preprocessor): |

203 | """ A preprocessor that continuizes a discrete domain (and optionally normalizes it). |

204 | See :obj:`Orange.feature.continuization.DomainContinuizer` for list of accepted arguments. |

205 | |

206 | """ |

207 | __new__ = _orange__new__(orange.Preprocessor) |

208 | __reduce__ = _orange__reduce__ |

209 | |

210 | def __init__(self, zeroBased=True, multinomialTreatment=orange.DomainContinuizer.NValues, |

211 | continuousTreatment=orange.DomainContinuizer.Leave, |

212 | classTreatment=orange.DomainContinuizer.Ignore, |

213 | **kwargs): |

214 | self.zeroBased = zeroBased |

215 | self.multinomialTreatment = multinomialTreatment |

216 | self.continuousTreatment = continuousTreatment |

217 | self.classTreatment = classTreatment |

218 | |

219 | def __call__(self, data, weightId=0): |

220 | continuizer = orange.DomainContinuizer(zeroBased=self.zeroBased, |

221 | multinomialTreatment=self.multinomialTreatment, |

222 | continuousTreatment=self.continuousTreatment, |

223 | classTreatment=self.classTreatment) |

224 | c_domain = continuizer(data, weightId) |

225 | return data.translate(c_domain) |

226 | |

227 | class Preprocessor_removeDiscrete(Preprocessor_continuize): |

228 | """ A Preprocessor that removes all discrete attributes from the domain. |

229 | """ |

230 | __new__ = _orange__new__(Preprocessor_continuize) |

231 | |

232 | def __call__(self, data, weightId=None): |

233 | attrs = [attr for attr in data.domain.attributes if attr.varType == orange.VarTypes.Continuous] |

234 | domain = orange.Domain(attrs, data.domain.classVar) |

235 | domain.addmetas(data.domain.getmetas()) |

236 | return orange.ExampleTable(domain, data) |

237 | |

238 | class Preprocessor_impute(orange.Preprocessor): |

239 | """ A preprocessor that imputes unknown values using a learner. |

240 | |

241 | :param model: a learner class. |

242 | |

243 | """ |

244 | __new__ = _orange__new__(orange.Preprocessor) |

245 | __reduce__ = _orange__reduce__ |

246 | |

247 | def __init__(self, model=None, **kwargs): |

248 | self.model = orange.MajorityLearner() if model is None else model |

249 | |

250 | def __call__(self, data, weightId=0): |

251 | return orange.Preprocessor_imputeByLearner(data, learner=self.model) |

252 | |

253 | def bestN(attrMeasures, N=10): |

254 | """ Return best N attributes |

255 | """ |

256 | return attrMeasures[-N:] |

257 | |

258 | def bestP(attrMeasures, P=10): |

259 | """ Return best P percent of attributes |

260 | """ |

261 | count = len(attrMeasures) |

262 | return attrMeasures[-max(int(math.ceil(count * P / 100.0)), 1):] |

263 | |

264 | class Preprocessor_featureSelection(orange.Preprocessor): |

265 | """ A preprocessor that runs feature selection using an feature scoring function. |

266 | |

267 | :param measure: a scoring function (default: orange.MeasureAttribute_relief) |

268 | :param filter: a filter function to use for selection (default Preprocessor_featureSelection.bestN) |

269 | :param limit: the limit for the filter function (default 10) |

270 | |

271 | """ |

272 | __new__ = _orange__new__(orange.Preprocessor) |

273 | __reduce__ = _orange__reduce__ |

274 | |

275 | bestN = staticmethod(bestN) |

276 | bestP = staticmethod(bestP) |

277 | |

278 | def __init__(self, measure=orange.MeasureAttribute_relief(), filter=None, limit=10): |

279 | self.measure = measure |

280 | self.filter = filter if filter is not None else self.bestN |

281 | self.limit = limit |

282 | |

283 | def attrScores(self, data): |

284 | """ Return a list of computed scores for all attributes in `data`. |

285 | """ |

286 | measures = sorted([(self.measure(attr, data), attr) for attr in data.domain.attributes]) |

287 | return measures |

288 | |

289 | def __call__(self, data, weightId=None): |

290 | measures = self.attrScores(data) |

291 | attrs = [attr for _, attr in self.filter(measures, self.limit)] |

292 | domain = orange.Domain(attrs, data.domain.classVar) |

293 | domain.addmetas(data.domain.getmetas()) |

294 | return orange.ExampleTable(domain, data) |

295 | |

296 | class Preprocessor_RFE(Preprocessor_featureSelection): |

297 | """ A preprocessor that runs RFE(Recursive Feature Elimination) using |

298 | linear SVM derived attribute weights. |

299 | |

300 | :param filter: a filter function to use for selection (default |

301 | Preprocessor_featureSelection.bestN) |

302 | :param limit: the limit for the filter function (default 10) |

303 | |

304 | """ |

305 | __new__ = _orange__new__(Preprocessor_featureSelection) |

306 | __reduce__ = _orange__reduce__ |

307 | def __init__(self, filter=None, limit=10): |

308 | self.limit = limit |

309 | self.filter = filter if filter is not None else self.bestN |

310 | |

311 | def __call__(self, data, weightId=None): |

312 | from Orange.classification.svm import RFE |

313 | rfe = RFE() |

314 | filtered = self.filter(range(len(data)), self.limit) |

315 | return rfe(data, len(filtered)) |

316 | |

317 | def selectNRandom(examples, N=10): |

318 | """ Select N random examples. |

319 | """ |

320 | import random |

321 | return random.sample(examples, N) |

322 | |

323 | def selectPRandom(examples, P=10): |

324 | """ Select P percent random examples. |

325 | """ |

326 | import random |

327 | count = len(examples) |

328 | return random.sample(examples, max(int(math.ceil(count * P / 100.0)), 1)) |

329 | |

330 | class Preprocessor_sample(orange.Preprocessor): |

331 | """ A preprocessor that samples a subset of the data. |

332 | |

333 | :param filter: a filter function to use for selection (default |

334 | Preprocessor_sample.selectNRandom) |

335 | :param limit: the limit for the filter function (default 10) |

336 | |

337 | """ |

338 | __new__ = _orange__new__(orange.Preprocessor) |

339 | __reduce__ = _orange__reduce__ |

340 | |

341 | selectNRandom = staticmethod(selectNRandom) |

342 | selectPRandom = staticmethod(selectPRandom) |

343 | |

344 | def __init__(self, filter=None, limit=10): |

345 | self.filter = filter if filter is not None else self.selectNRandom |

346 | self.limit = limit |

347 | |

348 | def __call__(self, data, weightId=None): |

349 | return orange.ExampleTable(data.domain, self.filter(data, self.limit)) |

350 | |

351 | |

352 | class Preprocessor_preprocessorList(orange.Preprocessor): |

353 | """ A preprocessor wrapping a sequence of other preprocessors. |

354 | |

355 | :param preprocessors: a list of :obj:`Preprocessor` instances |

356 | |

357 | """ |

358 | |

359 | __new__ = _orange__new__(orange.Preprocessor) |

360 | __reduce__ = _orange__reduce__ |

361 | |

362 | def __init__(self, preprocessors=[]): |

363 | self.preprocessors = preprocessors |

364 | |

365 | def __call__(self, data, weightId=None): |

366 | import orange |

367 | hadWeight = hasWeight = weightId is not None |

368 | for preprocessor in self.preprocessors: |

369 | t = preprocessor(data, weightId) if hasWeight else preprocessor(data) |

370 | if isinstance(t, tuple): |

371 | data, weightId = t |

372 | hasWeight = True |

373 | else: |

374 | data = t |

375 | if hadWeight: |

376 | return data, weightId |

377 | else: |

378 | return data |

379 |

**Note:**See TracBrowser for help on using the repository browser.