[8042] | 1 | import math |

2 | ||

3 | from collections import defaultdict | |

[10618] | 4 | from operator import add |

[8042] | 5 | |

6 | import Orange.core | |

7 | import Orange.data | |

[8990] | 8 | import Orange.misc |

[9011] | 9 | import Orange.feature |

[8990] | 10 | |

[8042] | 11 | import kernels |

12 | import warnings | |

13 | ||

14 | from Orange.core import SVMLearner as _SVMLearner | |

15 | from Orange.core import SVMLearnerSparse as _SVMLearnerSparse | |

16 | from Orange.core import LinearClassifier, \ | |

17 | LinearLearner, \ | |

[10618] | 18 | SVMClassifier as _SVMClassifier, \ |

19 | SVMClassifierSparse as _SVMClassifierSparse | |

[10300] | 20 | |

[10542] | 21 | from Orange.data import preprocess |

[11397] | 22 | from Orange.data.preprocess import DomainContinuizer |

[8042] | 23 | |

[10030] | 24 | from Orange import feature as variable |

[9229] | 25 | |

[10581] | 26 | from Orange.utils import _orange__new__ |

[8042] | 27 | |

[10955] | 28 | |

[9011] | 29 | def max_nu(data): |

[10369] | 30 | """ |

31 | Return the maximum nu parameter for the given data table for | |

32 | Nu_SVC learning. | |

[10955] | 33 | |

[9054] | 34 | :param data: Data with discrete class variable |

[9011] | 35 | :type data: Orange.data.Table |

[10955] | 36 | |

[8042] | 37 | """ |

38 | nu = 1.0 | |

[9011] | 39 | dist = list(Orange.core.Distribution(data.domain.classVar, data)) |

[10955] | 40 | |

[8042] | 41 | def pairs(seq): |

42 | for i, n1 in enumerate(seq): | |

[10300] | 43 | for n2 in seq[i + 1:]: |

[8042] | 44 | yield n1, n2 |

45 | return min([2.0 * min(n1, n2) / (n1 + n2) for n1, n2 in pairs(dist) \ | |

[10300] | 46 | if n1 != 0 and n2 != 0] + [nu]) |

47 | ||

[8990] | 48 | maxNu = max_nu |

[10300] | 49 | |

[10954] | 50 | |

51 | def is_discrete(feature): | |

52 | return isinstance(feature, Orange.feature.Discrete) | |

53 | ||

54 | ||

55 | def is_continuous(feature): | |

56 | return isinstance(feature, Orange.feature.Continuous) | |

57 | ||

58 | ||

[8042] | 59 | class SVMLearner(_SVMLearner): |

[9054] | 60 | """ |

[10369] | 61 | :param svm_type: the SVM type |

[8042] | 62 | :type svm_type: SVMLearner.SVMType |

[10369] | 63 | :param kernel_type: the kernel type |

[9054] | 64 | :type kernel_type: SVMLearner.Kernel |

[10369] | 65 | :param degree: kernel parameter (only for ``Polynomial``) |

[8042] | 66 | :type degree: int |

[10955] | 67 | :param gamma: kernel parameter; if 0, it is set to 1.0/#features |

68 | (for ``Polynomial``, ``RBF`` and ``Sigmoid``) | |

[8042] | 69 | :type gamma: float |

[10369] | 70 | :param coef0: kernel parameter (for ``Polynomial`` and ``Sigmoid``) |

[8042] | 71 | :type coef0: int |

[10369] | 72 | :param kernel_func: kernel function if ``kernel_type`` is |

73 | ``kernels.Custom`` | |

74 | :type kernel_func: callable object | |

75 | :param C: C parameter (for ``C_SVC``, ``Epsilon_SVR`` and ``Nu_SVR``) | |

[8042] | 76 | :type C: float |

[10369] | 77 | :param nu: Nu parameter (for ``Nu_SVC``, ``Nu_SVR`` and ``OneClass``) |

[8042] | 78 | :type nu: float |

[10369] | 79 | :param p: epsilon parameter (for ``Epsilon_SVR``) |

[8042] | 80 | :type p: float |

[10369] | 81 | :param cache_size: cache memory size in MB |

[8042] | 82 | :type cache_size: int |

[10369] | 83 | :param eps: tolerance of termination criterion |

[8042] | 84 | :type eps: float |

[9020] | 85 | :param probability: build a probability model |

[8042] | 86 | :type probability: bool |

[10955] | 87 | :param shrinking: use shrinking heuristics |

[8042] | 88 | :type shrinking: bool |

[11377] | 89 | :param normalization: normalize the input data prior to learning into |

90 | range [0..1] and replace discrete features with indicator columns | |

91 | one for each value of the feature using | |

92 | :class:`~Orange.data.continuization.DomainContinuizer` class | |

[10665] | 93 | (default ``True``) |

94 | :type normalization: bool | |

[9054] | 95 | :param weight: a list of class weights |

96 | :type weight: list | |

[10955] | 97 | :param verbose: If `True` show training progress (default is `False`). |

98 | :type verbose: bool | |

[10369] | 99 | |

[8990] | 100 | Example: |

[10955] | 101 | |

[8990] | 102 | >>> import Orange |

[9054] | 103 | >>> from Orange.classification import svm |

104 | >>> from Orange.evaluation import testing, scoring | |

[10369] | 105 | >>> data = Orange.data.Table("vehicle.tab") |

[9054] | 106 | >>> learner = svm.SVMLearner() |

[10369] | 107 | >>> results = testing.cross_validation([learner], data, folds=5) |

[10664] | 108 | >>> print "CA: %.4f" % scoring.CA(results)[0] |

109 | CA: 0.7908 | |

110 | >>> print "AUC: %.4f" % scoring.AUC(results)[0] | |

111 | AUC: 0.9565 | |

[10955] | 112 | |

[8042] | 113 | """ |

114 | __new__ = _orange__new__(_SVMLearner) | |

[10300] | 115 | |

[8042] | 116 | C_SVC = _SVMLearner.C_SVC |

117 | Nu_SVC = _SVMLearner.Nu_SVC | |

118 | OneClass = _SVMLearner.OneClass | |

119 | Nu_SVR = _SVMLearner.Nu_SVR | |

120 | Epsilon_SVR = _SVMLearner.Epsilon_SVR | |

[10300] | 121 | |

[10580] | 122 | @Orange.utils.deprecated_keywords({"kernelFunc": "kernel_func"}) |

[10300] | 123 | def __init__(self, svm_type=Nu_SVC, kernel_type=kernels.RBF, |

124 | kernel_func=None, C=1.0, nu=0.5, p=0.1, gamma=0.0, degree=3, | |

125 | coef0=0, shrinking=True, probability=True, verbose=False, | |

[8042] | 126 | cache_size=200, eps=0.001, normalization=True, |

[10300] | 127 | weight=[], **kwargs): |

[9195] | 128 | self.svm_type = svm_type |

[8042] | 129 | self.kernel_type = kernel_type |

[8990] | 130 | self.kernel_func = kernel_func |

[8042] | 131 | self.C = C |

132 | self.nu = nu | |

133 | self.p = p | |

134 | self.gamma = gamma | |

135 | self.degree = degree | |

136 | self.coef0 = coef0 | |

137 | self.shrinking = shrinking | |

138 | self.probability = probability | |

139 | self.verbose = verbose | |

140 | self.cache_size = cache_size | |

141 | self.eps = eps | |

142 | self.normalization = normalization | |

143 | for key, val in kwargs.items(): | |

144 | setattr(self, key, val) | |

145 | self.learner = Orange.core.SVMLearner(**kwargs) | |

146 | self.weight = weight | |

147 | ||

[8990] | 148 | max_nu = staticmethod(max_nu) |

[8042] | 149 | |

[9011] | 150 | def __call__(self, data, weight=0): |

[9054] | 151 | """Construct a SVM classifier |

[10955] | 152 | |

[9020] | 153 | :param table: data with continuous features |

[9011] | 154 | :type table: Orange.data.Table |

[10955] | 155 | |

[10369] | 156 | :param weight: ignored (required due to base class signature); |

[8042] | 157 | """ |

[10300] | 158 | |

[9011] | 159 | examples = Orange.core.Preprocessor_dropMissingClasses(data) |

[9054] | 160 | class_var = examples.domain.class_var |

[8042] | 161 | if len(examples) == 0: |

162 | raise ValueError("Example table is without any defined classes") | |

[10300] | 163 | |

[9054] | 164 | # Fix the svm_type parameter if we have a class_var/svm_type mismatch |

[10300] | 165 | if self.svm_type in [0, 1] and \ |

[9919] | 166 | isinstance(class_var, Orange.feature.Continuous): |

[9054] | 167 | self.svm_type += 3 |

[10955] | 168 | |

[10300] | 169 | if self.svm_type in [3, 4] and \ |

[9919] | 170 | isinstance(class_var, Orange.feature.Discrete): |

[9054] | 171 | self.svm_type -= 3 |

[10955] | 172 | |

[9054] | 173 | if self.kernel_type == kernels.Custom and not self.kernel_func: |

174 | raise ValueError("Custom kernel function not supplied") | |

[10300] | 175 | |

[8042] | 176 | nu = self.nu |

[10955] | 177 | if self.svm_type == SVMLearner.Nu_SVC: |

178 | # Check if nu is feasible | |

[10300] | 179 | max_nu = self.max_nu(examples) |

[8990] | 180 | if self.nu > max_nu: |

[8042] | 181 | if getattr(self, "verbose", 0): |

182 | warnings.warn("Specified nu %.3f is infeasible. \ | |

[8990] | 183 | Setting nu to %.3f" % (self.nu, max_nu)) |

184 | nu = max(max_nu - 1e-7, 0.0) | |

[10300] | 185 | |

186 | for name in ["svm_type", "kernel_type", "kernel_func", "C", "nu", "p", | |

187 | "gamma", "degree", "coef0", "shrinking", "probability", | |

[8042] | 188 | "verbose", "cache_size", "eps"]: |

189 | setattr(self.learner, name, getattr(self, name)) | |

[10955] | 190 | |

[8042] | 191 | self.learner.nu = nu |

[9055] | 192 | self.learner.set_weights(self.weight) |

[10300] | 193 | |

[9195] | 194 | if self.svm_type == SVMLearner.OneClass and self.probability: |

195 | self.learner.probability = False | |

[10955] | 196 | warnings.warn("One-class SVM probability output not supported.") |

[9055] | 197 | return self.learn_classifier(examples) |

[8042] | 198 | |

[9011] | 199 | def learn_classifier(self, data): |

[8042] | 200 | if self.normalization: |

[9011] | 201 | data = self._normalize(data) |

[10585] | 202 | svm = self.learner(data) |

[10618] | 203 | return SVMClassifier(svm) |

[8042] | 204 | |

[10580] | 205 | @Orange.utils.deprecated_keywords({"progressCallback": "progress_callback"}) |

[10300] | 206 | def tune_parameters(self, data, parameters=None, folds=5, verbose=0, |

[8990] | 207 | progress_callback=None): |

[10955] | 208 | """Tune the ``parameters`` on the given ``data`` using |

[10369] | 209 | internal cross validation. |

[10955] | 210 | |

[9020] | 211 | :param data: data for parameter tuning |

[10955] | 212 | :type data: Orange.data.Table |

[10369] | 213 | :param parameters: names of parameters to tune |

214 | (default: ["nu", "C", "gamma"]) | |

[8042] | 215 | :type parameters: list of strings |

[10369] | 216 | :param folds: number of folds for internal cross validation |

[8042] | 217 | :type folds: int |

[10369] | 218 | :param verbose: set verbose output |

[8042] | 219 | :type verbose: bool |

[10369] | 220 | :param progress_callback: callback function for reporting progress |

[8990] | 221 | :type progress_callback: callback function |

[10955] | 222 | |

[10369] | 223 | Here is example of tuning the `gamma` parameter using |

224 | 3-fold cross validation. :: | |

[10131] | 225 | |

[10133] | 226 | svm = Orange.classification.svm.SVMLearner() |

227 | svm.tune_parameters(table, parameters=["gamma"], folds=3) | |

[10955] | 228 | |

[8042] | 229 | """ |

[10300] | 230 | |

[8042] | 231 | import orngWrap |

[10300] | 232 | |

[9229] | 233 | if parameters is None: |

234 | parameters = ["nu", "C", "gamma"] | |

[10300] | 235 | |

[8042] | 236 | searchParams = [] |

237 | normalization = self.normalization | |

238 | if normalization: | |

[9011] | 239 | data = self._normalize(data) |

[8042] | 240 | self.normalization = False |

[9229] | 241 | if self.svm_type in [SVMLearner.Nu_SVC, SVMLearner.Nu_SVR] \ |

242 | and "nu" in parameters: | |

243 | if isinstance(data.domain.class_var, variable.Discrete): | |

244 | max_nu = max(self.max_nu(data) - 1e-7, 0.0) | |

245 | else: | |

246 | max_nu = 1.0 | |

[10300] | 247 | searchParams.append(("nu", [i / 10.0 for i in range(1, 9) if \ |

248 | i / 10.0 < max_nu] + [max_nu])) | |

[8042] | 249 | elif "C" in parameters: |

[10300] | 250 | searchParams.append(("C", [2 ** a for a in range(-5, 15, 2)])) |

[10955] | 251 | |

[10300] | 252 | if self.kernel_type == 2 and "gamma" in parameters: |

[10955] | 253 | searchParams.append(("gamma", |

254 | [2 ** a for a in range(-5, 5, 2)] + [0]) | |

255 | ) | |

[8042] | 256 | tunedLearner = orngWrap.TuneMParameters(object=self, |

[10300] | 257 | parameters=searchParams, |

258 | folds=folds, | |

259 | returnWhat=orngWrap.TuneMParameters.returnLearner, | |

260 | progressCallback=progress_callback | |

[10955] | 261 | if progress_callback else lambda i: None) |

[9011] | 262 | tunedLearner(data, verbose=verbose) |

[8042] | 263 | if normalization: |

264 | self.normalization = normalization | |

265 | ||

[9011] | 266 | def _normalize(self, data): |

[10542] | 267 | dc = preprocess.DomainContinuizer() |

268 | dc.class_treatment = preprocess.DomainContinuizer.Ignore | |

269 | dc.continuous_treatment = preprocess.DomainContinuizer.NormalizeBySpan | |

270 | dc.multinomial_treatment = preprocess.DomainContinuizer.NValues | |

[9011] | 271 | newdomain = dc(data) |

272 | return data.translate(newdomain) | |

[8042] | 273 | |

[10954] | 274 | |

[10580] | 275 | SVMLearner = Orange.utils.deprecated_members({ |

[10300] | 276 | "learnClassifier": "learn_classifier", |

[8990] | 277 | "tuneParameters": "tune_parameters", |

[10954] | 278 | "kernelFunc": "kernel_func", |

[8990] | 279 | }, |

280 | wrap_methods=["__init__", "tune_parameters"])(SVMLearner) | |

281 | ||

[10954] | 282 | |

[10618] | 283 | class SVMClassifier(_SVMClassifier): |

284 | def __new__(cls, *args, **kwargs): | |

285 | if args and isinstance(args[0], _SVMClassifier): | |

[10954] | 286 | # Will wrap a C++ object |

[10618] | 287 | return _SVMClassifier.__new__(cls, name=args[0].name) |

288 | elif args and isinstance(args[0], variable.Descriptor): | |

289 | # The constructor call for the C++ object. | |

[10954] | 290 | # This is a hack to support loading of old pickled classifiers |

[10618] | 291 | return _SVMClassifier.__new__(_SVMClassifier, *args, **kwargs) |

292 | else: | |

293 | raise ValueError | |

[10300] | 294 | |

[8042] | 295 | def __init__(self, wrapped): |

[10618] | 296 | self.class_var = wrapped.class_var |

297 | self.domain = wrapped.domain | |

298 | self.computes_probabilities = wrapped.computes_probabilities | |

299 | self.examples = wrapped.examples | |

300 | self.svm_type = wrapped.svm_type | |

301 | self.kernel_func = wrapped.kernel_func | |

302 | self.kernel_type = wrapped.kernel_type | |

303 | self.__wrapped = wrapped | |

[10954] | 304 | |

[10618] | 305 | assert(type(wrapped) in [_SVMClassifier, _SVMClassifierSparse]) |

[10954] | 306 | |

[10751] | 307 | if self.svm_type in [SVMLearner.C_SVC, SVMLearner.Nu_SVC] \ |

308 | and len(wrapped.support_vectors) > 0: | |

309 | # Reorder the support vectors of the binary classifiers | |

[10618] | 310 | label_map = self._get_libsvm_labels_map() |

311 | start = 0 | |

312 | support_vectors = [] | |

313 | for n in wrapped.n_SV: | |

[10954] | 314 | support_vectors.append( |

315 | wrapped.support_vectors[start: start + n] | |

316 | ) | |

[10618] | 317 | start += n |

[10954] | 318 | support_vectors = [support_vectors[i] for i in label_map \ |

319 | if i is not None] | |

320 | support_vectors = reduce(add, support_vectors) | |

321 | self.support_vectors = Orange.data.Table(support_vectors) | |

[10618] | 322 | else: |

323 | self.support_vectors = wrapped.support_vectors | |

[10954] | 324 | |

[10618] | 325 | @property |

326 | def coef(self): | |

[10637] | 327 | """Coefficients of the underlying svm model. |

[10954] | 328 | |

[10637] | 329 | If this is a classification model then this is a list of |

330 | coefficients for each binary 1vs1 classifiers, i.e. | |

331 | #Classes * (#Classses - 1) list of lists where | |

332 | each sublist contains tuples of (coef, support_vector_index) | |

[10954] | 333 | |

[10637] | 334 | For regression models it is still a list of lists (for consistency) |

[10954] | 335 | but of length 1 e.g. [[(coef, support_vector_index), ... ]] |

336 | ||

[10618] | 337 | """ |

[10954] | 338 | if is_discrete(self.class_var): |

[10637] | 339 | # We need to reorder the coef values |

340 | # see http://www.csie.ntu.edu.tw/~cjlin/libsvm/faq.html#f804 | |

341 | # for more information on how the coefs are stored by libsvm | |

342 | # internally. | |

343 | import numpy as np | |

344 | c_map = self._get_libsvm_bin_classifier_map() | |

345 | label_map = self._get_libsvm_labels_map() | |

[10954] | 346 | coef = [] |

[10637] | 347 | n_class = len(label_map) |

348 | n_SV = self.__wrapped.n_SV | |

349 | coef_array = np.array(self.__wrapped.coef) | |

350 | p = 0 | |

351 | libsvm_class_indices = np.cumsum([0] + list(n_SV), dtype=int) | |

352 | class_indices = np.cumsum([0] + list(self.n_SV), dtype=int) | |

353 | for i in range(n_class - 1): | |

354 | for j in range(i + 1, n_class): | |

355 | ni = label_map[i] | |

356 | nj = label_map[j] | |

[10954] | 357 | |

358 | if ni is None or nj is None: | |

359 | # One of the classes is missing from the model. | |

360 | continue | |

361 | ||

[10637] | 362 | bc_index, mult = c_map[p] |

[10954] | 363 | |

[10637] | 364 | if ni > nj: |

[10954] | 365 | # The order in libsvm model is switched. |

[10637] | 366 | ni, nj = nj, ni |

[10954] | 367 | |

[10637] | 368 | # Original class indices |

369 | c1_range = range(libsvm_class_indices[ni], | |

370 | libsvm_class_indices[ni + 1]) | |

[10954] | 371 | c2_range = range(libsvm_class_indices[nj], |

[10637] | 372 | libsvm_class_indices[nj + 1]) |

[10954] | 373 | |

[10637] | 374 | coef1 = mult * coef_array[nj - 1, c1_range] |

375 | coef2 = mult * coef_array[ni, c2_range] | |

[10954] | 376 | |

[10637] | 377 | # Mapped class indices |

378 | c1_range = range(class_indices[i], | |

379 | class_indices[i + 1]) | |

[10954] | 380 | c2_range = range(class_indices[j], |

[10637] | 381 | class_indices[j + 1]) |

382 | if mult == -1.0: | |

383 | c1_range, c2_range = c2_range, c1_range | |

[10954] | 384 | |

[10637] | 385 | nonzero1 = np.abs(coef1) > 0.0 |

386 | nonzero2 = np.abs(coef2) > 0.0 | |

[10954] | 387 | |

[10637] | 388 | coef1 = coef1[nonzero1] |

389 | coef2 = coef2[nonzero2] | |

[10954] | 390 | |

391 | c1_range = [sv_i for sv_i, nz in zip(c1_range, nonzero1) | |

392 | if nz] | |

393 | c2_range = [sv_i for sv_i, nz in zip(c2_range, nonzero2) | |

394 | if nz] | |

395 | ||

396 | coef.append(list(zip(coef1, c1_range)) + \ | |

397 | list(zip(coef2, c2_range))) | |

398 | ||

[10637] | 399 | p += 1 |

400 | else: | |

[10954] | 401 | coef = [zip(self.__wrapped.coef[0], |

402 | range(len(self.support_vectors)))] | |

403 | ||

[10618] | 404 | return coef |

[10954] | 405 | |

[10618] | 406 | @property |

407 | def rho(self): | |

[10637] | 408 | """Constant (bias) terms of the svm model. |

[10954] | 409 | |

410 | For classification models this is a list of bias terms | |

[10637] | 411 | for each binary 1vs1 classifier. |

[10954] | 412 | |

[10637] | 413 | For regression models it is a list with a single value. |

[10954] | 414 | |

[10618] | 415 | """ |

416 | rho = self.__wrapped.rho | |

[10954] | 417 | if is_discrete(self.class_var): |

[10637] | 418 | c_map = self._get_libsvm_bin_classifier_map() |

419 | return [rho[i] * m for i, m in c_map] | |

420 | else: | |

421 | return list(rho) | |

[10954] | 422 | |

[10618] | 423 | @property |

424 | def n_SV(self): | |

425 | """Number of support vectors for each class. | |

[10637] | 426 | For regression models this is `None`. |

[10954] | 427 | |

[10618] | 428 | """ |

[10954] | 429 | n_SV = self.__wrapped.n_SV |

430 | if n_SV is not None: | |

431 | labels_map = self._get_libsvm_labels_map() | |

432 | return [n_SV[i] if i is not None else 0 for i in labels_map] | |

[10618] | 433 | else: |

434 | return None | |

[10954] | 435 | |

[10637] | 436 | # Pairwise probability is expresed as: |

[10954] | 437 | # 1.0 / (1.0 + exp(dec_val[i] * prob_a[i] + prob_b[i])) |

438 | # Since dec_val already changes signs if we switch the | |

[10637] | 439 | # classifier direction only prob_b must change signs |

[10618] | 440 | @property |

441 | def prob_a(self): | |

442 | if self.__wrapped.prob_a is not None: | |

[10637] | 443 | if isinstance(self.class_var, variable.Discrete): |

444 | c_map = self._get_libsvm_bin_classifier_map() | |

445 | prob_a = self.__wrapped.prob_a | |

446 | return [prob_a[i] for i, _ in c_map] | |

447 | else: | |

448 | # A single value for regression | |

449 | return list(self.__wrapped.prob_a) | |

[10618] | 450 | else: |

451 | return None | |

[10954] | 452 | |

[10618] | 453 | @property |

454 | def prob_b(self): | |

455 | if self.__wrapped.prob_b is not None: | |

456 | c_map = self._get_libsvm_bin_classifier_map() | |

457 | prob_b = self.__wrapped.prob_b | |

[10637] | 458 | # Change sign when changing the classifier direction |

459 | return [prob_b[i] * m for i, m in c_map] | |

[10618] | 460 | else: |

461 | return None | |

[10954] | 462 | |

[10618] | 463 | def __call__(self, instance, what=Orange.core.GetValue): |

464 | """Classify a new ``instance`` | |

465 | """ | |

466 | instance = Orange.data.Instance(self.domain, instance) | |

467 | return self.__wrapped(instance, what) | |

468 | ||

469 | def class_distribution(self, instance): | |

470 | """Return a class distribution for the ``instance`` | |

471 | """ | |

472 | instance = Orange.data.Instance(self.domain, instance) | |

473 | return self.__wrapped.class_distribution(instance) | |

474 | ||

475 | def get_decision_values(self, instance): | |

476 | """Return the decision values of the binary 1vs1 | |

477 | classifiers for the ``instance`` (:class:`~Orange.data.Instance`). | |

[10954] | 478 | |

[10618] | 479 | """ |

480 | instance = Orange.data.Instance(self.domain, instance) | |

481 | dec_values = self.__wrapped.get_decision_values(instance) | |

[10637] | 482 | if isinstance(self.class_var, variable.Discrete): |

483 | # decision values are ordered by libsvm internal class values | |

484 | # i.e. the order of labels in the data | |

485 | c_map = self._get_libsvm_bin_classifier_map() | |

486 | return [dec_values[i] * m for i, m in c_map] | |

487 | else: | |

488 | return list(dec_values) | |

[10954] | 489 | |

[8990] | 490 | def get_model(self): |

[10618] | 491 | """Return a string representing the model in the libsvm model format. |

492 | """ | |

493 | return self.__wrapped.get_model() | |

[10954] | 494 | |

[10578] | 495 | def _get_libsvm_labels_map(self): |

[10954] | 496 | """Get the mapping from indices in `class_var.values` to |

497 | internal libsvm labels. If a class value is missing from the libsvm | |

498 | model the returned corresponding entry is `None`) | |

499 | ||

[10578] | 500 | """ |

[10954] | 501 | if is_discrete(self.class_var): |

502 | n_classes = len(self.class_var.values) | |

503 | else: | |

504 | # OneClass/Regression models | |

505 | n_classes = 1 | |

506 | model_string = self.__wrapped.get_model() | |

507 | # Get the labels definition line from the model string | |

508 | # (the labels, if present, are always integer strings | |

509 | # indexing self.class_var.values) | |

510 | labels = [line for line in model_string.splitlines() \ | |

[10578] | 511 | if line.startswith("label")] |

512 | labels = labels[0].split(" ")[1:] if labels else ["0"] | |

[10618] | 513 | labels = [int(label) for label in labels] |

[10954] | 514 | labels_map = dict((cls_index, i) for i, cls_index in enumerate(labels)) |

515 | return [labels_map.get(i) for i in range(n_classes)] | |

[10300] | 516 | |

[10618] | 517 | def _get_libsvm_bin_classifier_map(self): |

[10637] | 518 | """Return the libsvm binary classifier mapping (due to label ordering). |

[10618] | 519 | """ |

[10954] | 520 | if not is_discrete(self.class_var): |

[10637] | 521 | raise TypeError("SVM classification model expected") |

[10954] | 522 | |

[10618] | 523 | label_map = self._get_libsvm_labels_map() |

524 | bin_c_map = [] | |

[10954] | 525 | n_class_values = len(self.class_var.values) |

526 | nr_class = len([i for i in label_map if i is not None]) | |

527 | for i in range(n_class_values - 1): | |

528 | for j in range(i + 1, n_class_values): | |

[10618] | 529 | ni = label_map[i] |

530 | nj = label_map[j] | |

531 | mult = 1 | |

[10954] | 532 | |

533 | if ni is None or nj is None: | |

534 | # One or both classes are missing from the libsvm model. | |

535 | continue | |

536 | ||

[10618] | 537 | if ni > nj: |

[10954] | 538 | # The order in libsvm is switched |

[10618] | 539 | ni, nj = nj, ni |

540 | mult = -1 | |

[10954] | 541 | |

[10618] | 542 | # classifier index |

[10954] | 543 | cls_index = nr_class * (nr_class - 1) / 2 - \ |

544 | (nr_class - ni - 1) * (nr_class - ni - 2) / 2 - \ | |

545 | (nr_class - nj) | |

[10618] | 546 | bin_c_map.append((cls_index, mult)) |

547 | return bin_c_map | |

[10954] | 548 | |

[8042] | 549 | def __reduce__(self): |

[10618] | 550 | return SVMClassifier, (self.__wrapped,), dict(self.__dict__) |

[10955] | 551 | |

[10573] | 552 | def get_binary_classifier(self, c1, c2): |

553 | """Return a binary classifier for classes `c1` and `c2`. | |

554 | """ | |

555 | import numpy as np | |

556 | if self.svm_type not in [SVMLearner.C_SVC, SVMLearner.Nu_SVC]: | |

[10584] | 557 | raise TypeError("SVM classification model expected.") |

[10955] | 558 | |

[10573] | 559 | c1 = int(self.class_var(c1)) |

560 | c2 = int(self.class_var(c2)) | |

[10955] | 561 | |

[10573] | 562 | n_class = len(self.class_var.values) |

[10955] | 563 | |

[10573] | 564 | if c1 == c2: |

565 | raise ValueError("Different classes expected.") | |

[10955] | 566 | |

[10578] | 567 | bin_class_var = Orange.feature.Discrete("%s vs %s" % \ |

568 | (self.class_var.values[c1], self.class_var.values[c2]), | |

569 | values=["0", "1"]) | |

[10955] | 570 | |

[10578] | 571 | mult = 1.0 |

[10573] | 572 | if c1 > c2: |

573 | c1, c2 = c2, c1 | |

[10578] | 574 | mult = -1.0 |

[10955] | 575 | |

576 | classifier_i = n_class * (n_class - 1) / 2 - \ | |

577 | (n_class - c1 - 1) * (n_class - c1 - 2) / 2 - \ | |

578 | (n_class - c2) | |

579 | ||

[10618] | 580 | coef = self.coef[classifier_i] |

[10955] | 581 | |

[10618] | 582 | coef1 = [(mult * alpha, sv_i) for alpha, sv_i in coef \ |

583 | if int(self.support_vectors[sv_i].get_class()) == c1] | |

584 | coef2 = [(mult * alpha, sv_i) for alpha, sv_i in coef \ | |

[10955] | 585 | if int(self.support_vectors[sv_i].get_class()) == c2] |

586 | ||

[10578] | 587 | rho = mult * self.rho[classifier_i] |

[10955] | 588 | |

589 | model = self._binary_libsvm_model_string(bin_class_var, | |

[10618] | 590 | [coef1, coef2], |

591 | [rho]) | |

[10955] | 592 | |

[10618] | 593 | all_sv = [self.support_vectors[sv_i] \ |

[10955] | 594 | for c, sv_i in coef1 + coef2] |

595 | ||

[10618] | 596 | all_sv = Orange.data.Table(all_sv) |

[10955] | 597 | |

[10618] | 598 | svm_classifier_type = type(self.__wrapped) |

[10955] | 599 | |

[10618] | 600 | # Build args for svm_classifier_type constructor |

601 | args = (bin_class_var, self.examples, all_sv, model) | |

[10955] | 602 | |

[11604] | 603 | if issubclass(svm_classifier_type, _SVMClassifierSparse): |

[10618] | 604 | args = args + (int(self.__wrapped.use_non_meta),) |

[10955] | 605 | |

[10573] | 606 | if self.kernel_type == kernels.Custom: |

[10618] | 607 | args = args + (self.kernel_func,) |

[10955] | 608 | |

[10618] | 609 | native_classifier = svm_classifier_type(*args) |

610 | return SVMClassifier(native_classifier) | |

[10955] | 611 | |

[10618] | 612 | def _binary_libsvm_model_string(self, class_var, coef, rho): |

613 | """Return a libsvm formated model string for binary classifier | |

[10573] | 614 | """ |

615 | import itertools | |

[10955] | 616 | |

[10637] | 617 | if not isinstance(self.class_var, variable.Discrete): |

618 | raise TypeError("SVM classification model expected") | |

[10955] | 619 | |

[10573] | 620 | model = [] |

[10955] | 621 | |

[10573] | 622 | # Take the model up to nr_classes |

[10618] | 623 | libsvm_model = self.__wrapped.get_model() |

624 | for line in libsvm_model.splitlines(): | |

[10573] | 625 | if line.startswith("nr_class"): |

626 | break | |

627 | else: | |

628 | model.append(line.rstrip()) | |

[10955] | 629 | nr_class = len(class_var.values) |

630 | model.append("nr_class %i" % nr_class) | |

[10618] | 631 | model.append("total_sv %i" % reduce(add, [len(c) for c in coef])) |

[10573] | 632 | model.append("rho " + " ".join(str(r) for r in rho)) |

[10955] | 633 | model.append("label " + " ".join(str(i) for i in range(nr_class))) |

[10573] | 634 | # No probA and probB |

[10955] | 635 | |

[10618] | 636 | model.append("nr_sv " + " ".join(str(len(c)) for c in coef)) |

[10573] | 637 | model.append("SV") |

[10955] | 638 | |

[10573] | 639 | def instance_to_svm(inst): |

640 | values = [(i, float(inst[v])) \ | |

641 | for i, v in enumerate(inst.domain.attributes) \ | |

642 | if not inst[v].is_special() and float(inst[v]) != 0.0] | |

643 | return " ".join("%i:%f" % (i + 1, v) for i, v in values) | |

[10955] | 644 | |

[10618] | 645 | def sparse_instance_to_svm(inst): |

646 | non_meta = [] | |

647 | base = 1 | |

648 | if self.__wrapped.use_non_meta: | |

649 | non_meta = [instance_to_svm(inst)] | |

650 | base += len(inst.domain) | |

651 | metas = [] | |

652 | for m_id, value in sorted(inst.get_metas().items(), reverse=True): | |

653 | if not value.isSpecial() and float(value) != 0: | |

654 | metas.append("%i:%f" % (base - m_id, float(value))) | |

655 | return " ".join(non_meta + metas) | |

[10955] | 656 | |

[10618] | 657 | if isinstance(self.__wrapped, _SVMClassifierSparse): |

658 | converter = sparse_instance_to_svm | |

659 | else: | |

[10621] | 660 | converter = instance_to_svm |

[10955] | 661 | |

[10574] | 662 | if self.kernel_type == kernels.Custom: |

[10618] | 663 | SV = libsvm_model.split("SV\n", 1)[1] |

664 | # Get the sv indices (the last entry in the SV lines) | |

[10955] | 665 | indices = [int(s.split(":")[-1]) for s in SV.splitlines() \ |

666 | if s.strip()] | |

667 | ||

668 | # Reorder the indices | |

[10618] | 669 | label_map = self._get_libsvm_labels_map() |

670 | start = 0 | |

671 | reordered_indices = [] | |

672 | for n in self.__wrapped.n_SV: | |

673 | reordered_indices.append(indices[start: start + n]) | |

674 | start += n | |

675 | reordered_indices = [reordered_indices[i] for i in label_map] | |

676 | indices = reduce(add, reordered_indices) | |

[10955] | 677 | |

[10618] | 678 | for (c, sv_i) in itertools.chain(*coef): |

[10574] | 679 | model.append("%f 0:%i" % (c, indices[sv_i])) |

[10573] | 680 | else: |

[10618] | 681 | for (c, sv_i) in itertools.chain(*coef): |

[10955] | 682 | model.append( |

683 | "%f %s" % (c, converter(self.support_vectors[sv_i])) | |

684 | ) | |

685 | ||

[10573] | 686 | model.append("") |

687 | return "\n".join(model) | |

[10955] | 688 | |

[10300] | 689 | |

[10618] | 690 | SVMClassifier = Orange.utils.deprecated_members({ |

[10300] | 691 | "classDistribution": "class_distribution", |

[8990] | 692 | "getDecisionValues": "get_decision_values", |

693 | "getModel" : "get_model", | |

[10618] | 694 | }, wrap_methods=[])(SVMClassifier) |

[10955] | 695 | |

696 | ||

[10618] | 697 | # Backwards compatibility (pickling) |

698 | SVMClassifierWrapper = SVMClassifier | |

[10300] | 699 | |

[10955] | 700 | |

[8042] | 701 | class SVMLearnerSparse(SVMLearner): |

[10300] | 702 | |

[10369] | 703 | """ |

704 | A :class:`SVMLearner` that learns from data stored in meta | |

705 | attributes. Meta attributes do not need to be registered with the | |

706 | data set domain, or present in all data instances. | |

[8042] | 707 | """ |

[10300] | 708 | |

[10580] | 709 | @Orange.utils.deprecated_keywords({"useNonMeta": "use_non_meta"}) |

[8042] | 710 | def __init__(self, **kwds): |

711 | SVMLearner.__init__(self, **kwds) | |

[9189] | 712 | self.use_non_meta = kwds.get("use_non_meta", False) |

713 | self.learner = Orange.core.SVMLearnerSparse(**kwds) | |

[10300] | 714 | |

[9189] | 715 | def _normalize(self, data): |

716 | if self.use_non_meta: | |

[10542] | 717 | dc = preprocess.DomainContinuizer() |

[10955] | 718 | dc.class_treatment = dc.Ignore |

719 | dc.continuous_treatment = dc.NormalizeBySpan | |

720 | dc.multinomial_treatment = dc.NValues | |

[9189] | 721 | newdomain = dc(data) |

722 | data = data.translate(newdomain) | |

723 | return data | |

[8042] | 724 | |

[10955] | 725 | |

[8042] | 726 | class SVMLearnerEasy(SVMLearner): |

[10369] | 727 | """A class derived from :obj:`SVMLearner` that automatically |

728 | scales the data and performs parameter optimization using | |

729 | :func:`SVMLearner.tune_parameters`. The procedure is similar to | |

730 | that implemented in easy.py script from the LibSVM package. | |

[10955] | 731 | |

[8042] | 732 | """ |

[10300] | 733 | |

[10665] | 734 | def __init__(self, folds=4, verbose=0, **kwargs): |

735 | """ | |

736 | :param folds: the number of folds to use in cross validation | |

737 | :type folds: int | |

[10955] | 738 | |

[10665] | 739 | :param verbose: verbosity of the tuning procedure. |

740 | :type verbose: int | |

[10955] | 741 | |

[10665] | 742 | ``kwargs`` is passed to :class:`SVMLearner` |

[10955] | 743 | |

[10665] | 744 | """ |

745 | SVMLearner.__init__(self, **kwargs) | |

746 | self.folds = folds | |

747 | self.verbose = verbose | |

[10955] | 748 | |

[10665] | 749 | self.learner = SVMLearner(**kwargs) |

[10300] | 750 | |

[9011] | 751 | def learn_classifier(self, data): |

[10542] | 752 | transformer = preprocess.DomainContinuizer() |

753 | transformer.multinomialTreatment = preprocess.DomainContinuizer.NValues | |

[10300] | 754 | transformer.continuousTreatment = \ |

[10542] | 755 | preprocess.DomainContinuizer.NormalizeBySpan |

756 | transformer.classTreatment = preprocess.DomainContinuizer.Ignore | |

[10300] | 757 | newdomain = transformer(data) |

758 | newexamples = data.translate(newdomain) | |

[10955] | 759 | |

[8042] | 760 | parameters = [] |

[10955] | 761 | self.learner.normalization = False # Normalization already done |

[10300] | 762 | |

763 | if self.svm_type in [1, 4]: | |

[8042] | 764 | if self.svm_type == SVMLearner.Nu_SVC: |

[8990] | 765 | max_nu = max(self.max_nu(newexamples) - 1e-7, 0.0) |

[8042] | 766 | else: |

[8990] | 767 | max_nu = 1.0 |

[10300] | 768 | parameters.append(("nu", [i / 10.0 for i in range(1, 9) \ |

769 | if i / 10.0 < max_nu] + [max_nu])) | |

[8042] | 770 | else: |

[10300] | 771 | parameters.append(("C", [2 ** a for a in range(-5, 15, 2)])) |

772 | if self.kernel_type == 2: | |

[10955] | 773 | parameters.append( |

774 | ("gamma", [2 ** a for a in range(-5, 5, 2)] + [0]) | |

775 | ) | |

776 | ||

[8042] | 777 | import orngWrap |

[10642] | 778 | tunedLearner = orngWrap.TuneMParameters(learner=self.learner, |

[10300] | 779 | parameters=parameters, |

[8042] | 780 | folds=self.folds) |

[10300] | 781 | |

[10642] | 782 | return tunedLearner(newexamples, verbose=self.verbose) |

[8042] | 783 | |

[10955] | 784 | |

[10621] | 785 | class SVMLearnerSparseEasy(SVMLearnerEasy): |

[10665] | 786 | def __init__(self, folds=4, verbose=0, **kwargs): |

787 | SVMLearnerEasy.__init__(self, folds=folds, verbose=verbose, | |

788 | **kwargs) | |

789 | self.learner = SVMLearnerSparse(**kwargs) | |

[8042] | 790 | |

[9054] | 791 | |

[10676] | 792 | """ |

793 | LIBLINEAR learners interface | |

794 | """ | |

[10694] | 795 | |

[10955] | 796 | |

[9054] | 797 | class LinearSVMLearner(Orange.core.LinearLearner): |

798 | """Train a linear SVM model.""" | |

[10300] | 799 | |

[9054] | 800 | L2R_L2LOSS_DUAL = Orange.core.LinearLearner.L2R_L2Loss_SVC_Dual |

[10300] | 801 | L2R_L2LOSS = Orange.core.LinearLearner.L2R_L2Loss_SVC |

[9054] | 802 | L2R_L1LOSS_DUAL = Orange.core.LinearLearner.L2R_L1Loss_SVC_Dual |

803 | L1R_L2LOSS = Orange.core.LinearLearner.L1R_L2Loss_SVC | |

[10300] | 804 | |

[9054] | 805 | __new__ = _orange__new__(base=Orange.core.LinearLearner) |

[10300] | 806 | |

[10955] | 807 | def __init__(self, solver_type=L2R_L2LOSS_DUAL, C=1.0, eps=0.01, |

[11397] | 808 | bias=1.0, normalization=True, |

809 | multinomial_treatment=DomainContinuizer.NValues, **kwargs): | |

[9054] | 810 | """ |

[10694] | 811 | :param solver_type: One of the following class constants: |

812 | ``L2R_L2LOSS_DUAL``, ``L2R_L2LOSS``, | |

813 | ``L2R_L1LOSS_DUAL``, ``L1R_L2LOSS`` | |

[10955] | 814 | |

815 | The first part (``L2R`` or ``L1R``) is the regularization term | |

[10694] | 816 | on the weight vector (squared or absolute norm respectively), |

817 | the ``L1LOSS`` or ``L2LOSS`` indicate absolute or squared | |

818 | loss function ``DUAL`` means the optimization problem is | |

819 | solved in the dual space (for more information see the | |

820 | documentation on `LIBLINEAR`_). | |

[10955] | 821 | |

[9054] | 822 | :param C: Regularization parameter (default 1.0) |

[10694] | 823 | :type C: float |

[10955] | 824 | |

[9054] | 825 | :param eps: Stopping criteria (default 0.01) |

826 | :type eps: float | |

[10955] | 827 | |

[10774] | 828 | :param bias: If non negative then each instance is appended a constant |

829 | bias term (default 1.0). | |

[10955] | 830 | |

[10774] | 831 | :type bias: float |

[10955] | 832 | |

[11377] | 833 | :param normalization: Normalize the input data into range [0..1] prior |

834 | to learning (default ``True``) | |

[10676] | 835 | :type normalization: bool |

[10955] | 836 | |

[11397] | 837 | :param multinomial_treatment: Defines how to handle multinomial |

838 | features for learning. It can be one of the | |

839 | :class:`~.DomainContinuizer` `multinomial_treatment` | |

840 | constants (default: `DomainContinuizer.NValues`). | |

841 | ||

842 | :type multinomial_treatment: int | |

843 | ||

844 | .. versionadded:: 2.6.1 | |

845 | Added `multinomial_treatment` | |

846 | ||

847 | .. note:: By default if the training data contains discrete features | |

848 | they are replaced by indicator columns one for each value of the | |

849 | feature regardless of the value of `normalization`. This is | |

850 | different then in :class:`SVMLearner` where this is done only if | |

[11377] | 851 | `normalization` is ``True``. |

852 | ||

[10694] | 853 | Example |

[10955] | 854 | |

855 | >>> linear_svm = LinearSVMLearner( | |

856 | ... solver_type=LinearSVMLearner.L1R_L2LOSS, | |

857 | ... C=2.0) | |

[10694] | 858 | ... |

[10955] | 859 | |

[9054] | 860 | """ |

861 | self.solver_type = solver_type | |

862 | self.eps = eps | |

863 | self.C = C | |

[10774] | 864 | self.bias = bias |

[10676] | 865 | self.normalization = normalization |

[11397] | 866 | self.multinomial_treatment = multinomial_treatment |

[10676] | 867 | |

[8042] | 868 | for name, val in kwargs.items(): |

869 | setattr(self, name, val) | |

[11397] | 870 | |

[9054] | 871 | if self.solver_type not in [self.L2R_L2LOSS_DUAL, self.L2R_L2LOSS, |

[10694] | 872 | self.L2R_L1LOSS_DUAL, self.L1R_L2LOSS]: |

[11397] | 873 | warnings.warn( |

874 | " Deprecated 'solver_type', use " | |

875 | "'Orange.classification.logreg.LibLinearLogRegLearner'" | |

876 | "to build a logistic regression models using LIBLINEAR.", | |

877 | DeprecationWarning | |

878 | ) | |

[10300] | 879 | |

[10676] | 880 | def __call__(self, data, weight_id=None): |

[10679] | 881 | if not isinstance(data.domain.class_var, variable.Discrete): |

882 | raise TypeError("Can only learn a discrete class.") | |

883 | ||

[10682] | 884 | if data.domain.has_discrete_attributes(False) or self.normalization: |

[11397] | 885 | dc = DomainContinuizer() |

886 | dc.multinomial_treatment = self.multinomial_treatment | |

[10676] | 887 | dc.class_treatment = dc.Ignore |

888 | dc.continuous_treatment = \ | |

889 | dc.NormalizeBySpan if self.normalization else dc.Leave | |

890 | c_domain = dc(data) | |

891 | data = data.translate(c_domain) | |

[10300] | 892 | |

[10676] | 893 | return super(LinearSVMLearner, self).__call__(data, weight_id) |

[10300] | 894 | |

[9054] | 895 | LinearLearner = LinearSVMLearner |

896 | ||

[10955] | 897 | |

[9054] | 898 | class MultiClassSVMLearner(Orange.core.LinearLearner): |

899 | """ Multi-class SVM (Crammer and Singer) from the `LIBLINEAR`_ library. | |

900 | """ | |

901 | __new__ = _orange__new__(base=Orange.core.LinearLearner) | |

[10300] | 902 | |

[10774] | 903 | def __init__(self, C=1.0, eps=0.01, bias=1.0, |

[11397] | 904 | normalization=True, |

905 | multinomial_treatment=DomainContinuizer.NValues, | |

906 | **kwargs): | |

[9054] | 907 | """\ |

908 | :param C: Regularization parameter (default 1.0) | |

[10955] | 909 | :type C: float |

910 | ||

[9054] | 911 | :param eps: Stopping criteria (default 0.01) |

912 | :type eps: float | |

[10955] | 913 | |

[10774] | 914 | :param bias: If non negative then each instance is appended a constant |

915 | bias term (default 1.0). | |

[10955] | 916 | |

[10774] | 917 | :type bias: float |

[10955] | 918 | |

[10679] | 919 | :param normalization: Normalize the input data prior to learning |

920 | (default True) | |

921 | :type normalization: bool | |

[10955] | 922 | |

[11397] | 923 | :param multinomial_treatment: Defines how to handle multinomial |

924 | features for learning. It can be one of the | |

925 | :class:`~.DomainContinuizer` `multinomial_treatment` | |

926 | constants (default: `DomainContinuizer.NValues`). | |

927 | ||

928 | :type multinomial_treatment: int | |

929 | ||

930 | .. versionadded:: 2.6.1 | |

931 | Added `multinomial_treatment` | |

932 | ||

[9054] | 933 | """ |

934 | self.C = C | |

935 | self.eps = eps | |

[10774] | 936 | self.bias = bias |

[10679] | 937 | self.normalization = normalization |

[11397] | 938 | self.multinomial_treatment = multinomial_treatment |

[9054] | 939 | for name, val in kwargs.items(): |

940 | setattr(self, name, val) | |

[10300] | 941 | |

[9054] | 942 | self.solver_type = self.MCSVM_CS |

[10300] | 943 | |

[10679] | 944 | def __call__(self, data, weight_id=None): |

945 | if not isinstance(data.domain.class_var, variable.Discrete): | |

946 | raise TypeError("Can only learn a discrete class.") | |

947 | ||

[10682] | 948 | if data.domain.has_discrete_attributes(False) or self.normalization: |

[11397] | 949 | dc = DomainContinuizer() |

950 | dc.multinomial_treatment = self.multinomial_treatment | |

[10679] | 951 | dc.class_treatment = dc.Ignore |

952 | dc.continuous_treatment = \ | |

953 | dc.NormalizeBySpan if self.normalization else dc.Leave | |

954 | c_domain = dc(data) | |

955 | data = data.translate(c_domain) | |

956 | ||

957 | return super(MultiClassSVMLearner, self).__call__(data, weight_id) | |

[9054] | 958 | |

959 | #TODO: Unified way to get attr weights for linear SVMs. | |

[8042] | 960 | |

[10955] | 961 | |

[8990] | 962 | def get_linear_svm_weights(classifier, sum=True): |

[9020] | 963 | """Extract attribute weights from the linear SVM classifier. |

[10955] | 964 | |

[10369] | 965 | For multi class classification, the result depends on the argument |

966 | :obj:`sum`. If ``True`` (default) the function computes the | |

967 | squared sum of the weights over all binary one vs. one | |

968 | classifiers. If :obj:`sum` is ``False`` it returns a list of | |

969 | weights for each individual binary classifier (in the order of | |

970 | [class1 vs class2, class1 vs class3 ... class2 vs class3 ...]). | |

[10955] | 971 | |

[8042] | 972 | """ |

[10300] | 973 | |

[8990] | 974 | def update_weights(w, key, val, mul): |

[8042] | 975 | if key in w: |

[10300] | 976 | w[key] += mul * val |

[8042] | 977 | else: |

[10300] | 978 | w[key] = mul * val |

979 | ||

[8042] | 980 | def to_float(val): |

[10300] | 981 | return float(val) if not val.isSpecial() else 0.0 |

982 | ||

983 | SVs = classifier.support_vectors | |

[9054] | 984 | class_var = SVs.domain.class_var |

[10682] | 985 | |

[10638] | 986 | if classifier.svm_type in [SVMLearner.C_SVC, SVMLearner.Nu_SVC]: |

[10955] | 987 | weights = [] |

[10638] | 988 | classes = classifier.class_var.values |

989 | for i in range(len(classes) - 1): | |

990 | for j in range(i + 1, len(classes)): | |

991 | # Get the coef and rho values from the binary sub-classifier | |

[10955] | 992 | # Easier then using the full coef matrix (due to libsvm |

993 | # internal class reordering) | |

[10638] | 994 | bin_classifier = classifier.get_binary_classifier(i, j) |

995 | n_sv0 = bin_classifier.n_SV[0] | |

996 | SVs = bin_classifier.support_vectors | |

997 | w = {} | |

[10682] | 998 | |

[10639] | 999 | for coef, sv_ind in bin_classifier.coef[0]: |

[10638] | 1000 | SV = SVs[sv_ind] |

1001 | attributes = SVs.domain.attributes + \ | |

1002 | SV.getmetas(False, Orange.feature.Descriptor).keys() | |

1003 | for attr in attributes: | |

1004 | if attr.varType == Orange.feature.Type.Continuous: | |

1005 | update_weights(w, attr, to_float(SV[attr]), coef) | |

[10682] | 1006 | |

[10638] | 1007 | weights.append(w) |

1008 | if sum: | |

1009 | scores = defaultdict(float) | |

1010 | for w in weights: | |

1011 | for attr, w_attr in w.items(): | |

1012 | scores[attr] += w_attr ** 2 | |

1013 | for key in scores: | |

1014 | scores[key] = math.sqrt(scores[key]) | |

1015 | weights = dict(scores) | |

[8042] | 1016 | else: |

[10638] | 1017 | weights = {} |

1018 | for coef, sv_ind in classifier.coef[0]: | |

1019 | SV = SVs[sv_ind] | |

1020 | attributes = SVs.domain.attributes + \ | |

1021 | SV.getmetas(False, Orange.feature.Descriptor).keys() | |

1022 | for attr in attributes: | |

1023 | if attr.varType == Orange.feature.Type.Continuous: | |

1024 | update_weights(weights, attr, to_float(SV[attr]), coef) | |

[10682] | 1025 | |

[10955] | 1026 | return weights |

[10682] | 1027 | |

[8990] | 1028 | getLinearSVMWeights = get_linear_svm_weights |

1029 | ||

[10955] | 1030 | |

[8990] | 1031 | def example_weighted_sum(example, weights): |

[10300] | 1032 | sum = 0 |

[8042] | 1033 | for attr, w in weights.items(): |

[9054] | 1034 | sum += float(example[attr]) * w |

[8042] | 1035 | return sum |

[10300] | 1036 | |

[8990] | 1037 | exampleWeightedSum = example_weighted_sum |

1038 | ||

[10955] | 1039 | |

[9933] | 1040 | class ScoreSVMWeights(Orange.feature.scoring.Score): |

[10369] | 1041 | """ |

[10694] | 1042 | Score a feature using squared weights of a linear SVM model. |

[10955] | 1043 | |

[8990] | 1044 | Example: |

[10955] | 1045 | |

[10695] | 1046 | >>> table = Orange.data.Table("vehicle.tab") |

[10131] | 1047 | >>> score = Orange.classification.svm.ScoreSVMWeights() |

[10955] | 1048 | >>> svm_scores = [(score(f, table), f) for f in table.domain.features] |

[10682] | 1049 | >>> for feature_score, feature in sorted(svm_scores, reverse=True): |

1050 | ... print "%-35s: %.3f" % (feature.name, feature_score) | |

[10774] | 1051 | pr.axis aspect ratio : 44.263 |

1052 | kurtosis about major axis : 42.593 | |

1053 | max.length rectangularity : 39.377 | |

1054 | radius ratio : 28.741 | |

1055 | skewness about major axis : 26.682 | |

1056 | hollows ratio : 20.993 | |

1057 | compactness : 20.085 | |

1058 | elongatedness : 17.410 | |

1059 | distance circularity : 14.542 | |

1060 | scaled radius of gyration : 12.584 | |

1061 | max.length aspect ratio : 10.686 | |

1062 | scatter ratio : 10.574 | |

1063 | scaled variance along minor axis : 10.049 | |

1064 | circularity : 8.360 | |

1065 | pr.axis rectangularity : 7.473 | |

1066 | scaled variance along major axis : 5.731 | |

1067 | skewness about minor axis : 1.368 | |

1068 | kurtosis about minor axis : 0.690 | |

1069 | ||

[10682] | 1070 | |

[8042] | 1071 | """ |

[10300] | 1072 | |

[10682] | 1073 | handles_discrete = True |

1074 | handles_continuous = True | |

1075 | computes_thresholds = False | |

1076 | needs = Orange.feature.scoring.Score.Generator | |

1077 | ||

[9933] | 1078 | def __new__(cls, attr=None, data=None, weight_id=None, **kwargs): |

[9011] | 1079 | self = Orange.feature.scoring.Score.__new__(cls, **kwargs) |

1080 | if data is not None and attr is not None: | |

[8042] | 1081 | self.__init__(**kwargs) |

[9933] | 1082 | return self.__call__(attr, data, weight_id) |

[8042] | 1083 | else: |

1084 | return self | |

[10300] | 1085 | |

[8042] | 1086 | def __reduce__(self): |

[9933] | 1087 | return ScoreSVMWeights, (), dict(self.__dict__) |

[10300] | 1088 | |

[8042] | 1089 | def __init__(self, learner=None, **kwargs): |

[9054] | 1090 | """ |

[10682] | 1091 | :param learner: Learner used for weight estimation |

[10955] | 1092 | (by default |

1093 | ``LinearSVMLearner(solver_type=L2R_L2LOSS_DUAL, C=1.0)`` | |

[10682] | 1094 | will be used for classification problems and |

1095 | ``SVMLearner(svm_type=Epsilon_SVR, kernel_type=Linear, C=1.0, p=0.25)`` | |

[10955] | 1096 | for regression problems). |

1097 | ||

1098 | :type learner: Orange.core.LinearLearner | |

1099 | ||

[8042] | 1100 | """ |

[10682] | 1101 | self.learner = learner |

[10695] | 1102 | self._cached_data = None |

1103 | self._cached_data_crc = None | |

1104 | self._cached_weights = None | |

1105 | self._cached_classifier = None | |

[10300] | 1106 | |

[9933] | 1107 | def __call__(self, attr, data, weight_id=None): |

[10682] | 1108 | if attr not in data.domain.attributes: |

1109 | raise ValueError("Feature %r is not from the domain." % attr) | |

1110 | ||

1111 | if self.learner is not None: | |

1112 | learner = self.learner | |

1113 | elif isinstance(data.domain.class_var, variable.Discrete): | |

[10955] | 1114 | learner = LinearSVMLearner( |

1115 | solver_type=LinearSVMLearner.L2R_L2LOSS_DUAL, | |

1116 | C=1.0) | |

1117 | ||

[10682] | 1118 | elif isinstance(data.domain.class_var, variable.Continuous): |

1119 | learner = SVMLearner(svm_type=SVMLearner.Epsilon_SVR, | |

1120 | kernel_type=kernels.Linear, | |

1121 | C=1.0, p=0.25) | |

1122 | else: | |

1123 | raise TypeError("Cannot handle the class variable type %r" % \ | |

1124 | type(data.domain.class_var)) | |

1125 | ||

[10695] | 1126 | crc = data.checksum() |

1127 | if data is self._cached_data and crc == self._cached_data_crc: | |

[8042] | 1128 | weights = self._cached_weights |

1129 | else: | |

[10682] | 1130 | classifier = learner(data, weight_id) |

[10695] | 1131 | self._cached_data = data |

1132 | self._cached_data_crc = data.checksum() | |

1133 | self._cached_classifier = classifier | |

[10682] | 1134 | weights = self._extract_weights(classifier, data.domain.attributes) |

[8042] | 1135 | self._cached_weights = weights |

1136 | return weights.get(attr, 0.0) | |

1137 | ||

[10682] | 1138 | def _extract_weights(self, classifier, original_features): |

[10955] | 1139 | """Extract weights from a svm classifer (``SVMClassifier`` or a |

[10682] | 1140 | ``LinearLearner`` instance). |

[10955] | 1141 | |

[10682] | 1142 | """ |

1143 | import numpy as np | |

1144 | if isinstance(classifier, SVMClassifier): | |

1145 | weights = get_linear_svm_weights(classifier, sum=True) | |

1146 | if isinstance(classifier.class_var, variable.Continuous): | |

1147 | # The weights are in the the original non squared form | |

[10955] | 1148 | weights = dict((f, w ** 2) for f, w in weights.items()) |

[10682] | 1149 | elif isinstance(classifier, Orange.core.LinearClassifier): |

1150 | weights = np.array(classifier.weights) | |

1151 | weights = np.sum(weights ** 2, axis=0) | |

1152 | weights = dict(zip(classifier.domain.attributes, weights)) | |

1153 | else: | |

1154 | raise TypeError("Don't know how to use classifier type %r" % \ | |

1155 | type(classifier)) | |

1156 | ||

1157 | # collect dummy variables that were created for discrete features | |

1158 | sources = self._collect_source(weights.keys()) | |

1159 | source_weights = dict.fromkeys(original_features, 0.0) | |

1160 | for f in original_features: | |

[10695] | 1161 | if f in weights: |

1162 | source_weights[f] = weights[f] | |

1163 | elif f not in weights and f in sources: | |

[10682] | 1164 | dummys = sources[f] |

[10955] | 1165 | # Use averege weight |

[10682] | 1166 | source_weights[f] = np.average([weights[d] for d in dummys]) |

1167 | else: | |

1168 | raise ValueError(f) | |

1169 | ||

1170 | return source_weights | |

1171 | ||

1172 | def _collect_source(self, vars): | |

1173 | """ Given a list of variables ``var``, return a mapping from source | |

1174 | variables (``source_variable`` or ``get_value_from.variable`` members) | |

1175 | back to the variables in ``vars``. | |

[10955] | 1176 | |

[10682] | 1177 | """ |

1178 | source = defaultdict(list) | |

1179 | for var in vars: | |

1180 | if var.source_variable: | |

1181 | source[var.source_variable].append(var) | |

1182 | elif isinstance(var.get_value_from, Orange.core.ClassifierFromVar): | |

1183 | source[var.get_value_from.variable].append(var) | |

1184 | elif isinstance(var.get_value_from, Orange.core.ImputeClassifier): | |

[10955] | 1185 | imputer = var.get_value_from.classifier_from_var |

1186 | source[imputer.variable].append(var) | |

[10682] | 1187 | else: |

1188 | source[var].append(var) | |

1189 | return dict(source) | |

1190 | ||

[9933] | 1191 | MeasureAttribute_SVMWeights = ScoreSVMWeights |

[9011] | 1192 | |

[10955] | 1193 | |

[8042] | 1194 | class RFE(object): |

[11423] | 1195 | """ |

1196 | Iterative feature elimination based on weights computed by a | |

[10369] | 1197 | linear SVM. |

[10955] | 1198 | |

[11423] | 1199 | Example: |

[10955] | 1200 | |

[10695] | 1201 | >>> table = Orange.data.Table("promoters.tab") |

1202 | >>> svm_l = Orange.classification.svm.SVMLearner( | |

[10955] | 1203 | ... kernel_type=Orange.classification.svm.kernels.Linear) |

1204 | ... | |

[10695] | 1205 | >>> rfe = Orange.classification.svm.RFE(learner=svm_l) |

1206 | >>> data_with_subset_of_features = rfe(table, 10) | |

1207 | >>> data_with_subset_of_features.domain | |

1208 | [p-45, p-36, p-35, p-34, p-33, p-31, p-18, p-12, p-10, p-04, y] | |

[10955] | 1209 | |

[8042] | 1210 | """ |

[10300] | 1211 | |

[8042] | 1212 | def __init__(self, learner=None): |

[10695] | 1213 | """ |

[11423] | 1214 | :param learner: A linear svm learner for use for scoring (this |

1215 | learner is passed to :class:`ScoreSVMWeights`) | |

1216 | ||

1217 | :type learner: :class:`LinearSVMLearner` or :class:`SVMLearner` with | |

1218 | linear kernel | |

1219 | ||

1220 | .. seealso:: :class:`ScoreSVMWeights` | |

[10955] | 1221 | |

[10695] | 1222 | """ |

1223 | self.learner = learner | |

[8042] | 1224 | |

[10955] | 1225 | @Orange.utils.deprecated_keywords({"progressCallback": "progress_callback", |

1226 | "stopAt": "stop_at"}) | |

[9020] | 1227 | def get_attr_scores(self, data, stop_at=0, progress_callback=None): |

1228 | """Return a dictionary mapping attributes to scores. | |

1229 | A score is a step number at which the attribute | |

1230 | was removed from the recursive evaluation. | |

[10955] | 1231 | |

[8042] | 1232 | """ |

1233 | iter = 1 | |

1234 | attrs = data.domain.attributes | |

[10695] | 1235 | attr_scores = {} |

1236 | scorer = ScoreSVMWeights(learner=self.learner) | |

[10300] | 1237 | |

[9020] | 1238 | while len(attrs) > stop_at: |

[10695] | 1239 | scores = [(scorer(attr, data), attr) for attr in attrs] |

[8990] | 1240 | if progress_callback: |

[9020] | 1241 | progress_callback(100. * iter / (len(attrs) - stop_at)) |

[10695] | 1242 | scores = sorted(scores) |

1243 | num_to_remove = max(int(len(attrs) * 1.0 / (iter + 1)), 1) | |

1244 | for s, attr in scores[:num_to_remove]: | |

1245 | attr_scores[attr] = len(attr_scores) | |

1246 | attrs = [attr for s, attr in scores[num_to_remove:]] | |

[8042] | 1247 | if attrs: |

[10695] | 1248 | data = data.select(attrs + [data.domain.class_var]) |

[8042] | 1249 | iter += 1 |

[10695] | 1250 | return attr_scores |

[10300] | 1251 | |

[10955] | 1252 | @Orange.utils.deprecated_keywords( |

1253 | {"numSelected": "num_selected", | |

1254 | "progressCallback": "progress_callback"}) | |

[8990] | 1255 | def __call__(self, data, num_selected=20, progress_callback=None): |

[10955] | 1256 | """Return a new dataset with only `num_selected` best scoring |

1257 | attributes. | |

1258 | ||

[8042] | 1259 | :param data: Data |

1260 | :type data: Orange.data.Table | |

[10955] | 1261 | |

[8990] | 1262 | :param num_selected: number of features to preserve |

1263 | :type num_selected: int | |

[10955] | 1264 | |

[8042] | 1265 | """ |

[9020] | 1266 | scores = self.get_attr_scores(data, progress_callback=progress_callback) |

[8042] | 1267 | scores = sorted(scores.items(), key=lambda item: item[1]) |

[10300] | 1268 | |

[8990] | 1269 | scores = dict(scores[-num_selected:]) |

[8042] | 1270 | attrs = [attr for attr in data.domain.attributes if attr in scores] |

1271 | domain = Orange.data.Domain(attrs, data.domain.classVar) | |

1272 | domain.addmetas(data.domain.getmetas()) | |

1273 | data = Orange.data.Table(domain, data) | |

1274 | return data | |

1275 | ||

[10955] | 1276 | |

[10580] | 1277 | RFE = Orange.utils.deprecated_members({ |

[8990] | 1278 | "getAttrScores": "get_attr_scores"}, |

1279 | wrap_methods=["get_attr_scores", "__call__"])(RFE) | |

[8042] | 1280 | |

[10955] | 1281 | |

[9011] | 1282 | def example_table_to_svm_format(table, file): |

[8990] | 1283 | warnings.warn("Deprecated. Use table_to_svm_format", DeprecationWarning) |

[9011] | 1284 | table_to_svm_format(table, file) |

[8990] | 1285 | |

1286 | exampleTableToSVMFormat = example_table_to_svm_format | |

1287 | ||

[10955] | 1288 | |

[9011] | 1289 | def table_to_svm_format(data, file): |

1290 | """Save :obj:`Orange.data.Table` to a format used by LibSVM. | |

[10955] | 1291 | |

[9011] | 1292 | :param data: Data |

1293 | :type data: Orange.data.Table | |

1294 | :param file: file pointer | |

1295 | :type file: file | |

[10955] | 1296 | |

[9011] | 1297 | """ |

[10300] | 1298 | |

[9011] | 1299 | attrs = data.domain.attributes + data.domain.getmetas().values() |

[10300] | 1300 | attrs = [attr for attr in attrs if attr.varType |

1301 | in [Orange.feature.Type.Continuous, | |

[9923] | 1302 | Orange.feature.Type.Discrete]] |

[9011] | 1303 | cv = data.domain.classVar |

[10300] | 1304 | |

[9011] | 1305 | for ex in data: |

[9923] | 1306 | if cv.varType == Orange.feature.Type.Discrete: |

[10300] | 1307 | file.write(str(int(ex[cv]))) |

[8042] | 1308 | else: |

1309 | file.write(str(float(ex[cv]))) | |

[10300] | 1310 | |

[8042] | 1311 | for i, attr in enumerate(attrs): |

1312 | if not ex[attr].isSpecial(): | |

[10300] | 1313 | file.write(" " + str(i + 1) + ":" + str(float(ex[attr]))) |

[8042] | 1314 | file.write("\n") |

[10300] | 1315 | |

[9217] | 1316 | tableToSVMFormat = table_to_svm_format |

