| 1 | """ |
|---|
| 2 | .. index:: svm |
|---|
| 3 | |
|---|
| 4 | ======================= |
|---|
| 5 | Support Vector Machines |
|---|
| 6 | ======================= |
|---|
| 7 | |
|---|
| 8 | .. index:: Support Vector Machines Classification |
|---|
| 9 | |
|---|
| 10 | Interface to the LibSVM library (LIBSVM : a library for support vector machines |
|---|
| 11 | (http://www.csie.ntu.edu.tw/~cjlin/papers/libsvm.ps.gz) |
|---|
| 12 | |
|---|
| 13 | .. note:: On some data-sets SVM can perform very badly. It is a known fact that |
|---|
| 14 | SVM's can be very sensitive to the proper choice of the parameters. |
|---|
| 15 | If you are having problems with learner's accuracy try scaling the |
|---|
| 16 | data and using different parameters or choose an easier approach |
|---|
| 17 | and use the `SVMLearnerEasy` class which does this automatically |
|---|
| 18 | (it is similar to the easy.py script in the LibSVM distribution). |
|---|
| 19 | |
|---|
| 20 | .. autoclass:: Orange.classification.svm.SVMLearner |
|---|
| 21 | :members: |
|---|
| 22 | |
|---|
| 23 | .. autoclass:: Orange.classification.svm.SVMLearnerSparse |
|---|
| 24 | :members: |
|---|
| 25 | |
|---|
| 26 | .. autoclass:: Orange.classification.svm.SVMLearnerEasy |
|---|
| 27 | :members: |
|---|
| 28 | |
|---|
| 29 | Usefull functions |
|---|
| 30 | ================= |
|---|
| 31 | |
|---|
| 32 | .. automethod:: Orange.classification.svm.maxNu |
|---|
| 33 | |
|---|
| 34 | .. automethod:: Orange.classification.svm.getLinearSVMWeights |
|---|
| 35 | |
|---|
| 36 | .. automethod:: Orange.classification.svm.exampleTableToSVMFormat |
|---|
| 37 | |
|---|
| 38 | |
|---|
| 39 | Kernel Wrappers |
|---|
| 40 | --------------- |
|---|
| 41 | |
|---|
| 42 | .. autoclass:: Orange.classification.svm.kernels.KernelWrapper |
|---|
| 43 | :members: |
|---|
| 44 | |
|---|
| 45 | .. autoclass:: Orange.classification.svm.kernels.DualKernelWrapper |
|---|
| 46 | :members: |
|---|
| 47 | |
|---|
| 48 | .. autoclass:: Orange.classification.svm.kernels.RBFKernelWrapper |
|---|
| 49 | :members: |
|---|
| 50 | |
|---|
| 51 | .. autoclass:: Orange.classification.svm.kernels.PolyKernelWrapper |
|---|
| 52 | :members: |
|---|
| 53 | |
|---|
| 54 | .. autoclass:: Orange.classification.svm.kernels.AdditionKernelWrapper |
|---|
| 55 | :members: |
|---|
| 56 | |
|---|
| 57 | .. autoclass:: Orange.classification.svm.kernels.MultiplicationKernelWrapper |
|---|
| 58 | :members: |
|---|
| 59 | |
|---|
| 60 | .. autoclass:: Orange.classification.svm.kernels.CompositeKernelWrapper |
|---|
| 61 | :members: |
|---|
| 62 | |
|---|
| 63 | .. autoclass:: Orange.classification.svm.kernels.SparseLinKernel |
|---|
| 64 | :members: |
|---|
| 65 | |
|---|
| 66 | .. autoclass:: Orange.classification.svm.kernels.BagOfWords |
|---|
| 67 | :members: |
|---|
| 68 | |
|---|
| 69 | Example (`svm-custom-kernel.py`_ uses: `iris.tab`_) |
|---|
| 70 | |
|---|
| 71 | .. literalinclude:: code/svm-custom-kernel.py |
|---|
| 72 | |
|---|
| 73 | |
|---|
| 74 | SVM derived feature weights |
|---|
| 75 | --------------------------- |
|---|
| 76 | |
|---|
| 77 | .. autoclass:: Orange.classification.svm.MeasureAttribute_SVMWeights |
|---|
| 78 | :members: |
|---|
| 79 | |
|---|
| 80 | |
|---|
| 81 | SVM based Recursive Feature Elimination |
|---|
| 82 | --------------------------------------- |
|---|
| 83 | |
|---|
| 84 | .. autoclass:: Orange.classification.svm.RFE |
|---|
| 85 | :members: |
|---|
| 86 | |
|---|
| 87 | |
|---|
| 88 | .. _svm-linear-weights.py: code/svm-linear-weights.py |
|---|
| 89 | .. _svm-custom-kernel.py: code/svm-custom-kernel.py |
|---|
| 90 | .. _svm-easy.py: code/svm-easy.py |
|---|
| 91 | .. _brown-selected.tab: code/brown-selected.tab |
|---|
| 92 | .. _iris.tab: code/iris.tab |
|---|
| 93 | .. _vehicle.tab: code/vehicle.tab |
|---|
| 94 | |
|---|
| 95 | """ |
|---|
| 96 | import math |
|---|
| 97 | |
|---|
| 98 | from collections import defaultdict |
|---|
| 99 | |
|---|
| 100 | import Orange.core |
|---|
| 101 | import kernels |
|---|
| 102 | |
|---|
| 103 | from Orange.core import SVMLearner as _SVMLearner |
|---|
| 104 | from Orange.core import SVMLearnerSparse as _SVMLearnerSparse |
|---|
| 105 | from Orange.core import LinearClassifier, \ |
|---|
| 106 | LinearLearner, \ |
|---|
| 107 | SVMClassifier, \ |
|---|
| 108 | SVMClassifierSparse |
|---|
| 109 | |
|---|
| 110 | # ORANGE Support Vector Machines |
|---|
| 111 | # This module was written by Ales Erjavec |
|---|
| 112 | # and supersedes an earlier one written by Alex Jakulin (jakulin@acm.org), |
|---|
| 113 | # based on: Chih-Chung Chang and Chih-Jen Lin's |
|---|
| 114 | # LIBSVM : a library for support vector machines |
|---|
| 115 | # (http://www.csie.ntu.edu.tw/~cjlin/papers/libsvm.ps.gz) |
|---|
| 116 | |
|---|
| 117 | #from Orange.misc import _orange__new__ |
|---|
| 118 | |
|---|
| 119 | def _orange__new__(base=Orange.core.Learner): |
|---|
| 120 | """Return an orange 'schizofrenic' __new__ class method. |
|---|
| 121 | |
|---|
| 122 | :param base: base orange class (default Orange.core.Learner) |
|---|
| 123 | :type base: type |
|---|
| 124 | |
|---|
| 125 | Example:: |
|---|
| 126 | class NewOrangeLearner(Orange.core.Learner): |
|---|
| 127 | __new__ = _orange__new(Orange.core.Learner) |
|---|
| 128 | |
|---|
| 129 | """ |
|---|
| 130 | from functools import wraps |
|---|
| 131 | @wraps(base.__new__) |
|---|
| 132 | def _orange__new_wrapped(cls, data=None, **kwargs): |
|---|
| 133 | self = base.__new__(cls, **kwargs) |
|---|
| 134 | if data: |
|---|
| 135 | self.__init__(**kwargs) |
|---|
| 136 | return self.__call__(data) |
|---|
| 137 | else: |
|---|
| 138 | return self |
|---|
| 139 | return _orange__new_wrapped |
|---|
| 140 | |
|---|
| 141 | def maxNu(examples): |
|---|
| 142 | """Return the maximum nu parameter for Nu_SVC support vector learning |
|---|
| 143 | for the given example table. |
|---|
| 144 | |
|---|
| 145 | """ |
|---|
| 146 | nu = 1.0 |
|---|
| 147 | dist = list(Orange.core.Distribution(examples.domain.classVar, examples)) |
|---|
| 148 | def pairs(seq): |
|---|
| 149 | for i, n1 in enumerate(seq): |
|---|
| 150 | for n2 in seq[i+1:]: |
|---|
| 151 | yield n1, n2 |
|---|
| 152 | return min([2.0 * min(n1, n2) / (n1 + n2) for n1, n2 in pairs(dist) \ |
|---|
| 153 | if n1 != 0 and n2 !=0] + [nu]) |
|---|
| 154 | |
|---|
| 155 | class SVMLearner(_SVMLearner): |
|---|
| 156 | __new__ = _orange__new__(_SVMLearner) |
|---|
| 157 | |
|---|
| 158 | C_SVC = _SVMLearner.C_SVC |
|---|
| 159 | Nu_SVC = _SVMLearner.Nu_SVC |
|---|
| 160 | OneClass = _SVMLearner.OneClass |
|---|
| 161 | Nu_SVR = _SVMLearner.Nu_SVR |
|---|
| 162 | Epsilon_SVR = _SVMLearner.Epsilon_SVR |
|---|
| 163 | |
|---|
| 164 | def __init__(self, svm_type=Nu_SVC, kernel_type=kernels.RBF, |
|---|
| 165 | kernelFunc=None, C=1.0, nu=0.5, p=0.1, gamma=0.0, degree=3, |
|---|
| 166 | coef0=0, shrinking=True, probability=True, verbose=False, |
|---|
| 167 | cache_size=200, eps=0.001, normalization=True, |
|---|
| 168 | weight=[], **kwargs): |
|---|
| 169 | """:param svm_type: Defines the type of SVM (can be C_SVC, |
|---|
| 170 | Nu_SVC (default), OneClass, Epsilon_SVR, Nu_SVR) |
|---|
| 171 | :type svm_type: SVMLearner.SVMType |
|---|
| 172 | :param kernel_type: Defines the type of a kernel to use for learning |
|---|
| 173 | (can be kernels.RBF (default), kernels.Linear, kernels.Polynomial, |
|---|
| 174 | kernels.Sigmoid, kernels.Custom) |
|---|
| 175 | :type kernel_type: SVMLearner.Kernel |
|---|
| 176 | :param degree: Kernel parameter (for Polynomial) (default 3) |
|---|
| 177 | :type degree: int |
|---|
| 178 | :param gamma: Kernel parameter (Polynomial/RBF/Sigmoid) |
|---|
| 179 | (default 1/number_of_examples) |
|---|
| 180 | :type gamma: float |
|---|
| 181 | :param coef0: Kernel parameter (Polynomial/Sigmoid) (default 0) |
|---|
| 182 | :type coef0: int |
|---|
| 183 | :param kernelFunc: Function that will be called if `kernel_type` is |
|---|
| 184 | `Custom`. It must accept two `Orange.data.Example` arguments and |
|---|
| 185 | return a float (the distance between the examples). |
|---|
| 186 | :type kernelFunc: callable function |
|---|
| 187 | :param C: C parameter for C_SVC, Epsilon_SVR, Nu_SVR |
|---|
| 188 | :type C: float |
|---|
| 189 | :param nu: Nu parameter for Nu_SVC, Nu_SVR and OneClass (default 0.5) |
|---|
| 190 | :type nu: float |
|---|
| 191 | :param p: Epsilon in loss-function for Epsilon_SVR |
|---|
| 192 | :type p: float |
|---|
| 193 | :param cache_size: Cache memory size in MB (default 100) |
|---|
| 194 | :type cache_size: int |
|---|
| 195 | :param eps: Tolerance of termination criterion (default 0.001) |
|---|
| 196 | :type eps: float |
|---|
| 197 | :param probability: Determines if a probability model should be build |
|---|
| 198 | (default False) |
|---|
| 199 | :type probability: bool |
|---|
| 200 | :param shrinking: Determines whether to use shrinking heuristics |
|---|
| 201 | (default True) |
|---|
| 202 | :type shrinking: bool |
|---|
| 203 | :param weights: a list of class weights |
|---|
| 204 | :type weights: list |
|---|
| 205 | |
|---|
| 206 | """ |
|---|
| 207 | self.svm_type = SVMLearner.Nu_SVC |
|---|
| 208 | self.kernel_type = kernel_type |
|---|
| 209 | self.kernelFunc = kernelFunc |
|---|
| 210 | self.C = C |
|---|
| 211 | self.nu = nu |
|---|
| 212 | self.p = p |
|---|
| 213 | self.gamma = gamma |
|---|
| 214 | self.degree = degree |
|---|
| 215 | self.coef0 = coef0 |
|---|
| 216 | self.shrinking = shrinking |
|---|
| 217 | self.probability = probability |
|---|
| 218 | self.verbose = verbose |
|---|
| 219 | self.cache_size = cache_size |
|---|
| 220 | self.eps = eps |
|---|
| 221 | self.normalization = normalization |
|---|
| 222 | for key, val in kwargs.items(): |
|---|
| 223 | setattr(self, key, val) |
|---|
| 224 | # self.__dict__.update(kwargs) |
|---|
| 225 | self.learner = Orange.core.SVMLearner(**kwargs) |
|---|
| 226 | self.weight = weight |
|---|
| 227 | |
|---|
| 228 | maxNu = staticmethod(maxNu) |
|---|
| 229 | |
|---|
| 230 | def __call__(self, examples, weight=0): |
|---|
| 231 | examples = Orange.core.Preprocessor_dropMissingClasses(examples) |
|---|
| 232 | if len(examples) == 0: |
|---|
| 233 | raise ValueError("Example table is without any defined classes") |
|---|
| 234 | if self.svm_type in [0,1] and \ |
|---|
| 235 | examples.domain.classVar.varType!=Orange.core.VarTypes.Discrete: |
|---|
| 236 | self.svm_type+=3 |
|---|
| 237 | #raise AttributeError, "Cannot learn a discrete classifier from non descrete class data. Use EPSILON_SVR or NU_SVR for regression" |
|---|
| 238 | if self.svm_type in [3,4] and \ |
|---|
| 239 | examples.domain.classVar.varType==Orange.core.VarTypes.Discrete: |
|---|
| 240 | self.svm_type-=3 |
|---|
| 241 | #raise AttributeError, "Cannot do regression on descrete class data. Use C_SVC or NU_SVC for classification" |
|---|
| 242 | if self.kernel_type==4 and not self.kernelFunc: |
|---|
| 243 | raise AttributeError, "Custom kernel function not supplied" |
|---|
| 244 | ################################################## |
|---|
| 245 | # if self.kernel_type==4: #There is a bug in svm. For some unknown reason only the probability model works with custom kernels |
|---|
| 246 | # self.probability=True |
|---|
| 247 | ################################################## |
|---|
| 248 | nu = self.nu |
|---|
| 249 | if self.svm_type == SVMLearner.Nu_SVC: #is nu feasibile |
|---|
| 250 | maxNu= self.maxNu(examples) |
|---|
| 251 | if self.nu > maxNu: |
|---|
| 252 | if getattr(self, "verbose", 0): |
|---|
| 253 | import warnings |
|---|
| 254 | warnings.warn("Specified nu %.3f is infeasible. \ |
|---|
| 255 | Setting nu to %.3f" % (self.nu, maxNu)) |
|---|
| 256 | nu = max(maxNu - 1e-7, 0.0) |
|---|
| 257 | |
|---|
| 258 | for name in ["svm_type", "kernel_type", "kernelFunc", "C", "nu", "p", |
|---|
| 259 | "gamma", "degree", "coef0", "shrinking", "probability", |
|---|
| 260 | "verbose", "cache_size", "eps"]: |
|---|
| 261 | setattr(self.learner, name, getattr(self, name)) |
|---|
| 262 | self.learner.nu = nu |
|---|
| 263 | self.learner.setWeights(self.weight) |
|---|
| 264 | return self.learnClassifier(examples) |
|---|
| 265 | |
|---|
| 266 | def learnClassifier(self, examples): |
|---|
| 267 | if self.normalization: |
|---|
| 268 | examples = self._normalize(examples) |
|---|
| 269 | svm = self.learner(examples) |
|---|
| 270 | # if self.: |
|---|
| 271 | # return SVMClassifierWrapper(svm) |
|---|
| 272 | # else: |
|---|
| 273 | return SVMClassifierWrapper(svm) |
|---|
| 274 | return self.learner(examples) |
|---|
| 275 | |
|---|
| 276 | def tuneParameters(self, examples, parameters=None, folds=5, verbose=0, |
|---|
| 277 | progressCallback=None): |
|---|
| 278 | """Tune the parameters of the SVMLearner on given examples using |
|---|
| 279 | cross validation. |
|---|
| 280 | |
|---|
| 281 | :param examples: ExampleTable on which to tune the parameters |
|---|
| 282 | :param parameters: if not set defaults to ["nu", "C", "gamma"] |
|---|
| 283 | :param folds: number of folds used for cross validation |
|---|
| 284 | :param verbose: |
|---|
| 285 | :param progressCallback: a callback function to report progress |
|---|
| 286 | |
|---|
| 287 | Example:: |
|---|
| 288 | >>> svm = SVMLearner() |
|---|
| 289 | >>> svm.tuneParameters(examples, parameters=["gamma"], folds=3) |
|---|
| 290 | |
|---|
| 291 | This code tunes the `gamma` parameter on `examples` using 3-fold cross |
|---|
| 292 | validation |
|---|
| 293 | |
|---|
| 294 | """ |
|---|
| 295 | import orngWrap |
|---|
| 296 | parameters = ["nu", "C", "gamma"] if parameters == None else parameters |
|---|
| 297 | searchParams = [] |
|---|
| 298 | normalization = self.normalization |
|---|
| 299 | if normalization: |
|---|
| 300 | examples = self._normalize(examples) |
|---|
| 301 | self.normalization = False |
|---|
| 302 | if self.svm_type == SVMLearner.Nu_SVC and "nu" in parameters: |
|---|
| 303 | numOfNuValues=9 |
|---|
| 304 | maxNu = max(self.maxNu(examples) - 1e-7, 0.0) |
|---|
| 305 | searchParams.append(("nu", [i/10.0 for i in range(1, 9) if \ |
|---|
| 306 | i/10.0 < maxNu] + [maxNu])) |
|---|
| 307 | elif "C" in parameters: |
|---|
| 308 | searchParams.append(("C", [2**a for a in range(-5,15,2)])) |
|---|
| 309 | if self.kernel_type==2 and "gamma" in parameters: |
|---|
| 310 | searchParams.append(("gamma", [2**a for a in range(-5,5,2)]+[0])) |
|---|
| 311 | tunedLearner = orngWrap.TuneMParameters(object=self, |
|---|
| 312 | parameters=searchParams, |
|---|
| 313 | folds=folds, |
|---|
| 314 | returnWhat=orngWrap.TuneMParameters.returnLearner, |
|---|
| 315 | progressCallback=progressCallback |
|---|
| 316 | if progressCallback else lambda i:None) |
|---|
| 317 | tunedLearner(examples, verbose=verbose) |
|---|
| 318 | if normalization: |
|---|
| 319 | self.normalization = normalization |
|---|
| 320 | |
|---|
| 321 | def _normalize(self, examples): |
|---|
| 322 | dc = Orange.core.DomainContinuizer() |
|---|
| 323 | dc.classTreatment = Orange.core.DomainContinuizer.Ignore |
|---|
| 324 | dc.continuousTreatment = Orange.core.DomainContinuizer.NormalizeBySpan |
|---|
| 325 | dc.multinomialTreatment = Orange.core.DomainContinuizer.NValues |
|---|
| 326 | newdomain = dc(examples) |
|---|
| 327 | return examples.translate(newdomain) |
|---|
| 328 | |
|---|
| 329 | class SVMClassifierWrapper(Orange.core.SVMClassifier): |
|---|
| 330 | def __new__(cls, wrapped): |
|---|
| 331 | return Orange.core.SVMClassifier.__new__(cls, name=wrapped.name) |
|---|
| 332 | |
|---|
| 333 | def __init__(self, wrapped): |
|---|
| 334 | self.wrapped = wrapped |
|---|
| 335 | for name, val in wrapped.__dict__.items(): |
|---|
| 336 | self.__dict__[name] = val |
|---|
| 337 | |
|---|
| 338 | def __call__(self, example, what=Orange.core.GetValue): |
|---|
| 339 | example = Orange.core.Example(self.wrapped.domain, example) |
|---|
| 340 | return self.wrapped(example, what) |
|---|
| 341 | |
|---|
| 342 | def classDistribution(self, example): |
|---|
| 343 | example = Orange.core.Example(self.wrapped.domain, example) |
|---|
| 344 | return self.wrapped.classDistribution(example) |
|---|
| 345 | |
|---|
| 346 | def getDecisionValues(self, example): |
|---|
| 347 | example = Orange.core.Example(self.wrapped.domain, example) |
|---|
| 348 | return self.wrapped.getDecisionValues(example) |
|---|
| 349 | |
|---|
| 350 | def getModel(self): |
|---|
| 351 | return self.wrapped.getModel() |
|---|
| 352 | |
|---|
| 353 | def __reduce__(self): |
|---|
| 354 | return SVMClassifierWrapper, (self.wrapped,), dict([(name, val) \ |
|---|
| 355 | for name, val in self.__dict__.items() \ |
|---|
| 356 | if name not in self.wrapped.__dict__]) |
|---|
| 357 | |
|---|
| 358 | class SVMLearnerSparse(SVMLearner): |
|---|
| 359 | |
|---|
| 360 | """Same as SVMLearner except that it learns from the examples meta |
|---|
| 361 | attributes. |
|---|
| 362 | |
|---|
| 363 | .. note:: Note that meta attributes don't need to be registered with |
|---|
| 364 | the data-set domain, or present in all the examples. Use this if you |
|---|
| 365 | are learning from large sparse data-sets. |
|---|
| 366 | |
|---|
| 367 | """ |
|---|
| 368 | |
|---|
| 369 | def __init__(self, **kwds): |
|---|
| 370 | SVMLearner.__init__(self, **kwds) |
|---|
| 371 | self.learner=Orange.core.SVMLearnerSparse(**kwds) |
|---|
| 372 | |
|---|
| 373 | class SVMLearnerEasy(SVMLearner): |
|---|
| 374 | |
|---|
| 375 | """Same as `SVMLearner` except that it will automatically scale the data |
|---|
| 376 | and perform parameter optimization using the `tuneParameters` method |
|---|
| 377 | similar to the easy.py script in LibSVM package. Use this if the |
|---|
| 378 | SVMLearner performs badly. |
|---|
| 379 | |
|---|
| 380 | Example (`svm-easy.py`_ uses: `vehicle.tab`_) |
|---|
| 381 | |
|---|
| 382 | .. literalinclude:: code/svm-easy.py |
|---|
| 383 | |
|---|
| 384 | """ |
|---|
| 385 | |
|---|
| 386 | def __init__(self, **kwds): |
|---|
| 387 | self.folds=4 |
|---|
| 388 | self.verbose=0 |
|---|
| 389 | SVMLearner.__init__(self, **kwds) |
|---|
| 390 | self.learner = SVMLearner(**kwds) |
|---|
| 391 | |
|---|
| 392 | def learnClassifier(self, examples): |
|---|
| 393 | transformer=Orange.core.DomainContinuizer() |
|---|
| 394 | transformer.multinomialTreatment=Orange.core.DomainContinuizer.NValues |
|---|
| 395 | transformer.continuousTreatment=Orange.core.DomainContinuizer.NormalizeBySpan |
|---|
| 396 | transformer.classTreatment=Orange.core.DomainContinuizer.Ignore |
|---|
| 397 | newdomain=transformer(examples) |
|---|
| 398 | newexamples=examples.translate(newdomain) |
|---|
| 399 | #print newexamples[0] |
|---|
| 400 | params={} |
|---|
| 401 | parameters = [] |
|---|
| 402 | self.learner.normalization = False ## Normalization already done |
|---|
| 403 | |
|---|
| 404 | if self.svm_type in [1,4]: |
|---|
| 405 | numOfNuValues=9 |
|---|
| 406 | if self.svm_type == SVMLearner.Nu_SVC: |
|---|
| 407 | maxNu = max(self.maxNu(newexamples) - 1e-7, 0.0) |
|---|
| 408 | else: |
|---|
| 409 | maxNu = 1.0 |
|---|
| 410 | parameters.append(("nu", [i/10.0 for i in range(1, 9) \ |
|---|
| 411 | if i/10.0 < maxNu] + [maxNu])) |
|---|
| 412 | else: |
|---|
| 413 | parameters.append(("C", [2**a for a in range(-5,15,2)])) |
|---|
| 414 | if self.kernel_type==2: |
|---|
| 415 | parameters.append(("gamma", [2**a for a in range(-5,5,2)]+[0])) |
|---|
| 416 | import orngWrap |
|---|
| 417 | tunedLearner = orngWrap.TuneMParameters(object=self.learner, |
|---|
| 418 | parameters=parameters, |
|---|
| 419 | folds=self.folds) |
|---|
| 420 | |
|---|
| 421 | return SVMClassifierWrapper(tunedLearner(newexamples, |
|---|
| 422 | verbose=self.verbose)) |
|---|
| 423 | |
|---|
| 424 | class SVMLearnerSparseClassEasy(SVMLearnerEasy, SVMLearnerSparse): |
|---|
| 425 | def __init__(self, **kwds): |
|---|
| 426 | SVMLearnerSparse.__init__(self, **kwds) |
|---|
| 427 | |
|---|
| 428 | class LinearLearner(Orange.core.LinearLearner): |
|---|
| 429 | |
|---|
| 430 | """A wrapper around Orange.core.LinearLearner with a default |
|---|
| 431 | solver_type == L2Loss_SVM_Dual |
|---|
| 432 | |
|---|
| 433 | .. note:: The default in Orange.core.LinearLearner is L2_LR |
|---|
| 434 | |
|---|
| 435 | """ |
|---|
| 436 | |
|---|
| 437 | def __new__(cls, data=None, weightId=0, **kwargs): |
|---|
| 438 | self = Orange.core.LinearLearner.__new__(cls, **kwargs) |
|---|
| 439 | if data: |
|---|
| 440 | self.__init__(**kwargs) |
|---|
| 441 | return self.__call__(data, weightId) |
|---|
| 442 | else: |
|---|
| 443 | return self |
|---|
| 444 | |
|---|
| 445 | def __init__(self, **kwargs): |
|---|
| 446 | if kwargs.get("solver_type", None) in [Orange.core.LinearLearner.L2_LR, |
|---|
| 447 | None]: |
|---|
| 448 | kwargs = dict(kwargs) |
|---|
| 449 | kwargs["solver_type"] = Orange.core.LinearLearner.L2Loss_SVM_Dual |
|---|
| 450 | for name, val in kwargs.items(): |
|---|
| 451 | setattr(self, name, val) |
|---|
| 452 | |
|---|
| 453 | def getLinearSVMWeights(classifier, sum=True): |
|---|
| 454 | """Extract attribute weights from the linear svm classifier. |
|---|
| 455 | |
|---|
| 456 | .. note:: For multi class classification the weights are square-summed |
|---|
| 457 | over all binary one vs. one classifiers. If you want weights for |
|---|
| 458 | each binary classifier pass `sum=False` flag (In this case the order |
|---|
| 459 | of reported weights are for class1 vs class2, class1 vs class3 ... |
|---|
| 460 | class2 vs class3 ... classifiers). |
|---|
| 461 | |
|---|
| 462 | Example (`svm-linear-weights.py`_, uses: `brown-selected.tab`_) |
|---|
| 463 | .. literalinclude:: code/svm-linear-weights.py |
|---|
| 464 | |
|---|
| 465 | """ |
|---|
| 466 | def updateWeights(w, key, val, mul): |
|---|
| 467 | if key in w: |
|---|
| 468 | w[key]+=mul*val |
|---|
| 469 | else: |
|---|
| 470 | w[key]=mul*val |
|---|
| 471 | |
|---|
| 472 | def to_float(val): |
|---|
| 473 | return float(val) if not val.isSpecial() else 0.0 |
|---|
| 474 | |
|---|
| 475 | SVs=classifier.supportVectors |
|---|
| 476 | weights=[] |
|---|
| 477 | classes=classifier.supportVectors.domain.classVar.values |
|---|
| 478 | classSV=dict([(value, filter(lambda sv: sv.getclass()==value, \ |
|---|
| 479 | classifier.supportVectors)) \ |
|---|
| 480 | for value in classes]) |
|---|
| 481 | svRanges=[(0, classifier.nSV[0])] |
|---|
| 482 | for n in classifier.nSV[1:]: |
|---|
| 483 | svRanges.append((svRanges[-1][1], svRanges[-1][1]+n)) |
|---|
| 484 | for i in range(len(classes)-1): |
|---|
| 485 | for j in range(i+1, len(classes)): |
|---|
| 486 | w={} |
|---|
| 487 | coefInd=j-1 |
|---|
| 488 | for svInd in apply(range, svRanges[i]): |
|---|
| 489 | attributes = SVs.domain.attributes + \ |
|---|
| 490 | SVs[svInd].getmetas(False, Orange.core.Variable).keys() |
|---|
| 491 | for attr in attributes: |
|---|
| 492 | if attr.varType==Orange.core.VarTypes.Continuous: |
|---|
| 493 | updateWeights(w, attr, to_float(SVs[svInd][attr]), \ |
|---|
| 494 | classifier.coef[coefInd][svInd]) |
|---|
| 495 | coefInd=i |
|---|
| 496 | for svInd in apply(range, svRanges[j]): |
|---|
| 497 | attributes = SVs.domain.attributes + \ |
|---|
| 498 | SVs[svInd].getmetas(False, Orange.core.Variable).keys() |
|---|
| 499 | for attr in attributes: |
|---|
| 500 | if attr.varType==Orange.core.VarTypes.Continuous: |
|---|
| 501 | updateWeights(w, attr, to_float(SVs[svInd][attr]), \ |
|---|
| 502 | classifier.coef[coefInd][svInd]) |
|---|
| 503 | weights.append(w) |
|---|
| 504 | |
|---|
| 505 | if sum: |
|---|
| 506 | scores = defaultdict(float) |
|---|
| 507 | |
|---|
| 508 | for w in weights: |
|---|
| 509 | for attr, wAttr in w.items(): |
|---|
| 510 | scores[attr] += wAttr**2 |
|---|
| 511 | for key in scores: |
|---|
| 512 | scores[key] = math.sqrt(scores[key]) |
|---|
| 513 | return scores |
|---|
| 514 | else: |
|---|
| 515 | return weights |
|---|
| 516 | |
|---|
| 517 | def exampleWeightedSum(example, weights): |
|---|
| 518 | sum=0 |
|---|
| 519 | for attr, w in weights.items(): |
|---|
| 520 | sum+=float(example[attr])*w |
|---|
| 521 | return sum |
|---|
| 522 | |
|---|
| 523 | class MeasureAttribute_SVMWeights(Orange.core.MeasureAttribute): |
|---|
| 524 | |
|---|
| 525 | """Measure attribute relevance by training an linear SVM classifier on |
|---|
| 526 | provided examples and using a squared sum of weights (of each binary |
|---|
| 527 | classifier) as the returned measure. |
|---|
| 528 | |
|---|
| 529 | Example:: |
|---|
| 530 | >>> measure = MeasureAttribute_SVMWeights() |
|---|
| 531 | >>> for attr in data.domain.attributes: |
|---|
| 532 | ... print "%15s: %.3f" % (attr.name, measure(attr, data)) |
|---|
| 533 | |
|---|
| 534 | """ |
|---|
| 535 | |
|---|
| 536 | def __new__(cls, attr=None, examples=None, weightId=None, **kwargs): |
|---|
| 537 | self = Orange.core.MeasureAttribute.__new__(cls, **kwargs) |
|---|
| 538 | if examples is not None and attr is not None: |
|---|
| 539 | self.__init__(**kwargs) |
|---|
| 540 | return self.__call__(attr, examples, weightId) |
|---|
| 541 | else: |
|---|
| 542 | return self |
|---|
| 543 | |
|---|
| 544 | def __reduce__(self): |
|---|
| 545 | return MeasureAttribute_SVMWeights, (), {"learner": self.learner} |
|---|
| 546 | |
|---|
| 547 | def __init__(self, learner=None, **kwargs): |
|---|
| 548 | """:param learner: Learner used for weight esstimation (default LinearLearner(solver_type=L2Loss_SVM_Dual)) |
|---|
| 549 | :type learner: Orange.core.Learner |
|---|
| 550 | |
|---|
| 551 | """ |
|---|
| 552 | if learner: |
|---|
| 553 | self.learner = learner |
|---|
| 554 | else: |
|---|
| 555 | self.learner = LinearLearner(solver_type= |
|---|
| 556 | LinearLearner.L2Loss_SVM_Dual) |
|---|
| 557 | |
|---|
| 558 | self._cached_examples = None |
|---|
| 559 | |
|---|
| 560 | def __call__(self, attr, examples, weightId=None): |
|---|
| 561 | if examples is self._cached_examples: |
|---|
| 562 | weights = self._cached_weights |
|---|
| 563 | else: |
|---|
| 564 | classifier = self.learner(examples, weightId) |
|---|
| 565 | self._cached_examples = examples |
|---|
| 566 | import numpy |
|---|
| 567 | weights = numpy.array(classifier.weights) |
|---|
| 568 | weights = numpy.sum(weights ** 2, axis=0) |
|---|
| 569 | weights = dict(zip(examples.domain.attributes, weights)) |
|---|
| 570 | self._cached_weights = weights |
|---|
| 571 | return weights.get(attr, 0.0) |
|---|
| 572 | |
|---|
| 573 | class RFE(object): |
|---|
| 574 | |
|---|
| 575 | """Recursive feature elimination using linear svm derived attribute |
|---|
| 576 | weights. |
|---|
| 577 | |
|---|
| 578 | Example:: |
|---|
| 579 | |
|---|
| 580 | >>> rfe = RFE(SVMLearner(kernel_type=kernels.Linear, |
|---|
| 581 | normalization=False)) # normalization=False -> SVM Learner should |
|---|
| 582 | not change the domain |
|---|
| 583 | >>> data_with_removed_features = rfe(data, 5) # returns an example |
|---|
| 584 | table with only 5 best attributes |
|---|
| 585 | |
|---|
| 586 | """ |
|---|
| 587 | |
|---|
| 588 | def __init__(self, learner=None): |
|---|
| 589 | self.learner = learner or SVMLearner(kernel_type= |
|---|
| 590 | kernels.Linear, normalization=False) |
|---|
| 591 | |
|---|
| 592 | def getAttrScores(self, data, stopAt=0, progressCallback=None): |
|---|
| 593 | """Return a dict mapping attributes to scores (scores are not scores |
|---|
| 594 | in a general meaning they represent the step number at which they |
|---|
| 595 | were removed from the recursive evaluation). |
|---|
| 596 | |
|---|
| 597 | """ |
|---|
| 598 | iter = 1 |
|---|
| 599 | attrs = data.domain.attributes |
|---|
| 600 | attrScores = {} |
|---|
| 601 | |
|---|
| 602 | while len(attrs) > stopAt: |
|---|
| 603 | weights = getLinearSVMWeights(self.learner(data), sum=False) |
|---|
| 604 | if progressCallback: |
|---|
| 605 | progressCallback(100. * iter / (len(attrs) - stopAt)) |
|---|
| 606 | score = dict.fromkeys(attrs, 0) |
|---|
| 607 | for w in weights: |
|---|
| 608 | for attr, wAttr in w.items(): |
|---|
| 609 | score[attr] += wAttr**2 |
|---|
| 610 | score = score.items() |
|---|
| 611 | score.sort(lambda a,b:cmp(a[1],b[1])) |
|---|
| 612 | numToRemove = max(int(len(attrs)*1.0/(iter+1)), 1) |
|---|
| 613 | for attr, s in score[:numToRemove]: |
|---|
| 614 | attrScores[attr] = len(attrScores) |
|---|
| 615 | attrs = [attr for attr, s in score[numToRemove:]] |
|---|
| 616 | if attrs: |
|---|
| 617 | data = data.select(attrs + [data.domain.classVar]) |
|---|
| 618 | iter += 1 |
|---|
| 619 | return attrScores |
|---|
| 620 | |
|---|
| 621 | def __call__(self, data, numSelected=20, progressCallback=None): |
|---|
| 622 | """Return a new dataset with only `numSelected` best scoring attributes |
|---|
| 623 | |
|---|
| 624 | :param data: Data |
|---|
| 625 | :type data: Orange.core.ExampleTable |
|---|
| 626 | :param numSelected: number of features to preserve |
|---|
| 627 | :type numSelected: int |
|---|
| 628 | |
|---|
| 629 | """ |
|---|
| 630 | scores = self.getAttrScores(data, progressCallback=progressCallback) |
|---|
| 631 | scores = sorted(scores.items(), key=lambda item: item[1]) |
|---|
| 632 | |
|---|
| 633 | scores = dict(scores[-numSelected:]) |
|---|
| 634 | attrs = [attr for attr in data.domain.attributes if attr in scores] |
|---|
| 635 | domain = Orange.core.Domain(attrs, data.domain.classVar) |
|---|
| 636 | domain.addmetas(data.domain.getmetas()) |
|---|
| 637 | data = Orange.core.ExampleTable(domain, data) |
|---|
| 638 | return data |
|---|
| 639 | |
|---|
| 640 | def exampleTableToSVMFormat(examples, file): |
|---|
| 641 | """Save an example table in svm format as used by LibSVM""" |
|---|
| 642 | attrs = examples.domain.attributes + examples.domain.getmetas().values() |
|---|
| 643 | attrs = [attr for attr in attrs if attr.varType |
|---|
| 644 | in [Orange.core.VarTypes.Continuous, |
|---|
| 645 | Orange.core.VarTypes.Discrete]] |
|---|
| 646 | cv = examples.domain.classVar |
|---|
| 647 | |
|---|
| 648 | for ex in examples: |
|---|
| 649 | if cv.varType == Orange.core.VarTypes.Discrete: |
|---|
| 650 | file.write(str(int(ex[cv]))) |
|---|
| 651 | else: |
|---|
| 652 | file.write(str(float(ex[cv]))) |
|---|
| 653 | |
|---|
| 654 | for i, attr in enumerate(attrs): |
|---|
| 655 | if not ex[attr].isSpecial(): |
|---|
| 656 | file.write(" "+str(i+1)+":"+str(ex[attr])) |
|---|
| 657 | file.write("\n") |
|---|
| 658 | |
|---|