source:orange/orange/Orange/classification/logreg.py@7583:c9b17506b1c3

Revision 7583:c9b17506b1c3, 42.7 KB checked in by blaz <blaz.zupan@…>, 3 years ago (diff)

some editing, also, needs further work (statistical functions at the end need to be moved)

Line
1"""
2.. index: logistic regression
3.. index:
4   single: classification; logistic regression
5
6*******************
7Logistic regression
8*******************
9
10Implements `logistic regression <http://en.wikipedia.org/wiki/Logistic_regression>`_
11with an extension for proper treatment of discrete features.
12The algorithm can handle various anomalies in features, such as constant variables
13and singularities, that could make fitting of logistic regression almost
14impossible. Stepwise logistic regression, which iteratively selects the most informative features,
15is also supported.
16
17
18.. autofunction:: LogRegLearner
19.. autofunction:: StepWiseFSS
20.. autofunction:: dump
21
22.. autoclass:: StepWiseFSS_class
23   :members:
24
25Examples
26========
27
28The first example shows a very simple induction of a logistic regression
29classifier (`logreg-run.py`_, uses `titanic.tab`_).
30
31.. literalinclude:: code/logreg-run.py
32
33Result::
34
35    Classification accuracy: 0.778282598819
36
37    class attribute = survived
38    class values = <no, yes>
39
40        Attribute       beta  st. error     wald Z          P OR=exp(beta)
41
42        Intercept      -1.23       0.08     -15.15      -0.00
43     status=first       0.86       0.16       5.39       0.00       2.36
44    status=second      -0.16       0.18      -0.91       0.36       0.85
45     status=third      -0.92       0.15      -6.12       0.00       0.40
46        age=child       1.06       0.25       4.30       0.00       2.89
47       sex=female       2.42       0.14      17.04       0.00      11.25
48
49The next examples shows how to handle singularities in data sets
51
52.. literalinclude:: code/logreg-singularities.py
53
54The first few lines of the output of this script are::
55
56    <=50K <=50K
57    <=50K <=50K
58    <=50K <=50K
59    >50K >50K
60    <=50K >50K
61
62    class attribute = y
63    class values = <>50K, <=50K>
64
65                               Attribute       beta  st. error     wald Z          P OR=exp(beta)
66
67                               Intercept       6.62      -0.00       -inf       0.00
68                                     age      -0.04       0.00       -inf       0.00       0.96
69                                  fnlwgt      -0.00       0.00       -inf       0.00       1.00
70                           education-num      -0.28       0.00       -inf       0.00       0.76
71                 marital-status=Divorced       4.29       0.00        inf       0.00      72.62
72            marital-status=Never-married       3.79       0.00        inf       0.00      44.45
73                marital-status=Separated       3.46       0.00        inf       0.00      31.95
74                  marital-status=Widowed       3.85       0.00        inf       0.00      46.96
75    marital-status=Married-spouse-absent       3.98       0.00        inf       0.00      53.63
76        marital-status=Married-AF-spouse       4.01       0.00        inf       0.00      55.19
77                 occupation=Tech-support      -0.32       0.00       -inf       0.00       0.72
78
79If :obj:`removeSingular` is set to 0, inducing a logistic regression
80classifier would return an error::
81
82    Traceback (most recent call last):
83      File "logreg-singularities.py", line 4, in <module>
84        lr = classification.logreg.LogRegLearner(table, removeSingular=0)
85      File "/home/jure/devel/orange/Orange/classification/logreg.py", line 255, in LogRegLearner
86        return lr(examples, weightID)
87      File "/home/jure/devel/orange/Orange/classification/logreg.py", line 291, in __call__
88        lr = learner(examples, weight)
89    orange.KernelException: 'orange.LogRegLearner': singularity in workclass=Never-worked
90
91We can see that the attribute workclass is causing a singularity.
92
93The example below shows, how the use of stepwise logistic regression can help to
94gain in classification performance (`logreg-stepwise.py`_, uses `ionosphere.tab`_):
95
96.. literalinclude:: code/logreg-stepwise.py
97
98The output of this script is::
99
100    Learner      CA
101    logistic     0.841
102    filtered     0.846
103
104    Number of times attributes were used in cross-validation:
105     1 x a21
106    10 x a22
107     8 x a23
108     7 x a24
109     1 x a25
110    10 x a26
111    10 x a27
112     3 x a28
113     7 x a29
114     9 x a31
115     2 x a16
116     7 x a12
117     1 x a32
118     8 x a15
119    10 x a14
120     4 x a17
121     7 x a30
122    10 x a11
123     1 x a10
124     1 x a13
125    10 x a34
126     2 x a19
127     1 x a18
128    10 x a3
129    10 x a5
130     4 x a4
131     4 x a7
132     8 x a6
133    10 x a9
134    10 x a8
135
136.. _logreg-run.py: code/logreg-run.py
137.. _logreg-singularities.py: code/logreg-singularities.py
138.. _logreg-stepwise.py: code/logreg-stepwise.py
139
140.. _ionosphere.tab: code/ionosphere.tab
142.. _titanic.tab: code/titanic.tab
143
144"""
145
146from Orange.core import LogRegLearner, LogRegClassifier, LogRegFitter, LogRegFitter_Cholesky
147
148import Orange
149import math, os
150import warnings
151from numpy import *
152from numpy.linalg import *
153
154
155##########################################################################
156## Print out methods
157
158def dump(classifier):
159    """ Formatted print to console of all major features in logistic
160    regression classifier.
161
162    :param classifier: logistic regression classifier
163    """
164
165    # print out class values
166    print
167    print "class attribute = " + classifier.domain.classVar.name
168    print "class values = " + str(classifier.domain.classVar.values)
169    print
170
171    # get the longest attribute name
172    longest=0
173    for at in classifier.continuizedDomain.attributes:
174        if len(at.name)>longest:
175            longest=len(at.name);
176
177    # print out the head
178    formatstr = "%"+str(longest)+"s %10s %10s %10s %10s %10s"
179    print formatstr % ("Feature", "beta", "st. error", "wald Z", "P", "OR=exp(beta)")
180    print
181    formatstr = "%"+str(longest)+"s %10.2f %10.2f %10.2f %10.2f"
182    print formatstr % ("Intercept", classifier.beta[0], classifier.beta_se[0], classifier.wald_Z[0], classifier.P[0])
183    formatstr = "%"+str(longest)+"s %10.2f %10.2f %10.2f %10.2f %10.2f"
184    for i in range(len(classifier.continuizedDomain.attributes)):
185        print formatstr % (classifier.continuizedDomain.attributes[i].name, classifier.beta[i+1], classifier.beta_se[i+1], classifier.wald_Z[i+1], abs(classifier.P[i+1]), math.exp(classifier.beta[i+1]))
186
187
188def hasDiscreteValues(domain):
189    for at in domain.attributes:
190        if at.varType == Orange.core.VarTypes.Discrete:
191            return 1
192    return 0
193
194def LogRegLearner(table=None, weightID=0, **kwds):
195    """ Logistic regression learner.
196
197    Implements logistic regression. If data instances are provided to
198    the constructor, the learning algorithm is called and the resulting
199    classifier is returned instead of the learner.
200
201    :param table: data table with either discrete or continuous features
202    :type table: Orange.data.Table
203    :param weightID: the ID of the weight meta attribute
204    :type weightID: int
205    :param removeSingular: set to 1 if you want automatic removal of disturbing features, such as constants and singularities
206    :type removeSingular: bool
207    :param fitter: the fitting algorithm (by default the Newton-Raphson fitting algorithm is used)
208    :type fitter: type???
209    :param stepwiseLR: set to 1 if you wish to use stepwise logistic regression
210    :type stepwiseLR: bool
211    :param addCrit: parameter for stepwise feature selection
213    :param deleteCrit: parameter for stepwise feature selection
214    :type deleteCrit: float
215    :param numFeatures: parameter for stepwise feature selection
216    :type numFeatures: int
217    :rtype: :obj:`LogRegLearnerClass` or :obj:`LogRegClassifier`
218
219    """
220    lr = LogRegLearnerClass(**kwds)
221    if table:
222        return lr(table, weightID)
223    else:
224        return lr
225
226class LogRegLearnerClass(Orange.core.Learner):
227    def __init__(self, removeSingular=0, fitter = None, **kwds):
228        self.__dict__.update(kwds)
229        self.removeSingular = removeSingular
230        self.fitter = None
231
232    def __call__(self, examples, weight=0):
233        imputer = getattr(self, "imputer", None) or None
234        if getattr(self, "removeMissing", 0):
235            examples = Orange.core.Preprocessor_dropMissing(examples)
236##        if hasDiscreteValues(examples.domain):
237##            examples = createNoDiscTable(examples)
238        if not len(examples):
239            return None
240        if getattr(self, "stepwiseLR", 0):
242            removeCrit = getattr(self, "removeCrit", 0.3)
243            numFeatures = getattr(self, "numFeatures", -1)
244            attributes = StepWiseFSS(examples, addCrit = addCrit, deleteCrit = removeCrit, imputer = imputer, numFeatures = numFeatures)
245            tmpDomain = Orange.core.Domain(attributes, examples.domain.classVar)
247            examples = examples.select(tmpDomain)
248        learner = Orange.core.LogRegLearner()
249        learner.imputerConstructor = imputer
250        if imputer:
251            examples = self.imputer(examples)(examples)
252        examples = Orange.core.Preprocessor_dropMissing(examples)
253        if self.fitter:
254            learner.fitter = self.fitter
255        if self.removeSingular:
256            lr = learner.fitModel(examples, weight)
257        else:
258            lr = learner(examples, weight)
259        while isinstance(lr, Orange.core.Variable):
260            if isinstance(lr.getValueFrom, Orange.core.ClassifierFromVar) and isinstance(lr.getValueFrom.transformer, Orange.core.Discrete2Continuous):
261                lr = lr.getValueFrom.variable
262            attributes = examples.domain.attributes[:]
263            if lr in attributes:
264                attributes.remove(lr)
265            else:
266                attributes.remove(lr.getValueFrom.variable)
267            newDomain = Orange.core.Domain(attributes, examples.domain.classVar)
269            examples = examples.select(newDomain)
270            lr = learner.fitModel(examples, weight)
271        return lr
272
273
274
275def Univariate_LogRegLearner(examples=None, **kwds):
276    learner = apply(Univariate_LogRegLearner_Class, (), kwds)
277    if examples:
278        return learner(examples)
279    else:
280        return learner
281
282class Univariate_LogRegLearner_Class(Orange.core.Learner):
283    def __init__(self, **kwds):
284        self.__dict__.update(kwds)
285
286    def __call__(self, examples):
287        examples = createFullNoDiscTable(examples)
288        classifiers = map(lambda x: LogRegLearner(Orange.core.Preprocessor_dropMissing(examples.select(Orange.core.Domain(x, examples.domain.classVar)))), examples.domain.attributes)
289        maj_classifier = LogRegLearner(Orange.core.Preprocessor_dropMissing(examples.select(Orange.core.Domain(examples.domain.classVar))))
290        beta = [maj_classifier.beta[0]] + [x.beta[1] for x in classifiers]
291        beta_se = [maj_classifier.beta_se[0]] + [x.beta_se[1] for x in classifiers]
292        P = [maj_classifier.P[0]] + [x.P[1] for x in classifiers]
293        wald_Z = [maj_classifier.wald_Z[0]] + [x.wald_Z[1] for x in classifiers]
294        domain = examples.domain
295
296        return Univariate_LogRegClassifier(beta = beta, beta_se = beta_se, P = P, wald_Z = wald_Z, domain = domain)
297
298class Univariate_LogRegClassifier(Orange.core.Classifier):
299    def __init__(self, **kwds):
300        self.__dict__.update(kwds)
301
302    def __call__(self, example, resultType = Orange.core.GetValue):
303        # classification not implemented yet. For now its use is only to provide regression coefficients and its statistics
304        pass
305
306
307def LogRegLearner_getPriors(examples = None, weightID=0, **kwds):
308    lr = LogRegLearnerClass_getPriors(**kwds)
309    if examples:
310        return lr(examples, weightID)
311    else:
312        return lr
313
314class LogRegLearnerClass_getPriors(object):
315    def __init__(self, removeSingular=0, **kwds):
316        self.__dict__.update(kwds)
317        self.removeSingular = removeSingular
318    def __call__(self, examples, weight=0):
319        # next function changes data set to a extended with unknown values
320        def createLogRegExampleTable(data, weightID):
321            setsOfData = []
322            for at in data.domain.attributes:
323                # za vsak atribut kreiraj nov newExampleTable newData
324                # v dataOrig, dataFinal in newData dodaj nov atribut -- continuous variable
325                if at.varType == Orange.core.VarTypes.Continuous:
326                    atDisc = Orange.core.FloatVariable(at.name + "Disc")
327                    newDomain = Orange.core.Domain(data.domain.attributes+[atDisc,data.domain.classVar])
329                    newData = Orange.core.ExampleTable(newDomain,data)
330                    altData = Orange.core.ExampleTable(newDomain,data)
331                    for i,d in enumerate(newData):
332                        d[atDisc] = 0
333                        d[weightID] = 1*data[i][weightID]
334                    for i,d in enumerate(altData):
335                        d[atDisc] = 1
336                        d[at] = 0
337                        d[weightID] = 0.000001*data[i][weightID]
338                elif at.varType == Orange.core.VarTypes.Discrete:
339                # v dataOrig, dataFinal in newData atributu "at" dodaj ee  eno  vreednost, ki ima vrednost kar  ime atributa +  "X"
340                    atNew = Orange.core.EnumVariable(at.name, values = at.values + [at.name+"X"])
341                    newDomain = Orange.core.Domain(filter(lambda x: x!=at, data.domain.attributes)+[atNew,data.domain.classVar])
343                    newData = Orange.core.ExampleTable(newDomain,data)
344                    altData = Orange.core.ExampleTable(newDomain,data)
345                    for i,d in enumerate(newData):
346                        d[atNew] = data[i][at]
347                        d[weightID] = 1*data[i][weightID]
348                    for i,d in enumerate(altData):
349                        d[atNew] = at.name+"X"
350                        d[weightID] = 0.000001*data[i][weightID]
351                newData.extend(altData)
352                setsOfData.append(newData)
353            return setsOfData
354
355        learner = LogRegLearner(imputer = Orange.core.ImputerConstructor_average(), removeSingular = self.removeSingular)
356        # get Original Model
357        orig_model = learner(examples,weight)
358        if orig_model.fit_status:
359            print "Warning: model did not converge"
360
361        # get extended Model (you should not change data)
362        if weight == 0:
363            weight = Orange.core.newmetaid()
365        extended_set_of_examples = createLogRegExampleTable(examples, weight)
366        extended_models = [learner(extended_examples, weight) \
367                           for extended_examples in extended_set_of_examples]
368
369##        print examples[0]
370##        printOUT(orig_model)
371##        print orig_model.domain
372##        print orig_model.beta
373##        print orig_model.beta[orig_model.continuizedDomain.attributes[-1]]
374##        for i,m in enumerate(extended_models):
375##            print examples.domain.attributes[i]
376##            printOUT(m)
377
378
379        # izracunas odstopanja
380        # get sum of all betas
381        beta = 0
382        betas_ap = []
383        for m in extended_models:
386            beta = beta + beta_add
387
388        # substract it from intercept
389        #print "beta", beta
390        logistic_prior = orig_model.beta[0]+beta
391
392        # compare it to bayes prior
393        bayes = Orange.core.BayesLearner(examples)
394        bayes_prior = math.log(bayes.distribution[1]/bayes.distribution[0])
395
396        # normalize errors
397##        print "bayes", bayes_prior
398##        print "lr", orig_model.beta[0]
399##        print "lr2", logistic_prior
400##        print "dist", Orange.core.Distribution(examples.domain.classVar,examples)
401##        print "prej", betas_ap
402
403        # error normalization - to avoid errors due to assumption of independence of unknown values
404        dif = bayes_prior - logistic_prior
405        positives = sum(filter(lambda x: x>=0, betas_ap))
406        negatives = -sum(filter(lambda x: x<0, betas_ap))
407        if not negatives == 0:
408            kPN = positives/negatives
409            diffNegatives = dif/(1+kPN)
410            diffPositives = kPN*diffNegatives
411            kNegatives = (negatives-diffNegatives)/negatives
412            kPositives = positives/(positives-diffPositives)
413    ##        print kNegatives
414    ##        print kPositives
415
416            for i,b in enumerate(betas_ap):
417                if b<0: betas_ap[i]*=kNegatives
418                else: betas_ap[i]*=kPositives
419        #print "potem", betas_ap
420
421        # vrni originalni model in pripadajoce apriorne niclele
422        return (orig_model, betas_ap)
423        #return (bayes_prior,orig_model.beta[examples.domain.classVar],logistic_prior)
424
425class LogRegLearnerClass_getPriors_OneTable:
426    def __init__(self, removeSingular=0, **kwds):
427        self.__dict__.update(kwds)
428        self.removeSingular = removeSingular
429    def __call__(self, examples, weight=0):
430        # next function changes data set to a extended with unknown values
431        def createLogRegExampleTable(data, weightID):
432            finalData = Orange.core.ExampleTable(data)
433            origData = Orange.core.ExampleTable(data)
434            for at in data.domain.attributes:
435                # za vsak atribut kreiraj nov newExampleTable newData
436                # v dataOrig, dataFinal in newData dodaj nov atribut -- continuous variable
437                if at.varType == Orange.core.VarTypes.Continuous:
438                    atDisc = Orange.core.FloatVariable(at.name + "Disc")
439                    newDomain = Orange.core.Domain(origData.domain.attributes+[atDisc,data.domain.classVar])
441                    finalData = Orange.core.ExampleTable(newDomain,finalData)
442                    newData = Orange.core.ExampleTable(newDomain,origData)
443                    origData = Orange.core.ExampleTable(newDomain,origData)
444                    for d in origData:
445                        d[atDisc] = 0
446                    for d in finalData:
447                        d[atDisc] = 0
448                    for i,d in enumerate(newData):
449                        d[atDisc] = 1
450                        d[at] = 0
451                        d[weightID] = 100*data[i][weightID]
452
453                elif at.varType == Orange.core.VarTypes.Discrete:
454                # v dataOrig, dataFinal in newData atributu "at" dodaj ee  eno  vreednost, ki ima vrednost kar  ime atributa +  "X"
455                    atNew = Orange.core.EnumVariable(at.name, values = at.values + [at.name+"X"])
456                    newDomain = Orange.core.Domain(filter(lambda x: x!=at, origData.domain.attributes)+[atNew,origData.domain.classVar])
458                    temp_finalData = Orange.core.ExampleTable(finalData)
459                    finalData = Orange.core.ExampleTable(newDomain,finalData)
460                    newData = Orange.core.ExampleTable(newDomain,origData)
461                    temp_origData = Orange.core.ExampleTable(origData)
462                    origData = Orange.core.ExampleTable(newDomain,origData)
463                    for i,d in enumerate(origData):
464                        d[atNew] = temp_origData[i][at]
465                    for i,d in enumerate(finalData):
466                        d[atNew] = temp_finalData[i][at]
467                    for i,d in enumerate(newData):
468                        d[atNew] = at.name+"X"
469                        d[weightID] = 10*data[i][weightID]
470                finalData.extend(newData)
471            return finalData
472
473        learner = LogRegLearner(imputer = Orange.core.ImputerConstructor_average(), removeSingular = self.removeSingular)
474        # get Original Model
475        orig_model = learner(examples,weight)
476
477        # get extended Model (you should not change data)
478        if weight == 0:
479            weight = Orange.core.newmetaid()
481        extended_examples = createLogRegExampleTable(examples, weight)
482        extended_model = learner(extended_examples, weight)
483
484##        print examples[0]
485##        printOUT(orig_model)
486##        print orig_model.domain
487##        print orig_model.beta
488
489##        printOUT(extended_model)
490        # izracunas odstopanja
491        # get sum of all betas
492        beta = 0
493        betas_ap = []
494        for m in extended_models:
497            beta = beta + beta_add
498
499        # substract it from intercept
500        #print "beta", beta
501        logistic_prior = orig_model.beta[0]+beta
502
503        # compare it to bayes prior
504        bayes = Orange.core.BayesLearner(examples)
505        bayes_prior = math.log(bayes.distribution[1]/bayes.distribution[0])
506
507        # normalize errors
508        #print "bayes", bayes_prior
509        #print "lr", orig_model.beta[0]
510        #print "lr2", logistic_prior
511        #print "dist", Orange.core.Distribution(examples.domain.classVar,examples)
512        k = (bayes_prior-orig_model.beta[0])/(logistic_prior-orig_model.beta[0])
513        #print "prej", betas_ap
514        betas_ap = [k*x for x in betas_ap]
515        #print "potem", betas_ap
516
517        # vrni originalni model in pripadajoce apriorne niclele
518        return (orig_model, betas_ap)
519        #return (bayes_prior,orig_model.beta[data.domain.classVar],logistic_prior)
520
521
522######################################
523#### Fitters for logistic regression (logreg) learner ####
524######################################
525
526def Pr(x, betas):
527    k = math.exp(dot(x, betas))
528    return k / (1+k)
529
530def lh(x,y,betas):
531    llh = 0.0
532    for i,x_i in enumerate(x):
533        pr = Pr(x_i,betas)
534        llh += y[i]*log(max(pr,1e-6)) + (1-y[i])*log(max(1-pr,1e-6))
535    return llh
536
537
538def diag(vector):
539    mat = identity(len(vector), Float)
540    for i,v in enumerate(vector):
541        mat[i][i] = v
542    return mat
543
544class simpleFitter(Orange.core.LogRegFitter):
545    def __init__(self, penalty=0, se_penalty = False):
546        self.penalty = penalty
547        self.se_penalty = se_penalty
548    def __call__(self, data, weight=0):
549        ml = data.native(0)
550        for i in range(len(data.domain.attributes)):
551          a = data.domain.attributes[i]
552          if a.varType == Orange.core.VarTypes.Discrete:
553            for m in ml:
554              m[i] = a.values.index(m[i])
555        for m in ml:
556          m[-1] = data.domain.classVar.values.index(m[-1])
557        Xtmp = array(ml)
558        y = Xtmp[:,-1]   # true probabilities (1's or 0's)
559        one = reshape(array([1]*len(data)), (len(data),1)) # intercept column
560        X=concatenate((one, Xtmp[:,:-1]),1)  # intercept first, then data
561
562        betas = array([0.0] * (len(data.domain.attributes)+1))
563        oldBetas = array([1.0] * (len(data.domain.attributes)+1))
564        N = len(data)
565
566        pen_matrix = array([self.penalty] * (len(data.domain.attributes)+1))
567        if self.se_penalty:
568            p = array([Pr(X[i], betas) for i in range(len(data))])
569            W = identity(len(data), Float)
570            pp = p * (1.0-p)
571            for i in range(N):
572                W[i,i] = pp[i]
573            se = sqrt(diagonal(inverse(matrixmultiply(transpose(X), matrixmultiply(W, X)))))
574            for i,p in enumerate(pen_matrix):
575                pen_matrix[i] *= se[i]
576        # predict the probability for an instance, x and betas are vectors
577        # start the computation
578        likelihood = 0.
579        likelihood_new = 1.
580        while abs(likelihood - likelihood_new)>1e-5:
581            likelihood = likelihood_new
582            oldBetas = betas
583            p = array([Pr(X[i], betas) for i in range(len(data))])
584
585            W = identity(len(data), Float)
586            pp = p * (1.0-p)
587            for i in range(N):
588                W[i,i] = pp[i]
589
590            WI = inverse(W)
591            z = matrixmultiply(X, betas) + matrixmultiply(WI, y - p)
592
593            tmpA = inverse(matrixmultiply(transpose(X), matrixmultiply(W, X))+diag(pen_matrix))
594            tmpB = matrixmultiply(transpose(X), y-p)
595            betas = oldBetas + matrixmultiply(tmpA,tmpB)
596#            betaTemp = matrixmultiply(matrixmultiply(matrixmultiply(matrixmultiply(tmpA,transpose(X)),W),X),oldBetas)
597#            print betaTemp
598#            tmpB = matrixmultiply(transpose(X), matrixmultiply(W, z))
599#            betas = matrixmultiply(tmpA, tmpB)
600            likelihood_new = lh(X,y,betas)-self.penalty*sum([b*b for b in betas])
601            print likelihood_new
602
603
604
605##        XX = sqrt(diagonal(inverse(matrixmultiply(transpose(X),X))))
606##        yhat = array([Pr(X[i], betas) for i in range(len(data))])
607##        ss = sum((y - yhat) ** 2) / (N - len(data.domain.attributes) - 1)
608##        sigma = math.sqrt(ss)
609        p = array([Pr(X[i], betas) for i in range(len(data))])
610        W = identity(len(data), Float)
611        pp = p * (1.0-p)
612        for i in range(N):
613            W[i,i] = pp[i]
614        diXWX = sqrt(diagonal(inverse(matrixmultiply(transpose(X), matrixmultiply(W, X)))))
615        xTemp = matrixmultiply(matrixmultiply(inverse(matrixmultiply(transpose(X), matrixmultiply(W, X))),transpose(X)),y)
616        beta = []
617        beta_se = []
618        print "likelihood ridge", likelihood
619        for i in range(len(betas)):
620            beta.append(betas[i])
621            beta_se.append(diXWX[i])
622        return (self.OK, beta, beta_se, 0)
623
624def Pr_bx(bx):
625    if bx > 35:
626        return 1
627    if bx < -35:
628        return 0
629    return exp(bx)/(1+exp(bx))
630
631class bayesianFitter(Orange.core.LogRegFitter):
632    def __init__(self, penalty=0, anch_examples=[], tau = 0):
633        self.penalty = penalty
634        self.anch_examples = anch_examples
635        self.tau = tau
636
637    def createArrayData(self,data):
638        if not len(data):
639            return (array([]),array([]))
640        # convert data to numeric
641        ml = data.native(0)
642        for i,a in enumerate(data.domain.attributes):
643          if a.varType == Orange.core.VarTypes.Discrete:
644            for m in ml:
645              m[i] = a.values.index(m[i])
646        for m in ml:
647          m[-1] = data.domain.classVar.values.index(m[-1])
648        Xtmp = array(ml)
649        y = Xtmp[:,-1]   # true probabilities (1's or 0's)
650        one = reshape(array([1]*len(data)), (len(data),1)) # intercept column
651        X=concatenate((one, Xtmp[:,:-1]),1)  # intercept first, then data
652        return (X,y)
653
654    def __call__(self, data, weight=0):
655        (X,y)=self.createArrayData(data)
656
657        exTable = Orange.core.ExampleTable(data.domain)
658        for id,ex in self.anch_examples:
659            exTable.extend(Orange.core.ExampleTable(ex,data.domain))
660        (X_anch,y_anch)=self.createArrayData(exTable)
661
662        betas = array([0.0] * (len(data.domain.attributes)+1))
663
664        likelihood,betas = self.estimateBeta(X,y,betas,[0]*(len(betas)),X_anch,y_anch)
665
666        # get attribute groups atGroup = [(startIndex, number of values), ...)
667        ats = data.domain.attributes
668        atVec=reduce(lambda x,y: x+[(y,not y==x[-1][0])], [a.getValueFrom and a.getValueFrom.whichVar or a for a in ats],[(ats[0].getValueFrom and ats[0].getValueFrom.whichVar or ats[0],0)])[1:]
669        atGroup=[[0,0]]
670        for v_i,v in enumerate(atVec):
671            if v[1]==0: atGroup[-1][1]+=1
672            else:       atGroup.append([v_i,1])
673
674        # compute zero values for attributes
675        sumB = 0.
676        for ag in atGroup:
677            X_temp = concatenate((X[:,:ag[0]+1],X[:,ag[0]+1+ag[1]:]),1)
678            if X_anch:
679                X_anch_temp = concatenate((X_anch[:,:ag[0]+1],X_anch[:,ag[0]+1+ag[1]:]),1)
680            else: X_anch_temp = X_anch
681##            print "1", concatenate((betas[:i+1],betas[i+2:]))
682##            print "2", betas
683            likelihood_temp,betas_temp=self.estimateBeta(X_temp,y,concatenate((betas[:ag[0]+1],betas[ag[0]+ag[1]+1:])),[0]+[1]*(len(betas)-1-ag[1]),X_anch_temp,y_anch)
684            print "finBetas", betas, betas_temp
685            print "betas", betas[0], betas_temp[0]
686            sumB += betas[0]-betas_temp[0]
687        apriori = Orange.core.Distribution(data.domain.classVar, data)
688        aprioriProb = apriori[0]/apriori.abs
689
690        print "koncni rezultat", sumB, math.log((1-aprioriProb)/aprioriProb), betas[0]
691
692        beta = []
693        beta_se = []
694        print "likelihood2", likelihood
695        for i in range(len(betas)):
696            beta.append(betas[i])
697            beta_se.append(0.0)
698        return (self.OK, beta, beta_se, 0)
699
700
701
702    def estimateBeta(self,X,y,betas,const_betas,X_anch,y_anch):
703        N,N_anch = len(y),len(y_anch)
704        r,r_anch = array([dot(X[i], betas) for i in range(N)]),\
705                   array([dot(X_anch[i], betas) for i in range(N_anch)])
706        p    = array([Pr_bx(ri) for ri in r])
707        X_sq = X*X
708
709        max_delta      = [1.]*len(const_betas)
710        likelihood     = -1.e+10
711        likelihood_new = -1.e+9
712        while abs(likelihood - likelihood_new)>0.01 and max(max_delta)>0.01:
713            likelihood = likelihood_new
714            print likelihood
715            betas_temp = [b for b in betas]
716            for j in range(len(betas)):
717                if const_betas[j]: continue
718                dl = matrixmultiply(X[:,j],transpose(y-p))
719                for xi,x in enumerate(X_anch):
720                    dl += self.penalty*x[j]*(y_anch[xi] - Pr_bx(r_anch[xi]*self.penalty))
721
722                ddl = matrixmultiply(X_sq[:,j],transpose(p*(1-p)))
723                for xi,x in enumerate(X_anch):
724                    ddl += self.penalty*x[j]*Pr_bx(r[xi]*self.penalty)*(1-Pr_bx(r[xi]*self.penalty))
725
726                if j==0:
727                    dv = dl/max(ddl,1e-6)
728                elif betas[j] == 0: # special handling due to non-defined first and second derivatives
729                    dv = (dl-self.tau)/max(ddl,1e-6)
730                    if dv < 0:
731                        dv = (dl+self.tau)/max(ddl,1e-6)
732                        if dv > 0:
733                            dv = 0
734                else:
735                    dl -= sign(betas[j])*self.tau
736                    dv = dl/max(ddl,1e-6)
737                    if not sign(betas[j] + dv) == sign(betas[j]):
738                        dv = -betas[j]
739                dv = min(max(dv,-max_delta[j]),max_delta[j])
740                r+= X[:,j]*dv
741                p = array([Pr_bx(ri) for ri in r])
742                if N_anch:
743                    r_anch+=X_anch[:,j]*dv
744                betas[j] += dv
745                max_delta[j] = max(2*abs(dv),max_delta[j]/2)
746            likelihood_new = lh(X,y,betas)
747            for xi,x in enumerate(X_anch):
748                try:
749                    likelihood_new += y_anch[xi]*r_anch[xi]*self.penalty-log(1+exp(r_anch[xi]*self.penalty))
750                except:
751                    likelihood_new += r_anch[xi]*self.penalty*(y_anch[xi]-1)
752            likelihood_new -= sum([abs(b) for b in betas[1:]])*self.tau
753            if likelihood_new < likelihood:
754                max_delta = [md/4 for md in max_delta]
755                likelihood_new = likelihood
756                likelihood = likelihood_new + 1.
757                betas = [b for b in betas_temp]
758        print "betas", betas
759        print "init_like", likelihood_new
760        print "pure_like", lh(X,y,betas)
761        return (likelihood,betas)
762
763############################################################
764#  Feature subset selection for logistic regression
765
766
767def StepWiseFSS(table=None, **kwds):
768    """Implementation of algorithm described in [Hosmer and Lemeshow, Applied Logistic Regression, 2000].
769
770    If :obj:`table` is specified, stepwise logistic regression implemented
771    in :obj:`StepWiseFSS_class` is performed and a list of chosen features
772    is returned. If :obj:`table` is not specified an instance of
773    :obj:`StepWiseFSS_class` with all parameters set is returned.
774
775    :param table: data set
776    :type table: Orange.data.Table
777
778    :param addCrit: "Alpha" level to judge if variable has enough importance to be added in the new set. (e.g. if addCrit is 0.2, then features is added if its P is lower than 0.2)
780
781    :param deleteCrit: Similar to addCrit, just that it is used at backward elimination. It should be higher than addCrit!
782    :type deleteCrit: float
783
784    :param numFeatures: maximum number of selected features, use -1 for infinity.
785    :type numFeatures: int
786    :rtype: :obj:`StepWiseFSS_class` or list of features
787
788    """
789
790    """
791      Constructs and returns a new set of table that includes a
792      class and features selected by stepwise logistic regression. This is an
793      implementation of algorithm described in [Hosmer and Lemeshow, Applied Logistic Regression, 2000]
794
795      table: data set (ExampleTable)
796      addCrit: "Alpha" level to judge if variable has enough importance to be added in the new set. (e.g. if addCrit is 0.2, then attribute is added if its P is lower than 0.2)
797      deleteCrit: Similar to addCrit, just that it is used at backward elimination. It should be higher than addCrit!
798      numFeatures: maximum number of selected features, use -1 for infinity
799
800    """
801
802    fss = apply(StepWiseFSS_class, (), kwds)
803    if table is not None:
804        return fss(table)
805    else:
806        return fss
807
808def getLikelihood(fitter, examples):
809    res = fitter(examples)
810    if res[0] in [fitter.OK]: #, fitter.Infinity, fitter.Divergence]:
811       status, beta, beta_se, likelihood = res
812       if sum([abs(b) for b in beta])<sum([abs(b) for b in beta_se]):
813           return -100*len(examples)
814       return likelihood
815    else:
816       return -100*len(examples)
817
818
819
820class StepWiseFSS_class(Orange.core.Learner):
821  """ Perform stepwise logistic regression and return a list of the
822  most "informative" features. Each step of the algorithm is composed
823  of two parts. The first is backward elimination, where each already
824  chosen feature is tested for a significant contribution to the overall
825  model. If the worst among all tested features has higher significance
826  than is specified in :obj:`deleteCrit`, the feature is removed from
827  the model. The second step is forward selection, which is similar to
828  backward elimination. It loops through all the features that are not
829  in the model and tests whether they contribute to the common model
830  with significance lower that :obj:`addCrit`. The algorithm stops when
831  no feature in the model is to be removed and no feature not in the
832  model is to be added. By setting :obj:`numFeatures` larger than -1,
833  the algorithm will stop its execution when the number of features in model
834  exceeds that number.
835
836  Significances are assesed via the likelihood ration chi-square
837  test. Normal F test is not appropriate, because errors are assumed to
839
840  """
841
842  def __init__(self, addCrit=0.2, deleteCrit=0.3, numFeatures = -1, **kwds):
843
844    self.__dict__.update(kwds)
846    self.deleteCrit = deleteCrit
847    self.numFeatures = numFeatures
848  def __call__(self, examples):
849    if getattr(self, "imputer", 0):
850        examples = self.imputer(examples)(examples)
851    if getattr(self, "removeMissing", 0):
852        examples = Orange.core.Preprocessor_dropMissing(examples)
853    continuizer = Orange.core.DomainContinuizer(zeroBased=1,continuousTreatment=Orange.core.DomainContinuizer.Leave,
854                                           multinomialTreatment = Orange.core.DomainContinuizer.FrequentIsBase,
855                                           classTreatment = Orange.core.DomainContinuizer.Ignore)
856    attr = []
857    remain_attr = examples.domain.attributes[:]
858
859    # get LL for Majority Learner
860    tempDomain = Orange.core.Domain(attr,examples.domain.classVar)
861    #tempData  = Orange.core.Preprocessor_dropMissing(examples.select(tempDomain))
862    tempData  = Orange.core.Preprocessor_dropMissing(examples.select(tempDomain))
863
864    ll_Old = getLikelihood(Orange.core.LogRegFitter_Cholesky(), tempData)
865    ll_Best = -1000000
866    length_Old = float(len(tempData))
867
868    stop = 0
869    while not stop:
870        # LOOP until all variables are added or no further deletion nor addition of attribute is possible
871        worstAt = None
872        # if there are more than 1 attribute then perform backward elimination
873        if len(attr) >= 2:
874            minG = 1000
875            worstAt = attr[0]
876            ll_Best = ll_Old
877            length_Best = length_Old
878            for at in attr:
879                # check all attribute whether its presence enough increases LL?
880
881                tempAttr = filter(lambda x: x!=at, attr)
882                tempDomain = Orange.core.Domain(tempAttr,examples.domain.classVar)
884                # domain, calculate P for LL improvement.
885                tempDomain  = continuizer(Orange.core.Preprocessor_dropMissing(examples.select(tempDomain)))
886                tempData = Orange.core.Preprocessor_dropMissing(examples.select(tempDomain))
887
888                ll_Delete = getLikelihood(Orange.core.LogRegFitter_Cholesky(), tempData)
889                length_Delete = float(len(tempData))
890                length_Avg = (length_Delete + length_Old)/2.0
891
892                G=-2*length_Avg*(ll_Delete/length_Delete-ll_Old/length_Old)
893
894                # set new worst attribute
895                if G<minG:
896                    worstAt = at
897                    minG=G
898                    ll_Best = ll_Delete
899                    length_Best = length_Delete
900            # deletion of attribute
901
902            if worstAt.varType==Orange.core.VarTypes.Continuous:
903                P=lchisqprob(minG,1);
904            else:
905                P=lchisqprob(minG,len(worstAt.values)-1);
906            if P>=self.deleteCrit:
907                attr.remove(worstAt)
908                remain_attr.append(worstAt)
909                nodeletion=0
910                ll_Old = ll_Best
911                length_Old = length_Best
912            else:
913                nodeletion=1
914        else:
915            nodeletion = 1
916            # END OF DELETION PART
917
918        # if enough attributes has been chosen, stop the procedure
919        if self.numFeatures>-1 and len(attr)>=self.numFeatures:
920            remain_attr=[]
921
922        # for each attribute in the remaining
923        maxG=-1
924        ll_Best = ll_Old
925        length_Best = length_Old
926        bestAt = None
927        for at in remain_attr:
928            tempAttr = attr + [at]
929            tempDomain = Orange.core.Domain(tempAttr,examples.domain.classVar)
931            # domain, calculate P for LL improvement.
932            tempDomain  = continuizer(Orange.core.Preprocessor_dropMissing(examples.select(tempDomain)))
933            tempData = Orange.core.Preprocessor_dropMissing(examples.select(tempDomain))
934            ll_New = getLikelihood(Orange.core.LogRegFitter_Cholesky(), tempData)
935
936            length_New = float(len(tempData)) # get number of examples in tempData to normalize likelihood
937
938            # P=PR(CHI^2>G), G=-2(L(0)-L(1))=2(E(0)-E(1))
939            length_avg = (length_New + length_Old)/2
940            G=-2*length_avg*(ll_Old/length_Old-ll_New/length_New);
941            if G>maxG:
942                bestAt = at
943                maxG=G
944                ll_Best = ll_New
945                length_Best = length_New
946        if not bestAt:
947            stop = 1
948            continue
949
950        if bestAt.varType==Orange.core.VarTypes.Continuous:
951            P=lchisqprob(maxG,1);
952        else:
953            P=lchisqprob(maxG,len(bestAt.values)-1);
954        # Add attribute with smallest P to attributes(attr)
956            attr.append(bestAt)
957            remain_attr.remove(bestAt)
958            ll_Old = ll_Best
959            length_Old = length_Best
960
961        if (P>self.addCrit and nodeletion) or (bestAt == worstAt):
962            stop = 1
963
964    return attr
965
966
967def StepWiseFSS_Filter(examples = None, **kwds):
968    """
969        check function StepWiseFSS()
970    """
971
972    filter = apply(StepWiseFSS_Filter_class, (), kwds)
973    if examples is not None:
974        return filter(examples)
975    else:
976        return filter
977
978
979class StepWiseFSS_Filter_class(object):
980    def __init__(self, addCrit=0.2, deleteCrit=0.3, numFeatures = -1):
982        self.deleteCrit = deleteCrit
983        self.numFeatures = numFeatures
984    def __call__(self, examples):
986        return examples.select(Orange.core.Domain(attr, examples.domain.classVar))
987
988
989####################################
990##  PROBABILITY CALCULATIONS
991
992def lchisqprob(chisq,df):
993    """
994    Return the (1-tailed) probability value associated with the provided
995    chi-square value and df.  Adapted from chisq.c in Gary Perlman's |Stat.
996    """
997    BIG = 20.0
998    def ex(x):
999        BIG = 20.0
1000        if x < -BIG:
1001            return 0.0
1002        else:
1003            return math.exp(x)
1004    if chisq <=0 or df < 1:
1005        return 1.0
1006    a = 0.5 * chisq
1007    if df%2 == 0:
1008        even = 1
1009    else:
1010        even = 0
1011    if df > 1:
1012        y = ex(-a)
1013    if even:
1014        s = y
1015    else:
1016        s = 2.0 * zprob(-math.sqrt(chisq))
1017    if (df > 2):
1018        chisq = 0.5 * (df - 1.0)
1019        if even:
1020            z = 1.0
1021        else:
1022            z = 0.5
1023        if a > BIG:
1024            if even:
1025                e = 0.0
1026            else:
1027                e = math.log(math.sqrt(math.pi))
1028            c = math.log(a)
1029            while (z <= chisq):
1030                e = math.log(z) + e
1031                s = s + ex(c*z-a-e)
1032                z = z + 1.0
1033            return s
1034        else:
1035            if even:
1036                e = 1.0
1037            else:
1038                e = 1.0 / math.sqrt(math.pi) / math.sqrt(a)
1039            c = 0.0
1040            while (z <= chisq):
1041                e = e * (a/float(z))
1042                c = c + e
1043                z = z + 1.0
1044            return (c*y+s)
1045    else:
1046        return s
1047
1048
1049def zprob(z):
1050    """
1051    Returns the area under the normal curve 'to the left of' the given z value.
1052    Thus::
1053
1054    for z<0, zprob(z) = 1-tail probability
1055    for z>0, 1.0-zprob(z) = 1-tail probability
1056    for any z, 2.0*(1.0-zprob(abs(z))) = 2-tail probability
1057
1058    Adapted from z.c in Gary Perlman's |Stat.
1059    """
1060    Z_MAX = 6.0    # maximum meaningful z-value
1061    if z == 0.0:
1062    x = 0.0
1063    else:
1064    y = 0.5 * math.fabs(z)
1065    if y >= (Z_MAX*0.5):
1066        x = 1.0
1067    elif (y < 1.0):
1068        w = y*y
1069        x = ((((((((0.000124818987 * w
1070            -0.001075204047) * w +0.005198775019) * w
1071              -0.019198292004) * w +0.059054035642) * w
1072            -0.151968751364) * w +0.319152932694) * w
1073          -0.531923007300) * w +0.797884560593) * y * 2.0
1074    else:
1075        y = y - 2.0
1076        x = (((((((((((((-0.000045255659 * y
1077                 +0.000152529290) * y -0.000019538132) * y
1078               -0.000676904986) * y +0.001390604284) * y
1079             -0.000794620820) * y -0.002034254874) * y
1080               +0.006549791214) * y -0.010557625006) * y
1081             +0.011630447319) * y -0.009279453341) * y
1082           +0.005353579108) * y -0.002141268741) * y
1083         +0.000535310849) * y +0.999936657524
1084    if z > 0.0:
1085    prob = ((x+1.0)*0.5)
1086    else:
1087    prob = ((1.0-x)*0.5)
1088    return prob
1089
1090
Note: See TracBrowser for help on using the repository browser.