source: orange-bioinformatics/_bioinformatics/obiGsea.py @ 1711:643d37885055

Revision 1711:643d37885055, 32.7 KB checked in by markotoplak, 20 months ago (diff)

obiGeneSetSig: speedup of ASSESS.

RevLine 
[1632]1from __future__ import absolute_import
2
3from collections import defaultdict
[188]4import random
5import time
[1632]6
7import numpy
8
9import orange
[1705]10import Orange
[1632]11
12from . import obiGeneSets
13from .obiExpression import *
[188]14
15"""
16Gene set enrichment analysis.
17
18Author: Marko Toplak
19"""
20
[411]21def iset(data):
22    """
23    Is data orange.ExampleTable?
24    """
25    return isinstance(data, orange.ExampleTable)
26
27def issequencens(x):
28    "Is x a sequence and not string ? We say it is if it has a __getitem__ method and is not string."
29    return hasattr(x, '__getitem__') and not isinstance(x, basestring)
[395]30
[188]31def mean(l):
32    return float(sum(l))/len(l)
33
34def rankingFromOrangeMeas(meas):
35    """
36    Creates a function that sequentally ranks all attributes and returns
37    results in a list. Ranking function is build out of
38    orange.MeasureAttribute.
39    """
40    return lambda d: [ meas(i,d) for i in range(len(d.domain.attributes)) ]
41
42def orderedPointersCorr(lcor):
43    """
44    Return a list of integers: indexes in original
45    lcor. Elements in the list are ordered by
46    their lcor[i] value. Higher correlations first.
47    """
48    ordered = [ (i,a) for i,a in enumerate(lcor) ] #original pos + correlation
[1711]49    ordered.sort(key=lambda x: -x[1]) #sort by correlation, descending
[188]50    ordered = nth(ordered, 0) #contains positions in the original list
51    return ordered
52
53def enrichmentScoreRanked(subset, lcor, ordered, p=1.0, rev2=None):
54    """
55    Input data and subset.
56   
57    subset: list of attribute indices of the input data belonging
58        to the same set.
59    lcor: correlations with class for each attribute in a list.
60
61    Returns enrichment score on given data.
[414]62
63    This implementation efficiently handles "sparse" genesets (that
64    cover only a small subset of all genes in the dataset).
[188]65    """
66
[414]67    #print lcor
68
[188]69    subset = set(subset)
70
71    if rev2 == None:
72        def rev(l):
73            return numpy.argsort(l)
74        rev2 = rev(ordered)
75
76    #add if gene is not in the subset
77    notInA = -(1. / (len(lcor)-len(subset)))
78    #base for addition if gene is in the subset
[1711]79
80    cors = [ abs(lcor[i])**p for i in subset ] #belowe in numpy
[188]81    sumcors = sum(cors)
82
83    #this should not happen
84    if sumcors == 0.0:
85        return (0.0, None)
86   
87    inAb = 1./sumcors
88
89    ess = [0.0]
90   
91    map = {}
92    for i in subset:
93        orderedpos = rev2[i]
94        map[orderedpos] = inAb*abs(lcor[i]**p)
95       
96    last = 0
97
98    maxSum = minSum = csum = 0.0
99
100    for a,b in sorted(map.items()):
101        diff = a-last
102        csum += notInA*diff
103        last = a+1
104       
105        if csum < minSum:
106            minSum = csum
107       
108        csum += b
109
110        if csum > maxSum:
111            maxSum = csum
112
113    #finish it
114    diff = (len(ordered))-last
115    csum += notInA*diff
[485]116
[188]117    if csum < minSum:
118        minSum = csum
119
120    #print "MY", (maxSum if abs(maxSum) > abs(minSum) else minSum)
121
122    """
[411]123    #BY DEFINITION
[188]124    print "subset", subset
125
126    for i in ordered:
127        ess.append(ess[-1] + \
128            (inAb*abs(lcor[i]**p) if i in subset else notInA)
129        )
130        if i in subset:
131            print ess[-2], ess[-1]
132            print i, (inAb*abs(lcor[i]**p))
133
134    maxEs = max(ess)
135    minEs = min(ess)
136   
137    print "REAL", (maxEs if abs(maxEs) > abs(minEs) else minEs, ess[1:])
138
139    """
140    return (maxSum if abs(maxSum) > abs(minSum) else minSum, [])
141
142#from mOrngData
143def shuffleAttribute(data, attribute, locations):
144    """
145    Destructive!
146    """
147    attribute = data.domain[attribute]
148    l = [None]*len(data)
149    for i in range(len(data)):
150        l[locations[i]] = data[i][attribute]
151    for i in range(len(data)):
152        data[i][attribute] = l[i]
153
[411]154def shuffleClass(datai, rands=0):
[188]155    """
156    Returns a dataset with values of class attribute randomly shuffled.
[411]157    If multiple dataset are on input shuffle them all with the same random seed.
[188]158    """
[411]159    def shuffleOne(data):
160        rand = random.Random(rands)
161        d2 = orange.ExampleTable(data.domain, data)
162        locations = range(len(data))
163        rand.shuffle(locations)
164        shuffleAttribute(d2, d2.domain.classVar, locations)
165        return d2
166
167    if iset(datai):
168        return shuffleOne(datai)
169    else:
170        return [ shuffleOne(data) for data in datai ]
[188]171
172def shuffleList(l, rand=random.Random(0)):
173    """
174    Returns a copy of a shuffled input list.
175    """
176    import copy
177    l2 = copy.copy(l)
178    rand.shuffle(l2)
179    return l2
180
181def shuffleAttributes(data, rand=random.Random(0)):
182    """
183    Returns a dataset with a new attribute order.
184    """
185    natts = shuffleList(list(data.domain.attributes), rand)
186    dom2 = orange.Domain(natts, data.domain.classVar)
187    d2 = orange.ExampleTable(dom2, data)
188    return d2
189
190def gseapval(es, esnull):
191    """
192    From article (PNAS):
193    estimate nominal p-value for S from esnull by using the positive
194    or negative portion of the distribution corresponding to the sign
195    of the observed ES(S).
196    """
197   
198    try:
199        if es < 0:
200            return float(len([ a for a in esnull if a <= es ]))/ \
201                len([ a for a in esnull if a < 0])   
202        else: 
203            return float(len([ a for a in esnull if a >= es ]))/ \
[414]204                len([ a for a in esnull if a >= 0])
[188]205    except:
206        return 1.0
207
208
209def enrichmentScore(data, subset, rankingf):
210    """
211    Returns enrichment score and running enrichment score.
212    """
213    lcor = rankingf(data)
214    ordered = orderedPointersCorr(lcor)
215    es,l = enrichmentScoreRanked(subset, lcor, ordered)
216    return es,l
217
218def gseaE(data, subsets, rankingf=None, \
219        n=100, permutation="class", **kwargs):
220    """
221    Run GSEA algorithm on an example table.
222
223    data: orange example table.
224    subsets: list of distinct subsets of data.
225    rankingf: function that returns correlation to class of each
226        variable.
227    n: number of random permutations to sample null distribution.
228    permutation: "class" for permutating class, else permutate attribute
229        order.
230
231    """
232
233    if not rankingf:
234        rankingf=rankingFromOrangeMeas(MA_signalToNoise())
235
236    enrichmentScores = []
237 
238    lcor = rankingf(data)
[414]239    #print lcor
240
[188]241    ordered = orderedPointersCorr(lcor)
242
243    def rev(l):
244        return numpy.argsort(l)
245
246    rev2 = rev(ordered)
247
248    for subset in subsets:
249        es = enrichmentScoreRanked(subset, lcor, ordered, rev2=rev2)[0]
250        enrichmentScores.append(es)
251
252    runOptCallbacks(kwargs)
253
254    #print "PERMUTATION", permutation
255
256    enrichmentNulls = [ [] for a in range(len(subsets)) ]
257
258    for i in range(n):
259
260        if permutation == "class":
[411]261            d2 = shuffleClass(data, 2000+i) #fixed permutation
[188]262            r2 = rankingf(d2)
263
264        else:
265            r2 = shuffleList(lcor, random.Random(2000+i))
266
267        ordered2 = orderedPointersCorr(r2)
268        rev22 = rev(ordered2)
269        for si,subset in enumerate(subsets):
270            esn = enrichmentScoreRanked(subset, r2, ordered2, rev2=rev22)[0]
271            enrichmentNulls[si].append(esn)
272
273        runOptCallbacks(kwargs)
274
275    return gseaSignificance(enrichmentScores, enrichmentNulls)
276
277
278def runOptCallbacks(rargs):
279    if "callback" in rargs:
280        try:
281            [ a() for a in rargs["callback"] ]
282        except:
283            rargs["callback"]()
284           
285
286def gseaR(rankings, subsets, n=100, **kwargs):
287    """
288    """
289
290    if "permutation" in kwargs:
[395]291        if kwargs["permutation"] == "class":
292            raise Exception("Only gene permutation possible")
[188]293
294    enrichmentScores = []
295 
296    ordered = orderedPointersCorr(rankings)
297   
298    def rev(l):
299        return numpy.argsort(l)
300
301    rev2 = rev(ordered)
302
303    for subset in subsets:
304
305        es = enrichmentScoreRanked(subset, rankings, ordered, rev2=rev2)[0]
306        enrichmentScores.append(es)
307   
308    runOptCallbacks(kwargs)
309
310    enrichmentNulls = [ [] for a in range(len(subsets)) ]
311
312    for i in range(n):
313       
314        r2 = shuffleList(rankings, random.Random(2000+i))
315        ordered2 = orderedPointersCorr(r2)
316        rev22 = rev(ordered2)
317
318        for si,subset in enumerate(subsets):
319
320            esn = enrichmentScoreRanked(subset, r2, ordered2, rev2=rev22)[0]
321            enrichmentNulls[si].append(esn)
322
323        runOptCallbacks(kwargs)
324
325    return gseaSignificance(enrichmentScores, enrichmentNulls)
326
327
328def gseaSignificance(enrichmentScores, enrichmentNulls):
329
[414]330    #print enrichmentScores
331
[580]332    import time
333
334    tb1 = time.time()
335
[188]336    enrichmentPVals = []
337    nEnrichmentScores = []
338    nEnrichmentNulls = []
339
340    for i in range(len(enrichmentScores)):
341        es = enrichmentScores[i]
342        enrNull = enrichmentNulls[i]
[414]343        #print es, enrNull
[188]344
345        enrichmentPVals.append(gseapval(es, enrNull))
346
347        #normalize the ES(S,pi) and the observed ES(S), separetely rescaling
348        #the positive and negative scores by divident by the mean of the
349        #ES(S,pi)
350
351        #print es, enrNull
352
353        def normalize(s):
354            try:
355                if s == 0:
356                    return 0.0
357                if s >= 0:
358                    meanPos = mean([a for a in enrNull if a >= 0])
359                    #print s, meanPos
360                    return s/meanPos
361                else:
362                    meanNeg = mean([a for a in enrNull if a < 0])
363                    #print s, meanNeg
364                    return -s/meanNeg
365            except:
366                return 0.0 #return if according mean value is uncalculable
367
368
369        nes = normalize(es)
370        nEnrichmentScores.append(nes)
371       
372        nenrNull = [ normalize(s) for s in enrNull ]
373        nEnrichmentNulls.append(nenrNull)
374 
[411]375
[580]376    #print "First part", time.time() - tb1
377
[411]378    #FDR computation
[188]379    #create a histogram of all NES(S,pi) over all S and pi
380    vals = reduce(lambda x,y: x+y, nEnrichmentNulls, [])
[411]381
[580]382
383    def shorten(l, p=10000):
384        """
385        Take each len(l)/p element, if len(l)/p >= 2.
386        """
387        e = len(l)/p
388        if e <= 1:
389            return l
390        else:
391            return [ l[i] for i in xrange(0, len(l), e) ]
392
393    #vals = shorten(vals) -> this can speed up second part. is it relevant TODO?
394
[188]395    """
396    Use this null distribution to compute an FDR q value, for a given NES(S) =
397    NES* >= 0. The FDR is the ratio of the percantage of all (S,pi) with
398    NES(S,pi) >= 0, whose NES(S,pi) >= NES*, divided by the percentage of
399    observed S wih NES(S) >= 0, whose NES(S) >= NES*, and similarly if NES(S)
400    = NES* <= 0.
401    """
402
[411]403    nvals = numpy.array(sorted(vals))
404    nnes = numpy.array(sorted(nEnrichmentScores))
405
[580]406    #print "LEN VALS", len(vals), len(nEnrichmentScores)
407
[188]408    fdrs = []
409
410    import operator
411
412    for i in range(len(enrichmentScores)):
413
414        nes = nEnrichmentScores[i]
[411]415
416        """
417        #Strighfoward but slow implementation follows in comments.
418        #Useful as code description.
419       
[188]420        if nes >= 0:
421            op0 = operator.ge
422            opn = operator.ge
423        else:
424            op0 = operator.lt
425            opn = operator.le
426
427        allPos = [a for a in vals if op0(a,0)]
428        allHigherAndPos = [a for a in allPos if opn(a,nes) ]
429
430        nesPos = [a for a in nEnrichmentScores if op0(a,0) ]
431        nesHigherAndPos = [a for a in nesPos if opn(a,nes) ]
[411]432
433        top = len(allHigherAndPos)/float(len(allPos)) #p value
[188]434        down = len(nesHigherAndPos)/float(len(nesPos))
[411]435       
436        l1 = [ len(allPos), len(allHigherAndPos), len(nesPos), len(nesHigherAndPos)]
[188]437
[411]438        allPos = allHigherAndPos = nesPos =  nesHigherAndPos = 1
439
440        """
441
[580]442        #this could be speed up twice with the same accuracy!
[411]443        if nes >= 0:
444            allPos = int(len(vals) - numpy.searchsorted(nvals, 0, side="left"))
445            allHigherAndPos = int(len(vals) - numpy.searchsorted(nvals, nes, side="left"))
446            nesPos = len(nnes) - int(numpy.searchsorted(nnes, 0, side="left"))
447            nesHigherAndPos = len(nnes) - int(numpy.searchsorted(nnes, nes, side="left"))
448        else:
449            allPos = int(numpy.searchsorted(nvals, 0, side="left"))
450            allHigherAndPos = int(numpy.searchsorted(nvals, nes, side="right"))
451            nesPos = int(numpy.searchsorted(nnes, 0, side="left"))
452            nesHigherAndPos = int(numpy.searchsorted(nnes, nes, side="right"))
453           
454        """
455        #Comparing results
456        l2 = [ allPos, allHigherAndPos, nesPos, nesHigherAndPos ]
457        diffs = [ l1[i]-l2[i] for i in range(len(l1)) ]
458        sumd = sum( [ abs(a) for a in diffs ] )
459        if sumd > 0:
460            print nes > 0
461            print "orig", l1
462            print "modi", l2
463        """
464
465        try:
466            top = allHigherAndPos/float(allPos) #p value
467            down = nesHigherAndPos/float(nesPos)
468
469            fdrs.append(top/down)
470        except:
471            fdrs.append(1000000000.0)
[188]472   
[580]473    #print "Whole part", time.time() - tb1
474
[188]475    return zip(enrichmentScores, nEnrichmentScores, enrichmentPVals, fdrs)
476
[1632]477from . import obiGene
[188]478
479def nth(l,n): return [ a[n] for a in l ]
480
[411]481def itOrFirst(data):
[721]482    """ Returns input if input is of type ExampleTable, else returns first
483    element of the input list """
[411]484    if iset(data):
485        return data
486    else:
487        return data[0]
488
[721]489def wrap_in_list(data):
490    """ Wraps orange.ExampleTable in a list """
491    if iset(data):
492        return [ data ]
493    else:
494        return data
495
[1706]496class transform_class(object):
497   
498    def __init__(self, cv, mapval, class_values, nclass):
499        self.cv = cv
500        self.mapval = mapval
501        self.class_values = class_values
502        self.nclass = nclass
503   
504    def __call__(self, ex, *args, **kwargs):
505        """
506        Removes unnecessary class values and joins them according
507        to function input.
508        """
509        if ex[self.cv] in self.class_values:
510            return self.nclass(self.mapval[str(ex[self.cv].value)])
511        else:
512            return "?"
513
[680]514def takeClasses(datai, classValues=None):
515    """
516    Function joins class groups specified in an input pair
517    classValues. Each element of the pair is a list of class
518    values to be joined to first or second class. Group
[925]519    classes in two new class values.
520    If class values are not specified, take all the classes.
[680]521
522    Input data can be a single data set or a list of data sets
523    with the same domain.
524
525    Returns transformed data sets / data sets.
526    """
527
528    cv = itOrFirst(datai).domain.classVar
529    nclassvalues = None
530
531    if cv and len(itOrFirst(datai)) > 1:
532        oldcvals = [ a for a in cv.values ]
533       
534        if not classValues:
[925]535            classValues = oldcvals
[680]536
537        toJoin = []
538
539        for vals in classValues:
540            if issequencens(vals):
541                toJoin.append(list(vals))
542            else:
543                toJoin.append([vals])
544
545        classValues = reduce(lambda x,y: x+y, toJoin)
546        classValues = [ str(a) for a in classValues ] # ok class values
547
548        #dictionary of old class -> new class
549        mapval = {}
550        nclassvalues = [] # need to preserver order
551
552        for joinvals in toJoin:
553            joinvalsn = "+".join([ str(val) for val in sorted(joinvals) ])
554            nclassvalues.append(joinvalsn)
555
556            for val in joinvals:
557                mapval[str(val)] = joinvalsn
558
559        #take only examples with classValues classes
560        nclass = orange.EnumVariable(cv.name, values=nclassvalues)
561
[1706]562        tco = transform_class(cv=cv,mapval=mapval,class_values=classValues, nclass=nclass)
[680]563
[1706]564        nclass.get_value_from = tco
[1705]565
566        ndom = orange.Domain(itOrFirst(datai).domain.attributes, nclass)
567
568        def removeAndTransformClasses(data):
569            ndata = Orange.data.Table(ndom, data)
570            ndata = Orange.data.Table(ndom, [ex for ex in ndata if ex[-1].value != "?"])
571            return ndata
[680]572
573        if iset(datai):
574            datai = removeAndTransformClasses(datai)
575        else:
576            datai = [ removeAndTransformClasses(data) for data in datai ]
577
578    return datai
579
580def removeBadAttributes(datai, atLeast=3):
581    """
[699]582    Removes attributes which would obscure GSEA analysis.
583
[680]584    Attributes need to be continuous, they need to have
585    at least one value. Remove other attributes.
586
587    For the attribute to be valid, it needs to have at least
588    [atLeast] values for every class value.
589
590    Return transformed data set / data sets and ignored attributes.
591    """
592
593    def attrOk(a, data):
[699]594        """
595        Attribute is ok if it is continouous and if containg
596        at least atLest not unknown values.
597        """
[680]598
599        a = data.domain.attributes.index(a)
600
601        #can't
602        if data.domain.attributes[a].varType != orange.VarTypes.Continuous:
603            return False
604
[699]605        if len(data) == 1:
[680]606
607            vals = [ex[a].value for ex in data if not ex[a].isSpecial()]
608            if len(vals) < 1:
609                return False 
610       
611        if len(data) > 1 and data.domain.classVar and atLeast > 0:
[941]612           
613            dc = dict( (v, 0) for v in data.domain.classVar.values )
[680]614
[941]615            for ex in data:
616                if not ex[a].isSpecial():
617                    dc[ex[-1].value] += 1
618
619            minl = min(dc.values())
620
[680]621            if minl < atLeast:
622                #print "Less than atLeast"
623                return False
624
625        return True
626   
627
628    def notOkAttributes(data):
629        ignored = []
630        for a in data.domain.attributes:
631            if not attrOk(a, data):
632                #print "Removing", a
633                ignored.append(a)
634        return ignored
635   
636    ignored = []
637    if iset(datai):
638        ignored = set(notOkAttributes(datai))
639    else:
640        #ignore any attribute which is has less than atLeast values for each class
641        #ignored = set(reduce(lambda x,y: x+y, [ notOkAttributes(data) for data in datai ]))
642
643        #remove any attribute, which is ok in less than half of the dataset
644        ignored = []
[721]645        for a in itOrFirst(datai).domain.attributes:
[680]646            attrOks = sum([ attrOk(a, data) for data in datai ])
647            if attrOks < len(datai)/2:
648                ignored.append(a)
649
650
651    natts = [ a for a in itOrFirst(datai).domain.attributes if a not in ignored ]
652    #print ignored, natts, set(ignored) & set(natts)
653
654    ndom = orange.Domain(natts, itOrFirst(datai).domain.classVar)
655
656    datao = None
657    if iset(datai):
658        datao = orange.ExampleTable(ndom, datai)
659    else:
660        datao = [ orange.ExampleTable(ndom, data) for data in datai ]
661
662    return datao, ignored
663
664def keepOnlyMeanAttrs(datai, atLeast=3, classValues=None):
665    """
666    Attributes need to be continuous, they need to have
667    at least one value.
668
669    In order of attribute to be valid, it needs to have at least
670    [atLeast] values for every class value.
671
672    Keep only specified classes - group them in two values.
[699]673    """   
[680]674    datai = takeClasses(datai, classValues=classValues)
675    return removeBadAttributes(datai, atLeast=atLeast)
676
[705]677def data_single_meas_column(data):
678    """
679    Returns true if data seems to be in one column
680    (float variables) only. This column should contain
681    the rankings
682    """
683    columns = [a for a in data.domain] +  [ data.domain.getmeta(a) for a in list(data.domain.getmetas()) ]
684    floatvars = [ a for a in columns if a.varType == orange.VarTypes.Continuous ]
685    if len(floatvars) == 1:
686        return True
687    else:
688        return False
689
[934]690def transform_data(data, phenVar, geneVar):
[705]691    """
692    if we have log2ratio in a single value column, transpose the matrix
693    i.e. we have a single column with a continous variable. first
694    string variable then becomes the gene name
[934]695
696    The goal is to have different phenotypes annotated with a class,
697    and names of genes as attribute names.
698
[936]699    If phenVar is False, then we can work, then the input already
700    consists of scores of differential expressions
701
[934]702    If we have a single column, transpose it.
703    If phenVar is one of the groups, transpose the matrix.
[705]704    """
705
[936]706    def prepare_data(data, phenVar=None, geneVar=None):
[705]707
[936]708        def rorq(a, name):
709            """ Group annatation or question mark. """
710            try: 
711                return a.attributes[name]
712            except: 
713                return '?'
714   
715        #use class as phenotype by default, if it is present,
716        #if not, do not use any phenotype!
717        if phenVar == None: 
718            if not data.domain.classVar:
719                phenVar = False
720            else:
721                phenVar = data.domain.classVar
[721]722
[934]723
[936]724        #TODO validate phenVar and geneVar?
725        #TODO autodetection of groups?
726
727        #transpose is not needed if phenVar is classVar or phenVar is False
728        #and there is only one sample
729        if phenVar == data.domain.classVar or \
730            (phenVar == False and len(data) == 1):
731
732            if geneVar == None: #if not specified, set as true in this stage
733                geneVar = True
734
735            floatvars = [ a for a in data.domain.attributes \
736                if a.varType == orange.VarTypes.Continuous ]
737
738            #rename attributes without touching the original variable
739            if geneVar != True:
740                fl2 = []
741
742                for a in floatvars:
743                    na = orange.FloatVariable(name=rorq(a, geneVar))
744                    na.getValueFrom = lambda e, rw: e[a]
745                    fl2.append(na)
746
747                floatvars = fl2
748
749            dom = orange.Domain(floatvars, phenVar)
750            return orange.ExampleTable(dom, data)
751
752        elif phenVar == False or phenVar != data.domain.classVar:
753
754            cands = allgroups(data)
755            pv = False
756            if phenVar != False:
757                pv = orange.EnumVariable(name="phenotype", 
758                    values=list(cands[phenVar]))
759
760            #take the only string attribute as a gene name
761            gc = gene_cands(data, False)
762            if geneVar == None:
763                if len(gc) == 1:
764                    geneVar = gc[0]
765                else:
766                    geneNamesUnspecifiedError()
767           
768            latts = [ orange.FloatVariable(name=ex[geneVar].value) \
769                for ex in data ]
770
771            domain = orange.Domain(latts, pv)
772
773            examples = []
774            for at in data.domain.attributes:
775                if at.varType == orange.VarTypes.Continuous:
776                    vals = [ ex[at].value for ex in data ]
777                    if pv != False: #add class value
778                        vals.append(rorq(at, phenVar))
779                    examples.append(orange.Example(domain, vals))
780
781            return orange.ExampleTable(domain, examples)
782        else:
783            wrongInputsError()
784
785    #transform all example tables
[721]786    single = iset(data)
[936]787    transposed = [ prepare_data(d, phenVar, geneVar) for d in wrap_in_list(data) ]
[721]788
789    if single:
790        return transposed[0]
791    else:
792        return transposed
793
[705]794
[934]795def allgroups(data):
796    """
797    Return all phenotype descriptors of attributes with their values.
798    """
799    sd = defaultdict(set)
800    for attr in data.domain.attributes:
801        for key, value in attr.attributes.items():
802            sd[key].add(value)
803    return sd
804
[940]805def already_have_correlations(data):
806    return len(data) == 1 or data_single_meas_column(data)
807
808def need_to_transpose_single(data):
809    if len(data) == 1:
810        return False
811    else:
812        return True
813
[934]814def phenotype_cands(data):
815    """
816    Return all phenotype candidate descriptors in a list of tuples
817    (variable, values). Candidates are class variable, if it exists and
818    attributes dictionaries of attributes.
819    Phenotype candidates must contain at least two differend values.
820    """
[940]821    if already_have_correlations(data):
822        return [ (False, set()) ]
823    else:
824        cv = []
825        if data.domain.classVar and data.domain.classVar.varType == orange.VarTypes.Discrete:
826            cv.append((data.domain.classVar, set(data.domain.classVar.values)))
827        cands = cv + sorted(allgroups(data).items())
828        return filter(lambda x: len(x[1]) >= 2, cands)
[934]829
[936]830def gene_cands(data, correct):
[934]831    """
832    Returns all valid gene descriptors with regards to the choosen
833    phenotype variable.
834    Return variable descriptor for variables, name of the group for
835    descriptions in attr.attributes and True for the usage
836    of attribute names.
[936]837    Correct is True, if the example table has genes as attributes.
[934]838    """
[936]839    if correct:
[934]840        #gene names could be in attributes or as gene names (marker True)
841        return [True] + nth(sorted(allgroups(data)),0)
842    else:
843        #gene names are values of some string attribute
844        columns = [a for a in data.domain] +  \
845            [ data.domain.getmeta(a) for a in list(data.domain.getmetas()) ]
846        stringvars = [ a for a in columns if a.varType == 6 ]
847        return stringvars
848
849def is_variable(phenVar):
850    return isinstance(phenVar, orange.Variable)
[705]851
[188]852class GSEA(object):
853
[934]854    def __init__(self, data, organism=None, matcher=None, classValues=None, 
855        atLeast=3, caseSensitive=False, phenVar=None, geneVar=None):
[933]856        """
857        If the data set constains multiple measurements for a single gene,
858        all are considered. Individual constributions of such measurements
859        are not weighted down - each measurement is as important as they
860        would measure different genes.
[934]861
862        phenVar and geneVar can ether be an orange attribute or a string.
863        If they are strings, then they describe a group.
[933]864        """
865
[188]866        self.genesets = {}
867        self.organism = organism
[927]868
869        if organism != None:
[933]870            print "WARNING: obiGsea - organism and caseSensitive parameters are deprecated. Use matcher instead."
[927]871
[699]872        self.gsweights = {}
873        self.namesToIndices = None
[927]874        self.gm = matcher
[188]875
[934]876        data = transform_data(data, phenVar, geneVar)
877
[680]878        data, info = keepOnlyMeanAttrs(data, classValues=classValues, atLeast=atLeast)
[188]879
880        self.data = data
[933]881
882        #init attrnames
[411]883        attrnames = [ a.name for a in itOrFirst(self.data).domain.attributes ]
[927]884
885        if self.gm == None: #build a gene matcher, if if does not exists
886            self.gm = obiGene.matcher([obiGene.GMKEGG(self.organism, ignore_case=not caseSensitive)], 
887                ignore_case=not caseSensitive, direct=True)
888            print "WARNING: gene matcher build automatically for organism: " + self.organism
889
[864]890        self.gm.set_targets(attrnames)
[933]891
[188]892 
893    def addGeneset(self, genesetname, genes):
[699]894        """
895        Add a single gene set. See addGenesets function.
896        Solely for backwards compatibility.
897        """
898        self.addGenesets({ genesetname: genes })
[188]899
[1108]900    def addGenesets(self, genesets):
[699]901        """
[1108]902        Adds genesets from input dictionary. Performs gene matching. Adds
[699]903        to a self.genesets: key is genesetname, it's values are individual
904        genes and match results.
905        """
[1108]906        for g in obiGeneSets.GeneSets(genesets):
907            genes = g.genes
908            datamatch = filter(lambda x: x[1] != None, 
909                [ (gene, self.gm.umatch(gene)) for gene in genes])
910            self.genesets[g] = datamatch
[188]911
[485]912    def selectGenesets(self, minSize=3, maxSize=1000, minPart=0.1):
[699]913        """ Returns a list of gene sets that have sizes in limits """
[188]914
915        def okSizes(orig, transl):
[699]916            """compares sizes of genesets to limitations"""
[188]917            if len(transl) >= minSize and len(transl) <= maxSize \
918                and float(len(transl))/len(orig) >= minPart:
919                return True
920            return False
921
[1108]922        return  dict( (a,c) for a,c in self.genesets.iteritems() if okSizes(a.genes,c) )
[188]923
[699]924    def genesIndices(self, genes):
925        """
926        Returns in attribute indices of given genes.
927        Buffers locations dictionary.
928        """
929        if not self.namesToIndices:
[933]930            self.namesToIndices = defaultdict(list)
931            for i,at in enumerate(itOrFirst(self.data).domain.attributes):
932                self.namesToIndices[at.name].append(i)
933        return reduce(lambda x,y:x+y, [ self.namesToIndices[gname] for gname in genes ], [])
[188]934
[699]935    def compute_gene_weights(self, gsweights, gsetsnum, nattributes):
936        """
937        Computes gene set weights for all specified weights.
938        Expects gene sets in form { name: [ num_attributes ] }
939        GSWeights are
940        """
941        pass
[188]942
[1108]943    def to_gsetsnum(self, gsets):
[699]944        """
[1108]945        Returns a dictionary of given  gene sets in gsetnums format.
[699]946        """
[1108]947        return dict( (gs,self.genesIndices(nth(self.genesets[gs],1))) for gs in gsets)
[485]948
949    def compute(self, minSize=3, maxSize=1000, minPart=0.1, n=100, **kwargs):
950
951        subsetsok = self.selectGenesets(minSize=minSize, maxSize=maxSize, minPart=minPart)
952
[699]953        geneweights = None
[485]954
[699]955        gsetsnum = self.to_gsetsnum(subsetsok.keys())
956        gsetsnumit = gsetsnum.items() #to fix order
957
[934]958        #gsetsnumit = gsetsnumit[:1]
[933]959        #print gsetsnumit
960
[699]961        if len(gsetsnum) == 0:
[933]962            return {} # quick return if no genesets
[395]963
[699]964        if len(self.gsweights) > 0:
965            #set geneset
966            geneweights = [1]*len(data.domain.attributes)
967
[485]968        if len(itOrFirst(self.data)) > 1:
[699]969            gseal = gseaE(self.data, nth(gsetsnumit,1), n=n, geneweights=geneweights, **kwargs)
[188]970        else:
[699]971            rankings = [ self.data[0][at].native() for at in self.data.domain.attributes ]
[705]972            gseal = gseaR(rankings, nth(gsetsnumit,1), n=n, **kwargs)
[188]973
974        res = {}
975
[1108]976        for gs, gseale in zip(nth(gsetsnumit,0),gseal):
[663]977            rdict = {}
[699]978            rdict['es'] = gseale[0]
979            rdict['nes'] = gseale[1]
980            rdict['p'] = gseale[2]
981            rdict['fdr'] = gseale[3]
[1108]982            rdict['size'] = len(gs.genes)
983            rdict['matched_size'] = len(self.genesets[gs])
984            rdict['genes'] = nth(self.genesets[gs],1)
985            res[gs] = rdict
[188]986
987        return res
988
[933]989def runGSEA(data, organism=None, classValues=None, geneSets=None, n=100, 
990        permutation="class", minSize=3, maxSize=1000, minPart=0.1, atLeast=3, 
991        matcher=None, geneVar=None, phenVar=None, caseSensitive=False, 
992        **kwargs):
993    """
994    phenVar and geneVar specify the phenotype and gene variable.
995    """
996    gso = GSEA(data, organism=organism, matcher=matcher, 
[936]997        classValues=classValues, atLeast=atLeast, caseSensitive=caseSensitive,
998        geneVar=geneVar, phenVar=phenVar)
[933]999    gso.addGenesets(geneSets)
1000    res1 = gso.compute(n=n, permutation=permutation, minSize=minSize,
[936]1001        maxSize=maxSize, minPart=minPart, **kwargs)
[188]1002    return res1
1003
[485]1004def etForAttribute(datal,a):
[395]1005    """
[485]1006    Builds an example table for a single attribute across multiple
1007    example tables.
[395]1008    """
1009
[414]1010    tables = len(datal)
1011
1012    def getAttrVals(data, attr):
1013        dom2 = orange.Domain([data.domain[attr]], False)
1014        dataa = orange.ExampleTable(dom2, data)
1015        return [ a[0].native() for a in dataa ]
1016
1017    domainl = []
1018    valuesl = []
1019
1020    for id, data in enumerate(datal):
1021        v = getAttrVals(data,a)
1022        valuesl.append(v)
1023        domainl.append(orange.FloatVariable(name=("v"+str(id))))
1024
1025    classvals = getAttrVals(data, datal[0].domain.classVar)
1026    valuesl += [ classvals ]
1027
1028    dom = orange.Domain(domainl, datal[0].domain.classVar)
1029    examples = [ list(a) for a in zip(*valuesl) ]
1030
1031    datat = orange.ExampleTable(dom, examples)
1032
1033    return datat
1034
1035
[428]1036def evaluateEtWith(fn, *args, **kwargs):
[414]1037    """
[428]1038    fn - evaluates example table given
1039    following arguments.
[414]1040    """
1041
1042    def newf(datal):
1043        res = []
1044        for a in datal[0].domain.attributes:
1045            et = etForAttribute(datal, a)
[428]1046            res.append(fn(et, *args, **kwargs))
[414]1047        return res
1048
1049    return newf
1050
1051
[665]1052def hierarchyOutput(results, limitGenes=50):
1053    """
1054    Transforms results for use by hierarchy output from GO.
1055
1056    limitGenes - maximum number of genes on output.
1057    """
1058    trans = []
1059   
1060    for name, res in results.items():
1061        try:
1062            second = name.split(' ')[2]
1063            name = second if second[:2] == 'GO' else name
1064        except:
1065            pass
1066       
1067        trans.append((name, abs(res["nes"]), res["matched_size"], res["size"], res["p"], min(res["fdr"], 1.0), res["genes"][:limitGenes]))
1068
1069    return trans
1070
[188]1071if  __name__=="__main__":
[680]1072
[1108]1073    """
1074    Old example with another measure function
[934]1075    data = orange.ExampleTable("gene_three_lines_log.tab")
[1108]1076    data = orange.ExampleTable("sterolTalkHepa.tab")
[864]1077    gen1 = collections(['steroltalk.gmt', ':kegg:hsa'], default=False)
[933]1078    rankingf = rankingFromOrangeMeas(MA_anova())
[927]1079    matcher = obiGene.matcher([obiGene.GMKEGG('hsa')])
[936]1080    geneVar = gene_cands(data, False)[1]
[941]1081    phenVar = "group"
1082    out = runGSEA(data, n=10, geneSets=gen1, permutation="gene", atLeast=3, matcher=matcher, rankingf=rankingf, phenVar=phenVar, geneVar=geneVar)
[1108]1083    """
[1173]1084    """
[1108]1085    data = orange.ExampleTable("sterolTalkHepa.tab")
1086    gen1 = obiGeneSets.collections('steroltalk.gmt', (("KEGG",), "9606"))
1087    matcher = obiGene.matcher([obiGene.GMKEGG('hsa')])
1088    out = runGSEA(data, n=10, geneSets=gen1, permutation="gene", atLeast=3, matcher=matcher)
[1173]1089    """
1090    matcher = obiGene.matcher([[obiGene.GMKEGG('ddi'), obiGene.GMDicty()]])
1091    data = orange.ExampleTable("/home/marko/t_gsea1.tab")
1092    gen1 = obiGeneSets.collections((("KEGG",), "352472"))
1093    out = runGSEA(data, n=10, geneSets=gen1, permutation="gene", atLeast=3, matcher=matcher, phenVar="growth")
1094    print out
[699]1095    print "\n".join(map(str,sorted(out.items())))
[411]1096   
Note: See TracBrowser for help on using the repository browser.