source: orange-bioinformatics/_bioinformatics/obiGsea.py @ 1636:10d234fdadb9

Revision 1636:10d234fdadb9, 32.3 KB checked in by mitar, 2 years ago (diff)

Restructuring because we will not be using namespaces.

Line 
1from __future__ import absolute_import
2
3from collections import defaultdict
4import random
5import time
6
7import numpy
8
9import orange
10
11from . import obiGeneSets
12from .obiExpression import *
13
14"""
15Gene set enrichment analysis.
16
17Author: Marko Toplak
18"""
19
20def iset(data):
21    """
22    Is data orange.ExampleTable?
23    """
24    return isinstance(data, orange.ExampleTable)
25
26def issequencens(x):
27    "Is x a sequence and not string ? We say it is if it has a __getitem__ method and is not string."
28    return hasattr(x, '__getitem__') and not isinstance(x, basestring)
29
30def mean(l):
31    return float(sum(l))/len(l)
32
33def rankingFromOrangeMeas(meas):
34    """
35    Creates a function that sequentally ranks all attributes and returns
36    results in a list. Ranking function is build out of
37    orange.MeasureAttribute.
38    """
39    return lambda d: [ meas(i,d) for i in range(len(d.domain.attributes)) ]
40
41def orderedPointersCorr(lcor):
42    """
43    Return a list of integers: indexes in original
44    lcor. Elements in the list are ordered by
45    their lcor[i] value. Higher correlations first.
46    """
47    ordered = [ (i,a) for i,a in enumerate(lcor) ] #original pos + correlation
48    ordered.sort(lambda x,y: cmp(y[1],x[1])) #sort by correlation, descending
49    ordered = nth(ordered, 0) #contains positions in the original list
50    return ordered
51
52def enrichmentScoreRanked(subset, lcor, ordered, p=1.0, rev2=None):
53    """
54    Input data and subset.
55   
56    subset: list of attribute indices of the input data belonging
57        to the same set.
58    lcor: correlations with class for each attribute in a list.
59
60    Returns enrichment score on given data.
61
62    This implementation efficiently handles "sparse" genesets (that
63    cover only a small subset of all genes in the dataset).
64    """
65
66    #print lcor
67
68    subset = set(subset)
69
70    if rev2 == None:
71        def rev(l):
72            return numpy.argsort(l)
73        rev2 = rev(ordered)
74
75    #add if gene is not in the subset
76    notInA = -(1. / (len(lcor)-len(subset)))
77    #base for addition if gene is in the subset
78    cors = [ abs(lcor[i])**p for i in subset ]
79    sumcors = sum(cors)
80
81    #this should not happen
82    if sumcors == 0.0:
83        return (0.0, None)
84   
85    inAb = 1./sumcors
86
87    ess = [0.0]
88   
89    map = {}
90    for i in subset:
91        orderedpos = rev2[i]
92        map[orderedpos] = inAb*abs(lcor[i]**p)
93       
94    last = 0
95
96    maxSum = minSum = csum = 0.0
97
98    for a,b in sorted(map.items()):
99        diff = a-last
100        csum += notInA*diff
101        last = a+1
102       
103        if csum < minSum:
104            minSum = csum
105       
106        csum += b
107
108        if csum > maxSum:
109            maxSum = csum
110
111    #finish it
112    diff = (len(ordered))-last
113    csum += notInA*diff
114
115    if csum < minSum:
116        minSum = csum
117
118    #print "MY", (maxSum if abs(maxSum) > abs(minSum) else minSum)
119
120    """
121    #BY DEFINITION
122    print "subset", subset
123
124    for i in ordered:
125        ess.append(ess[-1] + \
126            (inAb*abs(lcor[i]**p) if i in subset else notInA)
127        )
128        if i in subset:
129            print ess[-2], ess[-1]
130            print i, (inAb*abs(lcor[i]**p))
131
132    maxEs = max(ess)
133    minEs = min(ess)
134   
135    print "REAL", (maxEs if abs(maxEs) > abs(minEs) else minEs, ess[1:])
136
137    """
138    return (maxSum if abs(maxSum) > abs(minSum) else minSum, [])
139
140#from mOrngData
141def shuffleAttribute(data, attribute, locations):
142    """
143    Destructive!
144    """
145    attribute = data.domain[attribute]
146    l = [None]*len(data)
147    for i in range(len(data)):
148        l[locations[i]] = data[i][attribute]
149    for i in range(len(data)):
150        data[i][attribute] = l[i]
151
152def shuffleClass(datai, rands=0):
153    """
154    Returns a dataset with values of class attribute randomly shuffled.
155    If multiple dataset are on input shuffle them all with the same random seed.
156    """
157    def shuffleOne(data):
158        rand = random.Random(rands)
159        d2 = orange.ExampleTable(data.domain, data)
160        locations = range(len(data))
161        rand.shuffle(locations)
162        shuffleAttribute(d2, d2.domain.classVar, locations)
163        return d2
164
165    if iset(datai):
166        return shuffleOne(datai)
167    else:
168        return [ shuffleOne(data) for data in datai ]
169
170def shuffleList(l, rand=random.Random(0)):
171    """
172    Returns a copy of a shuffled input list.
173    """
174    import copy
175    l2 = copy.copy(l)
176    rand.shuffle(l2)
177    return l2
178
179def shuffleAttributes(data, rand=random.Random(0)):
180    """
181    Returns a dataset with a new attribute order.
182    """
183    natts = shuffleList(list(data.domain.attributes), rand)
184    dom2 = orange.Domain(natts, data.domain.classVar)
185    d2 = orange.ExampleTable(dom2, data)
186    return d2
187
188def gseapval(es, esnull):
189    """
190    From article (PNAS):
191    estimate nominal p-value for S from esnull by using the positive
192    or negative portion of the distribution corresponding to the sign
193    of the observed ES(S).
194    """
195   
196    try:
197        if es < 0:
198            return float(len([ a for a in esnull if a <= es ]))/ \
199                len([ a for a in esnull if a < 0])   
200        else: 
201            return float(len([ a for a in esnull if a >= es ]))/ \
202                len([ a for a in esnull if a >= 0])
203    except:
204        return 1.0
205
206
207def enrichmentScore(data, subset, rankingf):
208    """
209    Returns enrichment score and running enrichment score.
210    """
211    lcor = rankingf(data)
212    ordered = orderedPointersCorr(lcor)
213    es,l = enrichmentScoreRanked(subset, lcor, ordered)
214    return es,l
215
216def gseaE(data, subsets, rankingf=None, \
217        n=100, permutation="class", **kwargs):
218    """
219    Run GSEA algorithm on an example table.
220
221    data: orange example table.
222    subsets: list of distinct subsets of data.
223    rankingf: function that returns correlation to class of each
224        variable.
225    n: number of random permutations to sample null distribution.
226    permutation: "class" for permutating class, else permutate attribute
227        order.
228
229    """
230
231    if not rankingf:
232        rankingf=rankingFromOrangeMeas(MA_signalToNoise())
233
234    enrichmentScores = []
235 
236    lcor = rankingf(data)
237    #print lcor
238
239    ordered = orderedPointersCorr(lcor)
240
241    def rev(l):
242        return numpy.argsort(l)
243
244    rev2 = rev(ordered)
245
246    for subset in subsets:
247        es = enrichmentScoreRanked(subset, lcor, ordered, rev2=rev2)[0]
248        enrichmentScores.append(es)
249
250    runOptCallbacks(kwargs)
251
252    #print "PERMUTATION", permutation
253
254    enrichmentNulls = [ [] for a in range(len(subsets)) ]
255
256    for i in range(n):
257
258        if permutation == "class":
259            d2 = shuffleClass(data, 2000+i) #fixed permutation
260            r2 = rankingf(d2)
261
262        else:
263            r2 = shuffleList(lcor, random.Random(2000+i))
264
265        ordered2 = orderedPointersCorr(r2)
266        rev22 = rev(ordered2)
267        for si,subset in enumerate(subsets):
268            esn = enrichmentScoreRanked(subset, r2, ordered2, rev2=rev22)[0]
269            enrichmentNulls[si].append(esn)
270
271        runOptCallbacks(kwargs)
272
273    return gseaSignificance(enrichmentScores, enrichmentNulls)
274
275
276def runOptCallbacks(rargs):
277    if "callback" in rargs:
278        try:
279            [ a() for a in rargs["callback"] ]
280        except:
281            rargs["callback"]()
282           
283
284def gseaR(rankings, subsets, n=100, **kwargs):
285    """
286    """
287
288    if "permutation" in kwargs:
289        if kwargs["permutation"] == "class":
290            raise Exception("Only gene permutation possible")
291
292    enrichmentScores = []
293 
294    ordered = orderedPointersCorr(rankings)
295   
296    def rev(l):
297        return numpy.argsort(l)
298
299    rev2 = rev(ordered)
300
301    for subset in subsets:
302
303        es = enrichmentScoreRanked(subset, rankings, ordered, rev2=rev2)[0]
304        enrichmentScores.append(es)
305   
306    runOptCallbacks(kwargs)
307
308    enrichmentNulls = [ [] for a in range(len(subsets)) ]
309
310    for i in range(n):
311       
312        r2 = shuffleList(rankings, random.Random(2000+i))
313        ordered2 = orderedPointersCorr(r2)
314        rev22 = rev(ordered2)
315
316        for si,subset in enumerate(subsets):
317
318            esn = enrichmentScoreRanked(subset, r2, ordered2, rev2=rev22)[0]
319            enrichmentNulls[si].append(esn)
320
321        runOptCallbacks(kwargs)
322
323    return gseaSignificance(enrichmentScores, enrichmentNulls)
324
325
326def gseaSignificance(enrichmentScores, enrichmentNulls):
327
328    #print enrichmentScores
329
330    import time
331
332    tb1 = time.time()
333
334    enrichmentPVals = []
335    nEnrichmentScores = []
336    nEnrichmentNulls = []
337
338    for i in range(len(enrichmentScores)):
339        es = enrichmentScores[i]
340        enrNull = enrichmentNulls[i]
341        #print es, enrNull
342
343        enrichmentPVals.append(gseapval(es, enrNull))
344
345        #normalize the ES(S,pi) and the observed ES(S), separetely rescaling
346        #the positive and negative scores by divident by the mean of the
347        #ES(S,pi)
348
349        #print es, enrNull
350
351        def normalize(s):
352            try:
353                if s == 0:
354                    return 0.0
355                if s >= 0:
356                    meanPos = mean([a for a in enrNull if a >= 0])
357                    #print s, meanPos
358                    return s/meanPos
359                else:
360                    meanNeg = mean([a for a in enrNull if a < 0])
361                    #print s, meanNeg
362                    return -s/meanNeg
363            except:
364                return 0.0 #return if according mean value is uncalculable
365
366
367        nes = normalize(es)
368        nEnrichmentScores.append(nes)
369       
370        nenrNull = [ normalize(s) for s in enrNull ]
371        nEnrichmentNulls.append(nenrNull)
372 
373
374    #print "First part", time.time() - tb1
375
376    #FDR computation
377    #create a histogram of all NES(S,pi) over all S and pi
378    vals = reduce(lambda x,y: x+y, nEnrichmentNulls, [])
379
380
381    def shorten(l, p=10000):
382        """
383        Take each len(l)/p element, if len(l)/p >= 2.
384        """
385        e = len(l)/p
386        if e <= 1:
387            return l
388        else:
389            return [ l[i] for i in xrange(0, len(l), e) ]
390
391    #vals = shorten(vals) -> this can speed up second part. is it relevant TODO?
392
393    """
394    Use this null distribution to compute an FDR q value, for a given NES(S) =
395    NES* >= 0. The FDR is the ratio of the percantage of all (S,pi) with
396    NES(S,pi) >= 0, whose NES(S,pi) >= NES*, divided by the percentage of
397    observed S wih NES(S) >= 0, whose NES(S) >= NES*, and similarly if NES(S)
398    = NES* <= 0.
399    """
400
401    nvals = numpy.array(sorted(vals))
402    nnes = numpy.array(sorted(nEnrichmentScores))
403
404    #print "LEN VALS", len(vals), len(nEnrichmentScores)
405
406    fdrs = []
407
408    import operator
409
410    for i in range(len(enrichmentScores)):
411
412        nes = nEnrichmentScores[i]
413
414        """
415        #Strighfoward but slow implementation follows in comments.
416        #Useful as code description.
417       
418        if nes >= 0:
419            op0 = operator.ge
420            opn = operator.ge
421        else:
422            op0 = operator.lt
423            opn = operator.le
424
425        allPos = [a for a in vals if op0(a,0)]
426        allHigherAndPos = [a for a in allPos if opn(a,nes) ]
427
428        nesPos = [a for a in nEnrichmentScores if op0(a,0) ]
429        nesHigherAndPos = [a for a in nesPos if opn(a,nes) ]
430
431        top = len(allHigherAndPos)/float(len(allPos)) #p value
432        down = len(nesHigherAndPos)/float(len(nesPos))
433       
434        l1 = [ len(allPos), len(allHigherAndPos), len(nesPos), len(nesHigherAndPos)]
435
436        allPos = allHigherAndPos = nesPos =  nesHigherAndPos = 1
437
438        """
439
440        #this could be speed up twice with the same accuracy!
441        if nes >= 0:
442            allPos = int(len(vals) - numpy.searchsorted(nvals, 0, side="left"))
443            allHigherAndPos = int(len(vals) - numpy.searchsorted(nvals, nes, side="left"))
444            nesPos = len(nnes) - int(numpy.searchsorted(nnes, 0, side="left"))
445            nesHigherAndPos = len(nnes) - int(numpy.searchsorted(nnes, nes, side="left"))
446        else:
447            allPos = int(numpy.searchsorted(nvals, 0, side="left"))
448            allHigherAndPos = int(numpy.searchsorted(nvals, nes, side="right"))
449            nesPos = int(numpy.searchsorted(nnes, 0, side="left"))
450            nesHigherAndPos = int(numpy.searchsorted(nnes, nes, side="right"))
451           
452        """
453        #Comparing results
454        l2 = [ allPos, allHigherAndPos, nesPos, nesHigherAndPos ]
455        diffs = [ l1[i]-l2[i] for i in range(len(l1)) ]
456        sumd = sum( [ abs(a) for a in diffs ] )
457        if sumd > 0:
458            print nes > 0
459            print "orig", l1
460            print "modi", l2
461        """
462
463        try:
464            top = allHigherAndPos/float(allPos) #p value
465            down = nesHigherAndPos/float(nesPos)
466
467            fdrs.append(top/down)
468        except:
469            fdrs.append(1000000000.0)
470   
471    #print "Whole part", time.time() - tb1
472
473    return zip(enrichmentScores, nEnrichmentScores, enrichmentPVals, fdrs)
474
475from . import obiGene
476
477def nth(l,n): return [ a[n] for a in l ]
478
479def itOrFirst(data):
480    """ Returns input if input is of type ExampleTable, else returns first
481    element of the input list """
482    if iset(data):
483        return data
484    else:
485        return data[0]
486
487def wrap_in_list(data):
488    """ Wraps orange.ExampleTable in a list """
489    if iset(data):
490        return [ data ]
491    else:
492        return data
493
494def takeClasses(datai, classValues=None):
495    """
496    Function joins class groups specified in an input pair
497    classValues. Each element of the pair is a list of class
498    values to be joined to first or second class. Group
499    classes in two new class values.
500    If class values are not specified, take all the classes.
501
502    Input data can be a single data set or a list of data sets
503    with the same domain.
504
505    Returns transformed data sets / data sets.
506    """
507
508    cv = itOrFirst(datai).domain.classVar
509    nclassvalues = None
510
511    if cv and len(itOrFirst(datai)) > 1:
512        oldcvals = [ a for a in cv.values ]
513       
514        if not classValues:
515            classValues = oldcvals
516
517        toJoin = []
518
519        for vals in classValues:
520            if issequencens(vals):
521                toJoin.append(list(vals))
522            else:
523                toJoin.append([vals])
524
525        classValues = reduce(lambda x,y: x+y, toJoin)
526        classValues = [ str(a) for a in classValues ] # ok class values
527
528        #dictionary of old class -> new class
529        mapval = {}
530        nclassvalues = [] # need to preserver order
531
532        for joinvals in toJoin:
533            joinvalsn = "+".join([ str(val) for val in sorted(joinvals) ])
534            nclassvalues.append(joinvalsn)
535
536            for val in joinvals:
537                mapval[str(val)] = joinvalsn
538
539        #take only examples with classValues classes
540        nclass = orange.EnumVariable(cv.name, values=nclassvalues)
541        ndom = orange.Domain(itOrFirst(datai).domain.attributes, nclass)
542
543        def removeAndTransformClasses(data):
544            """
545            Removes unnecessary class values and joins them according
546            to function input.
547            """
548            examples = []
549            for ex in data:
550                if ex[cv] in classValues:
551                    nex = orange.Example(ndom, ex)
552                    nex[-1] = mapval[str(ex[cv].value)]
553                    examples.append(nex)
554
555            return orange.ExampleTable(ndom, examples)
556
557        if iset(datai):
558            datai = removeAndTransformClasses(datai)
559        else:
560            datai = [ removeAndTransformClasses(data) for data in datai ]
561
562    return datai
563
564def removeBadAttributes(datai, atLeast=3):
565    """
566    Removes attributes which would obscure GSEA analysis.
567
568    Attributes need to be continuous, they need to have
569    at least one value. Remove other attributes.
570
571    For the attribute to be valid, it needs to have at least
572    [atLeast] values for every class value.
573
574    Return transformed data set / data sets and ignored attributes.
575    """
576
577    def attrOk(a, data):
578        """
579        Attribute is ok if it is continouous and if containg
580        at least atLest not unknown values.
581        """
582
583        a = data.domain.attributes.index(a)
584
585        #can't
586        if data.domain.attributes[a].varType != orange.VarTypes.Continuous:
587            return False
588
589        if len(data) == 1:
590
591            vals = [ex[a].value for ex in data if not ex[a].isSpecial()]
592            if len(vals) < 1:
593                return False 
594       
595        if len(data) > 1 and data.domain.classVar and atLeast > 0:
596           
597            dc = dict( (v, 0) for v in data.domain.classVar.values )
598
599            for ex in data:
600                if not ex[a].isSpecial():
601                    dc[ex[-1].value] += 1
602
603            minl = min(dc.values())
604
605            if minl < atLeast:
606                #print "Less than atLeast"
607                return False
608
609        return True
610   
611
612    def notOkAttributes(data):
613        ignored = []
614        for a in data.domain.attributes:
615            if not attrOk(a, data):
616                #print "Removing", a
617                ignored.append(a)
618        return ignored
619   
620    ignored = []
621    if iset(datai):
622        ignored = set(notOkAttributes(datai))
623    else:
624        #ignore any attribute which is has less than atLeast values for each class
625        #ignored = set(reduce(lambda x,y: x+y, [ notOkAttributes(data) for data in datai ]))
626
627        #remove any attribute, which is ok in less than half of the dataset
628        ignored = []
629        for a in itOrFirst(datai).domain.attributes:
630            attrOks = sum([ attrOk(a, data) for data in datai ])
631            if attrOks < len(datai)/2:
632                ignored.append(a)
633
634
635    natts = [ a for a in itOrFirst(datai).domain.attributes if a not in ignored ]
636    #print ignored, natts, set(ignored) & set(natts)
637
638    ndom = orange.Domain(natts, itOrFirst(datai).domain.classVar)
639
640    datao = None
641    if iset(datai):
642        datao = orange.ExampleTable(ndom, datai)
643    else:
644        datao = [ orange.ExampleTable(ndom, data) for data in datai ]
645
646    return datao, ignored
647
648def keepOnlyMeanAttrs(datai, atLeast=3, classValues=None):
649    """
650    Attributes need to be continuous, they need to have
651    at least one value.
652
653    In order of attribute to be valid, it needs to have at least
654    [atLeast] values for every class value.
655
656    Keep only specified classes - group them in two values.
657    """   
658    datai = takeClasses(datai, classValues=classValues)
659    return removeBadAttributes(datai, atLeast=atLeast)
660
661def data_single_meas_column(data):
662    """
663    Returns true if data seems to be in one column
664    (float variables) only. This column should contain
665    the rankings
666    """
667    columns = [a for a in data.domain] +  [ data.domain.getmeta(a) for a in list(data.domain.getmetas()) ]
668    floatvars = [ a for a in columns if a.varType == orange.VarTypes.Continuous ]
669    if len(floatvars) == 1:
670        return True
671    else:
672        return False
673
674def transform_data(data, phenVar, geneVar):
675    """
676    if we have log2ratio in a single value column, transpose the matrix
677    i.e. we have a single column with a continous variable. first
678    string variable then becomes the gene name
679
680    The goal is to have different phenotypes annotated with a class,
681    and names of genes as attribute names.
682
683    If phenVar is False, then we can work, then the input already
684    consists of scores of differential expressions
685
686    If we have a single column, transpose it.
687    If phenVar is one of the groups, transpose the matrix.
688    """
689
690    def prepare_data(data, phenVar=None, geneVar=None):
691
692        def rorq(a, name):
693            """ Group annatation or question mark. """
694            try: 
695                return a.attributes[name]
696            except: 
697                return '?'
698   
699        #use class as phenotype by default, if it is present,
700        #if not, do not use any phenotype!
701        if phenVar == None: 
702            if not data.domain.classVar:
703                phenVar = False
704            else:
705                phenVar = data.domain.classVar
706
707
708        #TODO validate phenVar and geneVar?
709        #TODO autodetection of groups?
710
711        #transpose is not needed if phenVar is classVar or phenVar is False
712        #and there is only one sample
713        if phenVar == data.domain.classVar or \
714            (phenVar == False and len(data) == 1):
715
716            if geneVar == None: #if not specified, set as true in this stage
717                geneVar = True
718
719            floatvars = [ a for a in data.domain.attributes \
720                if a.varType == orange.VarTypes.Continuous ]
721
722            #rename attributes without touching the original variable
723            if geneVar != True:
724                fl2 = []
725
726                for a in floatvars:
727                    na = orange.FloatVariable(name=rorq(a, geneVar))
728                    na.getValueFrom = lambda e, rw: e[a]
729                    fl2.append(na)
730
731                floatvars = fl2
732
733            dom = orange.Domain(floatvars, phenVar)
734            return orange.ExampleTable(dom, data)
735
736        elif phenVar == False or phenVar != data.domain.classVar:
737
738            cands = allgroups(data)
739            pv = False
740            if phenVar != False:
741                pv = orange.EnumVariable(name="phenotype", 
742                    values=list(cands[phenVar]))
743
744            #take the only string attribute as a gene name
745            gc = gene_cands(data, False)
746            if geneVar == None:
747                if len(gc) == 1:
748                    geneVar = gc[0]
749                else:
750                    geneNamesUnspecifiedError()
751           
752            latts = [ orange.FloatVariable(name=ex[geneVar].value) \
753                for ex in data ]
754
755            domain = orange.Domain(latts, pv)
756
757            examples = []
758            for at in data.domain.attributes:
759                if at.varType == orange.VarTypes.Continuous:
760                    vals = [ ex[at].value for ex in data ]
761                    if pv != False: #add class value
762                        vals.append(rorq(at, phenVar))
763                    examples.append(orange.Example(domain, vals))
764
765            return orange.ExampleTable(domain, examples)
766        else:
767            wrongInputsError()
768
769    #transform all example tables
770    single = iset(data)
771    transposed = [ prepare_data(d, phenVar, geneVar) for d in wrap_in_list(data) ]
772
773    if single:
774        return transposed[0]
775    else:
776        return transposed
777
778
779def allgroups(data):
780    """
781    Return all phenotype descriptors of attributes with their values.
782    """
783    sd = defaultdict(set)
784    for attr in data.domain.attributes:
785        for key, value in attr.attributes.items():
786            sd[key].add(value)
787    return sd
788
789def already_have_correlations(data):
790    return len(data) == 1 or data_single_meas_column(data)
791
792def need_to_transpose_single(data):
793    if len(data) == 1:
794        return False
795    else:
796        return True
797
798def phenotype_cands(data):
799    """
800    Return all phenotype candidate descriptors in a list of tuples
801    (variable, values). Candidates are class variable, if it exists and
802    attributes dictionaries of attributes.
803    Phenotype candidates must contain at least two differend values.
804    """
805    if already_have_correlations(data):
806        return [ (False, set()) ]
807    else:
808        cv = []
809        if data.domain.classVar and data.domain.classVar.varType == orange.VarTypes.Discrete:
810            cv.append((data.domain.classVar, set(data.domain.classVar.values)))
811        cands = cv + sorted(allgroups(data).items())
812        return filter(lambda x: len(x[1]) >= 2, cands)
813
814def gene_cands(data, correct):
815    """
816    Returns all valid gene descriptors with regards to the choosen
817    phenotype variable.
818    Return variable descriptor for variables, name of the group for
819    descriptions in attr.attributes and True for the usage
820    of attribute names.
821    Correct is True, if the example table has genes as attributes.
822    """
823    if correct:
824        #gene names could be in attributes or as gene names (marker True)
825        return [True] + nth(sorted(allgroups(data)),0)
826    else:
827        #gene names are values of some string attribute
828        columns = [a for a in data.domain] +  \
829            [ data.domain.getmeta(a) for a in list(data.domain.getmetas()) ]
830        stringvars = [ a for a in columns if a.varType == 6 ]
831        return stringvars
832
833def is_variable(phenVar):
834    return isinstance(phenVar, orange.Variable)
835
836class GSEA(object):
837
838    def __init__(self, data, organism=None, matcher=None, classValues=None, 
839        atLeast=3, caseSensitive=False, phenVar=None, geneVar=None):
840        """
841        If the data set constains multiple measurements for a single gene,
842        all are considered. Individual constributions of such measurements
843        are not weighted down - each measurement is as important as they
844        would measure different genes.
845
846        phenVar and geneVar can ether be an orange attribute or a string.
847        If they are strings, then they describe a group.
848        """
849
850        self.genesets = {}
851        self.organism = organism
852
853        if organism != None:
854            print "WARNING: obiGsea - organism and caseSensitive parameters are deprecated. Use matcher instead."
855
856        self.gsweights = {}
857        self.namesToIndices = None
858        self.gm = matcher
859
860        data = transform_data(data, phenVar, geneVar)
861
862        data, info = keepOnlyMeanAttrs(data, classValues=classValues, atLeast=atLeast)
863
864        self.data = data
865
866        #init attrnames
867        attrnames = [ a.name for a in itOrFirst(self.data).domain.attributes ]
868
869        if self.gm == None: #build a gene matcher, if if does not exists
870            self.gm = obiGene.matcher([obiGene.GMKEGG(self.organism, ignore_case=not caseSensitive)], 
871                ignore_case=not caseSensitive, direct=True)
872            print "WARNING: gene matcher build automatically for organism: " + self.organism
873
874        self.gm.set_targets(attrnames)
875
876 
877    def addGeneset(self, genesetname, genes):
878        """
879        Add a single gene set. See addGenesets function.
880        Solely for backwards compatibility.
881        """
882        self.addGenesets({ genesetname: genes })
883
884    def addGenesets(self, genesets):
885        """
886        Adds genesets from input dictionary. Performs gene matching. Adds
887        to a self.genesets: key is genesetname, it's values are individual
888        genes and match results.
889        """
890        for g in obiGeneSets.GeneSets(genesets):
891            genes = g.genes
892            datamatch = filter(lambda x: x[1] != None, 
893                [ (gene, self.gm.umatch(gene)) for gene in genes])
894            self.genesets[g] = datamatch
895
896    def selectGenesets(self, minSize=3, maxSize=1000, minPart=0.1):
897        """ Returns a list of gene sets that have sizes in limits """
898
899        def okSizes(orig, transl):
900            """compares sizes of genesets to limitations"""
901            if len(transl) >= minSize and len(transl) <= maxSize \
902                and float(len(transl))/len(orig) >= minPart:
903                return True
904            return False
905
906        return  dict( (a,c) for a,c in self.genesets.iteritems() if okSizes(a.genes,c) )
907
908    def genesIndices(self, genes):
909        """
910        Returns in attribute indices of given genes.
911        Buffers locations dictionary.
912        """
913        if not self.namesToIndices:
914            self.namesToIndices = defaultdict(list)
915            for i,at in enumerate(itOrFirst(self.data).domain.attributes):
916                self.namesToIndices[at.name].append(i)
917        return reduce(lambda x,y:x+y, [ self.namesToIndices[gname] for gname in genes ], [])
918
919    def compute_gene_weights(self, gsweights, gsetsnum, nattributes):
920        """
921        Computes gene set weights for all specified weights.
922        Expects gene sets in form { name: [ num_attributes ] }
923        GSWeights are
924        """
925        pass
926
927    def to_gsetsnum(self, gsets):
928        """
929        Returns a dictionary of given  gene sets in gsetnums format.
930        """
931        return dict( (gs,self.genesIndices(nth(self.genesets[gs],1))) for gs in gsets)
932
933    def compute(self, minSize=3, maxSize=1000, minPart=0.1, n=100, **kwargs):
934
935        subsetsok = self.selectGenesets(minSize=minSize, maxSize=maxSize, minPart=minPart)
936
937        geneweights = None
938
939        gsetsnum = self.to_gsetsnum(subsetsok.keys())
940        gsetsnumit = gsetsnum.items() #to fix order
941
942        #gsetsnumit = gsetsnumit[:1]
943        #print gsetsnumit
944
945        if len(gsetsnum) == 0:
946            return {} # quick return if no genesets
947
948        if len(self.gsweights) > 0:
949            #set geneset
950            geneweights = [1]*len(data.domain.attributes)
951
952        if len(itOrFirst(self.data)) > 1:
953            gseal = gseaE(self.data, nth(gsetsnumit,1), n=n, geneweights=geneweights, **kwargs)
954        else:
955            rankings = [ self.data[0][at].native() for at in self.data.domain.attributes ]
956            gseal = gseaR(rankings, nth(gsetsnumit,1), n=n, **kwargs)
957
958        res = {}
959
960        for gs, gseale in zip(nth(gsetsnumit,0),gseal):
961            rdict = {}
962            rdict['es'] = gseale[0]
963            rdict['nes'] = gseale[1]
964            rdict['p'] = gseale[2]
965            rdict['fdr'] = gseale[3]
966            rdict['size'] = len(gs.genes)
967            rdict['matched_size'] = len(self.genesets[gs])
968            rdict['genes'] = nth(self.genesets[gs],1)
969            res[gs] = rdict
970
971        return res
972
973def runGSEA(data, organism=None, classValues=None, geneSets=None, n=100, 
974        permutation="class", minSize=3, maxSize=1000, minPart=0.1, atLeast=3, 
975        matcher=None, geneVar=None, phenVar=None, caseSensitive=False, 
976        **kwargs):
977    """
978    phenVar and geneVar specify the phenotype and gene variable.
979    """
980    gso = GSEA(data, organism=organism, matcher=matcher, 
981        classValues=classValues, atLeast=atLeast, caseSensitive=caseSensitive,
982        geneVar=geneVar, phenVar=phenVar)
983    gso.addGenesets(geneSets)
984    res1 = gso.compute(n=n, permutation=permutation, minSize=minSize,
985        maxSize=maxSize, minPart=minPart, **kwargs)
986    return res1
987
988def etForAttribute(datal,a):
989    """
990    Builds an example table for a single attribute across multiple
991    example tables.
992    """
993
994    tables = len(datal)
995
996    def getAttrVals(data, attr):
997        dom2 = orange.Domain([data.domain[attr]], False)
998        dataa = orange.ExampleTable(dom2, data)
999        return [ a[0].native() for a in dataa ]
1000
1001    domainl = []
1002    valuesl = []
1003
1004    for id, data in enumerate(datal):
1005        v = getAttrVals(data,a)
1006        valuesl.append(v)
1007        domainl.append(orange.FloatVariable(name=("v"+str(id))))
1008
1009    classvals = getAttrVals(data, datal[0].domain.classVar)
1010    valuesl += [ classvals ]
1011
1012    dom = orange.Domain(domainl, datal[0].domain.classVar)
1013    examples = [ list(a) for a in zip(*valuesl) ]
1014
1015    datat = orange.ExampleTable(dom, examples)
1016
1017    return datat
1018
1019
1020def evaluateEtWith(fn, *args, **kwargs):
1021    """
1022    fn - evaluates example table given
1023    following arguments.
1024    """
1025
1026    def newf(datal):
1027        res = []
1028        for a in datal[0].domain.attributes:
1029            et = etForAttribute(datal, a)
1030            res.append(fn(et, *args, **kwargs))
1031        return res
1032
1033    return newf
1034
1035
1036def hierarchyOutput(results, limitGenes=50):
1037    """
1038    Transforms results for use by hierarchy output from GO.
1039
1040    limitGenes - maximum number of genes on output.
1041    """
1042    trans = []
1043   
1044    for name, res in results.items():
1045        try:
1046            second = name.split(' ')[2]
1047            name = second if second[:2] == 'GO' else name
1048        except:
1049            pass
1050       
1051        trans.append((name, abs(res["nes"]), res["matched_size"], res["size"], res["p"], min(res["fdr"], 1.0), res["genes"][:limitGenes]))
1052
1053    return trans
1054
1055if  __name__=="__main__":
1056
1057    """
1058    Old example with another measure function
1059    data = orange.ExampleTable("gene_three_lines_log.tab")
1060    data = orange.ExampleTable("sterolTalkHepa.tab")
1061    gen1 = collections(['steroltalk.gmt', ':kegg:hsa'], default=False)
1062    rankingf = rankingFromOrangeMeas(MA_anova())
1063    matcher = obiGene.matcher([obiGene.GMKEGG('hsa')])
1064    geneVar = gene_cands(data, False)[1]
1065    phenVar = "group"
1066    out = runGSEA(data, n=10, geneSets=gen1, permutation="gene", atLeast=3, matcher=matcher, rankingf=rankingf, phenVar=phenVar, geneVar=geneVar)
1067    """
1068    """
1069    data = orange.ExampleTable("sterolTalkHepa.tab")
1070    gen1 = obiGeneSets.collections('steroltalk.gmt', (("KEGG",), "9606"))
1071    matcher = obiGene.matcher([obiGene.GMKEGG('hsa')])
1072    out = runGSEA(data, n=10, geneSets=gen1, permutation="gene", atLeast=3, matcher=matcher)
1073    """
1074    matcher = obiGene.matcher([[obiGene.GMKEGG('ddi'), obiGene.GMDicty()]])
1075    data = orange.ExampleTable("/home/marko/t_gsea1.tab")
1076    gen1 = obiGeneSets.collections((("KEGG",), "352472"))
1077    out = runGSEA(data, n=10, geneSets=gen1, permutation="gene", atLeast=3, matcher=matcher, phenVar="growth")
1078    print out
1079    print "\n".join(map(str,sorted(out.items())))
1080   
Note: See TracBrowser for help on using the repository browser.