source: orange-bioinformatics/_bioinformatics/obiGsea.py @ 1705:0b6781662f02

Revision 1705:0b6781662f02, 32.5 KB checked in by markotoplak, 21 months ago (diff)

obiGsea.takeClasses: now builds a class value with get_value_from.

Line 
1from __future__ import absolute_import
2
3from collections import defaultdict
4import random
5import time
6
7import numpy
8
9import orange
10import Orange
11
12from . import obiGeneSets
13from .obiExpression import *
14
15"""
16Gene set enrichment analysis.
17
18Author: Marko Toplak
19"""
20
21def iset(data):
22    """
23    Is data orange.ExampleTable?
24    """
25    return isinstance(data, orange.ExampleTable)
26
27def issequencens(x):
28    "Is x a sequence and not string ? We say it is if it has a __getitem__ method and is not string."
29    return hasattr(x, '__getitem__') and not isinstance(x, basestring)
30
31def mean(l):
32    return float(sum(l))/len(l)
33
34def rankingFromOrangeMeas(meas):
35    """
36    Creates a function that sequentally ranks all attributes and returns
37    results in a list. Ranking function is build out of
38    orange.MeasureAttribute.
39    """
40    return lambda d: [ meas(i,d) for i in range(len(d.domain.attributes)) ]
41
42def orderedPointersCorr(lcor):
43    """
44    Return a list of integers: indexes in original
45    lcor. Elements in the list are ordered by
46    their lcor[i] value. Higher correlations first.
47    """
48    ordered = [ (i,a) for i,a in enumerate(lcor) ] #original pos + correlation
49    ordered.sort(lambda x,y: cmp(y[1],x[1])) #sort by correlation, descending
50    ordered = nth(ordered, 0) #contains positions in the original list
51    return ordered
52
53def enrichmentScoreRanked(subset, lcor, ordered, p=1.0, rev2=None):
54    """
55    Input data and subset.
56   
57    subset: list of attribute indices of the input data belonging
58        to the same set.
59    lcor: correlations with class for each attribute in a list.
60
61    Returns enrichment score on given data.
62
63    This implementation efficiently handles "sparse" genesets (that
64    cover only a small subset of all genes in the dataset).
65    """
66
67    #print lcor
68
69    subset = set(subset)
70
71    if rev2 == None:
72        def rev(l):
73            return numpy.argsort(l)
74        rev2 = rev(ordered)
75
76    #add if gene is not in the subset
77    notInA = -(1. / (len(lcor)-len(subset)))
78    #base for addition if gene is in the subset
79    cors = [ abs(lcor[i])**p for i in subset ]
80    sumcors = sum(cors)
81
82    #this should not happen
83    if sumcors == 0.0:
84        return (0.0, None)
85   
86    inAb = 1./sumcors
87
88    ess = [0.0]
89   
90    map = {}
91    for i in subset:
92        orderedpos = rev2[i]
93        map[orderedpos] = inAb*abs(lcor[i]**p)
94       
95    last = 0
96
97    maxSum = minSum = csum = 0.0
98
99    for a,b in sorted(map.items()):
100        diff = a-last
101        csum += notInA*diff
102        last = a+1
103       
104        if csum < minSum:
105            minSum = csum
106       
107        csum += b
108
109        if csum > maxSum:
110            maxSum = csum
111
112    #finish it
113    diff = (len(ordered))-last
114    csum += notInA*diff
115
116    if csum < minSum:
117        minSum = csum
118
119    #print "MY", (maxSum if abs(maxSum) > abs(minSum) else minSum)
120
121    """
122    #BY DEFINITION
123    print "subset", subset
124
125    for i in ordered:
126        ess.append(ess[-1] + \
127            (inAb*abs(lcor[i]**p) if i in subset else notInA)
128        )
129        if i in subset:
130            print ess[-2], ess[-1]
131            print i, (inAb*abs(lcor[i]**p))
132
133    maxEs = max(ess)
134    minEs = min(ess)
135   
136    print "REAL", (maxEs if abs(maxEs) > abs(minEs) else minEs, ess[1:])
137
138    """
139    return (maxSum if abs(maxSum) > abs(minSum) else minSum, [])
140
141#from mOrngData
142def shuffleAttribute(data, attribute, locations):
143    """
144    Destructive!
145    """
146    attribute = data.domain[attribute]
147    l = [None]*len(data)
148    for i in range(len(data)):
149        l[locations[i]] = data[i][attribute]
150    for i in range(len(data)):
151        data[i][attribute] = l[i]
152
153def shuffleClass(datai, rands=0):
154    """
155    Returns a dataset with values of class attribute randomly shuffled.
156    If multiple dataset are on input shuffle them all with the same random seed.
157    """
158    def shuffleOne(data):
159        rand = random.Random(rands)
160        d2 = orange.ExampleTable(data.domain, data)
161        locations = range(len(data))
162        rand.shuffle(locations)
163        shuffleAttribute(d2, d2.domain.classVar, locations)
164        return d2
165
166    if iset(datai):
167        return shuffleOne(datai)
168    else:
169        return [ shuffleOne(data) for data in datai ]
170
171def shuffleList(l, rand=random.Random(0)):
172    """
173    Returns a copy of a shuffled input list.
174    """
175    import copy
176    l2 = copy.copy(l)
177    rand.shuffle(l2)
178    return l2
179
180def shuffleAttributes(data, rand=random.Random(0)):
181    """
182    Returns a dataset with a new attribute order.
183    """
184    natts = shuffleList(list(data.domain.attributes), rand)
185    dom2 = orange.Domain(natts, data.domain.classVar)
186    d2 = orange.ExampleTable(dom2, data)
187    return d2
188
189def gseapval(es, esnull):
190    """
191    From article (PNAS):
192    estimate nominal p-value for S from esnull by using the positive
193    or negative portion of the distribution corresponding to the sign
194    of the observed ES(S).
195    """
196   
197    try:
198        if es < 0:
199            return float(len([ a for a in esnull if a <= es ]))/ \
200                len([ a for a in esnull if a < 0])   
201        else: 
202            return float(len([ a for a in esnull if a >= es ]))/ \
203                len([ a for a in esnull if a >= 0])
204    except:
205        return 1.0
206
207
208def enrichmentScore(data, subset, rankingf):
209    """
210    Returns enrichment score and running enrichment score.
211    """
212    lcor = rankingf(data)
213    ordered = orderedPointersCorr(lcor)
214    es,l = enrichmentScoreRanked(subset, lcor, ordered)
215    return es,l
216
217def gseaE(data, subsets, rankingf=None, \
218        n=100, permutation="class", **kwargs):
219    """
220    Run GSEA algorithm on an example table.
221
222    data: orange example table.
223    subsets: list of distinct subsets of data.
224    rankingf: function that returns correlation to class of each
225        variable.
226    n: number of random permutations to sample null distribution.
227    permutation: "class" for permutating class, else permutate attribute
228        order.
229
230    """
231
232    if not rankingf:
233        rankingf=rankingFromOrangeMeas(MA_signalToNoise())
234
235    enrichmentScores = []
236 
237    lcor = rankingf(data)
238    #print lcor
239
240    ordered = orderedPointersCorr(lcor)
241
242    def rev(l):
243        return numpy.argsort(l)
244
245    rev2 = rev(ordered)
246
247    for subset in subsets:
248        es = enrichmentScoreRanked(subset, lcor, ordered, rev2=rev2)[0]
249        enrichmentScores.append(es)
250
251    runOptCallbacks(kwargs)
252
253    #print "PERMUTATION", permutation
254
255    enrichmentNulls = [ [] for a in range(len(subsets)) ]
256
257    for i in range(n):
258
259        if permutation == "class":
260            d2 = shuffleClass(data, 2000+i) #fixed permutation
261            r2 = rankingf(d2)
262
263        else:
264            r2 = shuffleList(lcor, random.Random(2000+i))
265
266        ordered2 = orderedPointersCorr(r2)
267        rev22 = rev(ordered2)
268        for si,subset in enumerate(subsets):
269            esn = enrichmentScoreRanked(subset, r2, ordered2, rev2=rev22)[0]
270            enrichmentNulls[si].append(esn)
271
272        runOptCallbacks(kwargs)
273
274    return gseaSignificance(enrichmentScores, enrichmentNulls)
275
276
277def runOptCallbacks(rargs):
278    if "callback" in rargs:
279        try:
280            [ a() for a in rargs["callback"] ]
281        except:
282            rargs["callback"]()
283           
284
285def gseaR(rankings, subsets, n=100, **kwargs):
286    """
287    """
288
289    if "permutation" in kwargs:
290        if kwargs["permutation"] == "class":
291            raise Exception("Only gene permutation possible")
292
293    enrichmentScores = []
294 
295    ordered = orderedPointersCorr(rankings)
296   
297    def rev(l):
298        return numpy.argsort(l)
299
300    rev2 = rev(ordered)
301
302    for subset in subsets:
303
304        es = enrichmentScoreRanked(subset, rankings, ordered, rev2=rev2)[0]
305        enrichmentScores.append(es)
306   
307    runOptCallbacks(kwargs)
308
309    enrichmentNulls = [ [] for a in range(len(subsets)) ]
310
311    for i in range(n):
312       
313        r2 = shuffleList(rankings, random.Random(2000+i))
314        ordered2 = orderedPointersCorr(r2)
315        rev22 = rev(ordered2)
316
317        for si,subset in enumerate(subsets):
318
319            esn = enrichmentScoreRanked(subset, r2, ordered2, rev2=rev22)[0]
320            enrichmentNulls[si].append(esn)
321
322        runOptCallbacks(kwargs)
323
324    return gseaSignificance(enrichmentScores, enrichmentNulls)
325
326
327def gseaSignificance(enrichmentScores, enrichmentNulls):
328
329    #print enrichmentScores
330
331    import time
332
333    tb1 = time.time()
334
335    enrichmentPVals = []
336    nEnrichmentScores = []
337    nEnrichmentNulls = []
338
339    for i in range(len(enrichmentScores)):
340        es = enrichmentScores[i]
341        enrNull = enrichmentNulls[i]
342        #print es, enrNull
343
344        enrichmentPVals.append(gseapval(es, enrNull))
345
346        #normalize the ES(S,pi) and the observed ES(S), separetely rescaling
347        #the positive and negative scores by divident by the mean of the
348        #ES(S,pi)
349
350        #print es, enrNull
351
352        def normalize(s):
353            try:
354                if s == 0:
355                    return 0.0
356                if s >= 0:
357                    meanPos = mean([a for a in enrNull if a >= 0])
358                    #print s, meanPos
359                    return s/meanPos
360                else:
361                    meanNeg = mean([a for a in enrNull if a < 0])
362                    #print s, meanNeg
363                    return -s/meanNeg
364            except:
365                return 0.0 #return if according mean value is uncalculable
366
367
368        nes = normalize(es)
369        nEnrichmentScores.append(nes)
370       
371        nenrNull = [ normalize(s) for s in enrNull ]
372        nEnrichmentNulls.append(nenrNull)
373 
374
375    #print "First part", time.time() - tb1
376
377    #FDR computation
378    #create a histogram of all NES(S,pi) over all S and pi
379    vals = reduce(lambda x,y: x+y, nEnrichmentNulls, [])
380
381
382    def shorten(l, p=10000):
383        """
384        Take each len(l)/p element, if len(l)/p >= 2.
385        """
386        e = len(l)/p
387        if e <= 1:
388            return l
389        else:
390            return [ l[i] for i in xrange(0, len(l), e) ]
391
392    #vals = shorten(vals) -> this can speed up second part. is it relevant TODO?
393
394    """
395    Use this null distribution to compute an FDR q value, for a given NES(S) =
396    NES* >= 0. The FDR is the ratio of the percantage of all (S,pi) with
397    NES(S,pi) >= 0, whose NES(S,pi) >= NES*, divided by the percentage of
398    observed S wih NES(S) >= 0, whose NES(S) >= NES*, and similarly if NES(S)
399    = NES* <= 0.
400    """
401
402    nvals = numpy.array(sorted(vals))
403    nnes = numpy.array(sorted(nEnrichmentScores))
404
405    #print "LEN VALS", len(vals), len(nEnrichmentScores)
406
407    fdrs = []
408
409    import operator
410
411    for i in range(len(enrichmentScores)):
412
413        nes = nEnrichmentScores[i]
414
415        """
416        #Strighfoward but slow implementation follows in comments.
417        #Useful as code description.
418       
419        if nes >= 0:
420            op0 = operator.ge
421            opn = operator.ge
422        else:
423            op0 = operator.lt
424            opn = operator.le
425
426        allPos = [a for a in vals if op0(a,0)]
427        allHigherAndPos = [a for a in allPos if opn(a,nes) ]
428
429        nesPos = [a for a in nEnrichmentScores if op0(a,0) ]
430        nesHigherAndPos = [a for a in nesPos if opn(a,nes) ]
431
432        top = len(allHigherAndPos)/float(len(allPos)) #p value
433        down = len(nesHigherAndPos)/float(len(nesPos))
434       
435        l1 = [ len(allPos), len(allHigherAndPos), len(nesPos), len(nesHigherAndPos)]
436
437        allPos = allHigherAndPos = nesPos =  nesHigherAndPos = 1
438
439        """
440
441        #this could be speed up twice with the same accuracy!
442        if nes >= 0:
443            allPos = int(len(vals) - numpy.searchsorted(nvals, 0, side="left"))
444            allHigherAndPos = int(len(vals) - numpy.searchsorted(nvals, nes, side="left"))
445            nesPos = len(nnes) - int(numpy.searchsorted(nnes, 0, side="left"))
446            nesHigherAndPos = len(nnes) - int(numpy.searchsorted(nnes, nes, side="left"))
447        else:
448            allPos = int(numpy.searchsorted(nvals, 0, side="left"))
449            allHigherAndPos = int(numpy.searchsorted(nvals, nes, side="right"))
450            nesPos = int(numpy.searchsorted(nnes, 0, side="left"))
451            nesHigherAndPos = int(numpy.searchsorted(nnes, nes, side="right"))
452           
453        """
454        #Comparing results
455        l2 = [ allPos, allHigherAndPos, nesPos, nesHigherAndPos ]
456        diffs = [ l1[i]-l2[i] for i in range(len(l1)) ]
457        sumd = sum( [ abs(a) for a in diffs ] )
458        if sumd > 0:
459            print nes > 0
460            print "orig", l1
461            print "modi", l2
462        """
463
464        try:
465            top = allHigherAndPos/float(allPos) #p value
466            down = nesHigherAndPos/float(nesPos)
467
468            fdrs.append(top/down)
469        except:
470            fdrs.append(1000000000.0)
471   
472    #print "Whole part", time.time() - tb1
473
474    return zip(enrichmentScores, nEnrichmentScores, enrichmentPVals, fdrs)
475
476from . import obiGene
477
478def nth(l,n): return [ a[n] for a in l ]
479
480def itOrFirst(data):
481    """ Returns input if input is of type ExampleTable, else returns first
482    element of the input list """
483    if iset(data):
484        return data
485    else:
486        return data[0]
487
488def wrap_in_list(data):
489    """ Wraps orange.ExampleTable in a list """
490    if iset(data):
491        return [ data ]
492    else:
493        return data
494
495def takeClasses(datai, classValues=None):
496    """
497    Function joins class groups specified in an input pair
498    classValues. Each element of the pair is a list of class
499    values to be joined to first or second class. Group
500    classes in two new class values.
501    If class values are not specified, take all the classes.
502
503    Input data can be a single data set or a list of data sets
504    with the same domain.
505
506    Returns transformed data sets / data sets.
507    """
508
509    cv = itOrFirst(datai).domain.classVar
510    nclassvalues = None
511
512    if cv and len(itOrFirst(datai)) > 1:
513        oldcvals = [ a for a in cv.values ]
514       
515        if not classValues:
516            classValues = oldcvals
517
518        toJoin = []
519
520        for vals in classValues:
521            if issequencens(vals):
522                toJoin.append(list(vals))
523            else:
524                toJoin.append([vals])
525
526        classValues = reduce(lambda x,y: x+y, toJoin)
527        classValues = [ str(a) for a in classValues ] # ok class values
528
529        #dictionary of old class -> new class
530        mapval = {}
531        nclassvalues = [] # need to preserver order
532
533        for joinvals in toJoin:
534            joinvalsn = "+".join([ str(val) for val in sorted(joinvals) ])
535            nclassvalues.append(joinvalsn)
536
537            for val in joinvals:
538                mapval[str(val)] = joinvalsn
539
540        #take only examples with classValues classes
541        nclass = orange.EnumVariable(cv.name, values=nclassvalues)
542
543        def transform_class(ex,cv,mapval,classValues):
544            """
545            Removes unnecessary class values and joins them according
546            to function input.
547            """
548            if ex[cv] in classValues:
549                nex = orange.Example(ndom, ex)
550                return nclass(mapval[str(ex[cv].value)])
551            else:
552                return "?"
553
554        nclass.get_value_from = lambda ex,_: transform_class(ex,cv=cv,mapval=mapval,classValues=classValues)
555
556        ndom = orange.Domain(itOrFirst(datai).domain.attributes, nclass)
557
558        def removeAndTransformClasses(data):
559            ndata = Orange.data.Table(ndom, data)
560            ndata = Orange.data.Table(ndom, [ex for ex in ndata if ex[-1].value != "?"])
561            return ndata
562
563        if iset(datai):
564            datai = removeAndTransformClasses(datai)
565        else:
566            datai = [ removeAndTransformClasses(data) for data in datai ]
567
568    return datai
569
570def removeBadAttributes(datai, atLeast=3):
571    """
572    Removes attributes which would obscure GSEA analysis.
573
574    Attributes need to be continuous, they need to have
575    at least one value. Remove other attributes.
576
577    For the attribute to be valid, it needs to have at least
578    [atLeast] values for every class value.
579
580    Return transformed data set / data sets and ignored attributes.
581    """
582
583    def attrOk(a, data):
584        """
585        Attribute is ok if it is continouous and if containg
586        at least atLest not unknown values.
587        """
588
589        a = data.domain.attributes.index(a)
590
591        #can't
592        if data.domain.attributes[a].varType != orange.VarTypes.Continuous:
593            return False
594
595        if len(data) == 1:
596
597            vals = [ex[a].value for ex in data if not ex[a].isSpecial()]
598            if len(vals) < 1:
599                return False 
600       
601        if len(data) > 1 and data.domain.classVar and atLeast > 0:
602           
603            dc = dict( (v, 0) for v in data.domain.classVar.values )
604
605            for ex in data:
606                if not ex[a].isSpecial():
607                    dc[ex[-1].value] += 1
608
609            minl = min(dc.values())
610
611            if minl < atLeast:
612                #print "Less than atLeast"
613                return False
614
615        return True
616   
617
618    def notOkAttributes(data):
619        ignored = []
620        for a in data.domain.attributes:
621            if not attrOk(a, data):
622                #print "Removing", a
623                ignored.append(a)
624        return ignored
625   
626    ignored = []
627    if iset(datai):
628        ignored = set(notOkAttributes(datai))
629    else:
630        #ignore any attribute which is has less than atLeast values for each class
631        #ignored = set(reduce(lambda x,y: x+y, [ notOkAttributes(data) for data in datai ]))
632
633        #remove any attribute, which is ok in less than half of the dataset
634        ignored = []
635        for a in itOrFirst(datai).domain.attributes:
636            attrOks = sum([ attrOk(a, data) for data in datai ])
637            if attrOks < len(datai)/2:
638                ignored.append(a)
639
640
641    natts = [ a for a in itOrFirst(datai).domain.attributes if a not in ignored ]
642    #print ignored, natts, set(ignored) & set(natts)
643
644    ndom = orange.Domain(natts, itOrFirst(datai).domain.classVar)
645
646    datao = None
647    if iset(datai):
648        datao = orange.ExampleTable(ndom, datai)
649    else:
650        datao = [ orange.ExampleTable(ndom, data) for data in datai ]
651
652    return datao, ignored
653
654def keepOnlyMeanAttrs(datai, atLeast=3, classValues=None):
655    """
656    Attributes need to be continuous, they need to have
657    at least one value.
658
659    In order of attribute to be valid, it needs to have at least
660    [atLeast] values for every class value.
661
662    Keep only specified classes - group them in two values.
663    """   
664    datai = takeClasses(datai, classValues=classValues)
665    return removeBadAttributes(datai, atLeast=atLeast)
666
667def data_single_meas_column(data):
668    """
669    Returns true if data seems to be in one column
670    (float variables) only. This column should contain
671    the rankings
672    """
673    columns = [a for a in data.domain] +  [ data.domain.getmeta(a) for a in list(data.domain.getmetas()) ]
674    floatvars = [ a for a in columns if a.varType == orange.VarTypes.Continuous ]
675    if len(floatvars) == 1:
676        return True
677    else:
678        return False
679
680def transform_data(data, phenVar, geneVar):
681    """
682    if we have log2ratio in a single value column, transpose the matrix
683    i.e. we have a single column with a continous variable. first
684    string variable then becomes the gene name
685
686    The goal is to have different phenotypes annotated with a class,
687    and names of genes as attribute names.
688
689    If phenVar is False, then we can work, then the input already
690    consists of scores of differential expressions
691
692    If we have a single column, transpose it.
693    If phenVar is one of the groups, transpose the matrix.
694    """
695
696    def prepare_data(data, phenVar=None, geneVar=None):
697
698        def rorq(a, name):
699            """ Group annatation or question mark. """
700            try: 
701                return a.attributes[name]
702            except: 
703                return '?'
704   
705        #use class as phenotype by default, if it is present,
706        #if not, do not use any phenotype!
707        if phenVar == None: 
708            if not data.domain.classVar:
709                phenVar = False
710            else:
711                phenVar = data.domain.classVar
712
713
714        #TODO validate phenVar and geneVar?
715        #TODO autodetection of groups?
716
717        #transpose is not needed if phenVar is classVar or phenVar is False
718        #and there is only one sample
719        if phenVar == data.domain.classVar or \
720            (phenVar == False and len(data) == 1):
721
722            if geneVar == None: #if not specified, set as true in this stage
723                geneVar = True
724
725            floatvars = [ a for a in data.domain.attributes \
726                if a.varType == orange.VarTypes.Continuous ]
727
728            #rename attributes without touching the original variable
729            if geneVar != True:
730                fl2 = []
731
732                for a in floatvars:
733                    na = orange.FloatVariable(name=rorq(a, geneVar))
734                    na.getValueFrom = lambda e, rw: e[a]
735                    fl2.append(na)
736
737                floatvars = fl2
738
739            dom = orange.Domain(floatvars, phenVar)
740            return orange.ExampleTable(dom, data)
741
742        elif phenVar == False or phenVar != data.domain.classVar:
743
744            cands = allgroups(data)
745            pv = False
746            if phenVar != False:
747                pv = orange.EnumVariable(name="phenotype", 
748                    values=list(cands[phenVar]))
749
750            #take the only string attribute as a gene name
751            gc = gene_cands(data, False)
752            if geneVar == None:
753                if len(gc) == 1:
754                    geneVar = gc[0]
755                else:
756                    geneNamesUnspecifiedError()
757           
758            latts = [ orange.FloatVariable(name=ex[geneVar].value) \
759                for ex in data ]
760
761            domain = orange.Domain(latts, pv)
762
763            examples = []
764            for at in data.domain.attributes:
765                if at.varType == orange.VarTypes.Continuous:
766                    vals = [ ex[at].value for ex in data ]
767                    if pv != False: #add class value
768                        vals.append(rorq(at, phenVar))
769                    examples.append(orange.Example(domain, vals))
770
771            return orange.ExampleTable(domain, examples)
772        else:
773            wrongInputsError()
774
775    #transform all example tables
776    single = iset(data)
777    transposed = [ prepare_data(d, phenVar, geneVar) for d in wrap_in_list(data) ]
778
779    if single:
780        return transposed[0]
781    else:
782        return transposed
783
784
785def allgroups(data):
786    """
787    Return all phenotype descriptors of attributes with their values.
788    """
789    sd = defaultdict(set)
790    for attr in data.domain.attributes:
791        for key, value in attr.attributes.items():
792            sd[key].add(value)
793    return sd
794
795def already_have_correlations(data):
796    return len(data) == 1 or data_single_meas_column(data)
797
798def need_to_transpose_single(data):
799    if len(data) == 1:
800        return False
801    else:
802        return True
803
804def phenotype_cands(data):
805    """
806    Return all phenotype candidate descriptors in a list of tuples
807    (variable, values). Candidates are class variable, if it exists and
808    attributes dictionaries of attributes.
809    Phenotype candidates must contain at least two differend values.
810    """
811    if already_have_correlations(data):
812        return [ (False, set()) ]
813    else:
814        cv = []
815        if data.domain.classVar and data.domain.classVar.varType == orange.VarTypes.Discrete:
816            cv.append((data.domain.classVar, set(data.domain.classVar.values)))
817        cands = cv + sorted(allgroups(data).items())
818        return filter(lambda x: len(x[1]) >= 2, cands)
819
820def gene_cands(data, correct):
821    """
822    Returns all valid gene descriptors with regards to the choosen
823    phenotype variable.
824    Return variable descriptor for variables, name of the group for
825    descriptions in attr.attributes and True for the usage
826    of attribute names.
827    Correct is True, if the example table has genes as attributes.
828    """
829    if correct:
830        #gene names could be in attributes or as gene names (marker True)
831        return [True] + nth(sorted(allgroups(data)),0)
832    else:
833        #gene names are values of some string attribute
834        columns = [a for a in data.domain] +  \
835            [ data.domain.getmeta(a) for a in list(data.domain.getmetas()) ]
836        stringvars = [ a for a in columns if a.varType == 6 ]
837        return stringvars
838
839def is_variable(phenVar):
840    return isinstance(phenVar, orange.Variable)
841
842class GSEA(object):
843
844    def __init__(self, data, organism=None, matcher=None, classValues=None, 
845        atLeast=3, caseSensitive=False, phenVar=None, geneVar=None):
846        """
847        If the data set constains multiple measurements for a single gene,
848        all are considered. Individual constributions of such measurements
849        are not weighted down - each measurement is as important as they
850        would measure different genes.
851
852        phenVar and geneVar can ether be an orange attribute or a string.
853        If they are strings, then they describe a group.
854        """
855
856        self.genesets = {}
857        self.organism = organism
858
859        if organism != None:
860            print "WARNING: obiGsea - organism and caseSensitive parameters are deprecated. Use matcher instead."
861
862        self.gsweights = {}
863        self.namesToIndices = None
864        self.gm = matcher
865
866        data = transform_data(data, phenVar, geneVar)
867
868        data, info = keepOnlyMeanAttrs(data, classValues=classValues, atLeast=atLeast)
869
870        self.data = data
871
872        #init attrnames
873        attrnames = [ a.name for a in itOrFirst(self.data).domain.attributes ]
874
875        if self.gm == None: #build a gene matcher, if if does not exists
876            self.gm = obiGene.matcher([obiGene.GMKEGG(self.organism, ignore_case=not caseSensitive)], 
877                ignore_case=not caseSensitive, direct=True)
878            print "WARNING: gene matcher build automatically for organism: " + self.organism
879
880        self.gm.set_targets(attrnames)
881
882 
883    def addGeneset(self, genesetname, genes):
884        """
885        Add a single gene set. See addGenesets function.
886        Solely for backwards compatibility.
887        """
888        self.addGenesets({ genesetname: genes })
889
890    def addGenesets(self, genesets):
891        """
892        Adds genesets from input dictionary. Performs gene matching. Adds
893        to a self.genesets: key is genesetname, it's values are individual
894        genes and match results.
895        """
896        for g in obiGeneSets.GeneSets(genesets):
897            genes = g.genes
898            datamatch = filter(lambda x: x[1] != None, 
899                [ (gene, self.gm.umatch(gene)) for gene in genes])
900            self.genesets[g] = datamatch
901
902    def selectGenesets(self, minSize=3, maxSize=1000, minPart=0.1):
903        """ Returns a list of gene sets that have sizes in limits """
904
905        def okSizes(orig, transl):
906            """compares sizes of genesets to limitations"""
907            if len(transl) >= minSize and len(transl) <= maxSize \
908                and float(len(transl))/len(orig) >= minPart:
909                return True
910            return False
911
912        return  dict( (a,c) for a,c in self.genesets.iteritems() if okSizes(a.genes,c) )
913
914    def genesIndices(self, genes):
915        """
916        Returns in attribute indices of given genes.
917        Buffers locations dictionary.
918        """
919        if not self.namesToIndices:
920            self.namesToIndices = defaultdict(list)
921            for i,at in enumerate(itOrFirst(self.data).domain.attributes):
922                self.namesToIndices[at.name].append(i)
923        return reduce(lambda x,y:x+y, [ self.namesToIndices[gname] for gname in genes ], [])
924
925    def compute_gene_weights(self, gsweights, gsetsnum, nattributes):
926        """
927        Computes gene set weights for all specified weights.
928        Expects gene sets in form { name: [ num_attributes ] }
929        GSWeights are
930        """
931        pass
932
933    def to_gsetsnum(self, gsets):
934        """
935        Returns a dictionary of given  gene sets in gsetnums format.
936        """
937        return dict( (gs,self.genesIndices(nth(self.genesets[gs],1))) for gs in gsets)
938
939    def compute(self, minSize=3, maxSize=1000, minPart=0.1, n=100, **kwargs):
940
941        subsetsok = self.selectGenesets(minSize=minSize, maxSize=maxSize, minPart=minPart)
942
943        geneweights = None
944
945        gsetsnum = self.to_gsetsnum(subsetsok.keys())
946        gsetsnumit = gsetsnum.items() #to fix order
947
948        #gsetsnumit = gsetsnumit[:1]
949        #print gsetsnumit
950
951        if len(gsetsnum) == 0:
952            return {} # quick return if no genesets
953
954        if len(self.gsweights) > 0:
955            #set geneset
956            geneweights = [1]*len(data.domain.attributes)
957
958        if len(itOrFirst(self.data)) > 1:
959            gseal = gseaE(self.data, nth(gsetsnumit,1), n=n, geneweights=geneweights, **kwargs)
960        else:
961            rankings = [ self.data[0][at].native() for at in self.data.domain.attributes ]
962            gseal = gseaR(rankings, nth(gsetsnumit,1), n=n, **kwargs)
963
964        res = {}
965
966        for gs, gseale in zip(nth(gsetsnumit,0),gseal):
967            rdict = {}
968            rdict['es'] = gseale[0]
969            rdict['nes'] = gseale[1]
970            rdict['p'] = gseale[2]
971            rdict['fdr'] = gseale[3]
972            rdict['size'] = len(gs.genes)
973            rdict['matched_size'] = len(self.genesets[gs])
974            rdict['genes'] = nth(self.genesets[gs],1)
975            res[gs] = rdict
976
977        return res
978
979def runGSEA(data, organism=None, classValues=None, geneSets=None, n=100, 
980        permutation="class", minSize=3, maxSize=1000, minPart=0.1, atLeast=3, 
981        matcher=None, geneVar=None, phenVar=None, caseSensitive=False, 
982        **kwargs):
983    """
984    phenVar and geneVar specify the phenotype and gene variable.
985    """
986    gso = GSEA(data, organism=organism, matcher=matcher, 
987        classValues=classValues, atLeast=atLeast, caseSensitive=caseSensitive,
988        geneVar=geneVar, phenVar=phenVar)
989    gso.addGenesets(geneSets)
990    res1 = gso.compute(n=n, permutation=permutation, minSize=minSize,
991        maxSize=maxSize, minPart=minPart, **kwargs)
992    return res1
993
994def etForAttribute(datal,a):
995    """
996    Builds an example table for a single attribute across multiple
997    example tables.
998    """
999
1000    tables = len(datal)
1001
1002    def getAttrVals(data, attr):
1003        dom2 = orange.Domain([data.domain[attr]], False)
1004        dataa = orange.ExampleTable(dom2, data)
1005        return [ a[0].native() for a in dataa ]
1006
1007    domainl = []
1008    valuesl = []
1009
1010    for id, data in enumerate(datal):
1011        v = getAttrVals(data,a)
1012        valuesl.append(v)
1013        domainl.append(orange.FloatVariable(name=("v"+str(id))))
1014
1015    classvals = getAttrVals(data, datal[0].domain.classVar)
1016    valuesl += [ classvals ]
1017
1018    dom = orange.Domain(domainl, datal[0].domain.classVar)
1019    examples = [ list(a) for a in zip(*valuesl) ]
1020
1021    datat = orange.ExampleTable(dom, examples)
1022
1023    return datat
1024
1025
1026def evaluateEtWith(fn, *args, **kwargs):
1027    """
1028    fn - evaluates example table given
1029    following arguments.
1030    """
1031
1032    def newf(datal):
1033        res = []
1034        for a in datal[0].domain.attributes:
1035            et = etForAttribute(datal, a)
1036            res.append(fn(et, *args, **kwargs))
1037        return res
1038
1039    return newf
1040
1041
1042def hierarchyOutput(results, limitGenes=50):
1043    """
1044    Transforms results for use by hierarchy output from GO.
1045
1046    limitGenes - maximum number of genes on output.
1047    """
1048    trans = []
1049   
1050    for name, res in results.items():
1051        try:
1052            second = name.split(' ')[2]
1053            name = second if second[:2] == 'GO' else name
1054        except:
1055            pass
1056       
1057        trans.append((name, abs(res["nes"]), res["matched_size"], res["size"], res["p"], min(res["fdr"], 1.0), res["genes"][:limitGenes]))
1058
1059    return trans
1060
1061if  __name__=="__main__":
1062
1063    """
1064    Old example with another measure function
1065    data = orange.ExampleTable("gene_three_lines_log.tab")
1066    data = orange.ExampleTable("sterolTalkHepa.tab")
1067    gen1 = collections(['steroltalk.gmt', ':kegg:hsa'], default=False)
1068    rankingf = rankingFromOrangeMeas(MA_anova())
1069    matcher = obiGene.matcher([obiGene.GMKEGG('hsa')])
1070    geneVar = gene_cands(data, False)[1]
1071    phenVar = "group"
1072    out = runGSEA(data, n=10, geneSets=gen1, permutation="gene", atLeast=3, matcher=matcher, rankingf=rankingf, phenVar=phenVar, geneVar=geneVar)
1073    """
1074    """
1075    data = orange.ExampleTable("sterolTalkHepa.tab")
1076    gen1 = obiGeneSets.collections('steroltalk.gmt', (("KEGG",), "9606"))
1077    matcher = obiGene.matcher([obiGene.GMKEGG('hsa')])
1078    out = runGSEA(data, n=10, geneSets=gen1, permutation="gene", atLeast=3, matcher=matcher)
1079    """
1080    matcher = obiGene.matcher([[obiGene.GMKEGG('ddi'), obiGene.GMDicty()]])
1081    data = orange.ExampleTable("/home/marko/t_gsea1.tab")
1082    gen1 = obiGeneSets.collections((("KEGG",), "352472"))
1083    out = runGSEA(data, n=10, geneSets=gen1, permutation="gene", atLeast=3, matcher=matcher, phenVar="growth")
1084    print out
1085    print "\n".join(map(str,sorted(out.items())))
1086   
Note: See TracBrowser for help on using the repository browser.