# source:orange-bioinformatics/Orange/bioinformatics/obiGsea.py@1625:cefeb35cbfc9

Revision 1625:cefeb35cbfc9, 32.2 KB checked in by mitar, 2 years ago (diff)

Moving files around.

Line
1import orange
2import numpy
3import random
4import time
5from obiExpression import *
6import obiGeneSets
7from collections import defaultdict
8
9"""
10Gene set enrichment analysis.
11
12Author: Marko Toplak
13"""
14
15def iset(data):
16    """
17    Is data orange.ExampleTable?
18    """
19    return isinstance(data, orange.ExampleTable)
20
21def issequencens(x):
22    "Is x a sequence and not string ? We say it is if it has a __getitem__ method and is not string."
23    return hasattr(x, '__getitem__') and not isinstance(x, basestring)
24
25def mean(l):
26    return float(sum(l))/len(l)
27
28def rankingFromOrangeMeas(meas):
29    """
30    Creates a function that sequentally ranks all attributes and returns
31    results in a list. Ranking function is build out of
32    orange.MeasureAttribute.
33    """
34    return lambda d: [ meas(i,d) for i in range(len(d.domain.attributes)) ]
35
36def orderedPointersCorr(lcor):
37    """
38    Return a list of integers: indexes in original
39    lcor. Elements in the list are ordered by
40    their lcor[i] value. Higher correlations first.
41    """
42    ordered = [ (i,a) for i,a in enumerate(lcor) ] #original pos + correlation
43    ordered.sort(lambda x,y: cmp(y[1],x[1])) #sort by correlation, descending
44    ordered = nth(ordered, 0) #contains positions in the original list
45    return ordered
46
47def enrichmentScoreRanked(subset, lcor, ordered, p=1.0, rev2=None):
48    """
49    Input data and subset.
50
51    subset: list of attribute indices of the input data belonging
52        to the same set.
53    lcor: correlations with class for each attribute in a list.
54
55    Returns enrichment score on given data.
56
57    This implementation efficiently handles "sparse" genesets (that
58    cover only a small subset of all genes in the dataset).
59    """
60
61    #print lcor
62
63    subset = set(subset)
64
65    if rev2 == None:
66        def rev(l):
67            return numpy.argsort(l)
68        rev2 = rev(ordered)
69
70    #add if gene is not in the subset
71    notInA = -(1. / (len(lcor)-len(subset)))
72    #base for addition if gene is in the subset
73    cors = [ abs(lcor[i])**p for i in subset ]
74    sumcors = sum(cors)
75
76    #this should not happen
77    if sumcors == 0.0:
78        return (0.0, None)
79
80    inAb = 1./sumcors
81
82    ess = [0.0]
83
84    map = {}
85    for i in subset:
86        orderedpos = rev2[i]
87        map[orderedpos] = inAb*abs(lcor[i]**p)
88
89    last = 0
90
91    maxSum = minSum = csum = 0.0
92
93    for a,b in sorted(map.items()):
94        diff = a-last
95        csum += notInA*diff
96        last = a+1
97
98        if csum < minSum:
99            minSum = csum
100
101        csum += b
102
103        if csum > maxSum:
104            maxSum = csum
105
106    #finish it
107    diff = (len(ordered))-last
108    csum += notInA*diff
109
110    if csum < minSum:
111        minSum = csum
112
113    #print "MY", (maxSum if abs(maxSum) > abs(minSum) else minSum)
114
115    """
116    #BY DEFINITION
117    print "subset", subset
118
119    for i in ordered:
120        ess.append(ess[-1] + \
121            (inAb*abs(lcor[i]**p) if i in subset else notInA)
122        )
123        if i in subset:
124            print ess[-2], ess[-1]
125            print i, (inAb*abs(lcor[i]**p))
126
127    maxEs = max(ess)
128    minEs = min(ess)
129
130    print "REAL", (maxEs if abs(maxEs) > abs(minEs) else minEs, ess[1:])
131
132    """
133    return (maxSum if abs(maxSum) > abs(minSum) else minSum, [])
134
135#from mOrngData
136def shuffleAttribute(data, attribute, locations):
137    """
138    Destructive!
139    """
140    attribute = data.domain[attribute]
141    l = [None]*len(data)
142    for i in range(len(data)):
143        l[locations[i]] = data[i][attribute]
144    for i in range(len(data)):
145        data[i][attribute] = l[i]
146
147def shuffleClass(datai, rands=0):
148    """
149    Returns a dataset with values of class attribute randomly shuffled.
150    If multiple dataset are on input shuffle them all with the same random seed.
151    """
152    def shuffleOne(data):
153        rand = random.Random(rands)
154        d2 = orange.ExampleTable(data.domain, data)
155        locations = range(len(data))
156        rand.shuffle(locations)
157        shuffleAttribute(d2, d2.domain.classVar, locations)
158        return d2
159
160    if iset(datai):
161        return shuffleOne(datai)
162    else:
163        return [ shuffleOne(data) for data in datai ]
164
165def shuffleList(l, rand=random.Random(0)):
166    """
167    Returns a copy of a shuffled input list.
168    """
169    import copy
170    l2 = copy.copy(l)
171    rand.shuffle(l2)
172    return l2
173
174def shuffleAttributes(data, rand=random.Random(0)):
175    """
176    Returns a dataset with a new attribute order.
177    """
178    natts = shuffleList(list(data.domain.attributes), rand)
179    dom2 = orange.Domain(natts, data.domain.classVar)
180    d2 = orange.ExampleTable(dom2, data)
181    return d2
182
183def gseapval(es, esnull):
184    """
185    From article (PNAS):
186    estimate nominal p-value for S from esnull by using the positive
187    or negative portion of the distribution corresponding to the sign
188    of the observed ES(S).
189    """
190
191    try:
192        if es < 0:
193            return float(len([ a for a in esnull if a <= es ]))/ \
194                len([ a for a in esnull if a < 0])
195        else:
196            return float(len([ a for a in esnull if a >= es ]))/ \
197                len([ a for a in esnull if a >= 0])
198    except:
199        return 1.0
200
201
202def enrichmentScore(data, subset, rankingf):
203    """
204    Returns enrichment score and running enrichment score.
205    """
206    lcor = rankingf(data)
207    ordered = orderedPointersCorr(lcor)
208    es,l = enrichmentScoreRanked(subset, lcor, ordered)
209    return es,l
210
211def gseaE(data, subsets, rankingf=None, \
212        n=100, permutation="class", **kwargs):
213    """
214    Run GSEA algorithm on an example table.
215
216    data: orange example table.
217    subsets: list of distinct subsets of data.
218    rankingf: function that returns correlation to class of each
219        variable.
220    n: number of random permutations to sample null distribution.
221    permutation: "class" for permutating class, else permutate attribute
222        order.
223
224    """
225
226    if not rankingf:
227        rankingf=rankingFromOrangeMeas(MA_signalToNoise())
228
229    enrichmentScores = []
230
231    lcor = rankingf(data)
232    #print lcor
233
234    ordered = orderedPointersCorr(lcor)
235
236    def rev(l):
237        return numpy.argsort(l)
238
239    rev2 = rev(ordered)
240
241    for subset in subsets:
242        es = enrichmentScoreRanked(subset, lcor, ordered, rev2=rev2)[0]
243        enrichmentScores.append(es)
244
245    runOptCallbacks(kwargs)
246
247    #print "PERMUTATION", permutation
248
249    enrichmentNulls = [ [] for a in range(len(subsets)) ]
250
251    for i in range(n):
252
253        if permutation == "class":
254            d2 = shuffleClass(data, 2000+i) #fixed permutation
255            r2 = rankingf(d2)
256
257        else:
258            r2 = shuffleList(lcor, random.Random(2000+i))
259
260        ordered2 = orderedPointersCorr(r2)
261        rev22 = rev(ordered2)
262        for si,subset in enumerate(subsets):
263            esn = enrichmentScoreRanked(subset, r2, ordered2, rev2=rev22)[0]
264            enrichmentNulls[si].append(esn)
265
266        runOptCallbacks(kwargs)
267
268    return gseaSignificance(enrichmentScores, enrichmentNulls)
269
270
271def runOptCallbacks(rargs):
272    if "callback" in rargs:
273        try:
274            [ a() for a in rargs["callback"] ]
275        except:
276            rargs["callback"]()
277
278
279def gseaR(rankings, subsets, n=100, **kwargs):
280    """
281    """
282
283    if "permutation" in kwargs:
284        if kwargs["permutation"] == "class":
285            raise Exception("Only gene permutation possible")
286
287    enrichmentScores = []
288
289    ordered = orderedPointersCorr(rankings)
290
291    def rev(l):
292        return numpy.argsort(l)
293
294    rev2 = rev(ordered)
295
296    for subset in subsets:
297
298        es = enrichmentScoreRanked(subset, rankings, ordered, rev2=rev2)[0]
299        enrichmentScores.append(es)
300
301    runOptCallbacks(kwargs)
302
303    enrichmentNulls = [ [] for a in range(len(subsets)) ]
304
305    for i in range(n):
306
307        r2 = shuffleList(rankings, random.Random(2000+i))
308        ordered2 = orderedPointersCorr(r2)
309        rev22 = rev(ordered2)
310
311        for si,subset in enumerate(subsets):
312
313            esn = enrichmentScoreRanked(subset, r2, ordered2, rev2=rev22)[0]
314            enrichmentNulls[si].append(esn)
315
316        runOptCallbacks(kwargs)
317
318    return gseaSignificance(enrichmentScores, enrichmentNulls)
319
320
321def gseaSignificance(enrichmentScores, enrichmentNulls):
322
323    #print enrichmentScores
324
325    import time
326
327    tb1 = time.time()
328
329    enrichmentPVals = []
330    nEnrichmentScores = []
331    nEnrichmentNulls = []
332
333    for i in range(len(enrichmentScores)):
334        es = enrichmentScores[i]
335        enrNull = enrichmentNulls[i]
336        #print es, enrNull
337
338        enrichmentPVals.append(gseapval(es, enrNull))
339
340        #normalize the ES(S,pi) and the observed ES(S), separetely rescaling
341        #the positive and negative scores by divident by the mean of the
342        #ES(S,pi)
343
344        #print es, enrNull
345
346        def normalize(s):
347            try:
348                if s == 0:
349                    return 0.0
350                if s >= 0:
351                    meanPos = mean([a for a in enrNull if a >= 0])
352                    #print s, meanPos
353                    return s/meanPos
354                else:
355                    meanNeg = mean([a for a in enrNull if a < 0])
356                    #print s, meanNeg
357                    return -s/meanNeg
358            except:
359                return 0.0 #return if according mean value is uncalculable
360
361
362        nes = normalize(es)
363        nEnrichmentScores.append(nes)
364
365        nenrNull = [ normalize(s) for s in enrNull ]
366        nEnrichmentNulls.append(nenrNull)
367
368
369    #print "First part", time.time() - tb1
370
371    #FDR computation
372    #create a histogram of all NES(S,pi) over all S and pi
373    vals = reduce(lambda x,y: x+y, nEnrichmentNulls, [])
374
375
376    def shorten(l, p=10000):
377        """
378        Take each len(l)/p element, if len(l)/p >= 2.
379        """
380        e = len(l)/p
381        if e <= 1:
382            return l
383        else:
384            return [ l[i] for i in xrange(0, len(l), e) ]
385
386    #vals = shorten(vals) -> this can speed up second part. is it relevant TODO?
387
388    """
389    Use this null distribution to compute an FDR q value, for a given NES(S) =
390    NES* >= 0. The FDR is the ratio of the percantage of all (S,pi) with
391    NES(S,pi) >= 0, whose NES(S,pi) >= NES*, divided by the percentage of
392    observed S wih NES(S) >= 0, whose NES(S) >= NES*, and similarly if NES(S)
393    = NES* <= 0.
394    """
395
396    nvals = numpy.array(sorted(vals))
397    nnes = numpy.array(sorted(nEnrichmentScores))
398
399    #print "LEN VALS", len(vals), len(nEnrichmentScores)
400
401    fdrs = []
402
403    import operator
404
405    for i in range(len(enrichmentScores)):
406
407        nes = nEnrichmentScores[i]
408
409        """
410        #Strighfoward but slow implementation follows in comments.
411        #Useful as code description.
412
413        if nes >= 0:
414            op0 = operator.ge
415            opn = operator.ge
416        else:
417            op0 = operator.lt
418            opn = operator.le
419
420        allPos = [a for a in vals if op0(a,0)]
421        allHigherAndPos = [a for a in allPos if opn(a,nes) ]
422
423        nesPos = [a for a in nEnrichmentScores if op0(a,0) ]
424        nesHigherAndPos = [a for a in nesPos if opn(a,nes) ]
425
426        top = len(allHigherAndPos)/float(len(allPos)) #p value
427        down = len(nesHigherAndPos)/float(len(nesPos))
428
429        l1 = [ len(allPos), len(allHigherAndPos), len(nesPos), len(nesHigherAndPos)]
430
431        allPos = allHigherAndPos = nesPos =  nesHigherAndPos = 1
432
433        """
434
435        #this could be speed up twice with the same accuracy!
436        if nes >= 0:
437            allPos = int(len(vals) - numpy.searchsorted(nvals, 0, side="left"))
438            allHigherAndPos = int(len(vals) - numpy.searchsorted(nvals, nes, side="left"))
439            nesPos = len(nnes) - int(numpy.searchsorted(nnes, 0, side="left"))
440            nesHigherAndPos = len(nnes) - int(numpy.searchsorted(nnes, nes, side="left"))
441        else:
442            allPos = int(numpy.searchsorted(nvals, 0, side="left"))
443            allHigherAndPos = int(numpy.searchsorted(nvals, nes, side="right"))
444            nesPos = int(numpy.searchsorted(nnes, 0, side="left"))
445            nesHigherAndPos = int(numpy.searchsorted(nnes, nes, side="right"))
446
447        """
448        #Comparing results
449        l2 = [ allPos, allHigherAndPos, nesPos, nesHigherAndPos ]
450        diffs = [ l1[i]-l2[i] for i in range(len(l1)) ]
451        sumd = sum( [ abs(a) for a in diffs ] )
452        if sumd > 0:
453            print nes > 0
454            print "orig", l1
455            print "modi", l2
456        """
457
458        try:
459            top = allHigherAndPos/float(allPos) #p value
460            down = nesHigherAndPos/float(nesPos)
461
462            fdrs.append(top/down)
463        except:
464            fdrs.append(1000000000.0)
465
466    #print "Whole part", time.time() - tb1
467
468    return zip(enrichmentScores, nEnrichmentScores, enrichmentPVals, fdrs)
469
470import obiGene
471
472def nth(l,n): return [ a[n] for a in l ]
473
474def itOrFirst(data):
475    """ Returns input if input is of type ExampleTable, else returns first
476    element of the input list """
477    if iset(data):
478        return data
479    else:
480        return data[0]
481
482def wrap_in_list(data):
483    """ Wraps orange.ExampleTable in a list """
484    if iset(data):
485        return [ data ]
486    else:
487        return data
488
489def takeClasses(datai, classValues=None):
490    """
491    Function joins class groups specified in an input pair
492    classValues. Each element of the pair is a list of class
493    values to be joined to first or second class. Group
494    classes in two new class values.
495    If class values are not specified, take all the classes.
496
497    Input data can be a single data set or a list of data sets
498    with the same domain.
499
500    Returns transformed data sets / data sets.
501    """
502
503    cv = itOrFirst(datai).domain.classVar
504    nclassvalues = None
505
506    if cv and len(itOrFirst(datai)) > 1:
507        oldcvals = [ a for a in cv.values ]
508
509        if not classValues:
510            classValues = oldcvals
511
512        toJoin = []
513
514        for vals in classValues:
515            if issequencens(vals):
516                toJoin.append(list(vals))
517            else:
518                toJoin.append([vals])
519
520        classValues = reduce(lambda x,y: x+y, toJoin)
521        classValues = [ str(a) for a in classValues ] # ok class values
522
523        #dictionary of old class -> new class
524        mapval = {}
525        nclassvalues = [] # need to preserver order
526
527        for joinvals in toJoin:
528            joinvalsn = "+".join([ str(val) for val in sorted(joinvals) ])
529            nclassvalues.append(joinvalsn)
530
531            for val in joinvals:
532                mapval[str(val)] = joinvalsn
533
534        #take only examples with classValues classes
535        nclass = orange.EnumVariable(cv.name, values=nclassvalues)
536        ndom = orange.Domain(itOrFirst(datai).domain.attributes, nclass)
537
538        def removeAndTransformClasses(data):
539            """
540            Removes unnecessary class values and joins them according
541            to function input.
542            """
543            examples = []
544            for ex in data:
545                if ex[cv] in classValues:
546                    nex = orange.Example(ndom, ex)
547                    nex[-1] = mapval[str(ex[cv].value)]
548                    examples.append(nex)
549
550            return orange.ExampleTable(ndom, examples)
551
552        if iset(datai):
553            datai = removeAndTransformClasses(datai)
554        else:
555            datai = [ removeAndTransformClasses(data) for data in datai ]
556
557    return datai
558
559def removeBadAttributes(datai, atLeast=3):
560    """
561    Removes attributes which would obscure GSEA analysis.
562
563    Attributes need to be continuous, they need to have
564    at least one value. Remove other attributes.
565
566    For the attribute to be valid, it needs to have at least
567    [atLeast] values for every class value.
568
569    Return transformed data set / data sets and ignored attributes.
570    """
571
572    def attrOk(a, data):
573        """
574        Attribute is ok if it is continouous and if containg
575        at least atLest not unknown values.
576        """
577
578        a = data.domain.attributes.index(a)
579
580        #can't
581        if data.domain.attributes[a].varType != orange.VarTypes.Continuous:
582            return False
583
584        if len(data) == 1:
585
586            vals = [ex[a].value for ex in data if not ex[a].isSpecial()]
587            if len(vals) < 1:
588                return False
589
590        if len(data) > 1 and data.domain.classVar and atLeast > 0:
591
592            dc = dict( (v, 0) for v in data.domain.classVar.values )
593
594            for ex in data:
595                if not ex[a].isSpecial():
596                    dc[ex[-1].value] += 1
597
598            minl = min(dc.values())
599
600            if minl < atLeast:
601                #print "Less than atLeast"
602                return False
603
604        return True
605
606
607    def notOkAttributes(data):
608        ignored = []
609        for a in data.domain.attributes:
610            if not attrOk(a, data):
611                #print "Removing", a
612                ignored.append(a)
613        return ignored
614
615    ignored = []
616    if iset(datai):
617        ignored = set(notOkAttributes(datai))
618    else:
619        #ignore any attribute which is has less than atLeast values for each class
620        #ignored = set(reduce(lambda x,y: x+y, [ notOkAttributes(data) for data in datai ]))
621
622        #remove any attribute, which is ok in less than half of the dataset
623        ignored = []
624        for a in itOrFirst(datai).domain.attributes:
625            attrOks = sum([ attrOk(a, data) for data in datai ])
626            if attrOks < len(datai)/2:
627                ignored.append(a)
628
629
630    natts = [ a for a in itOrFirst(datai).domain.attributes if a not in ignored ]
631    #print ignored, natts, set(ignored) & set(natts)
632
633    ndom = orange.Domain(natts, itOrFirst(datai).domain.classVar)
634
635    datao = None
636    if iset(datai):
637        datao = orange.ExampleTable(ndom, datai)
638    else:
639        datao = [ orange.ExampleTable(ndom, data) for data in datai ]
640
641    return datao, ignored
642
643def keepOnlyMeanAttrs(datai, atLeast=3, classValues=None):
644    """
645    Attributes need to be continuous, they need to have
646    at least one value.
647
648    In order of attribute to be valid, it needs to have at least
649    [atLeast] values for every class value.
650
651    Keep only specified classes - group them in two values.
652    """
653    datai = takeClasses(datai, classValues=classValues)
654    return removeBadAttributes(datai, atLeast=atLeast)
655
656def data_single_meas_column(data):
657    """
658    Returns true if data seems to be in one column
659    (float variables) only. This column should contain
660    the rankings
661    """
662    columns = [a for a in data.domain] +  [ data.domain.getmeta(a) for a in list(data.domain.getmetas()) ]
663    floatvars = [ a for a in columns if a.varType == orange.VarTypes.Continuous ]
664    if len(floatvars) == 1:
665        return True
666    else:
667        return False
668
669def transform_data(data, phenVar, geneVar):
670    """
671    if we have log2ratio in a single value column, transpose the matrix
672    i.e. we have a single column with a continous variable. first
673    string variable then becomes the gene name
674
675    The goal is to have different phenotypes annotated with a class,
676    and names of genes as attribute names.
677
678    If phenVar is False, then we can work, then the input already
679    consists of scores of differential expressions
680
681    If we have a single column, transpose it.
682    If phenVar is one of the groups, transpose the matrix.
683    """
684
685    def prepare_data(data, phenVar=None, geneVar=None):
686
687        def rorq(a, name):
688            """ Group annatation or question mark. """
689            try:
690                return a.attributes[name]
691            except:
692                return '?'
693
694        #use class as phenotype by default, if it is present,
695        #if not, do not use any phenotype!
696        if phenVar == None:
697            if not data.domain.classVar:
698                phenVar = False
699            else:
700                phenVar = data.domain.classVar
701
702
703        #TODO validate phenVar and geneVar?
704        #TODO autodetection of groups?
705
706        #transpose is not needed if phenVar is classVar or phenVar is False
707        #and there is only one sample
708        if phenVar == data.domain.classVar or \
709            (phenVar == False and len(data) == 1):
710
711            if geneVar == None: #if not specified, set as true in this stage
712                geneVar = True
713
714            floatvars = [ a for a in data.domain.attributes \
715                if a.varType == orange.VarTypes.Continuous ]
716
717            #rename attributes without touching the original variable
718            if geneVar != True:
719                fl2 = []
720
721                for a in floatvars:
722                    na = orange.FloatVariable(name=rorq(a, geneVar))
723                    na.getValueFrom = lambda e, rw: e[a]
724                    fl2.append(na)
725
726                floatvars = fl2
727
728            dom = orange.Domain(floatvars, phenVar)
729            return orange.ExampleTable(dom, data)
730
731        elif phenVar == False or phenVar != data.domain.classVar:
732
733            cands = allgroups(data)
734            pv = False
735            if phenVar != False:
736                pv = orange.EnumVariable(name="phenotype",
737                    values=list(cands[phenVar]))
738
739            #take the only string attribute as a gene name
740            gc = gene_cands(data, False)
741            if geneVar == None:
742                if len(gc) == 1:
743                    geneVar = gc[0]
744                else:
745                    geneNamesUnspecifiedError()
746
747            latts = [ orange.FloatVariable(name=ex[geneVar].value) \
748                for ex in data ]
749
750            domain = orange.Domain(latts, pv)
751
752            examples = []
753            for at in data.domain.attributes:
754                if at.varType == orange.VarTypes.Continuous:
755                    vals = [ ex[at].value for ex in data ]
756                    if pv != False: #add class value
757                        vals.append(rorq(at, phenVar))
758                    examples.append(orange.Example(domain, vals))
759
760            return orange.ExampleTable(domain, examples)
761        else:
762            wrongInputsError()
763
764    #transform all example tables
765    single = iset(data)
766    transposed = [ prepare_data(d, phenVar, geneVar) for d in wrap_in_list(data) ]
767
768    if single:
769        return transposed[0]
770    else:
771        return transposed
772
773
774def allgroups(data):
775    """
776    Return all phenotype descriptors of attributes with their values.
777    """
778    sd = defaultdict(set)
779    for attr in data.domain.attributes:
780        for key, value in attr.attributes.items():
781            sd[key].add(value)
782    return sd
783
784def already_have_correlations(data):
785    return len(data) == 1 or data_single_meas_column(data)
786
787def need_to_transpose_single(data):
788    if len(data) == 1:
789        return False
790    else:
791        return True
792
793def phenotype_cands(data):
794    """
795    Return all phenotype candidate descriptors in a list of tuples
796    (variable, values). Candidates are class variable, if it exists and
797    attributes dictionaries of attributes.
798    Phenotype candidates must contain at least two differend values.
799    """
800    if already_have_correlations(data):
801        return [ (False, set()) ]
802    else:
803        cv = []
804        if data.domain.classVar and data.domain.classVar.varType == orange.VarTypes.Discrete:
805            cv.append((data.domain.classVar, set(data.domain.classVar.values)))
806        cands = cv + sorted(allgroups(data).items())
807        return filter(lambda x: len(x[1]) >= 2, cands)
808
809def gene_cands(data, correct):
810    """
811    Returns all valid gene descriptors with regards to the choosen
812    phenotype variable.
813    Return variable descriptor for variables, name of the group for
814    descriptions in attr.attributes and True for the usage
815    of attribute names.
816    Correct is True, if the example table has genes as attributes.
817    """
818    if correct:
819        #gene names could be in attributes or as gene names (marker True)
820        return [True] + nth(sorted(allgroups(data)),0)
821    else:
822        #gene names are values of some string attribute
823        columns = [a for a in data.domain] +  \
824            [ data.domain.getmeta(a) for a in list(data.domain.getmetas()) ]
825        stringvars = [ a for a in columns if a.varType == 6 ]
826        return stringvars
827
828def is_variable(phenVar):
829    return isinstance(phenVar, orange.Variable)
830
831class GSEA(object):
832
833    def __init__(self, data, organism=None, matcher=None, classValues=None,
834        atLeast=3, caseSensitive=False, phenVar=None, geneVar=None):
835        """
836        If the data set constains multiple measurements for a single gene,
837        all are considered. Individual constributions of such measurements
838        are not weighted down - each measurement is as important as they
839        would measure different genes.
840
841        phenVar and geneVar can ether be an orange attribute or a string.
842        If they are strings, then they describe a group.
843        """
844
845        self.genesets = {}
846        self.organism = organism
847
848        if organism != None:
849            print "WARNING: obiGsea - organism and caseSensitive parameters are deprecated. Use matcher instead."
850
851        self.gsweights = {}
852        self.namesToIndices = None
853        self.gm = matcher
854
855        data = transform_data(data, phenVar, geneVar)
856
857        data, info = keepOnlyMeanAttrs(data, classValues=classValues, atLeast=atLeast)
858
859        self.data = data
860
861        #init attrnames
862        attrnames = [ a.name for a in itOrFirst(self.data).domain.attributes ]
863
864        if self.gm == None: #build a gene matcher, if if does not exists
865            self.gm = obiGene.matcher([obiGene.GMKEGG(self.organism, ignore_case=not caseSensitive)],
866                ignore_case=not caseSensitive, direct=True)
867            print "WARNING: gene matcher build automatically for organism: " + self.organism
868
869        self.gm.set_targets(attrnames)
870
871
872    def addGeneset(self, genesetname, genes):
873        """
874        Add a single gene set. See addGenesets function.
875        Solely for backwards compatibility.
876        """
877        self.addGenesets({ genesetname: genes })
878
879    def addGenesets(self, genesets):
880        """
881        Adds genesets from input dictionary. Performs gene matching. Adds
882        to a self.genesets: key is genesetname, it's values are individual
883        genes and match results.
884        """
885        for g in obiGeneSets.GeneSets(genesets):
886            genes = g.genes
887            datamatch = filter(lambda x: x[1] != None,
888                [ (gene, self.gm.umatch(gene)) for gene in genes])
889            self.genesets[g] = datamatch
890
891    def selectGenesets(self, minSize=3, maxSize=1000, minPart=0.1):
892        """ Returns a list of gene sets that have sizes in limits """
893
894        def okSizes(orig, transl):
895            """compares sizes of genesets to limitations"""
896            if len(transl) >= minSize and len(transl) <= maxSize \
897                and float(len(transl))/len(orig) >= minPart:
898                return True
899            return False
900
901        return  dict( (a,c) for a,c in self.genesets.iteritems() if okSizes(a.genes,c) )
902
903    def genesIndices(self, genes):
904        """
905        Returns in attribute indices of given genes.
906        Buffers locations dictionary.
907        """
908        if not self.namesToIndices:
909            self.namesToIndices = defaultdict(list)
910            for i,at in enumerate(itOrFirst(self.data).domain.attributes):
911                self.namesToIndices[at.name].append(i)
912        return reduce(lambda x,y:x+y, [ self.namesToIndices[gname] for gname in genes ], [])
913
914    def compute_gene_weights(self, gsweights, gsetsnum, nattributes):
915        """
916        Computes gene set weights for all specified weights.
917        Expects gene sets in form { name: [ num_attributes ] }
918        GSWeights are
919        """
920        pass
921
922    def to_gsetsnum(self, gsets):
923        """
924        Returns a dictionary of given  gene sets in gsetnums format.
925        """
926        return dict( (gs,self.genesIndices(nth(self.genesets[gs],1))) for gs in gsets)
927
928    def compute(self, minSize=3, maxSize=1000, minPart=0.1, n=100, **kwargs):
929
930        subsetsok = self.selectGenesets(minSize=minSize, maxSize=maxSize, minPart=minPart)
931
932        geneweights = None
933
934        gsetsnum = self.to_gsetsnum(subsetsok.keys())
935        gsetsnumit = gsetsnum.items() #to fix order
936
937        #gsetsnumit = gsetsnumit[:1]
938        #print gsetsnumit
939
940        if len(gsetsnum) == 0:
941            return {} # quick return if no genesets
942
943        if len(self.gsweights) > 0:
944            #set geneset
945            geneweights = [1]*len(data.domain.attributes)
946
947        if len(itOrFirst(self.data)) > 1:
948            gseal = gseaE(self.data, nth(gsetsnumit,1), n=n, geneweights=geneweights, **kwargs)
949        else:
950            rankings = [ self.data[0][at].native() for at in self.data.domain.attributes ]
951            gseal = gseaR(rankings, nth(gsetsnumit,1), n=n, **kwargs)
952
953        res = {}
954
955        for gs, gseale in zip(nth(gsetsnumit,0),gseal):
956            rdict = {}
957            rdict['es'] = gseale[0]
958            rdict['nes'] = gseale[1]
959            rdict['p'] = gseale[2]
960            rdict['fdr'] = gseale[3]
961            rdict['size'] = len(gs.genes)
962            rdict['matched_size'] = len(self.genesets[gs])
963            rdict['genes'] = nth(self.genesets[gs],1)
964            res[gs] = rdict
965
966        return res
967
968def runGSEA(data, organism=None, classValues=None, geneSets=None, n=100,
969        permutation="class", minSize=3, maxSize=1000, minPart=0.1, atLeast=3,
970        matcher=None, geneVar=None, phenVar=None, caseSensitive=False,
971        **kwargs):
972    """
973    phenVar and geneVar specify the phenotype and gene variable.
974    """
975    gso = GSEA(data, organism=organism, matcher=matcher,
976        classValues=classValues, atLeast=atLeast, caseSensitive=caseSensitive,
977        geneVar=geneVar, phenVar=phenVar)
978    gso.addGenesets(geneSets)
979    res1 = gso.compute(n=n, permutation=permutation, minSize=minSize,
980        maxSize=maxSize, minPart=minPart, **kwargs)
981    return res1
982
983def etForAttribute(datal,a):
984    """
985    Builds an example table for a single attribute across multiple
986    example tables.
987    """
988
989    tables = len(datal)
990
991    def getAttrVals(data, attr):
992        dom2 = orange.Domain([data.domain[attr]], False)
993        dataa = orange.ExampleTable(dom2, data)
994        return [ a[0].native() for a in dataa ]
995
996    domainl = []
997    valuesl = []
998
999    for id, data in enumerate(datal):
1000        v = getAttrVals(data,a)
1001        valuesl.append(v)
1002        domainl.append(orange.FloatVariable(name=("v"+str(id))))
1003
1004    classvals = getAttrVals(data, datal[0].domain.classVar)
1005    valuesl += [ classvals ]
1006
1007    dom = orange.Domain(domainl, datal[0].domain.classVar)
1008    examples = [ list(a) for a in zip(*valuesl) ]
1009
1010    datat = orange.ExampleTable(dom, examples)
1011
1012    return datat
1013
1014
1015def evaluateEtWith(fn, *args, **kwargs):
1016    """
1017    fn - evaluates example table given
1018    following arguments.
1019    """
1020
1021    def newf(datal):
1022        res = []
1023        for a in datal[0].domain.attributes:
1024            et = etForAttribute(datal, a)
1025            res.append(fn(et, *args, **kwargs))
1026        return res
1027
1028    return newf
1029
1030
1031def hierarchyOutput(results, limitGenes=50):
1032    """
1033    Transforms results for use by hierarchy output from GO.
1034
1035    limitGenes - maximum number of genes on output.
1036    """
1037    trans = []
1038
1039    for name, res in results.items():
1040        try:
1041            second = name.split(' ')[2]
1042            name = second if second[:2] == 'GO' else name
1043        except:
1044            pass
1045
1046        trans.append((name, abs(res["nes"]), res["matched_size"], res["size"], res["p"], min(res["fdr"], 1.0), res["genes"][:limitGenes]))
1047
1048    return trans
1049
1050if  __name__=="__main__":
1051
1052    """
1053    Old example with another measure function
1054    data = orange.ExampleTable("gene_three_lines_log.tab")
1055    data = orange.ExampleTable("sterolTalkHepa.tab")
1056    gen1 = collections(['steroltalk.gmt', ':kegg:hsa'], default=False)
1057    rankingf = rankingFromOrangeMeas(MA_anova())
1058    matcher = obiGene.matcher([obiGene.GMKEGG('hsa')])
1059    geneVar = gene_cands(data, False)[1]
1060    phenVar = "group"
1061    out = runGSEA(data, n=10, geneSets=gen1, permutation="gene", atLeast=3, matcher=matcher, rankingf=rankingf, phenVar=phenVar, geneVar=geneVar)
1062    """
1063    """
1064    data = orange.ExampleTable("sterolTalkHepa.tab")
1065    gen1 = obiGeneSets.collections('steroltalk.gmt', (("KEGG",), "9606"))
1066    matcher = obiGene.matcher([obiGene.GMKEGG('hsa')])
1067    out = runGSEA(data, n=10, geneSets=gen1, permutation="gene", atLeast=3, matcher=matcher)
1068    """
1069    matcher = obiGene.matcher([[obiGene.GMKEGG('ddi'), obiGene.GMDicty()]])
1070    data = orange.ExampleTable("/home/marko/t_gsea1.tab")
1071    gen1 = obiGeneSets.collections((("KEGG",), "352472"))
1072    out = runGSEA(data, n=10, geneSets=gen1, permutation="gene", atLeast=3, matcher=matcher, phenVar="growth")
1073    print out
1074    print "\n".join(map(str,sorted(out.items())))
1075
Note: See TracBrowser for help on using the repository browser.