source: orange-bioinformatics/obiGsea.py @ 934:294df66d8619

Revision 934:294df66d8619, 29.7 KB checked in by markotoplak, 4 years ago (diff)

obiGsea: a step towards using "transposed" datasets.

Line 
1import orange
2import numpy
3import random
4import time
5from obiExpression import *
6from obiGeneSets import *
7from collections import defaultdict
8
9"""
10Gene set enrichment analysis.
11
12Author: Marko Toplak
13"""
14
15def iset(data):
16    """
17    Is data orange.ExampleTable?
18    """
19    return isinstance(data, orange.ExampleTable)
20
21def issequencens(x):
22    "Is x a sequence and not string ? We say it is if it has a __getitem__ method and is not string."
23    return hasattr(x, '__getitem__') and not isinstance(x, basestring)
24
25def mean(l):
26    return float(sum(l))/len(l)
27
28def rankingFromOrangeMeas(meas):
29    """
30    Creates a function that sequentally ranks all attributes and returns
31    results in a list. Ranking function is build out of
32    orange.MeasureAttribute.
33    """
34    return lambda d: [ meas(i,d) for i in range(len(d.domain.attributes)) ]
35
36def orderedPointersCorr(lcor):
37    """
38    Return a list of integers: indexes in original
39    lcor. Elements in the list are ordered by
40    their lcor[i] value. Higher correlations first.
41    """
42    ordered = [ (i,a) for i,a in enumerate(lcor) ] #original pos + correlation
43    ordered.sort(lambda x,y: cmp(y[1],x[1])) #sort by correlation, descending
44    ordered = nth(ordered, 0) #contains positions in the original list
45    return ordered
46
47def enrichmentScoreRanked(subset, lcor, ordered, p=1.0, rev2=None):
48    """
49    Input data and subset.
50   
51    subset: list of attribute indices of the input data belonging
52        to the same set.
53    lcor: correlations with class for each attribute in a list.
54
55    Returns enrichment score on given data.
56
57    This implementation efficiently handles "sparse" genesets (that
58    cover only a small subset of all genes in the dataset).
59    """
60
61    #print lcor
62
63    subset = set(subset)
64
65    if rev2 == None:
66        def rev(l):
67            return numpy.argsort(l)
68        rev2 = rev(ordered)
69
70    #add if gene is not in the subset
71    notInA = -(1. / (len(lcor)-len(subset)))
72    #base for addition if gene is in the subset
73    cors = [ abs(lcor[i])**p for i in subset ]
74    sumcors = sum(cors)
75
76    #this should not happen
77    if sumcors == 0.0:
78        return (0.0, None)
79   
80    inAb = 1./sumcors
81
82    ess = [0.0]
83   
84    map = {}
85    for i in subset:
86        orderedpos = rev2[i]
87        map[orderedpos] = inAb*abs(lcor[i]**p)
88       
89    last = 0
90
91    maxSum = minSum = csum = 0.0
92
93    for a,b in sorted(map.items()):
94        diff = a-last
95        csum += notInA*diff
96        last = a+1
97       
98        if csum < minSum:
99            minSum = csum
100       
101        csum += b
102
103        if csum > maxSum:
104            maxSum = csum
105
106    #finish it
107    diff = (len(ordered))-last
108    csum += notInA*diff
109
110    if csum < minSum:
111        minSum = csum
112
113    #print "MY", (maxSum if abs(maxSum) > abs(minSum) else minSum)
114
115    """
116    #BY DEFINITION
117    print "subset", subset
118
119    for i in ordered:
120        ess.append(ess[-1] + \
121            (inAb*abs(lcor[i]**p) if i in subset else notInA)
122        )
123        if i in subset:
124            print ess[-2], ess[-1]
125            print i, (inAb*abs(lcor[i]**p))
126
127    maxEs = max(ess)
128    minEs = min(ess)
129   
130    print "REAL", (maxEs if abs(maxEs) > abs(minEs) else minEs, ess[1:])
131
132    """
133    return (maxSum if abs(maxSum) > abs(minSum) else minSum, [])
134
135#from mOrngData
136def shuffleAttribute(data, attribute, locations):
137    """
138    Destructive!
139    """
140    attribute = data.domain[attribute]
141    l = [None]*len(data)
142    for i in range(len(data)):
143        l[locations[i]] = data[i][attribute]
144    for i in range(len(data)):
145        data[i][attribute] = l[i]
146
147def shuffleClass(datai, rands=0):
148    """
149    Returns a dataset with values of class attribute randomly shuffled.
150    If multiple dataset are on input shuffle them all with the same random seed.
151    """
152    def shuffleOne(data):
153        rand = random.Random(rands)
154        d2 = orange.ExampleTable(data.domain, data)
155        locations = range(len(data))
156        rand.shuffle(locations)
157        shuffleAttribute(d2, d2.domain.classVar, locations)
158        return d2
159
160    if iset(datai):
161        return shuffleOne(datai)
162    else:
163        return [ shuffleOne(data) for data in datai ]
164
165def shuffleList(l, rand=random.Random(0)):
166    """
167    Returns a copy of a shuffled input list.
168    """
169    import copy
170    l2 = copy.copy(l)
171    rand.shuffle(l2)
172    return l2
173
174def shuffleAttributes(data, rand=random.Random(0)):
175    """
176    Returns a dataset with a new attribute order.
177    """
178    natts = shuffleList(list(data.domain.attributes), rand)
179    dom2 = orange.Domain(natts, data.domain.classVar)
180    d2 = orange.ExampleTable(dom2, data)
181    return d2
182
183def gseapval(es, esnull):
184    """
185    From article (PNAS):
186    estimate nominal p-value for S from esnull by using the positive
187    or negative portion of the distribution corresponding to the sign
188    of the observed ES(S).
189    """
190   
191    try:
192        if es < 0:
193            return float(len([ a for a in esnull if a <= es ]))/ \
194                len([ a for a in esnull if a < 0])   
195        else: 
196            return float(len([ a for a in esnull if a >= es ]))/ \
197                len([ a for a in esnull if a >= 0])
198    except:
199        return 1.0
200
201
202def enrichmentScore(data, subset, rankingf):
203    """
204    Returns enrichment score and running enrichment score.
205    """
206    lcor = rankingf(data)
207    ordered = orderedPointersCorr(lcor)
208    es,l = enrichmentScoreRanked(subset, lcor, ordered)
209    return es,l
210
211def gseaE(data, subsets, rankingf=None, \
212        n=100, permutation="class", **kwargs):
213    """
214    Run GSEA algorithm on an example table.
215
216    data: orange example table.
217    subsets: list of distinct subsets of data.
218    rankingf: function that returns correlation to class of each
219        variable.
220    n: number of random permutations to sample null distribution.
221    permutation: "class" for permutating class, else permutate attribute
222        order.
223
224    """
225
226    if not rankingf:
227        rankingf=rankingFromOrangeMeas(MA_signalToNoise())
228
229    enrichmentScores = []
230 
231    lcor = rankingf(data)
232    #print lcor
233
234    ordered = orderedPointersCorr(lcor)
235
236    def rev(l):
237        return numpy.argsort(l)
238
239    rev2 = rev(ordered)
240
241    for subset in subsets:
242        es = enrichmentScoreRanked(subset, lcor, ordered, rev2=rev2)[0]
243        enrichmentScores.append(es)
244
245    runOptCallbacks(kwargs)
246
247    #print "PERMUTATION", permutation
248
249    enrichmentNulls = [ [] for a in range(len(subsets)) ]
250
251    for i in range(n):
252
253        if permutation == "class":
254            d2 = shuffleClass(data, 2000+i) #fixed permutation
255            r2 = rankingf(d2)
256
257        else:
258            r2 = shuffleList(lcor, random.Random(2000+i))
259
260        ordered2 = orderedPointersCorr(r2)
261        rev22 = rev(ordered2)
262        for si,subset in enumerate(subsets):
263            esn = enrichmentScoreRanked(subset, r2, ordered2, rev2=rev22)[0]
264            enrichmentNulls[si].append(esn)
265
266        runOptCallbacks(kwargs)
267
268    return gseaSignificance(enrichmentScores, enrichmentNulls)
269
270
271def runOptCallbacks(rargs):
272    if "callback" in rargs:
273        try:
274            [ a() for a in rargs["callback"] ]
275        except:
276            rargs["callback"]()
277           
278
279def gseaR(rankings, subsets, n=100, **kwargs):
280    """
281    """
282
283    if "permutation" in kwargs:
284        if kwargs["permutation"] == "class":
285            raise Exception("Only gene permutation possible")
286
287    enrichmentScores = []
288 
289    ordered = orderedPointersCorr(rankings)
290   
291    def rev(l):
292        return numpy.argsort(l)
293
294    rev2 = rev(ordered)
295
296    for subset in subsets:
297
298        es = enrichmentScoreRanked(subset, rankings, ordered, rev2=rev2)[0]
299        enrichmentScores.append(es)
300   
301    runOptCallbacks(kwargs)
302
303    enrichmentNulls = [ [] for a in range(len(subsets)) ]
304
305    for i in range(n):
306       
307        r2 = shuffleList(rankings, random.Random(2000+i))
308        ordered2 = orderedPointersCorr(r2)
309        rev22 = rev(ordered2)
310
311        for si,subset in enumerate(subsets):
312
313            esn = enrichmentScoreRanked(subset, r2, ordered2, rev2=rev22)[0]
314            enrichmentNulls[si].append(esn)
315
316        runOptCallbacks(kwargs)
317
318    return gseaSignificance(enrichmentScores, enrichmentNulls)
319
320
321def gseaSignificance(enrichmentScores, enrichmentNulls):
322
323    #print enrichmentScores
324
325    import time
326
327    tb1 = time.time()
328
329    enrichmentPVals = []
330    nEnrichmentScores = []
331    nEnrichmentNulls = []
332
333    for i in range(len(enrichmentScores)):
334        es = enrichmentScores[i]
335        enrNull = enrichmentNulls[i]
336        #print es, enrNull
337
338        enrichmentPVals.append(gseapval(es, enrNull))
339
340        #normalize the ES(S,pi) and the observed ES(S), separetely rescaling
341        #the positive and negative scores by divident by the mean of the
342        #ES(S,pi)
343
344        #print es, enrNull
345
346        def normalize(s):
347            try:
348                if s == 0:
349                    return 0.0
350                if s >= 0:
351                    meanPos = mean([a for a in enrNull if a >= 0])
352                    #print s, meanPos
353                    return s/meanPos
354                else:
355                    meanNeg = mean([a for a in enrNull if a < 0])
356                    #print s, meanNeg
357                    return -s/meanNeg
358            except:
359                return 0.0 #return if according mean value is uncalculable
360
361
362        nes = normalize(es)
363        nEnrichmentScores.append(nes)
364       
365        nenrNull = [ normalize(s) for s in enrNull ]
366        nEnrichmentNulls.append(nenrNull)
367 
368
369    #print "First part", time.time() - tb1
370
371    #FDR computation
372    #create a histogram of all NES(S,pi) over all S and pi
373    vals = reduce(lambda x,y: x+y, nEnrichmentNulls, [])
374
375
376    def shorten(l, p=10000):
377        """
378        Take each len(l)/p element, if len(l)/p >= 2.
379        """
380        e = len(l)/p
381        if e <= 1:
382            return l
383        else:
384            return [ l[i] for i in xrange(0, len(l), e) ]
385
386    #vals = shorten(vals) -> this can speed up second part. is it relevant TODO?
387
388    """
389    Use this null distribution to compute an FDR q value, for a given NES(S) =
390    NES* >= 0. The FDR is the ratio of the percantage of all (S,pi) with
391    NES(S,pi) >= 0, whose NES(S,pi) >= NES*, divided by the percentage of
392    observed S wih NES(S) >= 0, whose NES(S) >= NES*, and similarly if NES(S)
393    = NES* <= 0.
394    """
395
396    nvals = numpy.array(sorted(vals))
397    nnes = numpy.array(sorted(nEnrichmentScores))
398
399    #print "LEN VALS", len(vals), len(nEnrichmentScores)
400
401    fdrs = []
402
403    import operator
404
405    for i in range(len(enrichmentScores)):
406
407        nes = nEnrichmentScores[i]
408
409        """
410        #Strighfoward but slow implementation follows in comments.
411        #Useful as code description.
412       
413        if nes >= 0:
414            op0 = operator.ge
415            opn = operator.ge
416        else:
417            op0 = operator.lt
418            opn = operator.le
419
420        allPos = [a for a in vals if op0(a,0)]
421        allHigherAndPos = [a for a in allPos if opn(a,nes) ]
422
423        nesPos = [a for a in nEnrichmentScores if op0(a,0) ]
424        nesHigherAndPos = [a for a in nesPos if opn(a,nes) ]
425
426        top = len(allHigherAndPos)/float(len(allPos)) #p value
427        down = len(nesHigherAndPos)/float(len(nesPos))
428       
429        l1 = [ len(allPos), len(allHigherAndPos), len(nesPos), len(nesHigherAndPos)]
430
431        allPos = allHigherAndPos = nesPos =  nesHigherAndPos = 1
432
433        """
434
435        #this could be speed up twice with the same accuracy!
436        if nes >= 0:
437            allPos = int(len(vals) - numpy.searchsorted(nvals, 0, side="left"))
438            allHigherAndPos = int(len(vals) - numpy.searchsorted(nvals, nes, side="left"))
439            nesPos = len(nnes) - int(numpy.searchsorted(nnes, 0, side="left"))
440            nesHigherAndPos = len(nnes) - int(numpy.searchsorted(nnes, nes, side="left"))
441        else:
442            allPos = int(numpy.searchsorted(nvals, 0, side="left"))
443            allHigherAndPos = int(numpy.searchsorted(nvals, nes, side="right"))
444            nesPos = int(numpy.searchsorted(nnes, 0, side="left"))
445            nesHigherAndPos = int(numpy.searchsorted(nnes, nes, side="right"))
446           
447        """
448        #Comparing results
449        l2 = [ allPos, allHigherAndPos, nesPos, nesHigherAndPos ]
450        diffs = [ l1[i]-l2[i] for i in range(len(l1)) ]
451        sumd = sum( [ abs(a) for a in diffs ] )
452        if sumd > 0:
453            print nes > 0
454            print "orig", l1
455            print "modi", l2
456        """
457
458        try:
459            top = allHigherAndPos/float(allPos) #p value
460            down = nesHigherAndPos/float(nesPos)
461
462            fdrs.append(top/down)
463        except:
464            fdrs.append(1000000000.0)
465   
466    #print "Whole part", time.time() - tb1
467
468    return zip(enrichmentScores, nEnrichmentScores, enrichmentPVals, fdrs)
469
470import obiGene
471
472def nth(l,n): return [ a[n] for a in l ]
473
474def itOrFirst(data):
475    """ Returns input if input is of type ExampleTable, else returns first
476    element of the input list """
477    if iset(data):
478        return data
479    else:
480        return data[0]
481
482def wrap_in_list(data):
483    """ Wraps orange.ExampleTable in a list """
484    if iset(data):
485        return [ data ]
486    else:
487        return data
488
489def takeClasses(datai, classValues=None):
490    """
491    Function joins class groups specified in an input pair
492    classValues. Each element of the pair is a list of class
493    values to be joined to first or second class. Group
494    classes in two new class values.
495    If class values are not specified, take all the classes.
496
497    Input data can be a single data set or a list of data sets
498    with the same domain.
499
500    Returns transformed data sets / data sets.
501    """
502
503    cv = itOrFirst(datai).domain.classVar
504    nclassvalues = None
505
506    if cv and len(itOrFirst(datai)) > 1:
507        oldcvals = [ a for a in cv.values ]
508       
509        if not classValues:
510            classValues = oldcvals
511
512        toJoin = []
513
514        for vals in classValues:
515            if issequencens(vals):
516                toJoin.append(list(vals))
517            else:
518                toJoin.append([vals])
519
520        classValues = reduce(lambda x,y: x+y, toJoin)
521        classValues = [ str(a) for a in classValues ] # ok class values
522
523        #dictionary of old class -> new class
524        mapval = {}
525        nclassvalues = [] # need to preserver order
526
527        for joinvals in toJoin:
528            joinvalsn = "+".join([ str(val) for val in sorted(joinvals) ])
529            nclassvalues.append(joinvalsn)
530
531            for val in joinvals:
532                mapval[str(val)] = joinvalsn
533
534        #take only examples with classValues classes
535        nclass = orange.EnumVariable(cv.name, values=nclassvalues)
536        ndom = orange.Domain(itOrFirst(datai).domain.attributes, nclass)
537
538        def removeAndTransformClasses(data):
539            """
540            Removes unnecessary class values and joines them according
541            to function input.
542            """
543            examples = []
544            for ex in data:
545                if ex[cv] in classValues:
546                    vals = [ ex[a] for a in data.domain.attributes ]
547                    vals.append(mapval[str(ex[cv].value)])
548                    examples.append(vals)
549
550            return orange.ExampleTable(ndom, examples)
551
552        if iset(datai):
553            datai = removeAndTransformClasses(datai)
554        else:
555            datai = [ removeAndTransformClasses(data) for data in datai ]
556
557    return datai
558
559def removeBadAttributes(datai, atLeast=3):
560    """
561    Removes attributes which would obscure GSEA analysis.
562
563    Attributes need to be continuous, they need to have
564    at least one value. Remove other attributes.
565
566    For the attribute to be valid, it needs to have at least
567    [atLeast] values for every class value.
568
569    Return transformed data set / data sets and ignored attributes.
570    """
571
572    def attrOk(a, data):
573        """
574        Attribute is ok if it is continouous and if containg
575        at least atLest not unknown values.
576        """
577
578        a = data.domain.attributes.index(a)
579
580        #can't
581        if data.domain.attributes[a].varType != orange.VarTypes.Continuous:
582            return False
583
584        if len(data) == 1:
585
586            vals = [ex[a].value for ex in data if not ex[a].isSpecial()]
587            if len(vals) < 1:
588                return False 
589       
590        if len(data) > 1 and data.domain.classVar and atLeast > 0:
591
592            valc = [ [ex[a].value for ex in data \
593                        if not ex[a].isSpecial() and ex[-1] == data.domain.classVar[i] \
594                   ] for i in range(len(data.domain.classVar.values)) ]
595            minl = min( [ len(a) for a in valc ])
596           
597            if minl < atLeast:
598                #print "Less than atLeast"
599                return False
600
601        return True
602   
603
604    def notOkAttributes(data):
605        ignored = []
606        for a in data.domain.attributes:
607            if not attrOk(a, data):
608                #print "Removing", a
609                ignored.append(a)
610        return ignored
611   
612    ignored = []
613    if iset(datai):
614        ignored = set(notOkAttributes(datai))
615    else:
616        #ignore any attribute which is has less than atLeast values for each class
617        #ignored = set(reduce(lambda x,y: x+y, [ notOkAttributes(data) for data in datai ]))
618
619        #remove any attribute, which is ok in less than half of the dataset
620        ignored = []
621        for a in itOrFirst(datai).domain.attributes:
622            attrOks = sum([ attrOk(a, data) for data in datai ])
623            if attrOks < len(datai)/2:
624                ignored.append(a)
625
626
627    natts = [ a for a in itOrFirst(datai).domain.attributes if a not in ignored ]
628    #print ignored, natts, set(ignored) & set(natts)
629
630    ndom = orange.Domain(natts, itOrFirst(datai).domain.classVar)
631
632    datao = None
633    if iset(datai):
634        datao = orange.ExampleTable(ndom, datai)
635    else:
636        datao = [ orange.ExampleTable(ndom, data) for data in datai ]
637
638    return datao, ignored
639
640def keepOnlyMeanAttrs(datai, atLeast=3, classValues=None):
641    """
642    Attributes need to be continuous, they need to have
643    at least one value.
644
645    In order of attribute to be valid, it needs to have at least
646    [atLeast] values for every class value.
647
648    Keep only specified classes - group them in two values.
649    """   
650    datai = takeClasses(datai, classValues=classValues)
651    return removeBadAttributes(datai, atLeast=atLeast)
652
653def data_single_meas_column(data):
654    """
655    Returns true if data seems to be in one column
656    (float variables) only. This column should contain
657    the rankings
658    """
659    columns = [a for a in data.domain] +  [ data.domain.getmeta(a) for a in list(data.domain.getmetas()) ]
660    floatvars = [ a for a in columns if a.varType == orange.VarTypes.Continuous ]
661    if len(floatvars) == 1:
662        return True
663    else:
664        return False
665
666def transform_data(data, phenVar, geneVar):
667    """
668    if we have log2ratio in a single value column, transpose the matrix
669    i.e. we have a single column with a continous variable. first
670    string variable then becomes the gene name
671
672    The goal is to have different phenotypes annotated with a class,
673    and names of genes as attribute names.
674
675    If we have a single column, transpose it.
676    If phenVar is one of the groups, transpose the matrix.
677    """
678
679    def transpose_data(data):
680        columns = [a for a in data.domain] +  [ data.domain.getmeta(a) for a in list(data.domain.getmetas()) ]
681        floatvars = [ a for a in columns if a.varType == orange.VarTypes.Continuous ]
682        if len(floatvars) == 1:
683            floatvar = floatvars[0]
684            stringvar = [ a for a in columns if a.varType == 6 ][0]
685
686            tup = [ (ex[stringvar].value, ex[floatvar].value) for ex in data ]
687            newdom = orange.Domain([orange.FloatVariable(name=a[0]) for a in tup ], False)
688            example = [ a[1] for a in tup ]
689            ndata = orange.ExampleTable(newdom, [example])
690            return ndata
691        return data
692
693    #transform every example table example tables
694
695    single = iset(data)
696    transposed = [ transpose_data(d) for d in wrap_in_list(data) ]
697
698    if single:
699        return transposed[0]
700    else:
701        return transposed
702
703
704def allgroups(data):
705    """
706    Return all phenotype descriptors of attributes with their values.
707    """
708    sd = defaultdict(set)
709    for attr in data.domain.attributes:
710        for key, value in attr.attributes.items():
711            sd[key].add(value)
712    return sd
713
714def phenotype_cands(data):
715    """
716    Return all phenotype candidate descriptors in a list of tuples
717    (variable, values). Candidates are class variable, if it exists and
718    attributes dictionaries of attributes.
719    Phenotype candidates must contain at least two differend values.
720    """
721    cv = []
722    if data.domain.classVar and data.domain.classVar.varType == orange.VarTypes.Discrete:
723        cv.append((data.domain.classVar, set(data.domain.classVar.values)))
724    cands = cv + sorted(allgroups(data).items())
725    return filter(lambda x: len(x[1]) >= 2, cands)
726
727def gene_cands(data, phenVar):
728    """
729    Returns all valid gene descriptors with regards to the choosen
730    phenotype variable.
731    Return variable descriptor for variables, name of the group for
732    descriptions in attr.attributes and True for the usage
733    of attribute names.
734    """
735    if is_variable(phenVar[0]):
736        #gene names could be in attributes or as gene names (marker True)
737        return [True] + nth(sorted(allgroups(data)),0)
738    else:
739        #gene names are values of some string attribute
740        columns = [a for a in data.domain] +  \
741            [ data.domain.getmeta(a) for a in list(data.domain.getmetas()) ]
742        stringvars = [ a for a in columns if a.varType == 6 ]
743        return stringvars
744
745def is_variable(phenVar):
746    return isinstance(phenVar, orange.Variable)
747
748class GSEA(object):
749
750    def __init__(self, data, organism=None, matcher=None, classValues=None, 
751        atLeast=3, caseSensitive=False, phenVar=None, geneVar=None):
752        """
753        If the data set constains multiple measurements for a single gene,
754        all are considered. Individual constributions of such measurements
755        are not weighted down - each measurement is as important as they
756        would measure different genes.
757
758        phenVar and geneVar can ether be an orange attribute or a string.
759        If they are strings, then they describe a group.
760        """
761
762        self.genesets = {}
763        self.organism = organism
764
765        if organism != None:
766            print "WARNING: obiGsea - organism and caseSensitive parameters are deprecated. Use matcher instead."
767
768        self.gsweights = {}
769        self.namesToIndices = None
770        self.gm = matcher
771
772        data = transform_data(data, phenVar, geneVar)
773
774        data, info = keepOnlyMeanAttrs(data, classValues=classValues, atLeast=atLeast)
775
776        self.data = data
777
778        #init attrnames
779        attrnames = [ a.name for a in itOrFirst(self.data).domain.attributes ]
780
781        if self.gm == None: #build a gene matcher, if if does not exists
782            self.gm = obiGene.matcher([obiGene.GMKEGG(self.organism, ignore_case=not caseSensitive)], 
783                ignore_case=not caseSensitive, direct=True)
784            print "WARNING: gene matcher build automatically for organism: " + self.organism
785
786        self.gm.set_targets(attrnames)
787
788 
789    def addGeneset(self, genesetname, genes):
790        """
791        Add a single gene set. See addGenesets function.
792        Solely for backwards compatibility.
793        """
794        self.addGenesets({ genesetname: genes })
795
796    def addGenesets(self, gsdic):
797        """
798        Adds genesets from input dictionary. Also. performs gene matching. Adds
799        to a self.genesets: key is genesetname, it's values are individual
800        genes and match results.
801        """
802        for genesetname, genes in gsdic.iteritems():
803
804            if genesetname in self.genesets:
805                raise Exception("Name " + \
806                    + genesetname + " is already used in genesets.")
807            else:
808                datamatch = filter(lambda x: x[1] != None, [ (gene, self.gm.umatch(gene)) for gene in genes])
809                self.genesets[genesetname] = ( genes, datamatch )
810
811    def selectGenesets(self, minSize=3, maxSize=1000, minPart=0.1):
812        """ Returns a list of gene sets that have sizes in limits """
813
814        def okSizes(orig, transl):
815            """compares sizes of genesets to limitations"""
816            if len(transl) >= minSize and len(transl) <= maxSize \
817                and float(len(transl))/len(orig) >= minPart:
818                return True
819            return False
820
821        return  dict( (a,(b,c)) for a,(b,c) in self.genesets.iteritems() if okSizes(b,c) )
822
823    def genesIndices(self, genes):
824        """
825        Returns in attribute indices of given genes.
826        Buffers locations dictionary.
827        """
828        if not self.namesToIndices:
829            self.namesToIndices = defaultdict(list)
830            for i,at in enumerate(itOrFirst(self.data).domain.attributes):
831                self.namesToIndices[at.name].append(i)
832        return reduce(lambda x,y:x+y, [ self.namesToIndices[gname] for gname in genes ], [])
833
834    def compute_gene_weights(self, gsweights, gsetsnum, nattributes):
835        """
836        Computes gene set weights for all specified weights.
837        Expects gene sets in form { name: [ num_attributes ] }
838        GSWeights are
839        """
840        pass
841
842    def to_gsetsnum(self, names):
843        """
844        Returns a dictionary of gene sets with given names in gsetnums format.
845        """
846        return dict( (name,self.genesIndices(nth(self.genesets[name][1],1))) for name in names)
847
848    def compute(self, minSize=3, maxSize=1000, minPart=0.1, n=100, **kwargs):
849
850        subsetsok = self.selectGenesets(minSize=minSize, maxSize=maxSize, minPart=minPart)
851
852        geneweights = None
853
854        gsetsnum = self.to_gsetsnum(subsetsok.keys())
855        gsetsnumit = gsetsnum.items() #to fix order
856
857        #gsetsnumit = gsetsnumit[:1]
858        #print gsetsnumit
859
860        if len(gsetsnum) == 0:
861            return {} # quick return if no genesets
862
863        if len(self.gsweights) > 0:
864            #set geneset
865            geneweights = [1]*len(data.domain.attributes)
866
867        if len(itOrFirst(self.data)) > 1:
868            gseal = gseaE(self.data, nth(gsetsnumit,1), n=n, geneweights=geneweights, **kwargs)
869        else:
870            rankings = [ self.data[0][at].native() for at in self.data.domain.attributes ]
871            gseal = gseaR(rankings, nth(gsetsnumit,1), n=n, **kwargs)
872
873        res = {}
874
875        for name,gseale in zip(nth(gsetsnumit,0),gseal):
876            rdict = {}
877            rdict['es'] = gseale[0]
878            rdict['nes'] = gseale[1]
879            rdict['p'] = gseale[2]
880            rdict['fdr'] = gseale[3]
881            rdict['size'] = len(self.genesets[name][0])
882            rdict['matched_size'] = len(self.genesets[name][1])
883            rdict['genes'] = nth(self.genesets[name][1],1)
884            res[name] = rdict
885
886        return res
887
888def runGSEA(data, organism=None, classValues=None, geneSets=None, n=100, 
889        permutation="class", minSize=3, maxSize=1000, minPart=0.1, atLeast=3, 
890        matcher=None, geneVar=None, phenVar=None, caseSensitive=False, 
891        **kwargs):
892    """
893    phenVar and geneVar specify the phenotype and gene variable.
894    """
895    gso = GSEA(data, organism=organism, matcher=matcher, 
896        classValues=classValues, atLeast=atLeast, caseSensitive=caseSensitive)
897    if geneSets == None:
898        genesets = collections(default=True)
899    gso.addGenesets(geneSets)
900    res1 = gso.compute(n=n, permutation=permutation, minSize=minSize,
901        maxSize=maxSize, minPart=minPart, geneVar=geneVar, phenVar=phenVar,
902        **kwargs)
903    return res1
904
905def etForAttribute(datal,a):
906    """
907    Builds an example table for a single attribute across multiple
908    example tables.
909    """
910
911    tables = len(datal)
912
913    def getAttrVals(data, attr):
914        dom2 = orange.Domain([data.domain[attr]], False)
915        dataa = orange.ExampleTable(dom2, data)
916        return [ a[0].native() for a in dataa ]
917
918    domainl = []
919    valuesl = []
920
921    for id, data in enumerate(datal):
922        v = getAttrVals(data,a)
923        valuesl.append(v)
924        domainl.append(orange.FloatVariable(name=("v"+str(id))))
925
926    classvals = getAttrVals(data, datal[0].domain.classVar)
927    valuesl += [ classvals ]
928
929    dom = orange.Domain(domainl, datal[0].domain.classVar)
930    examples = [ list(a) for a in zip(*valuesl) ]
931
932    datat = orange.ExampleTable(dom, examples)
933
934    return datat
935
936
937def evaluateEtWith(fn, *args, **kwargs):
938    """
939    fn - evaluates example table given
940    following arguments.
941    """
942
943    def newf(datal):
944        res = []
945        for a in datal[0].domain.attributes:
946            et = etForAttribute(datal, a)
947            res.append(fn(et, *args, **kwargs))
948        return res
949
950    return newf
951
952
953def hierarchyOutput(results, limitGenes=50):
954    """
955    Transforms results for use by hierarchy output from GO.
956
957    limitGenes - maximum number of genes on output.
958    """
959    trans = []
960   
961    for name, res in results.items():
962        try:
963            second = name.split(' ')[2]
964            name = second if second[:2] == 'GO' else name
965        except:
966            pass
967       
968        trans.append((name, abs(res["nes"]), res["matched_size"], res["size"], res["p"], min(res["fdr"], 1.0), res["genes"][:limitGenes]))
969
970    return trans
971
972if  __name__=="__main__":
973
974    data = orange.ExampleTable("sterolTalkHepaM.tab")
975    print phenotype_cands(data)
976    print is_variable(phenotype_cands(data)[0][0])
977
978    """
979    data = orange.ExampleTable("gene_three_lines_log.tab")
980    print phenotype_cands(data)
981    print is_variable(phenotype_cands(data)[0][0])
982    """
983
984    gen1 = collections(['steroltalk.gmt', ':kegg:hsa'], default=False)
985
986    gen1 = dict([ ('[KEGG] Complement and coagulation cascades', gen1['[KEGG] Complement and coagulation cascades'])])
987
988    rankingf = rankingFromOrangeMeas(MA_anova())
989    matcher = obiGene.matcher([obiGene.GMKEGG('hsa')])
990
991    out = runGSEA(data, n=10, geneSets=gen1, permutation="gene", atLeast=3, matcher=matcher, rankingf=rankingf)
992    print "\n".join(map(str,sorted(out.items())))
993   
Note: See TracBrowser for help on using the repository browser.