source: orange-bioinformatics/Orange/bioinformatics/obiGeneAtlas.py @ 1625:cefeb35cbfc9

Revision 1625:cefeb35cbfc9, 23.1 KB checked in by mitar, 2 years ago (diff)

Moving files around.

Line 
1"""
2========================================
3Gene Expression Atlas (``obiGeneAtlas``)
4========================================
5
6Interface to Gene Expression Atlas.
7
8`Gene Expression Atlas <http://www.ebi.ac.uk/gxa/>`_ is a curated subset of
9gene expression experiments in Array Express Archive.
10
11.. autofunction:: gene_expression_atlas
12
13.. autofunction:: default_gene_matcher
14
15.. autofunction:: to_taxid
16
17"""
18
19import os, sys
20from collections import defaultdict, namedtuple
21
22from Orange.utils import serverfiles
23
24GeneResults = namedtuple("GeneResults", "id name synonyms expressions")
25ExpressionResults = namedtuple("ExpressionResults", "ef efv up down experiments")
26ExperimentExpression = namedtuple("ExperimentExpression", "accession expression pvalue")
27
28##
29GeneAtlasResult = GeneResults
30AtlasExpressions = ExpressionResults
31AtlasExperiment = ExperimentExpression
32##
33import shelve
34import orngServerFiles
35import obiGene
36from contextlib import closing
37
38def _cache(name="AtlasGeneResult.shelve"):
39    """ Return a open cache instance (a shelve object).
40    """
41    if not os.path.exists(orngServerFiles.localpath("GeneAtlas")):
42        try:
43            os.makedirs(orngServerFiles.localpath("GeneAtlas"))
44        except OSError:
45            pass
46    return shelve.open(orngServerFiles.localpath("GeneAtlas", name))
47
48SLEEP_TIME_MULTIPLIER = 3.0
49
50def gene_expression_atlas(genes, progress_callback=None):
51    """ Return GeneResults instances for genes (genes must be valid ensembl ids).
52    """
53    import time
54    genes = list(genes)
55    result_dict = {}
56    genes_not_cached = []
57    # See which genes are already cached
58    with closing(_cache()) as cache:
59        for gene in genes:
60            if str(gene) in cache:
61                result_dict[gene] = cache[str(gene)]
62            else:
63                genes_not_cached.append(gene)
64   
65    batch_size = 10
66    start = 0
67    res = []
68    while start < len(genes_not_cached):
69        batch = genes_not_cached[start: start + batch_size]
70        start += batch_size
71        start_time = time.time()
72        batch_res = batch_gene_atlas_expression(batch)
73        # Cache the new results.
74        # TODO: handle genes without any results.
75        genes_with_no_results = set(batch) - set(r.id for r in batch_res) 
76        with closing(_cache()) as cache:
77            for atlas_res in batch_res:
78                cache[str(atlas_res.id)] = atlas_res
79                result_dict[atlas_res.id] = atlas_res
80            for g in genes_with_no_results:
81                cache[str(g)] = None
82        res.extend(batch_res)
83        # Sleep
84        if start % (batch_size * 10) == 0:
85            # every 10 batches wait one minute before continuing.
86            time.sleep(60)
87        else:
88            time.sleep(min(20.0, SLEEP_TIME_MULTIPLIER*(time.time() - start_time)))
89           
90        if progress_callback:
91            progress_callback(100.0 * start / len(genes_not_cached))
92   
93    return [result_dict.get(g, None) for g in genes]
94
95   
96def batch_gene_atlas_expression(genes):
97    cond = GenePropertyCondition("Ensgene", "Is", genes)
98    res = run_query(cond, format="json")
99    results = res["results"]
100    results_genes = []
101    for one_result in results:
102        gene = one_result["gene"]
103        id = gene["id"]
104        name = gene["name"]
105        synonyms = gene.get("synonyms", [])
106        expressions = one_result["expressions"]
107        result_expressions = []
108        for expression in expressions:
109            ef = expression["ef"]
110            efv = expression["efv"]
111            up = expression["upExperiments"]
112            down = expression["downExperiments"]
113            experiments = expression["experiments"]
114            result_experiment = []
115            for exp in experiments:
116                exp_accession = exp["accession"]
117                updown = exp["expression"]
118                pval = exp["pvalue"]
119                result_experiment.append(ExperimentExpression(exp_accession, updown, pval))
120            result_expressions.append(ExpressionResults(ef, efv, up, down, result_experiment))
121        results_genes.append(GeneResults(id, name, synonyms, result_expressions))
122    return results_genes
123
124
125def default_gene_matcher(organism):
126    """ Return a default gene matcher for organism
127    (targeting Ensembl gene ids).
128   
129    """
130    taxid = to_taxid(organism)
131    matcher = obiGene.matcher([obiGene.GMEnsembl(taxid),
132                               obiGene.GMNCBI(taxid)])
133    matcher.set_targets(obiGene.EnsembleGeneInfo(taxid).keys())
134    return matcher
135
136from Orange.utils import lru_cache
137
138@lru_cache(maxsize=3)
139def _cached_default_gene_matcher(organism): 
140    return default_gene_matcher(organism)
141   
142
143def get_atlas_summary(genes, organism, gene_matcher=None,
144                      progress_callback=None):
145    """ Return 3 dictionaries containing a summary of atlas information
146    about three experimental factors:
147   
148        - Organism Part (OP)
149        - Disease State (DS)
150        - Cell type (CT)
151   
152    Each dictionary contains query genes as keys. Values are dictionaries
153    mapping factor values to a 2-tuple containig the count of up regulated
154    and down regulated experiments.
155   
156    Example ::
157   
158        >>> get_atlas_summary(["RUNX1"], "Homo sapiens")
159        ({u'RUNX1': ...
160       
161    """
162    if gene_matcher is None:
163        gene_matcher = _cached_default_gene_matcher(organism)
164       
165    matched, unmatched = [], []
166    for gene, match in zip(genes, map(gene_matcher.umatch, genes)):
167        if match:
168            matched.append(match)
169        else:
170            unmatched.append(gene)
171    if unmatched:
172        import warnings
173        warnings.warn("Unmatched genes " + "," .join(["%r" % g for g in unmatched]))
174   
175    results = gene_expression_atlas(matched, progress_callback=progress_callback)
176   
177    def collect_ef_summary(result, ef, summary):
178        for exp in result.expressions:
179            if exp.ef == ef:
180                if any([exp.up, exp.down]):
181                    summary[result.name][exp.efv] = (exp.up, exp.down)
182       
183           
184    op, ds, ct = defaultdict(dict), defaultdict(dict), defaultdict(dict)
185    for res in results:
186        if res:
187            collect_ef_summary(res, "organism_part", op)
188            collect_ef_summary(res, "disease_state", ds)
189            collect_ef_summary(res, "cell_type", ct)
190       
191    return dict(op), dict(ds), dict(ct)
192
193def drop_none(iter):
194    """ Drop all ``None`` from the iterator.
195    """
196    for e in iter:
197        if e is not None:
198            yield e
199           
200def construct_atlas_gene_sets(genes, organism, factors=["organism_part",
201                                    "disease_state", "cell_type"],
202                              max_pvalue=1e-5):
203    """ Construct gene sets for atlas experimental factor values in
204    ``factors``.
205    """
206    results = gene_expression_atlas(genes)
207    sets = defaultdict(list)
208   
209    for res in drop_none(results):
210        for exp in res.expressions:
211            if exp.ef not in factors:
212                continue
213            diff_exp = [e for e in exp.experiments \
214                        if e.pvalue <= max_pvalue]
215            if diff_exp:
216                sets[exp.ef, exp.efv].append(res.id)
217
218    organism = "+".join(organism.lower().split())
219    from obiGeneSets import GeneSets, GeneSet
220   
221    def display_string(name):
222        return name.capitalize().replace("_", " ")
223   
224    gene_sets = []
225    for (ef, efv), genes in sets.items():
226        ef_display = display_string(ef)
227        gs = GeneSet(genes, "Diff. expressed in %s=%s." % (ef_display, efv), id=ef + ":" + efv,
228                     description="Diff. expressed in %s=%s" % (ef_display, efv),
229                     link="http://www.ebi.ac.uk/gxa/qrs?specie_0={organism}&gprop_0=&gnot_0=&gval_0=&fact_1=&fexp_1=UPDOWN&fmex_1=&fval_1=%22{efv}%22+&view=hm".format( \
230                            organism=organism, efv="+".join(efv.lower().split())),
231                     hierarchy=("Tissues", ef_display))
232        gene_sets.append(gs)
233    return GeneSets(gene_sets)
234
235
236# Mapping for common taxids from obiTaxonomy
237TAXID_TO_ORG = {"": "Anopheles gambiae",
238                "3702": "Arabidopsis thaliana",
239                "9913": "Bos taurus",
240                "6239": "Caenorhabditis elegans",
241                "7955": "Danio rerio",
242                "7227": "Drosophila melanogaster",
243                "": "Epstein barr virus",
244                "": "Gallus gallus",
245                "9606": "Homo sapiens",
246                "": "Human cytomegalovirus",
247                "": "Kaposi sarcoma-associated herpesvirus",
248                "10090": "Mus musculus",
249                "10116": "Rattus norvegicus",
250                "4932": "Saccharomyces cerevisiae",
251                "4896": "Schizosaccharomyces pombe",
252                "8355": "Xenopus laevis"
253     }
254
255def to_taxid(name):
256    dd = dict((v, k) for k, v in TAXID_TO_ORG.items())
257    if name in dd:
258        return dd[name]
259    else:
260        import obiTaxonomy as tax
261        ids = tax.to_taxid(name, mapTo=TAXID_TO_ORG.keys())
262        if ids:
263            return ids.pop()
264        else:
265            raise ValueError("Unknown organism.")
266
267
268__doc__ += """\
269Low level REST query interface
270------------------------------
271
272Use `query_atlas_simple` for simple querys.
273
274Example (query human genes for experiments in which they are up regulated) ::
275
276    >>> run_simple_query(genes=["SORL1", "PSIP1", "CDKN1C"], regulation="up", organism="Homo sapiens")
277    {u'...
278   
279Or use the `AtlasCondition` subclasses in this module to construct a more
280advanced query and use the `run_query` function.
281
282Example (query human genes annotated to the GO term 'transporter activity'
283that are up regulated in the liver in at least three experiments) ::
284
285    >>> go_cond = GenePropertyCondition("Goterm", "Is", "transporter activity")
286    >>> liver_cond = ExperimentalFactorCondition("Organism_part", "up", 3, "liver")
287    >>> org_cond = OrganismCondition("Homo sapiens")
288    >>> cond_list = ConditionList([go_cond, liver_cond, org_cond])
289    >>> run_query(cond_list)
290    {u'...
291
292"""
293
294import urllib2
295from  StringIO import StringIO
296import json
297from xml.etree.ElementTree import ElementTree
298
299parse_json = json.load
300
301
302def parse_xml(stream):
303    """ Parse an xml stream into an instance of xml.etree.ElementTree.ElementTree.
304    """
305    return ElementTree(file=stream)
306
307
308class GeneExpressionAtlasConenction(object):
309    """ A connection to Gene Expression Atlas database.
310    """
311    DEFAULT_ADDRESS = "http://www.ebi.ac.uk:80/gxa/"
312    DEFAULT_CACHE = orngServerFiles.localpath("GeneAtlas", "GeneAtlasConnectionCache.shelve")
313    def __init__(self, address=None, timeout=30, cache=None):
314        """ Initialize the conenction.
315       
316        :param address: Address of the server.
317        :param timeout: Socket timeout.
318        :param cache : A dict like object to use as a cache.
319       
320        """
321        self.address = address if address is not None else self.DEFAULT_ADDRESS
322        self.timeout = timeout
323        self.cache = cache if cache is not None else self.DEFAULT_CACHE
324   
325    def query(self, condition, format="json", start=None, rows=None, indent=False):
326        url = self.address + "api/vx?" + condition.rest()
327        if start is not None and rows is not None:
328            url += "&start={0}&rows={1}".format(start, rows)
329        url += "&format={0}".format(format)
330        if indent:
331            url += "&indent"
332#        print url
333        if self.cache is not None:
334            return self._query_cached(url, format)
335        else:
336            return urllib2.urlopen(url)
337        return response
338   
339    def _query_cached(self, url, format):
340        if self.cache is not None:
341            with self.open_cache() as cache:
342                cached = url in cache
343           
344            if not cached:
345                response = urllib2.urlopen(url)
346                contents = response.read()
347                # Test if the contents is a valid json or xml string (some
348                # times the stream just stops in the middle, so this makes
349                # sure we don't cache an invalid response
350                # TODO: what about errors (e.g. 'cannot handle the
351                # query in a timely fashion'
352                if format == "json":
353                    parse_json(StringIO(contents))
354                else:
355                    parse_xml(StringIO(contents))
356                   
357                with self.open_cache() as cache:
358                    cache[url] = contents
359            else:
360                with self.open_cache() as cache:
361                    contents = cache[url]
362            return StringIO(contents)
363        else:
364            return urllib2.urlopen(url)
365       
366    def open_cache(self):
367        if isinstance(self.cache, basestring):
368            return closing(shelve.open(self.cache))
369        elif hasattr(self.cache, "close"):
370            return closing(self.cache)
371        elif self.cache is None:
372            return fake_closing({})
373        else:
374            return fake_closing(self.cache)
375       
376       
377from contextlib import contextmanager
378@contextmanager
379def fake_closing(obj):
380    yield obj
381   
382   
383# Names of all Gene Property filter names
384GENE_FILTERS = \
385    ["Name", # Gene name
386     "Goterm", #Gene Ontology Term
387     "Interproterm", #InterPro Term
388     "Disease", #Gene-Disease Assocation
389     "Keyword", #Gene Keyword
390     "Protein", #Protein
391
392     "Dbxref", #Other Database Cross-Refs
393     "Embl", #EMBL-Bank ID
394     "Ensfamily", #Ensembl Family
395     "Ensgene", #Ensembl Gene ID
396
397     "Ensprotein", #Ensembl Protein ID
398     "Enstranscript", #Ensembl Transcript ID
399     "Goid", #Gene Ontology ID
400     "Image", #IMAGE ID
401     "Interproid", #InterPro ID
402     "Locuslink", #Entrez Gene ID
403
404     "Omimid", #OMIM ID
405     "Orf", #ORF
406     "Refseq", #RefSeq ID
407     "Unigene", #UniGene ID
408     "Uniprot", #UniProt Accession
409
410     "Hmdb", #HMDB ID
411     "Chebi", #ChEBI ID
412     "Cas", #CAS
413     "Uniprotmetenz", #Uniprotmetenz
414     "Gene", #Gene Name or Identifier
415     "Synonym", #Gene Synonym
416     ]
417   
418# Valid Gene Property filter qualifiers
419GENE_FILTER_QUALIFIERS =\
420    ["Is",
421     "IsNot"
422     ]
423
424# Organisms in the Atlas
425ATLAS_ORGANISMS = \
426    ["Anopheles gambiae",
427     "Arabidopsis thaliana",
428     "Bos taurus",
429     "Caenorhabditis elegans",
430     "Danio rerio",
431     "Drosophila melanogaster",
432     "Epstein barr virus",
433     "Gallus gallus",
434     "Homo sapiens",
435     "Human cytomegalovirus",
436     "Kaposi sarcoma-associated herpesvirus",
437     "Mus musculus",
438     "Rattus norvegicus",
439     "Saccharomyces cerevisiae",
440     "Schizosaccharomyces pombe",
441#     "Unknown",
442     "Xenopus laevis"
443     ]
444   
445def ef_ontology():
446    """ Return the `EF <http://www.ebi.ac.uk/efo/>`_ (Experimental Factor) ontology
447    """
448    import obiOntology
449    import orngServerFiles
450    # Should this be in the OBOFoundry (Ontology) domain
451    try:
452        file = open(orngServerFiles.localpath_download("ArrayExpress", "efo.obo"), "rb")
453    except urllib2.HTTPError:
454        file = urllib2.urlopen("http://efo.svn.sourceforge.net/svnroot/efo/trunk/src/efoinobo/efo.obo")
455    return obiOntology.OBOOntology(file)
456
457
458class Condition(object):
459    """ Base class for Gene Expression Atlas query condition
460    """
461    def validate(self):
462        """ Validate condition in a subclass.
463        """
464        raise NotImplementedError
465   
466    def rest(self):
467        """ Return a REST query part in a subclass.
468        """
469        raise NotImplementedError
470   
471   
472class ConditionList(list, Condition):
473    """ A list of AtlasCondition instances.
474    """ 
475    def validate(self):
476        for item in self:
477            item.validate()
478       
479    def rest(self):
480        return "&".join(cond.rest() for cond in self)
481
482
483class GenePropertyCondition(Condition):
484    """ An atlas gene filter condition.
485   
486    :param property: Property of the gene. If None or "" all properties
487        will be searched.
488    :param qualifier: Qualifier can be 'Is' or 'IsNot'
489    :param value: The value to search for.
490   
491    Example ::
492   
493        >>> # Condition on a gene name
494        >>> condition = GenePropertyCondition("Name", "Is", "AS3MT")
495        >>> # Condition on genes from a GO Term
496        >>> condition = GenePropertyCondition("Goterm", "Is", "p53 binding")
497        >>> # Condition on disease association
498        >>> condition = GenePropertyCondition("Disease", "Is", "cancer")
499       
500    """
501    def __init__(self, property, qualifier, value):
502        self.property = property or ""
503        self.qualifier = qualifier
504        if isinstance(value, basestring):
505            self.value = value.replace(" ", "+")
506        elif isinstance(value, list):
507            self.value = "+".join(value)
508        else:
509            raise ValueError(value)
510       
511        self.validate()
512       
513    def validate(self):
514        assert(self.property in GENE_FILTERS + [""])
515        assert(self.qualifier in GENE_FILTER_QUALIFIERS + [""])
516       
517    def rest(self):
518        return "gene{property}{qualifier}={value}".format(**self.__dict__)
519       
520       
521class ExperimentalFactorCondition(Condition):
522    """ An atlas experimental factor filter condition.
523   
524    :param factor: EFO experiamntal factor
525    :param regulation: "up", "down", "updown", "any" or "none"
526    :param n: Minimum number of of experimants with this condition
527    :param value: Experimantal factor value
528   
529    Example ::
530   
531        >>> # Any genes up regulated in at least 3 experiments involving cancer.
532        >>> condition = ExperimentalFactorCondition("", "up", 3, "cancer")
533        >>> # Only genes which are up/down regulated in the heart in at least one experiment.
534        >>> condition = ExperimentalFactorCondition("Organism_part", "updown", 1, "heart")
535       
536    """
537    def __init__(self, factor, regulation, n, value):
538        self.factor = factor
539        self.regulation = regulation
540        self.n = n
541        self.value = value
542        self.validate()
543       
544    def validate(self):
545        # TODO: validate the factor and value
546#        assert(self.factor in ef_ontology())
547        assert(self.regulation in ["up", "down", "updown"])
548       
549    def rest(self):
550        return "{regulation}{n}In{factor}={value}".format(**self.__dict__)
551       
552       
553class OrganismCondition(Condition):
554    """ Condition on organism.
555    """
556    def __init__(self, organism):
557        self.organism = organism
558        self.validate()
559       
560    def validate(self):
561        assert(self.organism in ATLAS_ORGANISMS)
562       
563    def rest(self):
564        return "species={0}".format(self.organism.replace(" ", "+").lower())
565       
566       
567class ExperimentCondition(Condition):
568    """ Condition on experiement
569   
570    :param property: Property of the experiment. If None or "" all properties
571        will be searched.
572    :param qualifier: Qualifier can be 'Has' or 'HasNot'
573    :param value: The value to search for.
574   
575    Example ::
576   
577        >>> # Condition on a experiemnt acession
578        >>> condition = ExperimentCondition("", "", "E-GEOD-24283")
579        >>> # Condition on experiments involving lung
580        >>> condition = ExperimentCondition("Organism_part", "Has", "lung")
581       
582    """
583    EXPERIMENT_FILTER_QUALIFIERS = [
584                "Has",
585                "HasNot"]
586   
587    def __init__(self, property, qualifier, value):
588        self.property = property
589        self.qualifier = qualifier
590        if isinstance(value, basestring):
591            self.value = value.replace(" ", "+")
592        elif isinstance(value, list):
593            self.value = "+".join(value)
594        else:
595            raise ValueError(value)
596       
597        self.validate()
598       
599    def validate(self):
600        # TODO: check to EFO factors
601#        assert(self.property in EXPERIMENT_FILTERS + [""])
602        assert(self.qualifier in self.EXPERIMENT_FILTER_QUALIFIERS + [""])
603       
604    def rest(self):
605        return "experiment{property}{qualifier}={value}".format(**self.__dict__)
606       
607       
608class GeneExpressionAtlasError(Exception):
609    """ An error response from the Atlas server.
610    """
611    pass
612   
613   
614def __check_atlas_error_json(response):
615    if "error" in response:
616        raise GeneExpressionAtlasError(response["error"])
617    return response
618 
619     
620def __check_atlas_error_xml(response):
621    error = response.find("error")
622    if error is not None:
623        raise GeneExpressionAtlasError(error.text)
624    return response
625   
626       
627def run_simple_query(genes=None, regulation=None, organism=None,
628                     condition=None, format="json", start=None,
629                     rows=None):
630    """ A simple Gene Atlas query.
631   
632    :param genes: A list of gene names to search for.
633    :param regulation: Search for experiments in which `genes` are "up",
634        "down", "updown" or "none" regulated. If None all experiments
635        are searched.
636    :param organism: Search experiments for organism. If None all experiments
637        are searched.
638    :param condition: An EFO factor value (e.g. "brain")
639   
640    Example ::
641       
642        >>> run_simple_query(genes=['Pou5f1', 'Dppa3'], organism="Mus musculus")
643        {u'...
644       
645        >>> run_simple_query(genes=['Pou5f1', 'Dppa3'], regulation="up", organism="Mus musculus")
646        {u'...
647       
648        >>> run_simple_query(genes=['Pou5f1', 'Dppa3'], regulation="up", condition="liver", organism="Mus musculus")
649        {u'...
650       
651    """
652    conditions = ConditionList()
653    if genes:
654        conditions.append(GenePropertyCondition("Gene", "Is", genes))
655    if regulation or condition:
656        regulation = "any" if regulation is None else regulation
657        condition = "" if condition is None else condition
658        conditions.append(ExperimentalFactorCondition("", regulation, 1, condition))
659    if organism:
660        conditions.append(OrganismCondition(organism))
661       
662    connection = GeneExpressionAtlasConenction()
663    results = connection.query(conditions, format=format, start=start,
664                               rows=rows)
665    if format == "json":
666        return parse_json(results)
667    else:
668        return parse_xml(results)
669
670"""\
671.. todo:: can this be implemented query_atlas(organism="...", Locuslink="...", Chebi="...", up3InCompound="..." downInEFO="...")
672      Need a full list of accepted factors
673"""
674
675def run_query(condition, format="json", start=None, rows=None, indent=False, connection=None):
676    """ Query Atlas based on a `condition` (instance of :class:`Condition`)
677   
678    Example ::
679       
680        >>> condition1 = GenePropertyCondition("Goterm", "Is", "p53 binding")
681        >>> condition2 = ExperimentalFactorCondition("Organism_part", "up", 3, "heart")
682        >>> condition = ConditionList([condition1, condition2])
683        >>> run_query(condition)
684        {u'...
685       
686    """
687    if connection is None:
688        connection = GeneExpressionAtlasConenction()
689    results = connection.query(condition, format=format, start=start,
690                               rows=rows, indent=indent)
691    if format == "json":
692        response = parse_json(results)
693        return __check_atlas_error_json(response)
694    else:
695        response = parse_xml(results)
696        return __check_atlas_error_xml(response)
697   
698def test():
699    from pprint import pprint   
700    pprint(get_atlas_summary(['Pou5f1', 'Dppa3'], 'Mus musculus'))
701       
702    pprint(get_atlas_summary(['PDLIM5', 'FGFR2' ], 'Homo sapiens'))
703    import doctest 
704    doctest.testmod(optionflags=doctest.ELLIPSIS)
705   
706if __name__ == "__main__":
707    test()
Note: See TracBrowser for help on using the repository browser.