source: orange-bioinformatics/_bioinformatics/obiGeneAtlas.py @ 1636:10d234fdadb9

Revision 1636:10d234fdadb9, 23.1 KB checked in by mitar, 2 years ago (diff)

Restructuring because we will not be using namespaces.

Line 
1"""
2========================================
3Gene Expression Atlas (``obiGeneAtlas``)
4========================================
5
6Interface to Gene Expression Atlas.
7
8`Gene Expression Atlas <http://www.ebi.ac.uk/gxa/>`_ is a curated subset of
9gene expression experiments in Array Express Archive.
10
11.. autofunction:: gene_expression_atlas
12
13.. autofunction:: default_gene_matcher
14
15.. autofunction:: to_taxid
16
17"""
18
19from __future__ import absolute_import
20
21import os, shelve, sys
22from collections import defaultdict, namedtuple
23from contextlib import closing
24
25from Orange.orng import orngServerFiles
26from Orange.utils import serverfiles
27
28from . import obiGene
29
30GeneResults = namedtuple("GeneResults", "id name synonyms expressions")
31ExpressionResults = namedtuple("ExpressionResults", "ef efv up down experiments")
32ExperimentExpression = namedtuple("ExperimentExpression", "accession expression pvalue")
33
34##
35GeneAtlasResult = GeneResults
36AtlasExpressions = ExpressionResults
37AtlasExperiment = ExperimentExpression
38##
39
40def _cache(name="AtlasGeneResult.shelve"):
41    """ Return a open cache instance (a shelve object).
42    """
43    if not os.path.exists(orngServerFiles.localpath("GeneAtlas")):
44        try:
45            os.makedirs(orngServerFiles.localpath("GeneAtlas"))
46        except OSError:
47            pass
48    return shelve.open(orngServerFiles.localpath("GeneAtlas", name))
49
50SLEEP_TIME_MULTIPLIER = 3.0
51
52def gene_expression_atlas(genes, progress_callback=None):
53    """ Return GeneResults instances for genes (genes must be valid ensembl ids).
54    """
55    import time
56    genes = list(genes)
57    result_dict = {}
58    genes_not_cached = []
59    # See which genes are already cached
60    with closing(_cache()) as cache:
61        for gene in genes:
62            if str(gene) in cache:
63                result_dict[gene] = cache[str(gene)]
64            else:
65                genes_not_cached.append(gene)
66   
67    batch_size = 10
68    start = 0
69    res = []
70    while start < len(genes_not_cached):
71        batch = genes_not_cached[start: start + batch_size]
72        start += batch_size
73        start_time = time.time()
74        batch_res = batch_gene_atlas_expression(batch)
75        # Cache the new results.
76        # TODO: handle genes without any results.
77        genes_with_no_results = set(batch) - set(r.id for r in batch_res) 
78        with closing(_cache()) as cache:
79            for atlas_res in batch_res:
80                cache[str(atlas_res.id)] = atlas_res
81                result_dict[atlas_res.id] = atlas_res
82            for g in genes_with_no_results:
83                cache[str(g)] = None
84        res.extend(batch_res)
85        # Sleep
86        if start % (batch_size * 10) == 0:
87            # every 10 batches wait one minute before continuing.
88            time.sleep(60)
89        else:
90            time.sleep(min(20.0, SLEEP_TIME_MULTIPLIER*(time.time() - start_time)))
91           
92        if progress_callback:
93            progress_callback(100.0 * start / len(genes_not_cached))
94   
95    return [result_dict.get(g, None) for g in genes]
96
97   
98def batch_gene_atlas_expression(genes):
99    cond = GenePropertyCondition("Ensgene", "Is", genes)
100    res = run_query(cond, format="json")
101    results = res["results"]
102    results_genes = []
103    for one_result in results:
104        gene = one_result["gene"]
105        id = gene["id"]
106        name = gene["name"]
107        synonyms = gene.get("synonyms", [])
108        expressions = one_result["expressions"]
109        result_expressions = []
110        for expression in expressions:
111            ef = expression["ef"]
112            efv = expression["efv"]
113            up = expression["upExperiments"]
114            down = expression["downExperiments"]
115            experiments = expression["experiments"]
116            result_experiment = []
117            for exp in experiments:
118                exp_accession = exp["accession"]
119                updown = exp["expression"]
120                pval = exp["pvalue"]
121                result_experiment.append(ExperimentExpression(exp_accession, updown, pval))
122            result_expressions.append(ExpressionResults(ef, efv, up, down, result_experiment))
123        results_genes.append(GeneResults(id, name, synonyms, result_expressions))
124    return results_genes
125
126
127def default_gene_matcher(organism):
128    """ Return a default gene matcher for organism
129    (targeting Ensembl gene ids).
130   
131    """
132    taxid = to_taxid(organism)
133    matcher = obiGene.matcher([obiGene.GMEnsembl(taxid),
134                               obiGene.GMNCBI(taxid)])
135    matcher.set_targets(obiGene.EnsembleGeneInfo(taxid).keys())
136    return matcher
137
138from Orange.utils import lru_cache
139
140@lru_cache(maxsize=3)
141def _cached_default_gene_matcher(organism): 
142    return default_gene_matcher(organism)
143   
144
145def get_atlas_summary(genes, organism, gene_matcher=None,
146                      progress_callback=None):
147    """ Return 3 dictionaries containing a summary of atlas information
148    about three experimental factors:
149   
150        - Organism Part (OP)
151        - Disease State (DS)
152        - Cell type (CT)
153   
154    Each dictionary contains query genes as keys. Values are dictionaries
155    mapping factor values to a 2-tuple containig the count of up regulated
156    and down regulated experiments.
157   
158    Example ::
159   
160        >>> get_atlas_summary(["RUNX1"], "Homo sapiens")
161        ({u'RUNX1': ...
162       
163    """
164    if gene_matcher is None:
165        gene_matcher = _cached_default_gene_matcher(organism)
166       
167    matched, unmatched = [], []
168    for gene, match in zip(genes, map(gene_matcher.umatch, genes)):
169        if match:
170            matched.append(match)
171        else:
172            unmatched.append(gene)
173    if unmatched:
174        import warnings
175        warnings.warn("Unmatched genes " + "," .join(["%r" % g for g in unmatched]))
176   
177    results = gene_expression_atlas(matched, progress_callback=progress_callback)
178   
179    def collect_ef_summary(result, ef, summary):
180        for exp in result.expressions:
181            if exp.ef == ef:
182                if any([exp.up, exp.down]):
183                    summary[result.name][exp.efv] = (exp.up, exp.down)
184       
185           
186    op, ds, ct = defaultdict(dict), defaultdict(dict), defaultdict(dict)
187    for res in results:
188        if res:
189            collect_ef_summary(res, "organism_part", op)
190            collect_ef_summary(res, "disease_state", ds)
191            collect_ef_summary(res, "cell_type", ct)
192       
193    return dict(op), dict(ds), dict(ct)
194
195def drop_none(iter):
196    """ Drop all ``None`` from the iterator.
197    """
198    for e in iter:
199        if e is not None:
200            yield e
201           
202def construct_atlas_gene_sets(genes, organism, factors=["organism_part",
203                                    "disease_state", "cell_type"],
204                              max_pvalue=1e-5):
205    """ Construct gene sets for atlas experimental factor values in
206    ``factors``.
207    """
208    results = gene_expression_atlas(genes)
209    sets = defaultdict(list)
210   
211    for res in drop_none(results):
212        for exp in res.expressions:
213            if exp.ef not in factors:
214                continue
215            diff_exp = [e for e in exp.experiments \
216                        if e.pvalue <= max_pvalue]
217            if diff_exp:
218                sets[exp.ef, exp.efv].append(res.id)
219
220    organism = "+".join(organism.lower().split())
221    from .obiGeneSets import GeneSets, GeneSet
222   
223    def display_string(name):
224        return name.capitalize().replace("_", " ")
225   
226    gene_sets = []
227    for (ef, efv), genes in sets.items():
228        ef_display = display_string(ef)
229        gs = GeneSet(genes, "Diff. expressed in %s=%s." % (ef_display, efv), id=ef + ":" + efv,
230                     description="Diff. expressed in %s=%s" % (ef_display, efv),
231                     link="http://www.ebi.ac.uk/gxa/qrs?specie_0={organism}&gprop_0=&gnot_0=&gval_0=&fact_1=&fexp_1=UPDOWN&fmex_1=&fval_1=%22{efv}%22+&view=hm".format( \
232                            organism=organism, efv="+".join(efv.lower().split())),
233                     hierarchy=("Tissues", ef_display))
234        gene_sets.append(gs)
235    return GeneSets(gene_sets)
236
237
238# Mapping for common taxids from obiTaxonomy
239TAXID_TO_ORG = {"": "Anopheles gambiae",
240                "3702": "Arabidopsis thaliana",
241                "9913": "Bos taurus",
242                "6239": "Caenorhabditis elegans",
243                "7955": "Danio rerio",
244                "7227": "Drosophila melanogaster",
245                "": "Epstein barr virus",
246                "": "Gallus gallus",
247                "9606": "Homo sapiens",
248                "": "Human cytomegalovirus",
249                "": "Kaposi sarcoma-associated herpesvirus",
250                "10090": "Mus musculus",
251                "10116": "Rattus norvegicus",
252                "4932": "Saccharomyces cerevisiae",
253                "4896": "Schizosaccharomyces pombe",
254                "8355": "Xenopus laevis"
255     }
256
257def to_taxid(name):
258    dd = dict((v, k) for k, v in TAXID_TO_ORG.items())
259    if name in dd:
260        return dd[name]
261    else:
262        import .obiTaxonomy as tax
263        ids = tax.to_taxid(name, mapTo=TAXID_TO_ORG.keys())
264        if ids:
265            return ids.pop()
266        else:
267            raise ValueError("Unknown organism.")
268
269
270__doc__ += """\
271Low level REST query interface
272------------------------------
273
274Use `query_atlas_simple` for simple querys.
275
276Example (query human genes for experiments in which they are up regulated) ::
277
278    >>> run_simple_query(genes=["SORL1", "PSIP1", "CDKN1C"], regulation="up", organism="Homo sapiens")
279    {u'...
280   
281Or use the `AtlasCondition` subclasses in this module to construct a more
282advanced query and use the `run_query` function.
283
284Example (query human genes annotated to the GO term 'transporter activity'
285that are up regulated in the liver in at least three experiments) ::
286
287    >>> go_cond = GenePropertyCondition("Goterm", "Is", "transporter activity")
288    >>> liver_cond = ExperimentalFactorCondition("Organism_part", "up", 3, "liver")
289    >>> org_cond = OrganismCondition("Homo sapiens")
290    >>> cond_list = ConditionList([go_cond, liver_cond, org_cond])
291    >>> run_query(cond_list)
292    {u'...
293
294"""
295
296import urllib2
297from  StringIO import StringIO
298import json
299from xml.etree.ElementTree import ElementTree
300
301parse_json = json.load
302
303
304def parse_xml(stream):
305    """ Parse an xml stream into an instance of xml.etree.ElementTree.ElementTree.
306    """
307    return ElementTree(file=stream)
308
309
310class GeneExpressionAtlasConenction(object):
311    """ A connection to Gene Expression Atlas database.
312    """
313    DEFAULT_ADDRESS = "http://www.ebi.ac.uk:80/gxa/"
314    DEFAULT_CACHE = orngServerFiles.localpath("GeneAtlas", "GeneAtlasConnectionCache.shelve")
315    def __init__(self, address=None, timeout=30, cache=None):
316        """ Initialize the conenction.
317       
318        :param address: Address of the server.
319        :param timeout: Socket timeout.
320        :param cache : A dict like object to use as a cache.
321       
322        """
323        self.address = address if address is not None else self.DEFAULT_ADDRESS
324        self.timeout = timeout
325        self.cache = cache if cache is not None else self.DEFAULT_CACHE
326   
327    def query(self, condition, format="json", start=None, rows=None, indent=False):
328        url = self.address + "api/vx?" + condition.rest()
329        if start is not None and rows is not None:
330            url += "&start={0}&rows={1}".format(start, rows)
331        url += "&format={0}".format(format)
332        if indent:
333            url += "&indent"
334#        print url
335        if self.cache is not None:
336            return self._query_cached(url, format)
337        else:
338            return urllib2.urlopen(url)
339        return response
340   
341    def _query_cached(self, url, format):
342        if self.cache is not None:
343            with self.open_cache() as cache:
344                cached = url in cache
345           
346            if not cached:
347                response = urllib2.urlopen(url)
348                contents = response.read()
349                # Test if the contents is a valid json or xml string (some
350                # times the stream just stops in the middle, so this makes
351                # sure we don't cache an invalid response
352                # TODO: what about errors (e.g. 'cannot handle the
353                # query in a timely fashion'
354                if format == "json":
355                    parse_json(StringIO(contents))
356                else:
357                    parse_xml(StringIO(contents))
358                   
359                with self.open_cache() as cache:
360                    cache[url] = contents
361            else:
362                with self.open_cache() as cache:
363                    contents = cache[url]
364            return StringIO(contents)
365        else:
366            return urllib2.urlopen(url)
367       
368    def open_cache(self):
369        if isinstance(self.cache, basestring):
370            return closing(shelve.open(self.cache))
371        elif hasattr(self.cache, "close"):
372            return closing(self.cache)
373        elif self.cache is None:
374            return fake_closing({})
375        else:
376            return fake_closing(self.cache)
377       
378       
379from contextlib import contextmanager
380@contextmanager
381def fake_closing(obj):
382    yield obj
383   
384   
385# Names of all Gene Property filter names
386GENE_FILTERS = \
387    ["Name", # Gene name
388     "Goterm", #Gene Ontology Term
389     "Interproterm", #InterPro Term
390     "Disease", #Gene-Disease Assocation
391     "Keyword", #Gene Keyword
392     "Protein", #Protein
393
394     "Dbxref", #Other Database Cross-Refs
395     "Embl", #EMBL-Bank ID
396     "Ensfamily", #Ensembl Family
397     "Ensgene", #Ensembl Gene ID
398
399     "Ensprotein", #Ensembl Protein ID
400     "Enstranscript", #Ensembl Transcript ID
401     "Goid", #Gene Ontology ID
402     "Image", #IMAGE ID
403     "Interproid", #InterPro ID
404     "Locuslink", #Entrez Gene ID
405
406     "Omimid", #OMIM ID
407     "Orf", #ORF
408     "Refseq", #RefSeq ID
409     "Unigene", #UniGene ID
410     "Uniprot", #UniProt Accession
411
412     "Hmdb", #HMDB ID
413     "Chebi", #ChEBI ID
414     "Cas", #CAS
415     "Uniprotmetenz", #Uniprotmetenz
416     "Gene", #Gene Name or Identifier
417     "Synonym", #Gene Synonym
418     ]
419   
420# Valid Gene Property filter qualifiers
421GENE_FILTER_QUALIFIERS =\
422    ["Is",
423     "IsNot"
424     ]
425
426# Organisms in the Atlas
427ATLAS_ORGANISMS = \
428    ["Anopheles gambiae",
429     "Arabidopsis thaliana",
430     "Bos taurus",
431     "Caenorhabditis elegans",
432     "Danio rerio",
433     "Drosophila melanogaster",
434     "Epstein barr virus",
435     "Gallus gallus",
436     "Homo sapiens",
437     "Human cytomegalovirus",
438     "Kaposi sarcoma-associated herpesvirus",
439     "Mus musculus",
440     "Rattus norvegicus",
441     "Saccharomyces cerevisiae",
442     "Schizosaccharomyces pombe",
443#     "Unknown",
444     "Xenopus laevis"
445     ]
446   
447def ef_ontology():
448    """ Return the `EF <http://www.ebi.ac.uk/efo/>`_ (Experimental Factor) ontology
449    """
450    from . import obiOntology
451    from . import orngServerFiles
452    # Should this be in the OBOFoundry (Ontology) domain
453    try:
454        file = open(orngServerFiles.localpath_download("ArrayExpress", "efo.obo"), "rb")
455    except urllib2.HTTPError:
456        file = urllib2.urlopen("http://efo.svn.sourceforge.net/svnroot/efo/trunk/src/efoinobo/efo.obo")
457    return obiOntology.OBOOntology(file)
458
459
460class Condition(object):
461    """ Base class for Gene Expression Atlas query condition
462    """
463    def validate(self):
464        """ Validate condition in a subclass.
465        """
466        raise NotImplementedError
467   
468    def rest(self):
469        """ Return a REST query part in a subclass.
470        """
471        raise NotImplementedError
472   
473   
474class ConditionList(list, Condition):
475    """ A list of AtlasCondition instances.
476    """ 
477    def validate(self):
478        for item in self:
479            item.validate()
480       
481    def rest(self):
482        return "&".join(cond.rest() for cond in self)
483
484
485class GenePropertyCondition(Condition):
486    """ An atlas gene filter condition.
487   
488    :param property: Property of the gene. If None or "" all properties
489        will be searched.
490    :param qualifier: Qualifier can be 'Is' or 'IsNot'
491    :param value: The value to search for.
492   
493    Example ::
494   
495        >>> # Condition on a gene name
496        >>> condition = GenePropertyCondition("Name", "Is", "AS3MT")
497        >>> # Condition on genes from a GO Term
498        >>> condition = GenePropertyCondition("Goterm", "Is", "p53 binding")
499        >>> # Condition on disease association
500        >>> condition = GenePropertyCondition("Disease", "Is", "cancer")
501       
502    """
503    def __init__(self, property, qualifier, value):
504        self.property = property or ""
505        self.qualifier = qualifier
506        if isinstance(value, basestring):
507            self.value = value.replace(" ", "+")
508        elif isinstance(value, list):
509            self.value = "+".join(value)
510        else:
511            raise ValueError(value)
512       
513        self.validate()
514       
515    def validate(self):
516        assert(self.property in GENE_FILTERS + [""])
517        assert(self.qualifier in GENE_FILTER_QUALIFIERS + [""])
518       
519    def rest(self):
520        return "gene{property}{qualifier}={value}".format(**self.__dict__)
521       
522       
523class ExperimentalFactorCondition(Condition):
524    """ An atlas experimental factor filter condition.
525   
526    :param factor: EFO experiamntal factor
527    :param regulation: "up", "down", "updown", "any" or "none"
528    :param n: Minimum number of of experimants with this condition
529    :param value: Experimantal factor value
530   
531    Example ::
532   
533        >>> # Any genes up regulated in at least 3 experiments involving cancer.
534        >>> condition = ExperimentalFactorCondition("", "up", 3, "cancer")
535        >>> # Only genes which are up/down regulated in the heart in at least one experiment.
536        >>> condition = ExperimentalFactorCondition("Organism_part", "updown", 1, "heart")
537       
538    """
539    def __init__(self, factor, regulation, n, value):
540        self.factor = factor
541        self.regulation = regulation
542        self.n = n
543        self.value = value
544        self.validate()
545       
546    def validate(self):
547        # TODO: validate the factor and value
548#        assert(self.factor in ef_ontology())
549        assert(self.regulation in ["up", "down", "updown"])
550       
551    def rest(self):
552        return "{regulation}{n}In{factor}={value}".format(**self.__dict__)
553       
554       
555class OrganismCondition(Condition):
556    """ Condition on organism.
557    """
558    def __init__(self, organism):
559        self.organism = organism
560        self.validate()
561       
562    def validate(self):
563        assert(self.organism in ATLAS_ORGANISMS)
564       
565    def rest(self):
566        return "species={0}".format(self.organism.replace(" ", "+").lower())
567       
568       
569class ExperimentCondition(Condition):
570    """ Condition on experiement
571   
572    :param property: Property of the experiment. If None or "" all properties
573        will be searched.
574    :param qualifier: Qualifier can be 'Has' or 'HasNot'
575    :param value: The value to search for.
576   
577    Example ::
578   
579        >>> # Condition on a experiemnt acession
580        >>> condition = ExperimentCondition("", "", "E-GEOD-24283")
581        >>> # Condition on experiments involving lung
582        >>> condition = ExperimentCondition("Organism_part", "Has", "lung")
583       
584    """
585    EXPERIMENT_FILTER_QUALIFIERS = [
586                "Has",
587                "HasNot"]
588   
589    def __init__(self, property, qualifier, value):
590        self.property = property
591        self.qualifier = qualifier
592        if isinstance(value, basestring):
593            self.value = value.replace(" ", "+")
594        elif isinstance(value, list):
595            self.value = "+".join(value)
596        else:
597            raise ValueError(value)
598       
599        self.validate()
600       
601    def validate(self):
602        # TODO: check to EFO factors
603#        assert(self.property in EXPERIMENT_FILTERS + [""])
604        assert(self.qualifier in self.EXPERIMENT_FILTER_QUALIFIERS + [""])
605       
606    def rest(self):
607        return "experiment{property}{qualifier}={value}".format(**self.__dict__)
608       
609       
610class GeneExpressionAtlasError(Exception):
611    """ An error response from the Atlas server.
612    """
613    pass
614   
615   
616def __check_atlas_error_json(response):
617    if "error" in response:
618        raise GeneExpressionAtlasError(response["error"])
619    return response
620 
621     
622def __check_atlas_error_xml(response):
623    error = response.find("error")
624    if error is not None:
625        raise GeneExpressionAtlasError(error.text)
626    return response
627   
628       
629def run_simple_query(genes=None, regulation=None, organism=None,
630                     condition=None, format="json", start=None,
631                     rows=None):
632    """ A simple Gene Atlas query.
633   
634    :param genes: A list of gene names to search for.
635    :param regulation: Search for experiments in which `genes` are "up",
636        "down", "updown" or "none" regulated. If None all experiments
637        are searched.
638    :param organism: Search experiments for organism. If None all experiments
639        are searched.
640    :param condition: An EFO factor value (e.g. "brain")
641   
642    Example ::
643       
644        >>> run_simple_query(genes=['Pou5f1', 'Dppa3'], organism="Mus musculus")
645        {u'...
646       
647        >>> run_simple_query(genes=['Pou5f1', 'Dppa3'], regulation="up", organism="Mus musculus")
648        {u'...
649       
650        >>> run_simple_query(genes=['Pou5f1', 'Dppa3'], regulation="up", condition="liver", organism="Mus musculus")
651        {u'...
652       
653    """
654    conditions = ConditionList()
655    if genes:
656        conditions.append(GenePropertyCondition("Gene", "Is", genes))
657    if regulation or condition:
658        regulation = "any" if regulation is None else regulation
659        condition = "" if condition is None else condition
660        conditions.append(ExperimentalFactorCondition("", regulation, 1, condition))
661    if organism:
662        conditions.append(OrganismCondition(organism))
663       
664    connection = GeneExpressionAtlasConenction()
665    results = connection.query(conditions, format=format, start=start,
666                               rows=rows)
667    if format == "json":
668        return parse_json(results)
669    else:
670        return parse_xml(results)
671
672"""\
673.. todo:: can this be implemented query_atlas(organism="...", Locuslink="...", Chebi="...", up3InCompound="..." downInEFO="...")
674      Need a full list of accepted factors
675"""
676
677def run_query(condition, format="json", start=None, rows=None, indent=False, connection=None):
678    """ Query Atlas based on a `condition` (instance of :class:`Condition`)
679   
680    Example ::
681       
682        >>> condition1 = GenePropertyCondition("Goterm", "Is", "p53 binding")
683        >>> condition2 = ExperimentalFactorCondition("Organism_part", "up", 3, "heart")
684        >>> condition = ConditionList([condition1, condition2])
685        >>> run_query(condition)
686        {u'...
687       
688    """
689    if connection is None:
690        connection = GeneExpressionAtlasConenction()
691    results = connection.query(condition, format=format, start=start,
692                               rows=rows, indent=indent)
693    if format == "json":
694        response = parse_json(results)
695        return __check_atlas_error_json(response)
696    else:
697        response = parse_xml(results)
698        return __check_atlas_error_xml(response)
699   
700def test():
701    from pprint import pprint   
702    pprint(get_atlas_summary(['Pou5f1', 'Dppa3'], 'Mus musculus'))
703       
704    pprint(get_atlas_summary(['PDLIM5', 'FGFR2' ], 'Homo sapiens'))
705    import doctest 
706    doctest.testmod(optionflags=doctest.ELLIPSIS)
707   
708if __name__ == "__main__":
709    test()
Note: See TracBrowser for help on using the repository browser.