source: orange-bioinformatics/obiArrayExpress.py @ 1347:dd168923331f

Revision 1347:dd168923331f, 23.3 KB checked in by ales_erjavec <ales.erjavec@…>, 3 years ago (diff)

Added some docstrings, added results parsing in the module level functions.

Line 
1"""
2Array Express
3-------------
4
5A python module for accessing the ArrayExpress and GeneExpressionAtlas web services.
6
7Example::
8
9    >>> import obiArrayExpress
10    >>> obiArrayExpress.query_experiments(accession='E-MEXP-31')
11    {u'experiments': ...
12   
13    >>> obiArrayExpress.query_files(accession='E-MEXP-32', format="xml")
14    XMLNode(...
15   
16.. note: Currently querying ArrayExpress files only works with the xml format.
17 
18"""
19
20import os, sys
21import urllib2
22
23import orngEnviron
24import warnings
25import posixpath
26import shelve
27import json
28from collections import defaultdict
29
30parse_json = json.load
31
32def parse_xml(stream):
33    #TODO: parse stream, return the same structure as parse_json           
34    from Orange.misc.xml import parse
35    tree = parse(stream)
36    return tree
37   
38#    raise NotImplementedError()
39
40
41# All searchable fields of ArrayExpress (see query_experiments docstring
42# for a description of the fields)
43ARRAYEXPRESS_FIELDS = \
44    ["keywords",
45     "accession",
46     "array",
47     "ef",
48     "efv",
49     "expdesign",
50     "exptype",
51     "gxa",
52     "pmid",
53     "sa",
54     "species",
55     "expandefo",
56     "directsub",
57     "assaycount",
58     "efcount",
59     "samplecount",
60     "sacount",
61     "rawcount",
62     "fgemcount",
63     "miamescore",
64     "date",
65     "wholewords",
66    ]
67
68class ArrayExpressConnection(object):
69    """ A connection to the ArrayExpress database with query caching.
70    """
71   
72    try:
73        DEFAULT_CACHE = shelve.open(os.path.join(orngEnviron.bufferDir, "ArrayExpressCache.shelve"))
74    except ImportError:
75        warnings.warn("Could not load persistent cache!")
76        DEFAULT_CACHE = {}
77   
78    DEFAULT_ADDRESS = "http://www.ebi.ac.uk/arrayexpress/{format}/v2/"
79    DEFAULT_FORMAT = "json"
80   
81    # Order of arguments in the query
82    _ARGS_ORDER = ["keywords", "species", "array"]
83   
84    def __init__(self, address=None, cache=None, timeout=30,
85                 username=None, password=None):
86        """ Initialize the connection object.
87       
88        :param address: address of the ArrayExpress API
89        :param cache: a dict like object to use for caching
90        :param timeout: timeout for the socket connection
91       
92        .. todo: implement user login (see Accessing Private Data in API docs)
93       
94        """
95        self.address = address if address is not None else self.DEFAULT_ADDRESS
96        self.cache = cache if cache is not None else self.DEFAULT_CACHE
97        self.timeout = timeout
98       
99    def format_query(self, **kwargs):
100        """ Format the query arguments.
101       
102        Example::
103            >>> conn.format_query(gxa=True, efcount=(1, 5))
104            'efcount=[1 TO 5]&gxa=true'
105           
106        """
107        # Formaters:
108        def format_default(val):
109            if isinstance(val, basestring):
110                return val
111            else:
112                return "+".join(val)
113        def format_species(val):
114            return '"%s"' % val.lower()
115        def format_gxa(val):
116            if val:
117                return "true"
118            else:
119                raise ValueError("gxa={0}".format(val))
120        def format_expandefo(val):
121            if val:
122                return "on"
123            else:
124                raise ValueError("expandefo={0}".format(val))
125        def format_true_false(val):
126            return "true" if val else "false"
127        def format_interval(val):
128            if isinstance(val, tuple):
129                return "[{0} TO {1}]".format(*val)
130            else:
131                raise ValueError("Must be an interval argument (min, max)!")
132        def format_date(val):
133            return val
134        def format_wholewords(val):
135            if val:
136                return "on"
137            else:
138                raise ValueError("wholewords={0}".format(val))
139       
140        formaters = {"species": format_species,
141                     "gxa": format_gxa,
142                     "expandefo": format_expandefo,
143                     "directsub": format_true_false,
144                     "assaycount": format_interval,
145                     "efcount": format_interval,
146                     "samplecount": format_interval,
147                     "sacount": format_interval,
148                     "rawcount": format_interval,
149                     "fgemcount": format_interval,
150                     "miamescore": format_interval,
151                     "date": format_date,
152                     "wholewords": format_wholewords,
153                     }
154        parts = []
155        arg_items = kwargs.items()
156        ordered = sorted(arg_items, key=lambda arg: self._ARGS_ORDER.index(arg[0]) \
157                         if arg[0] in self._ARGS_ORDER else 100)
158       
159        for key, value in kwargs.iteritems():
160            if key == "format":
161                continue # format is handled in query_url
162            if key not in ARRAYEXPRESS_FIELDS:
163                raise ValueError("Invalid argument name: '{0}'".format(key))
164            if value is not None and value != []:
165                fmt = formaters.get(key, format_default)
166                value = fmt(value)
167                parts.append("{0}={1}".format(key, value))
168                 
169        return "&".join(parts)
170       
171    def query_url(self, what="experiments", **kwargs):
172        """ Return a formated query URL for the query arguments
173       
174        Example::
175            >>> conn.query_url(accession="E-MEXP-31")
176            'http://www.ebi.ac.uk/arrayexpress/json/v2/experiments?accession=E-MEXP-31'
177           
178        """
179        query = self.format_query(**kwargs)
180        url = posixpath.join(self.address, what)
181        url = url.format(format=kwargs.get("format", self.DEFAULT_FORMAT))
182        url = url + ("?" + query if query else "")
183#        print url
184        url = url.replace(" ", "%20")
185        return url
186   
187    def query_url_experiments(self, **kwargs):
188        """ Return a formated experiments query url for the calls arguments
189        """
190        return self.query_url("experiments", **kwargs)
191   
192    def query_url_files(self, **kwargs):
193        """ Return a formated experiments query url for the calls arguments
194        """
195        return self.query_url("files", **kwargs)
196   
197    def query_experiment(self, **kwargs):
198        """ Return an open stream to the experiments query results.
199       
200        .. note: This member function takes the same arguments as the module
201                 level query_experiemnts function.
202         
203        """
204        url = self.query_url_experiments(**kwargs)
205        stream = urllib2.urlopen(url, timeout=self.timeout)
206        return stream
207   
208    def query_files(self, **kwargs):
209        """ Return an open stream to the files query results.
210       
211        .. note: This member function takes the same arguments as the module
212                 level query_files function.
213        """
214        url = self.query_url_files(**kwargs)
215        stream = urllib2.urlopen(url, timeout=self.timeout)
216        return stream
217   
218    def open_file(self, accession, kind="raw", ext=None):
219        """ Return a file handle to experiment data.
220        Possible values for kind:
221            - raw: return the raw data if available
222            - fgem: return the processed data if available
223            - biosamples: a png or svg design image
224            - idf: investigation description
225            - adf: array design description
226            - mageml: MAGE-ML file
227           
228        Example::
229       
230            >>> raw_file = conn.open_file("E-TABM-1087", kind="raw")
231            >>> processed_file = conn.open_file("E-TABM-1087", kind="fgem")
232             
233        """
234        from Orange.misc.xml import parse
235        files = parse(self.query_files(accession=accession), format="xml")
236        files = list(files.elements("file"))
237        for file in files:
238            filekind = file.elements("kind").next()
239            fileext = file.elements("extension").next()
240            if filekind.data.strip() == kind and (fileext.data.strip() == ext or ext is None): 
241                url = file.elements("url").next()
242                return urllib2.urlopen(url.data.strip(), timeout=self.timeout)
243   
244   
245def query_experiments(keywords=None, accession=None, array=None, ef=None,
246                      efv=None, expdesign=None, exptype=None,
247                      gxa=None, pmid=None, sa=None, species=None,
248                      expandefo=None, directsub=None, assaycount=None,
249                      efcount=None, samplecount=None, rawcount=None,
250                      fgemcount=None, miamescore=None, date=None, 
251                      format="json", wholewords=None, connection=None):
252    """ Query Array Express experiments.
253   
254    :param keywords: A list of keywords to search (e.g. ['gliobastoma']
255    :param accession: Search by experiment accession (e.g. 'E-MEXP-31')
256    :param array: Search by array design name or accession (e.g. 'A-AFFY-33')
257    :param ef: Experimental factor (names of main variables of experiments)
258    :param efv: Experimental factor value (Has EFO expansion)
259    :param expdesign: Experiment design type. (e.g. ["dose", "response"])
260    :param exptype: Experiment type (e.g. 'RNA-Seq', has EFO expansion)
261    :param gxa: If True limit the results to experiments from the Gene
262        Expreission Atlas only.
263    :param pmid: Search by PubMed identifier
264    :param sa: Sample attribute values (e.g. 'fibroblast', has EFO expansion)
265    :param species: Search by species (e.g. 'Homo sapiens', has EFO expansion)
266   
267    :param expandefo: If True expand the search terms with all its child terms
268        in the Experimental Factor Ontology (EFO_) (e.g. keywords="cancer"
269        will be expanded to include for synonyms and sub types of cancer).
270    :param directsub: If True return only experiments submited directly to
271        Array Express else if False return only experiments imported from GEO
272        database (default None, return both)
273    :param assaycount: A two tuple (min, max) for filter on the number of
274        assays (e.g. (1, 5) will return only experiments with at least one
275        and no more then 5 assays).
276    :param efcount: Filter on the number of experimental factors (e.g. (1, 5))
277    :param sacount: Filter on the number of sample attribute categories
278    :param rawcount: Filter on the number or raw files
279    :param fgemcount: Filter on the number of final gene expression matrix
280        (processed data) files
281    :param miamescore: Filter on the MIAME complience score (max 5)
282    :param date: Filter by release date
283   
284    Example ::
285   
286        >>> query_experiments(species="Homo sapiens", ef="organism_part", efv="liver")
287        {u'experiments': ...
288       
289    .. _EFO: http://www.ebi.ac.uk/efo/
290   
291    """
292    if connection is None:
293        connection = ArrayExpressConnection()
294       
295    stream = connection.query_experiment(keywords=keywords, accession=accession,
296                array=array, ef=ef, efv=efv, expdesign=expdesign, exptype=exptype,
297                gxa=gxa, pmid=pmid, sa=sa, species=species, expandefo=expandefo,
298                directsub=directsub, assaycount=assaycount, efcount=efcount,
299                samplecount=samplecount, rawcount=rawcount, fgemcount=fgemcount,
300                miamescore=miamescore, date=date,  format=format,
301                wholewords=wholewords)
302   
303    if format == "json":
304        return parse_json(stream)
305    else:
306        return parse_xml(stream)
307
308def query_files(**kwargs):
309    """ Query Array Express files.
310   
311    This function accepts the same arguments as `query_experiments`.
312   
313    Example ::
314   
315        >>> query_files(species="Mus musculus", ef="developmental_stage", efv="embryo", format="xml")
316        XMLNode(...
317                       
318    """
319    connection = kwargs.get("connection", None)
320    if connection is None:
321        connection = ArrayExpressConnection()
322   
323    stream = connection.query_files(**kwargs)
324    if kwargs.get("format", "json") == "json":
325        return parse_json(stream)
326    else:
327        return parse_xml(stream)
328   
329__doc__ += """\
330Gene Expression Atlas
331---------------------
332
333Use `query_atlas_simple` for simple querys.
334
335Example (query human genes for experiments in which they are up regulated)::
336    >>> obiArrayExpress.query_atlas_simple(genes=["SORL1", "PSIP1", "CDKN1C"], regulation="up", organism="Homo sapiens")
337    {u'...
338   
339"""
340
341class GeneExpressionAtlasConenction(object):
342    """ A connection to Gene Expression Atlas database.
343    """
344    try:
345        DEFAULT_CACHE = shelve.open(os.path.join(orngEnviron.bufferDir, "GeneExpressionAtlasCache.shelve"))
346    except ImportError:
347        warnings.warn("Could not load persistent cache!")
348        DEFAULT_CACHE = {}
349   
350    DEFAULT_ADDRESS = "http://www.ebi.ac.uk:80/gxa/"
351   
352    def __init__(self, address=None, cache=None, timeout=30):
353        """ Initialize the conenction.
354       
355        :param address: Address of the server
356        :param cache: A dict like object to use for caching
357        :timeout: Socket timeout.
358       
359        """
360        self.address = address if address is not None else self.DEFAULT_ADDRESS
361        self.cache = cache if cache is not None else self.DEFAULT_CACHE
362        self.timeout = timeout
363       
364    def format_query(self,):
365        pass
366   
367    def query(self, condition, format="json", start=None, rows=None, indent=False):
368        url = self.address + "api?" + condition.rest()
369        if start and rows:
370            url += "&start={0}&rows={1}".format(start, rows)
371        url += "&format={0}".format(format)
372        if indent:
373            url += "&indent"
374#        print url
375        response = urllib2.urlopen(url)
376        return response
377   
378GENE_FILTERS = \
379    ["Name", # Gene name
380     "Goterm", #Gene Ontology Term
381     "Interproterm", #InterPro Term
382     "Disease", #Gene-Disease Assocation
383     "Keyword", #Gene Keyword
384     "Protein", #Protein
385
386     "Dbxref", #Other Database Cross-Refs
387     "Embl", #EMBL-Bank ID
388     "Ensfamily", #Ensembl Family
389     "Ensgene", #Ensembl Gene ID
390
391     "Ensprotein", #Ensembl Protein ID
392     "Enstranscript", #Ensembl Transcript ID
393     "Goid", #Gene Ontology ID
394     "Image", #IMAGE ID
395     "Interproid", #InterPro ID
396     "Locuslink", #Entrez Gene ID
397
398     "Omimid", #OMIM ID
399     "Orf", #ORF
400     "Refseq", #RefSeq ID
401     "Unigene", #UniGene ID
402     "Uniprot", #UniProt Accession
403
404     "Hmdb", #HMDB ID
405     "Chebi", #ChEBI ID
406     "Cas", #CAS
407     "Uniprotmetenz", #Uniprotmetenz
408     "Gene", #Gene Name or Identifier
409     "Synonym", #Gene Synonym
410     ]
411   
412GENE_FILTER_QUALIFIERS =\
413    ["Is",
414     "IsNot"
415     ]
416
417ATLAS_ORGANISMS = \
418    ["Anopheles gambiae",
419     "Arabidopsis thaliana",
420     "Bos taurus",
421     "Caenorhabditis elegans",
422     "Danio rerio",
423     "Drosophila melanogaster",
424     "Epstein barr virus",
425     "Gallus gallus",
426     "Homo sapiens",
427     "Human cytomegalovirus",
428     "Kaposi sarcoma-associated herpesvirus",
429     "Mus musculus",
430     "Rattus norvegicus",
431     "Saccharomyces cerevisiae",
432     "Schizosaccharomyces pombe",
433     "Unknown",
434     "Xenopus laevis"
435     ]
436   
437def ef_ontology():
438    """ Return the EF (Experimental Factor) ontology
439    """
440    import obiOntology
441#    return obiOntology.OBOOntology(urllib2.urlopen("http://efo.svn.sourceforge.net/svnroot/efo/trunk/src/efoinobo/efo.obo"))
442    import orngServerFiles
443    # Should this be in the OBOFoundry (Ontology) domain
444    file_name = orngServerFiles.localpath_download("ArrayExpress", "efo.obo")
445    return obiOntology.OBOOntology(open(filename, "rb"))
446
447
448class AtlasCondition(object):
449    """ Base class for Gene Expression Atlas query condition
450    """
451    def validate(self):
452        """ Validate condition in a subclass.
453        """
454        raise NotImplementedError
455   
456    def rest(self):
457        """ Return a REST query part in a subclass.
458        """
459        raise NotImplementedError
460   
461   
462class AtlasConditionList(list, AtlasCondition):
463    """ A list of AtlasCondition instances.
464    """ 
465    def validate(self):
466        for item in self:
467            item.validate()
468       
469    def rest(self):
470        return "&".join(cond.rest() for cond in self)
471
472class AtlasConditionGeneProperty(AtlasCondition):
473    """ An atlas gene filter condition.
474   
475    :param property: Property of the gene. If None or "" all properties
476        will be searched.
477    :param qualifier: Qualifier can be 'Is' or 'IsNot'
478    :param value: The value to search for.
479   
480    Example ::
481   
482        >>> # Condition on a gene name
483        >>> condition = AtlasConditionGeneProperty("Name", "Is", "AS3MT")
484        >>> # Condition on genes from a GO Term
485        >>> condition = AtlasConditionGeneProperty("Goterm", "Is", "p53 binding")
486        >>> # Condition on disease association
487        >>> condition = AtlasConditionGeneProperty("Disease", "Is", "cancer")
488       
489    """
490    def __init__(self, property, qualifier, value):
491        self.property = property or ""
492        self.qualifier = qualifier
493        if isinstance(value, basestring):
494            self.value = value.replace(" ", "+")
495        elif isinstance(value, list):
496            self.value = "+".join(value)
497        else:
498            raise ValueError(value)
499       
500        self.validate()
501       
502    def validate(self):
503        assert(self.property in GENE_FILTERS + [""])
504        assert(self.qualifier in GENE_FILTER_QUALIFIERS + [""])
505       
506    def rest(self):
507        return "gene{property}{qualifier}={value}".format(**self.__dict__)
508       
509       
510class AtlasConditionExperimentalFactor(AtlasCondition):
511    """ An atlas experimental factor filter condition.
512   
513    :param factor: EFO experiamntal factor
514    :param regulation: "up", "down", "updown", "any" or "none"
515    :param n: Minimum number of of experimants with this condition
516    :param value: Experimantal factor value
517   
518    Example ::
519   
520        >>> # Any genes up regulated in at least 3 experiments involving cancer.
521        >>> condition = AtlasConditionExperimentalFactor("", "up", 3, "cancer")
522        >>> # Only genes which are up/down regulated in the heart in at least one experiment.
523        >>> condition = AtlasConditionExperimentalFactor("Organism_part", "updown", 1, "heart")
524       
525    """
526    def __init__(self, factor, regulation, n, value):
527        self.factor = factor
528        self.regulation = regulation
529        self.n = n
530        self.value = value
531        self.validate()
532       
533    def validate(self):
534        # TODO: validate the factor and value
535#        assert(self.factor in ef_ontology())
536        assert(self.regulation in ["up", "down", "updown"])
537       
538    def rest(self):
539        return "{regulation}{n}In{factor}={value}".format(**self.__dict__)
540       
541class AtlasConditionOrganism(AtlasCondition):
542    """ Condition on organism.
543    """
544    def __init__(self, organism):
545        self.organism = organism
546        self.validate()
547       
548    def validate(self):
549        assert(self.organism in ATLAS_ORGANISMS)
550       
551    def rest(self):
552        return "species={0}".format(self.organism.replace(" ", "+").lower())
553       
554   
555def query_atlas_simple(genes=None, regulation=None, organism=None,
556                       condition=None, format="json", start=None,
557                       rows=None, indent=False):
558    """ A simple Atlas query.
559   
560    :param genes: A list of gene names to search for.
561    :param regulation: Search for experiments in which `genes` are "up",
562        "down", "updown" or "none" regulated. If None all experiments
563        are searched.
564    :param organism: Search experiments for organism. If None all experiments
565        are searched.
566    :param condition: An EFO factor value (e.g. "brain")
567   
568    Example::
569       
570        >>> query_atlas_simple(genes=['Pou5f1', 'Dppa3'], organism="Mus musculus")
571        {u'...
572       
573        >>> query_atlas_simple(genes=['Pou5f1', 'Dppa3'], regulation="up", organism="Mus musculus")
574        {u'...
575       
576        >>> query_atlas_simple(genes=['Pou5f1', 'Dppa3'], regulation="up", condition="liver", organism="Mus musculus")
577        {u'...
578       
579    """
580    conditions = AtlasConditionList()
581    conditions.append(AtlasConditionGeneProperty("Gene", "Is", genes))
582    if regulation or condition:
583        regulation = "any" if regulation is None else regulation
584        condition = "" if condition is None else condition
585        conditions.append(AtlasConditionExperimentalFactor("", regulation, 1, condition))
586    if organism:
587        conditions.append(AtlasConditionOrganism(organism))
588       
589    connection = GeneExpressionAtlasConenction()
590    results = connection.query(conditions, format=format, start=start,
591                               rows=rows)
592    if format == "json":
593        return parse_json(results)
594    else:
595        return parse_xml(results)
596
597"""\
598.. TODO: can this be implemented query_atlas(organism="...", Locuslink="...", Chebi="...", up3InCompound="..." downInEFO="...")
599      Need a full list of accepted factors
600"""
601
602def query_atlas(condition, format="json", start=None, rows=None, indent=False):
603    """ Query Atlas based on a `condition` (instance of AtlasCondition)
604   
605    Example::
606       
607        >>> condition1 = AtlasConditionGeneProperty("Goterm", "Is", "p53 binding")
608        >>> condition2 = AtlasConditionExperimentalFactor("Organism_part", "up", 3, "heart")
609        >>> condition = AtlasConditionList([condition1, condition2])
610        >>> query_atlas(condition)
611        {u'...
612       
613    """
614    connection = GeneExpressionAtlasConenction()
615    results = connection.query(condition, format=format, start=start,
616                               rows=rows, indent=indent)
617    if format == "json":
618        return parse_json(results)
619    else:
620        return parse_xml(results)
621
622
623def get_atlas_summary(genes, organism):
624    """ Return 3 dictionaries containing a summary of atlas information
625    about three experimental factors:
626   
627        - Organism Part (OP)
628        - Disease State (DS)
629        - Cell type (CT)
630   
631    Each dictionary contains query genes as keys. Values are dictionaries
632    mapping factor values to a 2-tuple containig the count of up regulated
633    and down regulated experiments.
634   
635    Example::
636   
637        >>> get_atlas_summary(["RUNX1"], "Homo sapiens")
638        ({u'RUNX1': ...
639       
640    """
641    genes_condition = AtlasConditionGeneProperty("Gene", "Is", genes)
642    org_condition = AtlasConditionOrganism(organism)
643    condition = AtlasConditionList([genes_condition, org_condition])
644    result = query_atlas(condition, format="json")
645   
646    org_part = collect_ef_summary(result, "organism_part")
647    disease_state = collect_ef_summary(result, "disease_state")
648    cell_type = collect_ef_summary(result, "cell_type")
649   
650    return org_part, disease_state, cell_type
651   
652def collect_ef_summary(info, ef):
653    """ Collect the results summary from query_atlas, result for experimental
654    factor `ef`.
655    """
656    summary = defaultdict(dict)
657    results = info["results"]
658    for res in results:
659        gene = res["gene"]
660        expressions = res["expressions"] 
661        for expression in expressions:
662            if expression["ef"] == ef:
663                efv = expression["efv"]
664                updown = (expression["upExperiments"],
665                          expression["downExperiments"]
666                          )
667               
668                if any(updown):
669                    summary[gene["name"]][efv] = updown
670   
671    return dict(summary)
672   
673   
674def test():
675    from pprint import pprint   
676    pprint(get_atlas_summary(['Pou5f1', 'Dppa3'], 'Mus musculus'))
677       
678    pprint(get_atlas_summary(['PDLIM5', 'FGFR2' ], 'Homo sapiens'))
679   
680   
681    conn = ArrayExpressConnection()
682    import doctest
683    doctest.testmod(optionflags=doctest.ELLIPSIS, extraglobs={"conn": conn})
684   
685if __name__ == "__main__":
686    test()
687   
Note: See TracBrowser for help on using the repository browser.