source: orange-bioinformatics/obiArrayExpress.py @ 1350:63636452235c

Revision 1350:63636452235c, 23.8 KB checked in by ales_erjavec <ales.erjavec@…>, 3 years ago (diff)

Added more documentation.
Removed the unused caching parameters.

Line 
1"""
2obiArrayExpress
3===============
4
5A python module for accessing the ArrayExpress and GeneExpressionAtlas
6web services.
7
8
9Array Express
10-------------
11
12`Array Express Archive <http://www.ebi.ac.uk/arrayexpress/>`_ is a database of gene expression experiments that you
13can query and download.
14
15Example of an Array Express query ::
16
17    >>> import obiArrayExpress
18    >>> obiArrayExpress.query_experiments(accession='E-MEXP-31')
19    {u'experiments': ...
20   
21    >>> obiArrayExpress.query_files(accession='E-MEXP-32', format="xml")
22    XMLNode(...
23   
24.. note:: Currently querying ArrayExpress files only works with the xml format.
25
26.. note:: See the documentation of `query_experiments` for a full set of
27          parameters that these functions accept.
28
29"""
30
31import os, sys
32import urllib2
33
34import orngEnviron
35import warnings
36import posixpath
37import shelve
38import json
39from collections import defaultdict
40
41parse_json = json.load
42
43def parse_xml(stream):
44    #TODO: parse stream, return the same structure as parse_json           
45    from Orange.misc.xml import parse
46    tree = parse(stream)
47    return tree
48
49
50# All searchable fields of ArrayExpress (see query_experiments docstring
51# for a description of the fields)
52ARRAYEXPRESS_FIELDS = \
53    ["keywords",
54     "accession",
55     "array",
56     "ef",
57     "efv",
58     "expdesign",
59     "exptype",
60     "gxa",
61     "pmid",
62     "sa",
63     "species",
64     "expandefo",
65     "directsub",
66     "assaycount",
67     "efcount",
68     "samplecount",
69     "sacount",
70     "rawcount",
71     "fgemcount",
72     "miamescore",
73     "date",
74     "wholewords",
75    ]
76
77class ArrayExpressConnection(object):
78    """ A connection to the ArrayExpress. Used for query construction,
79    and user login.
80   
81    .. todo:: Implement user login.
82    """
83   
84    DEFAULT_ADDRESS = "http://www.ebi.ac.uk/arrayexpress/{format}/v2/"
85    DEFAULT_FORMAT = "json"
86   
87    # Order of arguments in the query
88    _ARGS_ORDER = ["keywords", "species", "array"]
89   
90    def __init__(self, address=None, timeout=30,
91                 username=None, password=None):
92        """ Initialize the connection object.
93       
94        :param address: Address of the ArrayExpress API
95        :param timeout: Timeout for the socket connection
96       
97        .. todo:: Implement user login (see Accessing Private Data in API docs)
98       
99        """
100        self.address = address if address is not None else self.DEFAULT_ADDRESS
101        self.timeout = timeout
102       
103    def format_query(self, **kwargs):
104        """ Format the query arguments.
105       
106        Example ::
107       
108            >>> conn.format_query(gxa=True, efcount=(1, 5))
109            'efcount=[1 TO 5]&gxa=true'
110           
111        """
112        # Formaters:
113        def format_default(val):
114            if isinstance(val, basestring):
115                return val
116            else:
117                return "+".join(val)
118        def format_species(val):
119            return '"%s"' % val.lower()
120        def format_gxa(val):
121            if val:
122                return "true"
123            else:
124                raise ValueError("gxa={0}".format(val))
125        def format_expandefo(val):
126            if val:
127                return "on"
128            else:
129                raise ValueError("expandefo={0}".format(val))
130        def format_true_false(val):
131            return "true" if val else "false"
132        def format_interval(val):
133            if isinstance(val, tuple):
134                return "[{0} TO {1}]".format(*val)
135            else:
136                raise ValueError("Must be an interval argument (min, max)!")
137        def format_date(val):
138            return val
139        def format_wholewords(val):
140            if val:
141                return "on"
142            else:
143                raise ValueError("wholewords={0}".format(val))
144       
145        formaters = {"species": format_species,
146                     "gxa": format_gxa,
147                     "expandefo": format_expandefo,
148                     "directsub": format_true_false,
149                     "assaycount": format_interval,
150                     "efcount": format_interval,
151                     "samplecount": format_interval,
152                     "sacount": format_interval,
153                     "rawcount": format_interval,
154                     "fgemcount": format_interval,
155                     "miamescore": format_interval,
156                     "date": format_date,
157                     "wholewords": format_wholewords,
158                     }
159        parts = []
160        arg_items = kwargs.items()
161        ordered = sorted(arg_items, key=lambda arg: self._ARGS_ORDER.index(arg[0]) \
162                         if arg[0] in self._ARGS_ORDER else 100)
163       
164        for key, value in kwargs.iteritems():
165            if key == "format":
166                continue # format is handled in query_url
167            if key not in ARRAYEXPRESS_FIELDS:
168                raise ValueError("Invalid argument name: '{0}'".format(key))
169            if value is not None and value != []:
170                fmt = formaters.get(key, format_default)
171                value = fmt(value)
172                parts.append("{0}={1}".format(key, value))
173                 
174        return "&".join(parts)
175       
176    def query_url(self, what="experiments", **kwargs):
177        """ Return a formated query URL for the query arguments
178       
179        Example ::
180            >>> conn.query_url(accession="E-MEXP-31")
181            'http://www.ebi.ac.uk/arrayexpress/json/v2/experiments?accession=E-MEXP-31'
182           
183        """
184        query = self.format_query(**kwargs)
185        url = posixpath.join(self.address, what)
186        url = url.format(format=kwargs.get("format", self.DEFAULT_FORMAT))
187        url = url + ("?" + query if query else "")
188#        print url
189        url = url.replace(" ", "%20")
190        return url
191   
192    def query_url_experiments(self, **kwargs):
193        """ Return a formated experiments query url for the calls arguments
194        """
195        return self.query_url("experiments", **kwargs)
196   
197    def query_url_files(self, **kwargs):
198        """ Return a formated experiments query url for the calls arguments
199        """
200        return self.query_url("files", **kwargs)
201   
202    def query_experiment(self, **kwargs):
203        """ Return an open stream to the experiments query results.
204       
205        .. note:: This member function takes the same arguments as the module
206                  level `query_experiemnts` function.
207         
208        """
209        url = self.query_url_experiments(**kwargs)
210        stream = urllib2.urlopen(url, timeout=self.timeout)
211        return stream
212   
213    def query_files(self, **kwargs):
214        """ Return an open stream to the files query results.
215       
216        .. note:: This member function takes the same arguments as the module
217                  level `query_files` function.
218        """
219        url = self.query_url_files(**kwargs)
220        stream = urllib2.urlopen(url, timeout=self.timeout)
221        return stream
222   
223    def open_file(self, accession, kind="raw", ext=None):
224        """ Return a file handle to experiment data.
225        Possible values for kind:
226            - raw: return the raw data if available
227            - fgem: return the processed data if available
228            - biosamples: a png or svg design image
229            - idf: investigation description
230            - adf: array design description
231            - mageml: MAGE-ML file
232           
233        Example ::
234       
235            >>> raw_file = conn.open_file("E-TABM-1087", kind="raw")
236            >>> processed_file = conn.open_file("E-TABM-1087", kind="fgem")
237             
238        """
239        from Orange.misc.xml import parse
240        files = parse(self.query_files(accession=accession), format="xml")
241        files = list(files.elements("file"))
242        for file in files:
243            filekind = file.elements("kind").next()
244            fileext = file.elements("extension").next()
245            if filekind.data.strip() == kind and (fileext.data.strip() == ext or ext is None): 
246                url = file.elements("url").next()
247                return urllib2.urlopen(url.data.strip(), timeout=self.timeout)
248   
249   
250def query_experiments(keywords=None, accession=None, array=None, ef=None,
251                      efv=None, expdesign=None, exptype=None,
252                      gxa=None, pmid=None, sa=None, species=None,
253                      expandefo=None, directsub=None, assaycount=None,
254                      efcount=None, samplecount=None, rawcount=None,
255                      fgemcount=None, miamescore=None, date=None, 
256                      format="json", wholewords=None, connection=None):
257    """ Query Array Express experiments.
258   
259    :param keywords: A list of keywords to search (e.g. ['gliobastoma']
260    :param accession: Search by experiment accession (e.g. 'E-MEXP-31')
261    :param array: Search by array design name or accession (e.g. 'A-AFFY-33')
262    :param ef: Experimental factor (names of main variables of experiments)
263    :param efv: Experimental factor value (Has EFO expansion)
264    :param expdesign: Experiment design type. (e.g. ["dose", "response"])
265    :param exptype: Experiment type (e.g. 'RNA-Seq', has EFO expansion)
266    :param gxa: If True limit the results to experiments from the Gene
267        Expreission Atlas only.
268    :param pmid: Search by PubMed identifier
269    :param sa: Sample attribute values (e.g. 'fibroblast', has EFO expansion)
270    :param species: Search by species (e.g. 'Homo sapiens', has EFO expansion)
271   
272    :param expandefo: If True expand the search terms with all its child terms
273        in the Experimental Factor Ontology (EFO_) (e.g. keywords="cancer"
274        will be expanded to include for synonyms and sub types of cancer).
275    :param directsub: If True return only experiments submited directly to
276        Array Express else if False return only experiments imported from GEO
277        database (default None, return both)
278    :param assaycount: A two tuple (min, max) for filter on the number of
279        assays (e.g. (1, 5) will return only experiments with at least one
280        and no more then 5 assays).
281    :param efcount: Filter on the number of experimental factors (e.g. (1, 5))
282    :param sacount: Filter on the number of sample attribute categories
283    :param rawcount: Filter on the number or raw files
284    :param fgemcount: Filter on the number of final gene expression matrix
285        (processed data) files
286    :param miamescore: Filter on the MIAME complience score (max 5)
287    :param date: Filter by release date
288   
289    Example ::
290   
291        >>> query_experiments(species="Homo sapiens", ef="organism_part", efv="liver")
292        {u'experiments': ...
293       
294    .. _EFO: http://www.ebi.ac.uk/efo/
295   
296    """
297    if connection is None:
298        connection = ArrayExpressConnection()
299       
300    stream = connection.query_experiment(keywords=keywords, accession=accession,
301                array=array, ef=ef, efv=efv, expdesign=expdesign, exptype=exptype,
302                gxa=gxa, pmid=pmid, sa=sa, species=species, expandefo=expandefo,
303                directsub=directsub, assaycount=assaycount, efcount=efcount,
304                samplecount=samplecount, rawcount=rawcount, fgemcount=fgemcount,
305                miamescore=miamescore, date=date,  format=format,
306                wholewords=wholewords)
307   
308    if format == "json":
309        return parse_json(stream)
310    else:
311        return parse_xml(stream)
312
313def query_files(**kwargs):
314    """ Query Array Express files.
315   
316    This function accepts the same arguments as `query_experiments`.
317   
318    Example ::
319   
320        >>> query_files(species="Mus musculus", ef="developmental_stage", efv="embryo", format="xml")
321        XMLNode(...
322                       
323    """
324    connection = kwargs.get("connection", None)
325    if connection is None:
326        connection = ArrayExpressConnection()
327   
328    stream = connection.query_files(**kwargs)
329    if kwargs.get("format", "json") == "json":
330        return parse_json(stream)
331    else:
332        return parse_xml(stream)
333   
334__doc__ += """\
335Gene Expression Atlas
336---------------------
337
338`Gene Expression Atlas <http://www.ebi.ac.uk/gxa/>`_ is a curated subset of
339gene expression experiments in Array Express Archive.
340
341Use `query_atlas_simple` for simple querys.
342
343Example (query human genes for experiments in which they are up regulated) ::
344
345    >>> obiArrayExpress.query_atlas_simple(genes=["SORL1", "PSIP1", "CDKN1C"], regulation="up", organism="Homo sapiens")
346    {u'...
347   
348Or use the `AtlasCondition` subclasses in this module to construct a more
349advanced query and use the `query_atlas` function.
350
351Example (query human genes annotated to the GO term 'transporter activity'
352that are up regulated in the liver in at least three experiments) ::
353
354    >>> go_cond = AtlasConditionGeneProperty("Goterm", "Is", "transporter activity")
355    >>> liver_cond = AtlasConditionExperimentalFactor("Organism_part", "up", 3, "liver")
356    >>> org_cond = AtlasConditionOrganism("Homo sapiens")
357    >>> cond_list = AtlasConditionList([go_cond, liver_cond, org_cond])
358    >>> query_atlas(cond_list)
359    {u'...
360   
361"""
362
363class GeneExpressionAtlasConenction(object):
364    """ A connection to Gene Expression Atlas database.
365    """
366    DEFAULT_ADDRESS = "http://www.ebi.ac.uk:80/gxa/"
367   
368    def __init__(self, address=None, timeout=30):
369        """ Initialize the conenction.
370       
371        :param address: Address of the server.
372        :timeout: Socket timeout.
373       
374        """
375        self.address = address if address is not None else self.DEFAULT_ADDRESS
376        self.timeout = timeout
377   
378    def query(self, condition, format="json", start=None, rows=None, indent=False):
379        url = self.address + "api?" + condition.rest()
380        if start and rows:
381            url += "&start={0}&rows={1}".format(start, rows)
382        url += "&format={0}".format(format)
383        if indent:
384            url += "&indent"
385#        print url
386        response = urllib2.urlopen(url)
387        return response
388   
389# Names of all Gene Property filter names
390GENE_FILTERS = \
391    ["Name", # Gene name
392     "Goterm", #Gene Ontology Term
393     "Interproterm", #InterPro Term
394     "Disease", #Gene-Disease Assocation
395     "Keyword", #Gene Keyword
396     "Protein", #Protein
397
398     "Dbxref", #Other Database Cross-Refs
399     "Embl", #EMBL-Bank ID
400     "Ensfamily", #Ensembl Family
401     "Ensgene", #Ensembl Gene ID
402
403     "Ensprotein", #Ensembl Protein ID
404     "Enstranscript", #Ensembl Transcript ID
405     "Goid", #Gene Ontology ID
406     "Image", #IMAGE ID
407     "Interproid", #InterPro ID
408     "Locuslink", #Entrez Gene ID
409
410     "Omimid", #OMIM ID
411     "Orf", #ORF
412     "Refseq", #RefSeq ID
413     "Unigene", #UniGene ID
414     "Uniprot", #UniProt Accession
415
416     "Hmdb", #HMDB ID
417     "Chebi", #ChEBI ID
418     "Cas", #CAS
419     "Uniprotmetenz", #Uniprotmetenz
420     "Gene", #Gene Name or Identifier
421     "Synonym", #Gene Synonym
422     ]
423   
424# Valid Gene Property filter qualifiers
425GENE_FILTER_QUALIFIERS =\
426    ["Is",
427     "IsNot"
428     ]
429
430# Organisms in the Atlas
431ATLAS_ORGANISMS = \
432    ["Anopheles gambiae",
433     "Arabidopsis thaliana",
434     "Bos taurus",
435     "Caenorhabditis elegans",
436     "Danio rerio",
437     "Drosophila melanogaster",
438     "Epstein barr virus",
439     "Gallus gallus",
440     "Homo sapiens",
441     "Human cytomegalovirus",
442     "Kaposi sarcoma-associated herpesvirus",
443     "Mus musculus",
444     "Rattus norvegicus",
445     "Saccharomyces cerevisiae",
446     "Schizosaccharomyces pombe",
447     "Unknown",
448     "Xenopus laevis"
449     ]
450   
451def ef_ontology():
452    """ Return the `EF <http://www.ebi.ac.uk/efo/>`_ (Experimental Factor) ontology
453    """
454    import obiOntology
455#    return obiOntology.OBOOntology(urllib2.urlopen("http://efo.svn.sourceforge.net/svnroot/efo/trunk/src/efoinobo/efo.obo"))
456    import orngServerFiles
457    # Should this be in the OBOFoundry (Ontology) domain
458    file_name = orngServerFiles.localpath_download("ArrayExpress", "efo.obo")
459    return obiOntology.OBOOntology(open(filename, "rb"))
460
461
462class AtlasCondition(object):
463    """ Base class for Gene Expression Atlas query condition
464    """
465    def validate(self):
466        """ Validate condition in a subclass.
467        """
468        raise NotImplementedError
469   
470    def rest(self):
471        """ Return a REST query part in a subclass.
472        """
473        raise NotImplementedError
474   
475   
476class AtlasConditionList(list, AtlasCondition):
477    """ A list of AtlasCondition instances.
478    """ 
479    def validate(self):
480        for item in self:
481            item.validate()
482       
483    def rest(self):
484        return "&".join(cond.rest() for cond in self)
485
486class AtlasConditionGeneProperty(AtlasCondition):
487    """ An atlas gene filter condition.
488   
489    :param property: Property of the gene. If None or "" all properties
490        will be searched.
491    :param qualifier: Qualifier can be 'Is' or 'IsNot'
492    :param value: The value to search for.
493   
494    Example ::
495   
496        >>> # Condition on a gene name
497        >>> condition = AtlasConditionGeneProperty("Name", "Is", "AS3MT")
498        >>> # Condition on genes from a GO Term
499        >>> condition = AtlasConditionGeneProperty("Goterm", "Is", "p53 binding")
500        >>> # Condition on disease association
501        >>> condition = AtlasConditionGeneProperty("Disease", "Is", "cancer")
502       
503    """
504    def __init__(self, property, qualifier, value):
505        self.property = property or ""
506        self.qualifier = qualifier
507        if isinstance(value, basestring):
508            self.value = value.replace(" ", "+")
509        elif isinstance(value, list):
510            self.value = "+".join(value)
511        else:
512            raise ValueError(value)
513       
514        self.validate()
515       
516    def validate(self):
517        assert(self.property in GENE_FILTERS + [""])
518        assert(self.qualifier in GENE_FILTER_QUALIFIERS + [""])
519       
520    def rest(self):
521        return "gene{property}{qualifier}={value}".format(**self.__dict__)
522       
523       
524class AtlasConditionExperimentalFactor(AtlasCondition):
525    """ An atlas experimental factor filter condition.
526   
527    :param factor: EFO experiamntal factor
528    :param regulation: "up", "down", "updown", "any" or "none"
529    :param n: Minimum number of of experimants with this condition
530    :param value: Experimantal factor value
531   
532    Example ::
533   
534        >>> # Any genes up regulated in at least 3 experiments involving cancer.
535        >>> condition = AtlasConditionExperimentalFactor("", "up", 3, "cancer")
536        >>> # Only genes which are up/down regulated in the heart in at least one experiment.
537        >>> condition = AtlasConditionExperimentalFactor("Organism_part", "updown", 1, "heart")
538       
539    """
540    def __init__(self, factor, regulation, n, value):
541        self.factor = factor
542        self.regulation = regulation
543        self.n = n
544        self.value = value
545        self.validate()
546       
547    def validate(self):
548        # TODO: validate the factor and value
549#        assert(self.factor in ef_ontology())
550        assert(self.regulation in ["up", "down", "updown"])
551       
552    def rest(self):
553        return "{regulation}{n}In{factor}={value}".format(**self.__dict__)
554       
555class AtlasConditionOrganism(AtlasCondition):
556    """ Condition on organism.
557    """
558    def __init__(self, organism):
559        self.organism = organism
560        self.validate()
561       
562    def validate(self):
563        assert(self.organism in ATLAS_ORGANISMS)
564       
565    def rest(self):
566        return "species={0}".format(self.organism.replace(" ", "+").lower())
567       
568   
569def query_atlas_simple(genes=None, regulation=None, organism=None,
570                       condition=None, format="json", start=None,
571                       rows=None):
572    """ A simple Atlas query.
573   
574    :param genes: A list of gene names to search for.
575    :param regulation: Search for experiments in which `genes` are "up",
576        "down", "updown" or "none" regulated. If None all experiments
577        are searched.
578    :param organism: Search experiments for organism. If None all experiments
579        are searched.
580    :param condition: An EFO factor value (e.g. "brain")
581   
582    Example ::
583       
584        >>> query_atlas_simple(genes=['Pou5f1', 'Dppa3'], organism="Mus musculus")
585        {u'...
586       
587        >>> query_atlas_simple(genes=['Pou5f1', 'Dppa3'], regulation="up", organism="Mus musculus")
588        {u'...
589       
590        >>> query_atlas_simple(genes=['Pou5f1', 'Dppa3'], regulation="up", condition="liver", organism="Mus musculus")
591        {u'...
592       
593    """
594    conditions = AtlasConditionList()
595    if genes:
596        conditions.append(AtlasConditionGeneProperty("Gene", "Is", genes))
597    if regulation or condition:
598        regulation = "any" if regulation is None else regulation
599        condition = "" if condition is None else condition
600        conditions.append(AtlasConditionExperimentalFactor("", regulation, 1, condition))
601    if organism:
602        conditions.append(AtlasConditionOrganism(organism))
603       
604    connection = GeneExpressionAtlasConenction()
605    results = connection.query(conditions, format=format, start=start,
606                               rows=rows)
607    if format == "json":
608        return parse_json(results)
609    else:
610        return parse_xml(results)
611
612"""\
613.. todo:: can this be implemented query_atlas(organism="...", Locuslink="...", Chebi="...", up3InCompound="..." downInEFO="...")
614      Need a full list of accepted factors
615"""
616
617def query_atlas(condition, format="json", start=None, rows=None, indent=False):
618    """ Query Atlas based on a `condition` (instance of AtlasCondition)
619   
620    Example ::
621       
622        >>> condition1 = AtlasConditionGeneProperty("Goterm", "Is", "p53 binding")
623        >>> condition2 = AtlasConditionExperimentalFactor("Organism_part", "up", 3, "heart")
624        >>> condition = AtlasConditionList([condition1, condition2])
625        >>> query_atlas(condition)
626        {u'...
627       
628    """
629    connection = GeneExpressionAtlasConenction()
630    results = connection.query(condition, format=format, start=start,
631                               rows=rows, indent=indent)
632    if format == "json":
633        return parse_json(results)
634    else:
635        return parse_xml(results)
636
637
638def get_atlas_summary(genes, organism):
639    """ Return 3 dictionaries containing a summary of atlas information
640    about three experimental factors:
641   
642        - Organism Part (OP)
643        - Disease State (DS)
644        - Cell type (CT)
645   
646    Each dictionary contains query genes as keys. Values are dictionaries
647    mapping factor values to a 2-tuple containig the count of up regulated
648    and down regulated experiments.
649   
650    Example ::
651   
652        >>> get_atlas_summary(["RUNX1"], "Homo sapiens")
653        ({u'RUNX1': ...
654       
655    """
656    genes_condition = AtlasConditionGeneProperty("Gene", "Is", genes)
657    org_condition = AtlasConditionOrganism(organism)
658    condition = AtlasConditionList([genes_condition, org_condition])
659    result = query_atlas(condition, format="json")
660   
661    org_part = collect_ef_summary(result, "organism_part")
662    disease_state = collect_ef_summary(result, "disease_state")
663    cell_type = collect_ef_summary(result, "cell_type")
664   
665    return org_part, disease_state, cell_type
666   
667def collect_ef_summary(info, ef):
668    """ Collect the results summary from query_atlas, result for experimental
669    factor `ef`.
670    """
671    summary = defaultdict(dict)
672    results = info["results"]
673    for res in results:
674        gene = res["gene"]
675        expressions = res["expressions"] 
676        for expression in expressions:
677            if expression["ef"] == ef:
678                efv = expression["efv"]
679                updown = (expression["upExperiments"],
680                          expression["downExperiments"]
681                          )
682               
683                if any(updown):
684                    summary[gene["name"]][efv] = updown
685   
686    return dict(summary)
687   
688   
689def test():
690    from pprint import pprint   
691    pprint(get_atlas_summary(['Pou5f1', 'Dppa3'], 'Mus musculus'))
692       
693    pprint(get_atlas_summary(['PDLIM5', 'FGFR2' ], 'Homo sapiens'))
694   
695   
696    conn = ArrayExpressConnection()
697    import doctest
698    doctest.testmod(optionflags=doctest.ELLIPSIS, extraglobs={"conn": conn})
699   
700if __name__ == "__main__":
701    test()
702   
Note: See TracBrowser for help on using the repository browser.