source: orange-bioinformatics/orangecontrib/bio/obiArrayExpress.py @ 1886:5a859e2c6ae1

Revision 1886:5a859e2c6ae1, 60.1 KB checked in by Ales Erjavec <ales.erjavec@…>, 6 months ago (diff)

Fixes for Arary Express query response parsing.

RevLine 
[1342]1"""
[1470]2===============
[1350]3obiArrayExpress
4===============
5
[1530]6A python module for accessing the ArrayExpress web services and database.
[1342]7
[1687]8`Array Express Archive <http://www.ebi.ac.uk/arrayexpress/>`_ is a
9database of gene expression experiments that you can query and download.
[1342]10
[1530]11Example ::
12
[1687]13    >>> # Retrieve the object representing experiment with accession E-TABM-25
[1530]14    >>> experiment = ArrayExpressExperiment("E-TABM-25")
15    >>> print experiment.accession
16    E-TABM-25
[1687]17
[1530]18    >>> print experiment.name
19    Transcription profiling of aging in the primate brain
[1687]20
[1530]21    >>> print experiment.species
22    ['Pan troglodytes']
[1687]23
[1530]24    >>> print experiment.files
25    [{'kind': ...
[1687]26
[1530]27    >>> # Retrieve the data matrix for experiment 'E-MEXP-2917'
28    >>> experiment = ArrayExpressExperiment("E-MEXP-2917")
[1687]29    >>> table = experiment.fgem_to_table()
[1530]30
31
32Low level Array Express query using REST services::
[1342]33
[1885]34    >>> from Orange.bio import obiArrayExpress
[1342]35    >>> obiArrayExpress.query_experiments(accession='E-MEXP-31')
[1347]36    {u'experiments': ...
[1687]37
[1470]38    >>> obiArrayExpress.query_experiments(keywords='gliobastoma')
39    {u'experiments': ...
[1687]40
[1347]41    >>> obiArrayExpress.query_files(accession='E-MEXP-32', format="xml")
[1530]42    <xml.etree.ElementTree.ElementTree object ...
[1687]43
[1350]44.. note:: Currently querying ArrayExpress files only works with the xml format.
45
46.. note:: See the documentation of `query_experiments` for a full set of
47          parameters that these functions accept.
48
[1342]49"""
50
[1632]51from __future__ import absolute_import
52
[1687]53import os
[1342]54import urllib2
[1687]55import re
[1342]56import warnings
57import shelve
[1353]58import shutil
59import posixpath
[1347]60import json
[1353]61from xml.etree.ElementTree import ElementTree
[1440]62from StringIO import StringIO
[1353]63
[1342]64from collections import defaultdict
[1470]65from functools import partial
66from contextlib import closing
[1342]67
[1885]68from Orange.orng import orngServerFiles
69
[1347]70parse_json = json.load
71
[1687]72
[1347]73def parse_xml(stream):
[1687]74    """ Parse an xml stream into an instance of
75    `xml.etree.ElementTree.ElementTree`.
[1353]76    """
[1687]77    return ElementTree(file=stream)
[1347]78
79# All searchable fields of ArrayExpress (see query_experiments docstring
80# for a description of the fields)
[1342]81ARRAYEXPRESS_FIELDS = \
[1347]82    ["keywords",
83     "accession",
[1342]84     "array",
85     "ef",
86     "efv",
87     "expdesign",
88     "exptype",
89     "gxa",
90     "pmid",
91     "sa",
92     "species",
[1347]93     "expandefo",
94     "directsub",
95     "assaycount",
96     "efcount",
97     "samplecount",
98     "sacount",
99     "rawcount",
100     "fgemcount",
101     "miamescore",
102     "date",
103     "wholewords",
[1342]104    ]
[1687]105
[1353]106
[1885]107def _open_shelve(filename, flag="r"):
108    dirname = os.path.dirname(filename)
109    if not os.path.isdir(dirname):
110        os.makedirs(dirname)
111    exists = os.path.exists(filename)
112    if flag in ["r", "w"] and not exists:
113        # needs to be created first
114        # XXX: Race condition
115        s = shelve.open(filename, "c")
116        s.close()
117
118    return shelve.open(filename, flag)
119
120
[1342]121class ArrayExpressConnection(object):
[1885]122    """
123    A connection to the Array Express.
[1687]124
[1885]125    Used to construct a REST query and run it.
126
127    :param address: Address of the ArrayExpress API.
128    :param timeout: Timeout for the socket connection.
129
130    .. todo::
131        Implement user login (see Accessing Private Data in API docs)
[1687]132
[1342]133    """
[1687]134
[1345]135    DEFAULT_ADDRESS = "http://www.ebi.ac.uk/arrayexpress/{format}/v2/"
136    DEFAULT_FORMAT = "json"
[1687]137    DEFAULT_CACHE = orngServerFiles.localpath("ArrayExpress",
138                                              "ArrayExpressCache.shelve")
[1342]139    # Order of arguments in the query
140    _ARGS_ORDER = ["keywords", "species", "array"]
[1687]141
[1353]142    def __init__(self, address=None, timeout=30, cache=None,
[1347]143                 username=None, password=None):
[1342]144        self.address = address if address is not None else self.DEFAULT_ADDRESS
145        self.timeout = timeout
[1353]146        self.cache = cache if cache is not None else self.DEFAULT_CACHE
147        self.username = username
148        self.password = password
149       
[1342]150    def format_query(self, **kwargs):
[1347]151        """ Format the query arguments.
152       
[1350]153        Example ::
154       
[1347]155            >>> conn.format_query(gxa=True, efcount=(1, 5))
156            'efcount=[1 TO 5]&gxa=true'
157           
158        """
159        # Formaters:
160        def format_default(val):
161            if isinstance(val, basestring):
162                return val
163            else:
164                return "+".join(val)
[1342]165        def format_species(val):
166            return '"%s"' % val.lower()
[1347]167        def format_gxa(val):
168            if val:
169                return "true"
170            else:
171                raise ValueError("gxa={0}".format(val))
172        def format_expandefo(val):
173            if val:
174                return "on"
175            else:
176                raise ValueError("expandefo={0}".format(val))
177        def format_true_false(val):
178            return "true" if val else "false"
179        def format_interval(val):
180            if isinstance(val, tuple):
181                return "[{0} TO {1}]".format(*val)
182            else:
183                raise ValueError("Must be an interval argument (min, max)!")
184        def format_date(val):
[1418]185            # TODO check if val contains a datetime.date object, assert proper format
186            return format_interval(val)
[1347]187        def format_wholewords(val):
188            if val:
189                return "on"
190            else:
191                raise ValueError("wholewords={0}".format(val))
192       
[1342]193        formaters = {"species": format_species,
[1347]194                     "gxa": format_gxa,
195                     "expandefo": format_expandefo,
196                     "directsub": format_true_false,
197                     "assaycount": format_interval,
198                     "efcount": format_interval,
199                     "samplecount": format_interval,
200                     "sacount": format_interval,
201                     "rawcount": format_interval,
202                     "fgemcount": format_interval,
203                     "miamescore": format_interval,
204                     "date": format_date,
205                     "wholewords": format_wholewords,
[1342]206                     }
207        parts = []
208        arg_items = kwargs.items()
209        ordered = sorted(arg_items, key=lambda arg: self._ARGS_ORDER.index(arg[0]) \
210                         if arg[0] in self._ARGS_ORDER else 100)
[1347]211       
[1342]212        for key, value in kwargs.iteritems():
[1347]213            if key == "format":
214                continue # format is handled in query_url
215            if key not in ARRAYEXPRESS_FIELDS:
216                raise ValueError("Invalid argument name: '{0}'".format(key))
217            if value is not None and value != []:
218                fmt = formaters.get(key, format_default)
219                value = fmt(value)
220                parts.append("{0}={1}".format(key, value))
221                 
[1342]222        return "&".join(parts)
223       
224    def query_url(self, what="experiments", **kwargs):
[1347]225        """ Return a formated query URL for the query arguments
[1342]226       
[1350]227        Example ::
[1342]228            >>> conn.query_url(accession="E-MEXP-31")
[1347]229            'http://www.ebi.ac.uk/arrayexpress/json/v2/experiments?accession=E-MEXP-31'
[1342]230           
231        """
232        query = self.format_query(**kwargs)
233        url = posixpath.join(self.address, what)
[1345]234        url = url.format(format=kwargs.get("format", self.DEFAULT_FORMAT))
[1347]235        url = url + ("?" + query if query else "")
[1342]236        url = url.replace(" ", "%20")
237        return url
238   
239    def query_url_experiments(self, **kwargs):
240        """ Return a formated experiments query url for the calls arguments
241        """
242        return self.query_url("experiments", **kwargs)
243   
244    def query_url_files(self, **kwargs):
245        """ Return a formated experiments query url for the calls arguments
246        """
247        return self.query_url("files", **kwargs)
248   
249    def query_experiment(self, **kwargs):
[1347]250        """ Return an open stream to the experiments query results.
251       
[1350]252        .. note:: This member function takes the same arguments as the module
253                  level `query_experiemnts` function.
[1347]254         
255        """
[1342]256        url = self.query_url_experiments(**kwargs)
[1353]257        stream = self._cache_urlopen(url, timeout=self.timeout)
[1342]258        return stream
259   
260    def query_files(self, **kwargs):
[1347]261        """ Return an open stream to the files query results.
262       
[1350]263        .. note:: This member function takes the same arguments as the module
264                  level `query_files` function.
[1347]265        """
[1342]266        url = self.query_url_files(**kwargs)
[1353]267        stream = self._cache_urlopen(url, timeout=self.timeout)
[1342]268        return stream
269   
270    def open_file(self, accession, kind="raw", ext=None):
271        """ Return a file handle to experiment data.
272        Possible values for kind:
273            - raw: return the raw data if available
274            - fgem: return the processed data if available
275            - biosamples: a png or svg design image
276            - idf: investigation description
277            - adf: array design description
278            - mageml: MAGE-ML file
279           
[1350]280        Example ::
[1342]281       
282            >>> raw_file = conn.open_file("E-TABM-1087", kind="raw")
283            >>> processed_file = conn.open_file("E-TABM-1087", kind="fgem")
284             
285        """
[1886]286        stream = self.query_files(accession=accession, format="json")
287        data = json.load(stream)
288        try:
289            files = data["files"]["experiment"]["file"]
290        except KeyError:
291            raise ValueError(accession)
292
[1342]293        for file in files:
[1886]294            filekind = file["kind"]
295            fileext = file["extension"]
296            if (filekind == kind) and (fileext == ext or ext is None):
297                url = file["url"]
298                return self._cache_urlopen(str(url), timeout=self.timeout)
299
300        raise ValueError("%s does not have a file of kind: %r" %
301                         (accession, kind))
302
[1353]303    def _cache_urlopen(self, url, timeout=30):
[1470]304        if self.cache is not None:
[1885]305            with self.open_cache("r") as cache:
306                if url in cache:
307                    return StringIO(cache[url])
308
309            stream = urllib2.urlopen(url, timeout=timeout)
310            data = stream.read()
311            with self.open_cache("w") as cache:
312                cache[url] = data
313
[1353]314            return StringIO(data)
315        else:
316            return urllib2.urlopen(url, timeout=timeout)
[1885]317
318    def open_cache(self, flag="r"):
[1470]319        if isinstance(self.cache, basestring):
[1885]320            try:
321                return closing(_open_shelve(self.cache, flag))
322            except Exception:
323                return fake_closing({})
[1470]324        elif hasattr(self.cache, "close"):
325            return closing(self.cache)
326        elif self.cache is None:
327            return fake_closing({})
328        else:
329            return fake_closing(self.cache)
[1885]330
[1470]331from contextlib import contextmanager
332
333@contextmanager
334def fake_closing(obj):
335    yield obj
[1342]336   
[1347]337def query_experiments(keywords=None, accession=None, array=None, ef=None,
338                      efv=None, expdesign=None, exptype=None,
339                      gxa=None, pmid=None, sa=None, species=None,
340                      expandefo=None, directsub=None, assaycount=None,
341                      efcount=None, samplecount=None, rawcount=None,
342                      fgemcount=None, miamescore=None, date=None, 
343                      format="json", wholewords=None, connection=None):
[1342]344    """ Query Array Express experiments.
345   
[1347]346    :param keywords: A list of keywords to search (e.g. ['gliobastoma']
347    :param accession: Search by experiment accession (e.g. 'E-MEXP-31')
348    :param array: Search by array design name or accession (e.g. 'A-AFFY-33')
349    :param ef: Experimental factor (names of main variables of experiments)
350    :param efv: Experimental factor value (Has EFO expansion)
351    :param expdesign: Experiment design type. (e.g. ["dose", "response"])
352    :param exptype: Experiment type (e.g. 'RNA-Seq', has EFO expansion)
353    :param gxa: If True limit the results to experiments from the Gene
354        Expreission Atlas only.
355    :param pmid: Search by PubMed identifier
356    :param sa: Sample attribute values (e.g. 'fibroblast', has EFO expansion)
357    :param species: Search by species (e.g. 'Homo sapiens', has EFO expansion)
358   
359    :param expandefo: If True expand the search terms with all its child terms
360        in the Experimental Factor Ontology (EFO_) (e.g. keywords="cancer"
361        will be expanded to include for synonyms and sub types of cancer).
362    :param directsub: If True return only experiments submited directly to
363        Array Express else if False return only experiments imported from GEO
364        database (default None, return both)
365    :param assaycount: A two tuple (min, max) for filter on the number of
366        assays (e.g. (1, 5) will return only experiments with at least one
367        and no more then 5 assays).
368    :param efcount: Filter on the number of experimental factors (e.g. (1, 5))
369    :param sacount: Filter on the number of sample attribute categories
370    :param rawcount: Filter on the number or raw files
371    :param fgemcount: Filter on the number of final gene expression matrix
372        (processed data) files
373    :param miamescore: Filter on the MIAME complience score (max 5)
374    :param date: Filter by release date
375   
[1342]376    Example ::
377   
378        >>> query_experiments(species="Homo sapiens", ef="organism_part", efv="liver")
[1347]379        {u'experiments': ...
[1342]380       
[1347]381    .. _EFO: http://www.ebi.ac.uk/efo/
382   
[1342]383    """
[1347]384    if connection is None:
385        connection = ArrayExpressConnection()
386       
387    stream = connection.query_experiment(keywords=keywords, accession=accession,
388                array=array, ef=ef, efv=efv, expdesign=expdesign, exptype=exptype,
389                gxa=gxa, pmid=pmid, sa=sa, species=species, expandefo=expandefo,
390                directsub=directsub, assaycount=assaycount, efcount=efcount,
391                samplecount=samplecount, rawcount=rawcount, fgemcount=fgemcount,
392                miamescore=miamescore, date=date,  format=format,
393                wholewords=wholewords)
394   
395    if format == "json":
396        return parse_json(stream)
397    else:
398        return parse_xml(stream)
[1353]399   
400   
401def query_files(keywords=None, accession=None, array=None, ef=None,
402               efv=None, expdesign=None, exptype=None,
403               gxa=None, pmid=None, sa=None, species=None,
404               expandefo=None, directsub=None, assaycount=None,
405               efcount=None, samplecount=None, rawcount=None,
406               fgemcount=None, miamescore=None, date=None, 
407               format="json", wholewords=None, connection=None):
[1342]408    """ Query Array Express files.
409   
[1347]410    This function accepts the same arguments as `query_experiments`.
411   
[1342]412    Example ::
413   
[1347]414        >>> query_files(species="Mus musculus", ef="developmental_stage", efv="embryo", format="xml")
[1470]415        <xml.etree.ElementTree.ElementTree object ...
[1353]416       
417    .. todo:: why does the json interface not work.
[1342]418                       
419    """
[1347]420    if connection is None:
421        connection = ArrayExpressConnection()
[1342]422   
[1353]423    stream = connection.query_files(keywords=keywords, accession=accession,
424                array=array, ef=ef, efv=efv, expdesign=expdesign, exptype=exptype,
425                gxa=gxa, pmid=pmid, sa=sa, species=species, expandefo=expandefo,
426                directsub=directsub, assaycount=assaycount, efcount=efcount,
427                samplecount=samplecount, rawcount=rawcount, fgemcount=fgemcount,
428                miamescore=miamescore, date=date,  format=format,
429                wholewords=wholewords)
430   
431    if format == "json":
[1347]432        return parse_json(stream)
433    else:
434        return parse_xml(stream)
435   
[1353]436   
437def open_file(accession, kind="raw", ext=None, repo_dir=None):
438    """ Open a file for the experiment.
439     
440    Example ::
441   
442        >>> file = open_file("E-MTAB-369", kind="raw", repo_dir="~/.arrayexpress/")
443       
444    """
445    raise NotImplementedError
446
447"""\
448MAGE-TAB convinence functions, classes
449======================================
450"""
451
452IDF_SINGLE_TAGS = \
453    ["Investigation Title",
454     "Date of Experiment",
455     "Public Release Date",
456     "Experiment Description",
457    ]
458
459def parse_idf(file):
460    """ Parse an idf.txt (Investigation Description  Format) formated file.
461    Return a list of tuples where the first element is the tag (first column)
462    and the second element is a list of all tag values.
463   
464    """
465    if isinstance(file, basestring):
466        file = open(file, "rb")
467    data = file.read()
468    lines = data.splitlines()
469    lines = [line.split("\t") for line in lines if line and not line.startswith("#")]
470    parsed = [(line[0], line[1:]) for line in lines]
471    return parsed
472   
473def parse_sdrf(file):
474    """ Parse an sdfr formated file. Return a tuple with the first element
475    a list of header values and the second element is a list of all rows
476    (a row is a list of string values).
477   
478    """
479    if isinstance(file, basestring):
480        file = open(file, "rb")
481    data = file.read()
482    lines = data.splitlines()
483    lines = [line.split("\t") for line in lines if line.strip() and not line.startswith("#")]
484    header = [h for h in lines[0] if h]
[1354]485    rows = [line[:len(header)] for line in lines[1:]]
[1353]486    assert(all([len(r) == len(header) for r in rows]))
487    return header, rows
488   
489def parse_adf(file):
490    pass
491
492def parse_data_matrix(file):
493    """ Parse the MAGE-TAB processed data matrix. Return a tuple where the
494    elements are:
495        - a (header REF, header values) tuple (e.g. ("Hybridization REF", ["Stage1", "Stage2", ...]) )
[1687]496        - a list of quantitation type for header values (e.g. ["log2 ratio", "log2 ratio", ...])
[1353]497        - a (row REF, row names list) tuple ("e.g. ("Reporter REF", ["Probe 1", "Probe 2", ...]) )
498        - a list of list matrix with values (as strings)
499       
500    """
501    if isinstance(file, basestring):
502        file = open(file, "rb")
503    data = file.read()
504    lines = data.splitlines()
505    lines = [line.split("\t") for line in lines if line.strip()]
506    header = lines[0]
507    header_ref, header = header[0], header[1:]
508    line2 = lines[1]
[1687]509    row_ref, quant_type = line2[0], line2[1:]
[1353]510    row_names, rows = [], []
511    for line in lines[2:]:
512        r_name, row = line[0], line[1:]
513        row_names.append(r_name)
514        rows.append(row)
515       
516    return ((header_ref, header),
[1687]517            quant_type,
[1353]518            (row_ref, row_names),
519            rows) 
520   
[1530]521class InvestigationDesign(dict):
[1353]522    """ Investigation design (contains the contents of the .idf).
523   
524    Example ::
525   
526        >>> idf = InvestigationDesign("foobar.idf")
527        >>> print idf.investigation_title
528        foo investigation
529        >>> print idf.experimental_design
530        ['fubar', 'snafu']
531        >>> print idf.sdrf_file
532        ['foobar.sdrf']
533       
534    """
535    def __init__(self, idf_file=None):
536        idf = parse_idf(idf_file)
537        self._idf = idf
538        self.update(dict(idf))
539        for tag, values in idf:
540            if tag in IDF_SINGLE_TAGS:
541                values = values[0] if values else None
542            ttag = self.transform_tag(tag)
543            setattr(self, ttag, values)
544       
545    def transform_tag(self, tag):
546        """ Transform the tag into a proper python attribute name by
547        replacing all spaces and special characters (e.g '[', ']' into
548        underscores).
549       
550        """
551        toreplace = [" ", "-", "[", "]"]
552        for s in toreplace:
553            tag = tag.replace(s, "_")
554        return tag.lower()
555           
556    def __getitem__(self, tag):
557        """ Return the tag values
558       
559        Example ::
560       
561            >>> idf["Investigation Title"]
562            'foo investigation'
563           
564        """
565        try:
566            return self._idf_dict[tag]
567        except KeyError:
568            pass
569       
570        ttag = self.transform_tag(tag)
571        if hasattr(self, ttag):
572            return getattr(self, ttag)
573        else:
574            raise KeyError(tag)
575       
576class SampleDataRelationship(object):
577    """ Sample-Data Relationship (contains the contents of the .sdrf file).
578   
579    Example ::
580   
581        >>> sdr = SampleDataRelationship("foobar.sdrf")
[1687]582        >>> sdr.source_name()
[1353]583        ['foo', ...
584       
[1687]585        >>> sdr.sample_name()
[1353]586        ['sampled foo', ...
587       
[1687]588        >>> sdr.extract_protocol_ref()
[1353]589        ['bar the foo', ...
590       
[1687]591        >>> sdr.derived_array_data_matrix_file()
[1353]592        ['foobar.data.txt', ...
593       
594        >>>
595         
596    """
597   
598    # Nodes an edges
599    NODES_EDGES = ["Source Name", "Sample Name", "Extract Name",
600                   "Labeled Extract Name", "Hybridization Name",
601                   "Assay Name", "Scan Name", "Normalization Name",
602                   "Array Data File", "Derived Array Data File",
603                   "Array Data Matrix File", "Derived Array Data Matrix File",
604                   "Image File", "Protocol REF"]
605   
606    # Attributes for nodes and edges
607    NODE_EDGE_ATTRIBUTES = \
608    {"Source Name": ["Characteristics", "Provider", "Material Type", "Description", "Comment"],
609     "Sample Name": ["Characteristics", "Material Type", "Description", "Comment"],
610     "Extract Name":["Characteristics", "Material Type", "Description", "Comment"],
611     "Labeled Extract Name": ["Characteristics", "Material Type", "Description", "Label", "Comment"],
612     "Hybridization Name": ["Array Design File", "Array Design REF", "Comment"],
613     "Assay Name": ["Technology Type", "Comment"],
614     "Scan Name": ["Comment"],
615     "Normalization Name": ["Comment"],
616     "Array Data File": ["Comment"],
617     "Derived Array Data File": ["Comment"],
618     "Array Data Matrix File": ["Comment"],
619     "Derived Array Data Matrix File": ["Comment"],
620     "Image File": ["Comment"],
621     "Protocol REF": ["Term Source REF", "Parameter", "Performer", "Date", "Comment"]
622     }
623   
624    # Attributes
625    ATTRIBUTE_COLUMNS = \
626    {"Characteristics []": ["Unit", "Term Source REF"],
627     "Provider": ["Comment"],
628     "Material Type": ["Term Source REF"],
629     "Label": ["Term Source REF"],
630     "Array Design File": ["Comment"],
631     "Array Design REF": ["Term Source REF", "Comment"],   
632     "Technology Type": ["Term Source REF"],
633     "Factor Value [] ()": ["Unit", "Term Source REF"],
634     "Performer": ["Comment"],
635     "Date": [],
636     "Parameter Value []": ["Unit", "Comment"],
637     "Unit []": ["Term Source REF"],
638     "Description": [],
639     "Term Source REF": ["Term Accession Number"],
640     "Term Accession Number": [],
641     "Comment []": []
642     }
643    def __init__(self, sdrf_file=None):
644        header, rows = parse_sdrf(sdrf_file)
645        self.header = header
646        self.rows = rows
647       
648    def transform_tag(self, tag):
649        """ Transform the tag into a proper python attribute name by
650        replacing all spaces and special characters (e.g '[', ']' into
651        underscores).
652       
653        """
654        toreplace = [" ", "-", "[", "]"]
655        for s in toreplace:
656            tag = tag.replace(s, "_")
657        return tag.lower()
658   
659    def _subsection(self, name):
660        """ Return the named subsection (name must be from the
661        NODES_EDGES list).
662       
663        """
664        idx = self.NODES_EDGES.index(name)
665        start = self.header.index(name)
666        end = -1
667        for name in self.NODES_EDGES[idx + 1:]:
668            if name in self.header[start + 1:]:
669                end = self.header.index(name, start + 1)
670                break
671        return self.header[start:end], [r[start:end] for r in self.rows]
672   
673    def _column(self, name):
674        """ Return the named column.
675        """
[1687]676        if isinstance(name, basestring):
677            index = self.header.index(name)
678        else:
679            index = name
[1353]680        return [r[index] for r in self.rows]
681       
682    def source(self):
683        """ Return the Source subsection
684        """
685        return self._subsection("Source Name")
686       
687    def source_name(self):
688        """ Return the Source Name subsection
689        """
690        return self._column("Source Name")
691       
692    def sample(self):
693        """ Return the Sample subsection
694        """
695        return self._subsection("Sample Name")
696       
697    def sample_name(self):
698        """ Return the Sample Name subsection
699        """
700        return self._column("Sample Name")
701       
702    def extract(self):
703        """ Return the Extract subsection
704        """
705        return self._subsection("Extract Name")
706       
707    def extract_name(self):
708        """ Return the Extract Name subsection
709        """
710        return self._column("Extract Name")
711       
712    def labeled_extract(self):
713        """ Return the Labeled Extract subsection
714        """
715        return self._subsection("Labeled Extract Name")
716       
717    def labeled_extract_name(self):
718        """ Return the Labeled Extract Name subsection
719        """
720        return self._column("Labeled Extract Name")
721       
722    def hybridization(self):
[1687]723        """ Return the Hybridization subsection.
[1353]724        """
[1687]725        return self._subsection("Hybridization Name")
[1353]726       
727    def hybridization_name(self):
728        """ Return the Hibridization Name subsection.
729        """
[1687]730        return self._column("Hybridization Name")
[1353]731       
732    def assay(self):
733        """ Return the Assay subsection
734        """
735        return self._subsection("Assay Name")
736       
737    def assay_name(self):
738        """ Return the Assay Name subsection
739        """
740        return self._column("Assay Name")
741       
742    def scan(self):
743        """ Return the Scan subsection
744        """
745        return self._subsection("Scan Name")
746       
747    def scan_name(self):
748        """ Return the Scan name subsection
749        """
750        return self._column("Scan Name")
751       
752    def normalization(self):
753        """ Return the Normalization subsection.
754        """
755        return self._subsection("Normalization Name")
756       
757    def normalization_name(self):
758        """ Return the Normalization Name subsection.
759        """
760        return self._column("Normalization Name")
761         
762    def array_data(self):
763        """ Return the Array Data subsection
764        """
765        return self._subsection("Array Data File")
766   
767    def array_data_file(self):
768        """ Return the Array Data File subsection
769        """
770        return self._column("Array Data File")
771       
772    def derived_array_data(self):
773        """ Return the Derived Array Data subsection
774        """
775        return self._subsection("Derived Array Data File")
776   
777    def derived_array_data_file(self):
778        """ Return the Derived Array Data File subsection
779        """
780        return self._column("Derived Array Data File")
781       
782    def array_data_matrix(self):
783        """ Return the Array Data Matrix subsection.
784        """
785        return self._subsection("Array Data Matrix File")
786   
787    def array_data_matrix_file(self):
788        """ Return the Array Data Matrix File subsection.
789        """
790        return self._column("Array Data Matrix File")
791       
792    def derived_array_data_matrix(self):
793        """ Return the Derived Array Data Matrix subsection.
794        """
795        return self._subsection("Derived Array Data Matrix File")
796   
797    def derived_array_data_matrix_file(self):
798        """ Return the Derived Array Data Matrix File subsection.
799        """
800        return self._column("Derived Array Data Matrix File")
801       
802    def image(self):
803        """ Return the Image subsection
804        """
805        return self._subsection("Image File")
806   
807    def image_file(self):
808        """ Return the Image File subsection.
809        """
810        return self._column("Image File")
811       
812class ArrayDesign(object):
813    """ Arary design (contains the contents of the .adf file).
814    """
815    def __init__(self, adf_file=None):
816        adf = parse_adf(adf_file)
817        self._adf = adf
818   
819def _is_float(str):
820    try:
821        float(str)
822        return True
823    except ValueError:
824        return False
825   
826def _is_continuous(items, check_count=100):
827    """ Are the strings in items continous numbers.
828    """
829    count = 0
830    i = 0
831    for i, item in enumerate(items):
832        if _is_float(item):
833            count += 1
834        if i >= check_count:
835            break
836    return count >= i * 0.5
837   
[1687]838def processed_matrix_to_orange(matrix_file, sdrf=None):
[1354]839    """ Load a single processed matrix file in to an Orange.data.Table
840    instance.
841    """
842    import numpy
843    import Orange
844   
845    if isinstance(matrix_file, basestring):
846        matrix_file = open(matrix_file, "rb")
[1687]847
848    header, quant_type, rows, matrix = parse_data_matrix(matrix_file)
[1354]849    header_ref, header = header
850    row_ref, rows = rows
851   
852    matrix = numpy.array(matrix, dtype=object)
853   
854   
855    features = []
856    is_float = numpy.frompyfunc(_is_float, 1, 1) # an numpy ufunc
857         
[1687]858    for header_name, quant, column in zip(header, quant_type, matrix.T):
[1354]859        if _is_continuous(column):
[1555]860            feature = Orange.feature.Continuous(header_name)
[1354]861            column[numpy.where(1 - is_float(column))] = "?" # relace all non parsable floats with '?'
862        else:
863            values = set(column)
[1555]864            feature = Orange.feature.Discrete(header_name, values=sorted(values))
[1687]865        feature.attributes["quantitation type"] = quant
[1354]866        features.append(feature)
867       
[1555]868    row_ref_feature = Orange.feature.String(row_ref)
[1354]869    domain = Orange.data.Domain(features, None)
[1555]870    domain.addmeta(Orange.feature.Descriptor.new_meta_id(), row_ref_feature)
[1354]871   
872    table = Orange.data.Table(domain, [list(row) for row in matrix])
[1687]873    table.setattr("header_ref", header_ref)
[1354]874    # Add row identifiers
875    for instance, row in zip(table, rows):
876        instance[row_ref_feature] = row
[1687]877
878    if sdrf is not None:
879        pattern = re.compile(r"((Characteristics)|(Factor Value)|(Parameter Value)) \[(?P<name>.*?)\].*")
880        # Row name in sdrf
881        row_name = header_ref[:header_ref.find(" REF")] + " Name"
882        # feature names as listed in sdrf
883        feature_names = sdrf._column(row_name)
884        annotations = defaultdict(partial(defaultdict, set))
885        for i, header in enumerate(sdrf.header):
886            match = pattern.match(header)
887            if match:
888                name = match.group("name")
889                for val, feature_name in zip(sdrf._column(i), feature_names):
890                    annotations[feature_name][name].add(val)
891
892        def to_str(values):
893            if len(values) > 1:
894                return str(list(values))
895            else:
896                return str(list(values)[0])
897
898        for feature in table.domain.features:
899            feature.attributes.update([(key, to_str(value)) for key, value in \
900                                       annotations[feature.name].items()])
[1354]901    return table
[1687]902
[1354]903
[1353]904def mage_tab_to_orange(idf_filename):
[1687]905    """Convert an MAGE-TAB annotated experiment into an Orange.data.Table
[1353]906    instance (assumes all the associated MAGE-TAB files are in the same
907    directory.
[1687]908
[1353]909    """
910    dirname = os.path.dirname(idf_filename)
[1530]911    idf = InvestigationDesign(idf_filename)
[1353]912   
913    sdrf_filename = os.path.join(dirname, idf.sdrf_file[0])
914    sdrf = SampleDataRelationship(sdrf_filename)
915   
916    data_matrices = set(sdrf.derived_array_data_matrix_file())
[1354]917    data_matrices = [name for name in data_matrices if name.strip()]
[1353]918   
[1354]919    tables = []
920    for filename in data_matrices:
921        matrix_file = os.path.join(dirname, filename)
[1687]922        table = processed_matrix_to_orange(matrix_file, sdrf)
[1354]923        tables.append(table)
[1687]924    table = hstack_tables(tables)
925
926    return table
927
928
[1354]929def hstack_tables(tables):
930    """ Stack the tables horizontaly.
931    """
932    import Orange
933    max_len = max([len(table) for table in tables])
934    stacked_features = []
935    stacked_values = [[] for i in range(max_len)]
936    stacked_meta_features = []
937    stacked_meta_values = [{} for i in range(max_len)]
[1353]938   
[1354]939    for table in tables:
940        stacked_features.extend(table.domain.variables)
941        stacked_meta_features.extend(table.domain.getmetas().items())
[1353]942       
[1354]943        for i, instance in enumerate(table):
944            stacked_values[i].extend(list(instance))
945            stacked_meta_values[i].update(instance.getmetas())
946           
947        # Fill extra lines with unknowns
948        for i in range(len(table), max_len):
949            stacked_values[i].extend(["?"] * len(table.domain.variables))
950       
951    domain = Orange.data.Domain(stacked_features, tables[-1].domain.class_var)
952    domain.addmetas(dict(set(stacked_meta_features)))
953    table = Orange.data.Table(domain, stacked_values)
[1353]954   
[1354]955    # Add meta attributes
956    for instance, metas in zip(table, stacked_meta_values):
957        for m, val in metas.iteritems():
958            instance[m] = val
959           
[1353]960    return table
961   
962def _dictify(element):
[1530]963    """ 'Dictify' and xml.etree.Element.Element instance by taking all
964    subnode tags as keys and the corresponding text values as items i.e. turn
965    `<node><bla>foo</bla><a>b</b></node>` into a {"bla":"foo", "a":b"}
966       
[1353]967    """
968    if element is None:
969        element = []
970    dict = {}
971    strip = lambda s: s.strip() if s else s
972    for node in element:
973        dict[node.tag] = strip(getattr(node, "text", None))
974    return dict
975   
[1530]976class SearchableList(list):
[1687]977    """ A list with an advanced `search` method
[1530]978    """
979    def search(self, **kwargs):
980        """ Search the list for elements with members that correspond the
981        supplied keyword arguments.
982       
983            >>> foo.bar = "foo"
984            >>> list = SearchableList([foo, bar])
985            >>> list.search(bar="foo") # Search for objects which have a member named "bar" and that member equals "foo"
986            [<__main__.foo object ...
[1687]987
[1530]988        """
989        ret = []
990        for obj in self:
991            for member, value in kwargs.items():
992                if hasattr(obj, member) and getattr(obj, member) == value:
993                    ret.append(obj)
994        return ret
[1353]995   
996class ArrayExpressExperiment(object):
997    """ An convinience class representing an Array Express Experiment.
998   
999    Example ::
1000   
1001        >>> ae = ArrayExpressExperiment("E-MEXP-2917")
1002        >>> print ae.name
1003        Characterization of Data Variation in Gene Expression Profiling of Human Peripheral Blood Samples
1004       
1005        >>> for file in ae.files:
1006        ...     print file["name"], file["url"]
[1470]1007        E-MEXP-2917.sdrf.txt http://www.ebi.ac.uk/arrayexpress/files/E-MEXP-2917/E-MEXP-2917.sdrf.txt
[1353]1008        ...
[1530]1009       
1010        >>> table = ae.fgem_to_table() # Retieve the experiment data table
[1353]1011           
1012    """
1013   
1014    def __init__(self, accession, connection=None):
1015        self.accession = accession
1016        self.connection = connection
1017        self._etree = tree = query_experiments(accession=accession, connection=self.connection, format="xml")
1018        experiments = tree.findall("experiment")
1019        # find the exact match (more then one experiment can be listed in the query result)
1020        experiments = [e for e in experiments if e.find("accession").text.strip() == accession]
[1418]1021        self._experiment = experiment = experiments[0]
[1353]1022       
1023        self.species = [e.text for e in experiment.findall("species")]
1024        bool_values = {"true": True, "false": False}
1025        self.rawdatafiles = bool_values[experiment.find("rawdatafiles").get("available","false")]
[1886]1026        self.fgemdatafiles = bool_values[experiment.find("processeddatafiles").get("available", "false")]
[1353]1027       
1028        self.sampleattributes = []
1029        for sa in experiment.findall("sampleattribute"):
1030            category = sa.find("category").text.strip()
1031            values = [val.text for val in sa.findall("value")]
1032            self.sampleattributes.append((category, values))
1033           
1034        self.experimentalfactors = []
1035        for ef in experiment.findall("experimentalfactor"):
1036            name = ef.find("name").text.strip()
1037            values = [val.text.strip() for val in ef.findall("values")]
1038            self.experimentalfactors.append((name, values))
1039           
1040        self.miamescores = _dictify(experiment.find("miamescores"))
1041           
1042        self.id = experiment.find("id").text
1043        self.secondaryaccession = getattr(experiment.find("secondaryaccession"), "text", None)
1044        self.name = experiment.find("name").text
1045        self.experimenttype = experiment.find("experimenttype").text.strip()
1046        self.releasedate = experiment.find("releasedate").text
1047        self.lastupdatedate = getattr(experiment.find("lastupdatedate"), "text", None)
1048        self.samples = int(experiment.find("samples").text)
1049        self.assays = int(experiment.find("assays").text)
1050       
1051        self.arraydesign = [_dictify(e) for e in experiment.findall("arraydesign")]
1052           
1053        self.bioassaydatagroups = [_dictify(group) for group in experiment.findall("bioassaydatagroup")]
1054        self.bibliography = [_dictify(e) for e in experiment.findall("bibliography")]
1055        self.provider = [_dictify(e) for e in experiment.findall("provider")]
1056       
1057        self.experimentdesign = []
1058        for expd in experiment.findall("experimentdesign"):
1059            self.experimentdesign.append(expd.text)
1060           
1061        self.description = [_dictify(e) for e in experiment.findall("description")]
1062       
1063        tree = query_files(accession=self.accession, format="xml", connection=self.connection)
1064        experiments = tree.findall("experiment")
1065        experiments = [e for e in experiments if e.find("accession").text.strip() == accession]
1066        experiment = experiments[0]
1067        files = experiment.findall("file")
1068        self.files = [_dictify(file) for file in files]
1069       
1070    def _download_processed(self):
1071        """ Download the processed matrix file, and associated MAGE-TAB files (idf, sdrf, adf)
[1530]1072       
1073        .. todo:: What about the raw data files (we need converters for other file types)
[1353]1074        """
1075        assert(self.fgemdatafiles)
1076        exp_files = [(f["kind"], f) for f in self.files if f.get("kind") in ["idf", "sdrf"] and f.get("extension") == "txt"]
1077        exp_files += [(f["kind"], f) for f in self.files if f.get("kind") == "fgem"]
1078        array_files = [(f["kind"], f) for f in self.files if f.get("kind") == "adf" and f.get("extension") == "txt"]
1079        assert(len(files) == 3)
1080       
1081        for type, file in files.iteritems():
1082            url = file["url"].strip()
1083            rest, basename = os.path.split(url)
1084            _, dirname = os.path.split(rest)
1085           
1086            repo_dir = orngServerFiles.localpath("ArrayExpress", dirname)
1087            try:
1088                os.makedirs(repo_dir)
1089            except OSError:
1090                pass
1091            local_filename = os.path.join(repo_dir, basename)
1092            stream = urllib2.urlopen(url)
1093            shutil.copyfileobj(stream, open(local_filename, "wb"))
1094           
1095            if file["extension"] == "zip":
1096                import zipfile
1097                zfile = zlib.ZipFile(local_filename)
1098                zfile.extractall(repo_dir)
1099            elif file["extension"] == "gz":
1100                import gzip
1101                gzfile = gzip.open(local_filename)
1102                gzfile.extractall(repo_dir)
[1418]1103            elif file["extension"] in ["tgz", "tar"]:
[1353]1104                import tarfile
1105                tfile = tarfile.TarFile(local_filename)
1106                tfile.extractall(repo_dir)
1107            elif file["extension"] == "txt":
1108                pass
1109            else:
1110                raise ValueError("Unknown extension ('{0}').".format(basename))
1111           
1112    def _download_file(self, url, extract=True):
1113        """ Download the `file` from the ArrayExpress saving it to a local
1114        repository directory.
1115         
1116        """
1117        rest, basename = posixpath.split(url)
1118        dirname = posixpath.basename(rest)
1119        repo_dir = orngServerFiles.localpath("ArrayExpress", dirname)
1120        try:
1121            os.makedirs(repo_dir)
1122        except OSError:
1123            pass
1124        stream = urllib2.urlopen(url)
1125        local_filename = os.path.join(repo_dir, basename)
1126        shutil.copyfileobj(stream, open(local_filename, "wb"))
1127       
1128        if extract:
1129            _, extension = os.path.splitext(local_filename)
1130            if extension == ".zip":
1131                import zipfile
1132                zfile = zipfile.ZipFile(local_filename)
1133                zfile.extractall(repo_dir)
1134            elif extension == ".gz":
1135                import gzip
1136                gzfile = gzip.open(local_filename)
1137                gzfile.extractall(repo_dir)
1138            elif extension in [".tgz"]:
1139                import tarfile
1140                tfile = tarfile.TarFile(local_filename)
1141                tfile.extractall(repo_dir)
1142            elif extension == ".txt":
1143                pass
1144            else:
1145                raise ValueError("Unknown extension ('{0}').".format(basename))
1146           
1147    def _is_local(self, url):
1148        """ Is the `url` stored in the local repository.
1149        """
1150        return os.path.exists(self._local_filepath(url))
1151   
1152    def _local_filepath(self, url):
1153        """ Return the local file path for url.
1154        """
1155        rest, basename = posixpath.split(url)
1156        dirname = posixpath.basename(rest)
1157        return orngServerFiles.localpath("ArrayExpress", os.path.join(dirname, basename))
1158   
1159    def _open(self, url):
1160        """ Return an open file like handle to url (ArrayExpress file).
1161        The file is cached in the local repository for future access.
1162       
1163        """
1164        if not self._is_local(url):
1165            self._download_file(url, extract=True)
1166        file = self._local_filepath(url)
1167        return open(file, "rb")
1168   
1169    def _search_files(self, kind=None, extension=None):
1170        """ Search files by `kind` and `extension`.
1171        """
1172        res = []
1173        for file in self.files:
1174            kind_match = kind == file.get("kind") or kind is None
1175            extension_match = extension == file.get("extension") or extension is None
1176           
1177            if kind_match and extension_match:
1178                res.append(file)
1179        return res
1180       
1181    def array_design(self):
1182        """ Return a list of `ArrayDesign` instances used in this experiment.
1183        """
1184        files = [f for f in self.files if f.get("kind") == "adf" and \
1185                 f.get("extension") == "txt"]
1186       
1187        array_design = []
1188        for file in files:
1189            url = file.get("url")
1190            if not self._is_local(url):
1191                self._download_file(url)
1192            array_design.append(ArrayDesign(self._open(url)))
1193        return array_design
1194       
1195    def investigation_design(self):
1196        """ Return an `InvestigationDesgin` instance for this experiment
1197        """
1198        files = [f for f in self.files if f.get("kind") == "idf" and \
1199                 f.get("extension") == "txt"]
1200        if not files:
1201            raise ValueError("The experiment '{0}' does not have an investigation design file".format(self.accession))
1202        file = files[0]
[1530]1203        return InvestigationDesign(self._open(file.get("url")))
[1353]1204       
1205       
1206    def sample_data_relationship(self):
1207        """ Return an `SampleDataRelationship` instance describing this experiment.
1208        """
1209        files = [f for f in self.files if f.get("kind") == "sdrf" and \
1210                 f.get("extension") == "txt"]
1211        if not files:
1212            raise ValueError("The experiment '{0}' does not have an sample and data relationship file".format(self.accession))
1213        file = files[0]
1214        return SampleDataRelationship(self._open(file.get("url")))
1215       
1216    def fgem_to_table(self):
[1530]1217        """ Retrieve the processed matrix from the Array Express ftp
1218        server and convert it to an :class:`Orange.data.Table` instance.
1219         
1220        """
[1353]1221        assert(self.fgemdatafiles)
1222        repo_dir = orngServerFiles.localpath("ArrayExpress", self.accession)
1223        # Find the file listing the data matrix files (should be in sdrf but sometimes it is in 2column file only, why?)
1224        sdrf = self._search_files("sdrf", "txt")
1225        if sdrf:
1226            sdrf = SampleDataRelationship(self._open(sdrf[0].get("url")))
1227            if "Derived Array Data Matrix File" not in sdrf.header:
1228                twocol = self._search_files("twocolumn", "txt")
1229                if twocol:
1230                    sdrf = SampleDataRelationship(self._open(twocol[0].get("url")))
1231        matrix_file = self._search_files("fgem")[0]
1232        self._open(matrix_file.get("url")) 
1233        matrix_files = sorted(set(sdrf.derived_array_data_matrix_file()))
1234       
[1354]1235        idf_file = self._search_files("idf", "txt")[0]
1236        self._open(idf_file.get("url")) # To download if not cached
1237        return mage_tab_to_orange(os.path.join(repo_dir, idf_file.get("name")))
[1353]1238       
1239   
[1530]1240"""\
[1342]1241Gene Expression Atlas
1242---------------------
[1347]1243
[1470]1244.. WARNING:: Deprecated, use ``obiGeneAtlas`` instead.
1245
[1350]1246`Gene Expression Atlas <http://www.ebi.ac.uk/gxa/>`_ is a curated subset of
1247gene expression experiments in Array Express Archive.
1248
[1347]1249Use `query_atlas_simple` for simple querys.
1250
[1350]1251Example (query human genes for experiments in which they are up regulated) ::
1252
[1347]1253    >>> obiArrayExpress.query_atlas_simple(genes=["SORL1", "PSIP1", "CDKN1C"], regulation="up", organism="Homo sapiens")
1254    {u'...
1255   
[1350]1256Or use the `AtlasCondition` subclasses in this module to construct a more
1257advanced query and use the `query_atlas` function.
1258
1259Example (query human genes annotated to the GO term 'transporter activity'
1260that are up regulated in the liver in at least three experiments) ::
1261
1262    >>> go_cond = AtlasConditionGeneProperty("Goterm", "Is", "transporter activity")
1263    >>> liver_cond = AtlasConditionExperimentalFactor("Organism_part", "up", 3, "liver")
1264    >>> org_cond = AtlasConditionOrganism("Homo sapiens")
1265    >>> cond_list = AtlasConditionList([go_cond, liver_cond, org_cond])
1266    >>> query_atlas(cond_list)
1267    {u'...
1268   
[1342]1269"""
1270
[1885]1271
[1342]1272class GeneExpressionAtlasConenction(object):
1273    """
[1885]1274    A connection to Gene Expression Atlas database.
1275
1276    :param address:
1277        Address of the GXA server (default: http://www.ebi.ac.uk/gxa/api/deprecated).
1278    :param timeout:
1279        Socket timeout (default 30).
1280    :param cache:
1281        A dict like object to use as a cache.
1282
1283    """
1284    DEFAULT_ADDRESS = "http://www.ebi.ac.uk/gxa/api/deprecated"
1285
1286    DEFAULT_CACHE = orngServerFiles.localpath(
1287        "ArrayExpress", "GeneAtlasCache.shelve")
1288
[1440]1289    def __init__(self, address=None, timeout=30, cache=None):
[1885]1290        """
1291        Initialize the connection.
1292
[1347]1293        """
[1342]1294        self.address = address if address is not None else self.DEFAULT_ADDRESS
1295        self.timeout = timeout
[1885]1296        if cache is None:
1297            cache = self.DEFAULT_CACHE
1298
1299        self.cache = cache
1300
[1342]1301    def query(self, condition, format="json", start=None, rows=None, indent=False):
[1885]1302        url = self.address + "?" + condition.rest()
[1470]1303        if start is not None and rows is not None:
[1342]1304            url += "&start={0}&rows={1}".format(start, rows)
1305        url += "&format={0}".format(format)
1306        if indent:
1307            url += "&indent"
1308#        print url
[1440]1309        if self.cache is not None:
1310            return self._query_cached(url)
1311        else:
1312            return urllib2.urlopen(url)
[1885]1313
[1440]1314    def _query_cached(self, url):
1315        if self.cache is not None:
[1885]1316            with self.open_cache("r") as cache:
1317                if url in cache:
1318                    return StringIO(cache[url])
1319
1320            response = urllib2.urlopen(url)
1321            contents = response.read()
1322            with self.open_cache("w") as cache:
1323                cache[url] = contents
1324
1325            return StringIO(contents)
[1440]1326        else:
1327            return urllib2.urlopen(url)
[1885]1328
1329    def open_cache(self, flag="r"):
1330        """
1331        Return a context manager for a dict like object.
1332        """
1333        if isinstance(self.cache, basestring):
1334            try:
1335                return closing(_open_shelve(self.cache, flag))
1336            except Exception:
1337                return fake_closing({})
1338        else:
1339            return fake_closing(self.cache)
1340
1341
[1350]1342# Names of all Gene Property filter names
[1342]1343GENE_FILTERS = \
1344    ["Name", # Gene name
1345     "Goterm", #Gene Ontology Term
1346     "Interproterm", #InterPro Term
1347     "Disease", #Gene-Disease Assocation
1348     "Keyword", #Gene Keyword
1349     "Protein", #Protein
1350
1351     "Dbxref", #Other Database Cross-Refs
1352     "Embl", #EMBL-Bank ID
1353     "Ensfamily", #Ensembl Family
1354     "Ensgene", #Ensembl Gene ID
1355
1356     "Ensprotein", #Ensembl Protein ID
1357     "Enstranscript", #Ensembl Transcript ID
1358     "Goid", #Gene Ontology ID
1359     "Image", #IMAGE ID
1360     "Interproid", #InterPro ID
1361     "Locuslink", #Entrez Gene ID
1362
1363     "Omimid", #OMIM ID
1364     "Orf", #ORF
1365     "Refseq", #RefSeq ID
1366     "Unigene", #UniGene ID
1367     "Uniprot", #UniProt Accession
1368
1369     "Hmdb", #HMDB ID
1370     "Chebi", #ChEBI ID
1371     "Cas", #CAS
1372     "Uniprotmetenz", #Uniprotmetenz
1373     "Gene", #Gene Name or Identifier
1374     "Synonym", #Gene Synonym
1375     ]
1376   
[1350]1377# Valid Gene Property filter qualifiers
[1342]1378GENE_FILTER_QUALIFIERS =\
1379    ["Is",
1380     "IsNot"
1381     ]
1382
[1350]1383# Organisms in the Atlas
[1342]1384ATLAS_ORGANISMS = \
1385    ["Anopheles gambiae",
1386     "Arabidopsis thaliana",
1387     "Bos taurus",
1388     "Caenorhabditis elegans",
1389     "Danio rerio",
1390     "Drosophila melanogaster",
1391     "Epstein barr virus",
1392     "Gallus gallus",
1393     "Homo sapiens",
1394     "Human cytomegalovirus",
1395     "Kaposi sarcoma-associated herpesvirus",
1396     "Mus musculus",
1397     "Rattus norvegicus",
1398     "Saccharomyces cerevisiae",
1399     "Schizosaccharomyces pombe",
[1440]1400#     "Unknown",
[1342]1401     "Xenopus laevis"
1402     ]
1403   
[1470]1404#_COMMON_TAXIDS = \
1405#    {"Anopheles gambiae",
1406#     "Arabidopsis thaliana",
1407#     "Bos taurus",
1408#     "Caenorhabditis elegans",
1409#     "Danio rerio",
1410#     "Drosophila melanogaster",
1411#     "Epstein barr virus",
1412#     "Gallus gallus",
1413#     "Homo sapiens",
1414#     "Human cytomegalovirus",
1415#     "Kaposi sarcoma-associated herpesvirus",
1416#     "Mus musculus",
1417#     "Rattus norvegicus",
1418#     "Saccharomyces cerevisiae",
1419#     "Schizosaccharomyces pombe",
1420##     "Unknown",
1421#     "Xenopus laevis"
1422#     }
1423   
[1342]1424def ef_ontology():
[1350]1425    """ Return the `EF <http://www.ebi.ac.uk/efo/>`_ (Experimental Factor) ontology
[1342]1426    """
[1632]1427    from . import obiOntology
[1345]1428#    return obiOntology.OBOOntology(urllib2.urlopen("http://efo.svn.sourceforge.net/svnroot/efo/trunk/src/efoinobo/efo.obo"))
[1632]1429    from Orange.orng import orngServerFiles
[1345]1430    # Should this be in the OBOFoundry (Ontology) domain
[1440]1431    try:
1432        file = open(orngServerFiles.localpath_download("ArrayExpress", "efo.obo"), "rb")
1433    except urllib2.HTTPError:
1434        file = urllib2.urlopen("http://efo.svn.sourceforge.net/svnroot/efo/trunk/src/efoinobo/efo.obo")
1435    return obiOntology.OBOOntology(file)
[1342]1436
1437
1438class AtlasCondition(object):
1439    """ Base class for Gene Expression Atlas query condition
1440    """
1441    def validate(self):
1442        """ Validate condition in a subclass.
1443        """
1444        raise NotImplementedError
1445   
1446    def rest(self):
1447        """ Return a REST query part in a subclass.
1448        """
1449        raise NotImplementedError
1450   
1451   
1452class AtlasConditionList(list, AtlasCondition):
1453    """ A list of AtlasCondition instances.
1454    """ 
1455    def validate(self):
1456        for item in self:
1457            item.validate()
1458       
1459    def rest(self):
1460        return "&".join(cond.rest() for cond in self)
1461
[1440]1462
[1342]1463class AtlasConditionGeneProperty(AtlasCondition):
1464    """ An atlas gene filter condition.
1465   
1466    :param property: Property of the gene. If None or "" all properties
1467        will be searched.
1468    :param qualifier: Qualifier can be 'Is' or 'IsNot'
1469    :param value: The value to search for.
1470   
1471    Example ::
1472   
1473        >>> # Condition on a gene name
1474        >>> condition = AtlasConditionGeneProperty("Name", "Is", "AS3MT")
1475        >>> # Condition on genes from a GO Term
1476        >>> condition = AtlasConditionGeneProperty("Goterm", "Is", "p53 binding")
1477        >>> # Condition on disease association
1478        >>> condition = AtlasConditionGeneProperty("Disease", "Is", "cancer")
1479       
1480    """
1481    def __init__(self, property, qualifier, value):
1482        self.property = property or ""
1483        self.qualifier = qualifier
1484        if isinstance(value, basestring):
1485            self.value = value.replace(" ", "+")
1486        elif isinstance(value, list):
1487            self.value = "+".join(value)
1488        else:
1489            raise ValueError(value)
1490       
1491        self.validate()
1492       
1493    def validate(self):
1494        assert(self.property in GENE_FILTERS + [""])
1495        assert(self.qualifier in GENE_FILTER_QUALIFIERS + [""])
1496       
1497    def rest(self):
1498        return "gene{property}{qualifier}={value}".format(**self.__dict__)
1499       
1500       
1501class AtlasConditionExperimentalFactor(AtlasCondition):
1502    """ An atlas experimental factor filter condition.
1503   
1504    :param factor: EFO experiamntal factor
1505    :param regulation: "up", "down", "updown", "any" or "none"
1506    :param n: Minimum number of of experimants with this condition
1507    :param value: Experimantal factor value
1508   
1509    Example ::
1510   
1511        >>> # Any genes up regulated in at least 3 experiments involving cancer.
1512        >>> condition = AtlasConditionExperimentalFactor("", "up", 3, "cancer")
1513        >>> # Only genes which are up/down regulated in the heart in at least one experiment.
1514        >>> condition = AtlasConditionExperimentalFactor("Organism_part", "updown", 1, "heart")
1515       
1516    """
1517    def __init__(self, factor, regulation, n, value):
1518        self.factor = factor
1519        self.regulation = regulation
1520        self.n = n
1521        self.value = value
1522        self.validate()
1523       
1524    def validate(self):
1525        # TODO: validate the factor and value
[1347]1526#        assert(self.factor in ef_ontology())
[1342]1527        assert(self.regulation in ["up", "down", "updown"])
1528       
1529    def rest(self):
1530        return "{regulation}{n}In{factor}={value}".format(**self.__dict__)
1531       
1532class AtlasConditionOrganism(AtlasCondition):
1533    """ Condition on organism.
1534    """
1535    def __init__(self, organism):
1536        self.organism = organism
1537        self.validate()
1538       
1539    def validate(self):
1540        assert(self.organism in ATLAS_ORGANISMS)
1541       
1542    def rest(self):
1543        return "species={0}".format(self.organism.replace(" ", "+").lower())
1544       
[1418]1545       
1546class AtlasConditionExperiment(AtlasCondition):
1547    """ Condition on experiement
[1342]1548   
[1418]1549    :param property: Property of the experiment. If None or "" all properties
1550        will be searched.
1551    :param qualifier: Qualifier can be 'Has' or 'HasNot'
1552    :param value: The value to search for.
1553   
1554    Example ::
1555   
1556        >>> # Condition on a experiemnt acession
1557        >>> condition = AtlasConditionExperiment("", "", "E-GEOD-24283")
1558        >>> # Condition on experiments involving lung
[1530]1559        >>> condition = AtlasConditionExperiment("Organism_part", "Has", "lung")
[1418]1560       
1561    """
1562#    EXPERIMENT_FILTERS = [
1563#                "Organism"
1564#                "Factor"]
1565   
1566    EXPERIMENT_FILTER_QUALIFIERS = [
1567                "Has",
1568                "HasNot"]
1569   
1570    def __init__(self, property, qualifier, value):
1571        self.property = property
1572        self.qualifier = qualifier
1573        if isinstance(value, basestring):
1574            self.value = value.replace(" ", "+")
1575        elif isinstance(value, list):
1576            self.value = "+".join(value)
1577        else:
1578            raise ValueError(value)
1579       
1580        self.validate()
1581       
1582    def validate(self):
1583        # TODO: check to EFO factors
1584#        assert(self.property in EXPERIMENT_FILTERS + [""])
1585        assert(self.qualifier in self.EXPERIMENT_FILTER_QUALIFIERS + [""])
1586       
1587    def rest(self):
1588        return "experiment{property}{qualifier}={value}".format(**self.__dict__)
1589       
1590       
1591class GeneAtlasError(ValueError):
1592    """ An error response from the Atlas server.
1593    """
1594    pass
1595   
1596   
1597def __check_atlas_error_json(response):
[1470]1598    if "error" in response:
1599        raise GeneAtlasError(response["error"])
1600    return response
1601 
[1418]1602     
1603def __check_atlas_error_xml(response):
1604    error = response.find("error")
1605    if error is not None:
1606        raise GeneAtlasError(error.text)
1607    return response
1608   
1609       
[1347]1610def query_atlas_simple(genes=None, regulation=None, organism=None,
1611                       condition=None, format="json", start=None,
[1350]1612                       rows=None):
[1342]1613    """ A simple Atlas query.
1614   
[1347]1615    :param genes: A list of gene names to search for.
1616    :param regulation: Search for experiments in which `genes` are "up",
1617        "down", "updown" or "none" regulated. If None all experiments
1618        are searched.
1619    :param organism: Search experiments for organism. If None all experiments
1620        are searched.
1621    :param condition: An EFO factor value (e.g. "brain")
1622   
[1350]1623    Example ::
[1342]1624       
1625        >>> query_atlas_simple(genes=['Pou5f1', 'Dppa3'], organism="Mus musculus")
[1347]1626        {u'...
[1342]1627       
[1347]1628        >>> query_atlas_simple(genes=['Pou5f1', 'Dppa3'], regulation="up", organism="Mus musculus")
1629        {u'...
1630       
1631        >>> query_atlas_simple(genes=['Pou5f1', 'Dppa3'], regulation="up", condition="liver", organism="Mus musculus")
1632        {u'...
[1342]1633       
1634    """
[1470]1635    warnings.warn("Use 'obiGeneAtlas.run_simple_query' instead.", DeprecationWarning)
[1342]1636    conditions = AtlasConditionList()
[1350]1637    if genes:
1638        conditions.append(AtlasConditionGeneProperty("Gene", "Is", genes))
[1347]1639    if regulation or condition:
1640        regulation = "any" if regulation is None else regulation
1641        condition = "" if condition is None else condition
1642        conditions.append(AtlasConditionExperimentalFactor("", regulation, 1, condition))
[1342]1643    if organism:
1644        conditions.append(AtlasConditionOrganism(organism))
[1347]1645       
[1342]1646    connection = GeneExpressionAtlasConenction()
[1347]1647    results = connection.query(conditions, format=format, start=start,
1648                               rows=rows)
1649    if format == "json":
1650        return parse_json(results)
1651    else:
1652        return parse_xml(results)
[1342]1653
1654"""\
[1350]1655.. todo:: can this be implemented query_atlas(organism="...", Locuslink="...", Chebi="...", up3InCompound="..." downInEFO="...")
[1342]1656      Need a full list of accepted factors
1657"""
1658
[1440]1659def query_atlas(condition, format="json", start=None, rows=None, indent=False, connection=None):
[1342]1660    """ Query Atlas based on a `condition` (instance of AtlasCondition)
1661   
[1350]1662    Example ::
[1342]1663       
[1347]1664        >>> condition1 = AtlasConditionGeneProperty("Goterm", "Is", "p53 binding")
1665        >>> condition2 = AtlasConditionExperimentalFactor("Organism_part", "up", 3, "heart")
1666        >>> condition = AtlasConditionList([condition1, condition2])
1667        >>> query_atlas(condition)
1668        {u'...
[1342]1669       
1670    """
[1470]1671    warnings.warn("Use 'obiGeneAtlas.run_query' instead.", DeprecationWarning)
[1440]1672    if connection is None:
1673        connection = GeneExpressionAtlasConenction()
[1345]1674    results = connection.query(condition, format=format, start=start,
1675                               rows=rows, indent=indent)
[1347]1676    if format == "json":
[1418]1677        response = parse_json(results)
1678        return __check_atlas_error_json(response)
[1347]1679    else:
[1418]1680        response = parse_xml(results)
1681        return __check_atlas_error_xml(response)
[1342]1682
1683
[1440]1684def get_atlas_summary(genes, organism, connection=None):
[1344]1685    """ Return 3 dictionaries containing a summary of atlas information
1686    about three experimental factors:
1687   
1688        - Organism Part (OP)
1689        - Disease State (DS)
1690        - Cell type (CT)
1691   
1692    Each dictionary contains query genes as keys. Values are dictionaries
1693    mapping factor values to a 2-tuple containig the count of up regulated
1694    and down regulated experiments.
1695   
[1350]1696    Example ::
[1344]1697   
1698        >>> get_atlas_summary(["RUNX1"], "Homo sapiens")
1699        ({u'RUNX1': ...
1700       
1701    """
[1470]1702    warnings.warn("Use 'obiGeneAtlas.get_atlas_summary' instead.", DeprecationWarning)
[1344]1703    genes_condition = AtlasConditionGeneProperty("Gene", "Is", genes)
[1342]1704    org_condition = AtlasConditionOrganism(organism)
1705    condition = AtlasConditionList([genes_condition, org_condition])
[1440]1706    result = query_atlas(condition, format="json", connection=connection)
[1342]1707   
1708    org_part = collect_ef_summary(result, "organism_part")
1709    disease_state = collect_ef_summary(result, "disease_state")
1710    cell_type = collect_ef_summary(result, "cell_type")
1711   
[1470]1712    return dict(org_part), dict(disease_state), dict(cell_type)
[1342]1713   
[1440]1714   
[1470]1715def collect_ef_summary(info, ef, summary=None):
[1342]1716    """ Collect the results summary from query_atlas, result for experimental
1717    factor `ef`.
1718    """
[1470]1719    if summary is None:
1720        summary = defaultdict(dict)
1721       
[1342]1722    results = info["results"]
1723    for res in results:
1724        gene = res["gene"]
1725        expressions = res["expressions"] 
1726        for expression in expressions:
1727            if expression["ef"] == ef:
1728                efv = expression["efv"]
1729                updown = (expression["upExperiments"],
1730                          expression["downExperiments"]
1731                          )
1732               
1733                if any(updown):
1734                    summary[gene["name"]][efv] = updown
1735   
[1470]1736    return summary
[1687]1737
1738
[1470]1739def test():   
[1342]1740    conn = ArrayExpressConnection()
1741    import doctest
[1530]1742    foo = type("foo", (object,), {})()
1743    bar = type("bar", (object,), {})()
1744   
1745    doctest.testmod(optionflags=doctest.ELLIPSIS,
1746                    extraglobs={"conn": conn, "foo": foo,"bar": bar})
[1347]1747   
[1440]1748   
[1347]1749if __name__ == "__main__":
1750    test()
[1555]1751   
Note: See TracBrowser for help on using the repository browser.