source: orange-bioinformatics/_bioinformatics/obiArrayExpress.py @ 1636:10d234fdadb9

Revision 1636:10d234fdadb9, 58.4 KB checked in by mitar, 2 years ago (diff)

Restructuring because we will not be using namespaces.

Line 
1"""
2===============
3obiArrayExpress
4===============
5
6A python module for accessing the ArrayExpress web services and database.
7
8`Array Express Archive <http://www.ebi.ac.uk/arrayexpress/>`_ is a database of gene expression experiments that you
9can query and download.
10
11Example ::
12
13    >>> # Retrieve the object representing experiment with accession E-TABM-25
14    >>> experiment = ArrayExpressExperiment("E-TABM-25")
15    >>> print experiment.accession
16    E-TABM-25
17   
18    >>> print experiment.name
19    Transcription profiling of aging in the primate brain
20   
21    >>> print experiment.species
22    ['Pan troglodytes']
23   
24    >>> print experiment.files
25    [{'kind': ...
26   
27    >>> # Retrieve the data matrix for experiment 'E-MEXP-2917'
28    >>> experiment = ArrayExpressExperiment("E-MEXP-2917")
29    >>> table = experiment.fgem_to_table()
30
31
32Low level Array Express query using REST services::
33
34    >>> from Orange.bioinformatics import obiArrayExpress
35    >>> obiArrayExpress.query_experiments(accession='E-MEXP-31')
36    {u'experiments': ...
37   
38    >>> obiArrayExpress.query_experiments(keywords='gliobastoma')
39    {u'experiments': ...
40   
41    >>> obiArrayExpress.query_files(accession='E-MEXP-32', format="xml")
42    <xml.etree.ElementTree.ElementTree object ...
43   
44.. note:: Currently querying ArrayExpress files only works with the xml format.
45
46.. note:: See the documentation of `query_experiments` for a full set of
47          parameters that these functions accept.
48
49"""
50
51from __future__ import absolute_import
52
53import os, sys
54import urllib2
55
56from Orange.orng import orngServerFiles
57import warnings
58import posixpath
59import shelve
60import shutil
61import posixpath
62import json
63from xml.etree.ElementTree import ElementTree
64from StringIO import StringIO
65
66from collections import defaultdict
67from functools import partial
68from contextlib import closing
69
70parse_json = json.load
71
72def parse_xml(stream):
73    """ Parse an xml stream into an instance of xml.etree.ElementTree.ElementTree.
74    """
75    return ElementTree(file=stream) 
76
77# All searchable fields of ArrayExpress (see query_experiments docstring
78# for a description of the fields)
79ARRAYEXPRESS_FIELDS = \
80    ["keywords",
81     "accession",
82     "array",
83     "ef",
84     "efv",
85     "expdesign",
86     "exptype",
87     "gxa",
88     "pmid",
89     "sa",
90     "species",
91     "expandefo",
92     "directsub",
93     "assaycount",
94     "efcount",
95     "samplecount",
96     "sacount",
97     "rawcount",
98     "fgemcount",
99     "miamescore",
100     "date",
101     "wholewords",
102    ]
103   
104
105class ArrayExpressConnection(object):
106    """ A connection to the ArrayExpress. Used to construct a REST query
107    and run it.
108   
109    .. todo:: Implement user login.
110   
111    """
112   
113    DEFAULT_ADDRESS = "http://www.ebi.ac.uk/arrayexpress/{format}/v2/"
114    DEFAULT_FORMAT = "json"
115    DEFAULT_CACHE = orngServerFiles.localpath("ArrayExpress", "ArrayExpressCache.shelve")
116    # Order of arguments in the query
117    _ARGS_ORDER = ["keywords", "species", "array"]
118   
119    def __init__(self, address=None, timeout=30, cache=None,
120                 username=None, password=None):
121        """ Initialize the connection object.
122       
123        :param address: Address of the ArrayExpress API
124        :param timeout: Timeout for the socket connection
125       
126        .. todo:: Implement user login (see Accessing Private Data in API docs)
127       
128        """
129        self.address = address if address is not None else self.DEFAULT_ADDRESS
130        self.timeout = timeout
131        self.cache = cache if cache is not None else self.DEFAULT_CACHE
132        self.username = username
133        self.password = password
134       
135    def format_query(self, **kwargs):
136        """ Format the query arguments.
137       
138        Example ::
139       
140            >>> conn.format_query(gxa=True, efcount=(1, 5))
141            'efcount=[1 TO 5]&gxa=true'
142           
143        """
144        # Formaters:
145        def format_default(val):
146            if isinstance(val, basestring):
147                return val
148            else:
149                return "+".join(val)
150        def format_species(val):
151            return '"%s"' % val.lower()
152        def format_gxa(val):
153            if val:
154                return "true"
155            else:
156                raise ValueError("gxa={0}".format(val))
157        def format_expandefo(val):
158            if val:
159                return "on"
160            else:
161                raise ValueError("expandefo={0}".format(val))
162        def format_true_false(val):
163            return "true" if val else "false"
164        def format_interval(val):
165            if isinstance(val, tuple):
166                return "[{0} TO {1}]".format(*val)
167            else:
168                raise ValueError("Must be an interval argument (min, max)!")
169        def format_date(val):
170            # TODO check if val contains a datetime.date object, assert proper format
171            return format_interval(val)
172        def format_wholewords(val):
173            if val:
174                return "on"
175            else:
176                raise ValueError("wholewords={0}".format(val))
177       
178        formaters = {"species": format_species,
179                     "gxa": format_gxa,
180                     "expandefo": format_expandefo,
181                     "directsub": format_true_false,
182                     "assaycount": format_interval,
183                     "efcount": format_interval,
184                     "samplecount": format_interval,
185                     "sacount": format_interval,
186                     "rawcount": format_interval,
187                     "fgemcount": format_interval,
188                     "miamescore": format_interval,
189                     "date": format_date,
190                     "wholewords": format_wholewords,
191                     }
192        parts = []
193        arg_items = kwargs.items()
194        ordered = sorted(arg_items, key=lambda arg: self._ARGS_ORDER.index(arg[0]) \
195                         if arg[0] in self._ARGS_ORDER else 100)
196       
197        for key, value in kwargs.iteritems():
198            if key == "format":
199                continue # format is handled in query_url
200            if key not in ARRAYEXPRESS_FIELDS:
201                raise ValueError("Invalid argument name: '{0}'".format(key))
202            if value is not None and value != []:
203                fmt = formaters.get(key, format_default)
204                value = fmt(value)
205                parts.append("{0}={1}".format(key, value))
206                 
207        return "&".join(parts)
208       
209    def query_url(self, what="experiments", **kwargs):
210        """ Return a formated query URL for the query arguments
211       
212        Example ::
213            >>> conn.query_url(accession="E-MEXP-31")
214            'http://www.ebi.ac.uk/arrayexpress/json/v2/experiments?accession=E-MEXP-31'
215           
216        """
217        query = self.format_query(**kwargs)
218        url = posixpath.join(self.address, what)
219        url = url.format(format=kwargs.get("format", self.DEFAULT_FORMAT))
220        url = url + ("?" + query if query else "")
221        url = url.replace(" ", "%20")
222        return url
223   
224    def query_url_experiments(self, **kwargs):
225        """ Return a formated experiments query url for the calls arguments
226        """
227        return self.query_url("experiments", **kwargs)
228   
229    def query_url_files(self, **kwargs):
230        """ Return a formated experiments query url for the calls arguments
231        """
232        return self.query_url("files", **kwargs)
233   
234    def query_experiment(self, **kwargs):
235        """ Return an open stream to the experiments query results.
236       
237        .. note:: This member function takes the same arguments as the module
238                  level `query_experiemnts` function.
239         
240        """
241        url = self.query_url_experiments(**kwargs)
242        stream = self._cache_urlopen(url, timeout=self.timeout)
243        return stream
244   
245    def query_files(self, **kwargs):
246        """ Return an open stream to the files query results.
247       
248        .. note:: This member function takes the same arguments as the module
249                  level `query_files` function.
250        """
251        url = self.query_url_files(**kwargs)
252        stream = self._cache_urlopen(url, timeout=self.timeout)
253        return stream
254   
255    def open_file(self, accession, kind="raw", ext=None):
256        """ Return a file handle to experiment data.
257        Possible values for kind:
258            - raw: return the raw data if available
259            - fgem: return the processed data if available
260            - biosamples: a png or svg design image
261            - idf: investigation description
262            - adf: array design description
263            - mageml: MAGE-ML file
264           
265        Example ::
266       
267            >>> raw_file = conn.open_file("E-TABM-1087", kind="raw")
268            >>> processed_file = conn.open_file("E-TABM-1087", kind="fgem")
269             
270        """
271        stream = self.query_files(accession=accession, format="xml")
272        tree = ElementTree(file=stream)
273        files = tree.findall("experiment/file")
274        for file in files:
275            filekind = file.find("kind").text
276            fileext = file.find("extension").text
277            if filekind.strip() == kind and (fileext.strip() == ext or ext is None): 
278                url = file.find("url").text
279                return self._cache_urlopen(url.strip(), timeout=self.timeout)
280           
281    def _cache_urlopen(self, url, timeout=30):
282        if self.cache is not None:
283            with self.open_cache() as cache:
284                cached = url in cache
285            if not cached:
286                stream = urllib2.urlopen(url, timeout=timeout)
287                data = stream.read()
288                with self.open_cache() as cache:
289                    cache[url] = data
290            else:
291                with self.open_cache() as cache:
292                    data = cache[url]
293            return StringIO(data)
294        else:
295            return urllib2.urlopen(url, timeout=timeout)
296       
297    def open_cache(self):
298        if isinstance(self.cache, basestring):
299            return closing(shelve.open(self.cache))
300        elif hasattr(self.cache, "close"):
301            return closing(self.cache)
302        elif self.cache is None:
303            return fake_closing({})
304        else:
305            return fake_closing(self.cache)
306       
307from contextlib import contextmanager
308
309@contextmanager
310def fake_closing(obj):
311    yield obj
312   
313def query_experiments(keywords=None, accession=None, array=None, ef=None,
314                      efv=None, expdesign=None, exptype=None,
315                      gxa=None, pmid=None, sa=None, species=None,
316                      expandefo=None, directsub=None, assaycount=None,
317                      efcount=None, samplecount=None, rawcount=None,
318                      fgemcount=None, miamescore=None, date=None, 
319                      format="json", wholewords=None, connection=None):
320    """ Query Array Express experiments.
321   
322    :param keywords: A list of keywords to search (e.g. ['gliobastoma']
323    :param accession: Search by experiment accession (e.g. 'E-MEXP-31')
324    :param array: Search by array design name or accession (e.g. 'A-AFFY-33')
325    :param ef: Experimental factor (names of main variables of experiments)
326    :param efv: Experimental factor value (Has EFO expansion)
327    :param expdesign: Experiment design type. (e.g. ["dose", "response"])
328    :param exptype: Experiment type (e.g. 'RNA-Seq', has EFO expansion)
329    :param gxa: If True limit the results to experiments from the Gene
330        Expreission Atlas only.
331    :param pmid: Search by PubMed identifier
332    :param sa: Sample attribute values (e.g. 'fibroblast', has EFO expansion)
333    :param species: Search by species (e.g. 'Homo sapiens', has EFO expansion)
334   
335    :param expandefo: If True expand the search terms with all its child terms
336        in the Experimental Factor Ontology (EFO_) (e.g. keywords="cancer"
337        will be expanded to include for synonyms and sub types of cancer).
338    :param directsub: If True return only experiments submited directly to
339        Array Express else if False return only experiments imported from GEO
340        database (default None, return both)
341    :param assaycount: A two tuple (min, max) for filter on the number of
342        assays (e.g. (1, 5) will return only experiments with at least one
343        and no more then 5 assays).
344    :param efcount: Filter on the number of experimental factors (e.g. (1, 5))
345    :param sacount: Filter on the number of sample attribute categories
346    :param rawcount: Filter on the number or raw files
347    :param fgemcount: Filter on the number of final gene expression matrix
348        (processed data) files
349    :param miamescore: Filter on the MIAME complience score (max 5)
350    :param date: Filter by release date
351   
352    Example ::
353   
354        >>> query_experiments(species="Homo sapiens", ef="organism_part", efv="liver")
355        {u'experiments': ...
356       
357    .. _EFO: http://www.ebi.ac.uk/efo/
358   
359    """
360    if connection is None:
361        connection = ArrayExpressConnection()
362       
363    stream = connection.query_experiment(keywords=keywords, accession=accession,
364                array=array, ef=ef, efv=efv, expdesign=expdesign, exptype=exptype,
365                gxa=gxa, pmid=pmid, sa=sa, species=species, expandefo=expandefo,
366                directsub=directsub, assaycount=assaycount, efcount=efcount,
367                samplecount=samplecount, rawcount=rawcount, fgemcount=fgemcount,
368                miamescore=miamescore, date=date,  format=format,
369                wholewords=wholewords)
370   
371    if format == "json":
372        return parse_json(stream)
373    else:
374        return parse_xml(stream)
375   
376   
377def query_files(keywords=None, accession=None, array=None, ef=None,
378               efv=None, expdesign=None, exptype=None,
379               gxa=None, pmid=None, sa=None, species=None,
380               expandefo=None, directsub=None, assaycount=None,
381               efcount=None, samplecount=None, rawcount=None,
382               fgemcount=None, miamescore=None, date=None, 
383               format="json", wholewords=None, connection=None):
384    """ Query Array Express files.
385   
386    This function accepts the same arguments as `query_experiments`.
387   
388    Example ::
389   
390        >>> query_files(species="Mus musculus", ef="developmental_stage", efv="embryo", format="xml")
391        <xml.etree.ElementTree.ElementTree object ...
392       
393    .. todo:: why does the json interface not work.
394                       
395    """
396    if connection is None:
397        connection = ArrayExpressConnection()
398   
399    stream = connection.query_files(keywords=keywords, accession=accession,
400                array=array, ef=ef, efv=efv, expdesign=expdesign, exptype=exptype,
401                gxa=gxa, pmid=pmid, sa=sa, species=species, expandefo=expandefo,
402                directsub=directsub, assaycount=assaycount, efcount=efcount,
403                samplecount=samplecount, rawcount=rawcount, fgemcount=fgemcount,
404                miamescore=miamescore, date=date,  format=format,
405                wholewords=wholewords)
406   
407    if format == "json":
408        return parse_json(stream)
409    else:
410        return parse_xml(stream)
411   
412   
413def open_file(accession, kind="raw", ext=None, repo_dir=None):
414    """ Open a file for the experiment.
415     
416    Example ::
417   
418        >>> file = open_file("E-MTAB-369", kind="raw", repo_dir="~/.arrayexpress/")
419       
420    """
421    raise NotImplementedError
422
423"""\
424MAGE-TAB convinence functions, classes
425======================================
426"""
427
428IDF_SINGLE_TAGS = \
429    ["Investigation Title",
430     "Date of Experiment",
431     "Public Release Date",
432     "Experiment Description",
433    ]
434
435def parse_idf(file):
436    """ Parse an idf.txt (Investigation Description  Format) formated file.
437    Return a list of tuples where the first element is the tag (first column)
438    and the second element is a list of all tag values.
439   
440    """
441    if isinstance(file, basestring):
442        file = open(file, "rb")
443    data = file.read()
444    lines = data.splitlines()
445    lines = [line.split("\t") for line in lines if line and not line.startswith("#")]
446    parsed = [(line[0], line[1:]) for line in lines]
447    return parsed
448   
449def parse_sdrf(file):
450    """ Parse an sdfr formated file. Return a tuple with the first element
451    a list of header values and the second element is a list of all rows
452    (a row is a list of string values).
453   
454    """
455    if isinstance(file, basestring):
456        file = open(file, "rb")
457    data = file.read()
458    lines = data.splitlines()
459    lines = [line.split("\t") for line in lines if line.strip() and not line.startswith("#")]
460    header = [h for h in lines[0] if h]
461    rows = [line[:len(header)] for line in lines[1:]]
462    assert(all([len(r) == len(header) for r in rows]))
463    return header, rows
464   
465def parse_adf(file):
466    pass
467
468def parse_data_matrix(file):
469    """ Parse the MAGE-TAB processed data matrix. Return a tuple where the
470    elements are:
471        - a (header REF, header values) tuple (e.g. ("Hybridization REF", ["Stage1", "Stage2", ...]) )
472        - a list of quantitations for header values (e.g. ["log2 ratio", "log2 ratio", ...])
473        - a (row REF, row names list) tuple ("e.g. ("Reporter REF", ["Probe 1", "Probe 2", ...]) )
474        - a list of list matrix with values (as strings)
475       
476    """
477    if isinstance(file, basestring):
478        file = open(file, "rb")
479    data = file.read()
480    lines = data.splitlines()
481    lines = [line.split("\t") for line in lines if line.strip()]
482    header = lines[0]
483    header_ref, header = header[0], header[1:]
484    line2 = lines[1]
485    row_ref, quanifications = line2[0], line2[1:]
486    row_names, rows = [], []
487    for line in lines[2:]:
488        r_name, row = line[0], line[1:]
489        row_names.append(r_name)
490        rows.append(row)
491       
492    return ((header_ref, header),
493            quanifications,
494            (row_ref, row_names),
495            rows) 
496   
497class InvestigationDesign(dict):
498    """ Investigation design (contains the contents of the .idf).
499   
500    Example ::
501   
502        >>> idf = InvestigationDesign("foobar.idf")
503        >>> print idf.investigation_title
504        foo investigation
505        >>> print idf.experimental_design
506        ['fubar', 'snafu']
507        >>> print idf.sdrf_file
508        ['foobar.sdrf']
509       
510    """
511    def __init__(self, idf_file=None):
512        idf = parse_idf(idf_file)
513        self._idf = idf
514        self.update(dict(idf))
515        for tag, values in idf:
516            if tag in IDF_SINGLE_TAGS:
517                values = values[0] if values else None
518            ttag = self.transform_tag(tag)
519            setattr(self, ttag, values)
520       
521    def transform_tag(self, tag):
522        """ Transform the tag into a proper python attribute name by
523        replacing all spaces and special characters (e.g '[', ']' into
524        underscores).
525       
526        """
527        toreplace = [" ", "-", "[", "]"]
528        for s in toreplace:
529            tag = tag.replace(s, "_")
530        return tag.lower()
531           
532    def __getitem__(self, tag):
533        """ Return the tag values
534       
535        Example ::
536       
537            >>> idf["Investigation Title"]
538            'foo investigation'
539           
540        """
541        try:
542            return self._idf_dict[tag]
543        except KeyError:
544            pass
545       
546        ttag = self.transform_tag(tag)
547        if hasattr(self, ttag):
548            return getattr(self, ttag)
549        else:
550            raise KeyError(tag)
551       
552class SampleDataRelationship(object):
553    """ Sample-Data Relationship (contains the contents of the .sdrf file).
554   
555    Example ::
556   
557        >>> sdr = SampleDataRelationship("foobar.sdrf")
558        >>> sdr.source_name
559        ['foo', ...
560       
561        >>> sdr.sample_name
562        ['sampled foo', ...
563       
564        >>> sdr.extract_protocol_ref
565        ['bar the foo', ...
566       
567        >>> sdr.derived_array_data_matrix_file
568        ['foobar.data.txt', ...
569       
570        >>>
571         
572    """
573   
574    # Nodes an edges
575    NODES_EDGES = ["Source Name", "Sample Name", "Extract Name",
576                   "Labeled Extract Name", "Hybridization Name",
577                   "Assay Name", "Scan Name", "Normalization Name",
578                   "Array Data File", "Derived Array Data File",
579                   "Array Data Matrix File", "Derived Array Data Matrix File",
580                   "Image File", "Protocol REF"]
581   
582    # Attributes for nodes and edges
583    NODE_EDGE_ATTRIBUTES = \
584    {"Source Name": ["Characteristics", "Provider", "Material Type", "Description", "Comment"],
585     "Sample Name": ["Characteristics", "Material Type", "Description", "Comment"],
586     "Extract Name":["Characteristics", "Material Type", "Description", "Comment"],
587     "Labeled Extract Name": ["Characteristics", "Material Type", "Description", "Label", "Comment"],
588     "Hybridization Name": ["Array Design File", "Array Design REF", "Comment"],
589     "Assay Name": ["Technology Type", "Comment"],
590     "Scan Name": ["Comment"],
591     "Normalization Name": ["Comment"],
592     "Array Data File": ["Comment"],
593     "Derived Array Data File": ["Comment"],
594     "Array Data Matrix File": ["Comment"],
595     "Derived Array Data Matrix File": ["Comment"],
596     "Image File": ["Comment"],
597     "Protocol REF": ["Term Source REF", "Parameter", "Performer", "Date", "Comment"]
598     }
599   
600    # Attributes
601    ATTRIBUTE_COLUMNS = \
602    {"Characteristics []": ["Unit", "Term Source REF"],
603     "Provider": ["Comment"],
604     "Material Type": ["Term Source REF"],
605     "Label": ["Term Source REF"],
606     "Array Design File": ["Comment"],
607     "Array Design REF": ["Term Source REF", "Comment"],   
608     "Technology Type": ["Term Source REF"],
609     "Factor Value [] ()": ["Unit", "Term Source REF"],
610     "Performer": ["Comment"],
611     "Date": [],
612     "Parameter Value []": ["Unit", "Comment"],
613     "Unit []": ["Term Source REF"],
614     "Description": [],
615     "Term Source REF": ["Term Accession Number"],
616     "Term Accession Number": [],
617     "Comment []": []
618     }
619    def __init__(self, sdrf_file=None):
620        header, rows = parse_sdrf(sdrf_file)
621        self.header = header
622        self.rows = rows
623       
624    def transform_tag(self, tag):
625        """ Transform the tag into a proper python attribute name by
626        replacing all spaces and special characters (e.g '[', ']' into
627        underscores).
628       
629        """
630        toreplace = [" ", "-", "[", "]"]
631        for s in toreplace:
632            tag = tag.replace(s, "_")
633        return tag.lower()
634   
635    def _subsection(self, name):
636        """ Return the named subsection (name must be from the
637        NODES_EDGES list).
638       
639        """
640        idx = self.NODES_EDGES.index(name)
641        start = self.header.index(name)
642        end = -1
643        for name in self.NODES_EDGES[idx + 1:]:
644            if name in self.header[start + 1:]:
645                end = self.header.index(name, start + 1)
646                break
647        return self.header[start:end], [r[start:end] for r in self.rows]
648   
649    def _column(self, name):
650        """ Return the named column.
651        """
652        index = self.header.index(name)
653        return [r[index] for r in self.rows]
654       
655    def source(self):
656        """ Return the Source subsection
657        """
658        return self._subsection("Source Name")
659       
660    def source_name(self):
661        """ Return the Source Name subsection
662        """
663        return self._column("Source Name")
664       
665    def sample(self):
666        """ Return the Sample subsection
667        """
668        return self._subsection("Sample Name")
669       
670    def sample_name(self):
671        """ Return the Sample Name subsection
672        """
673        return self._column("Sample Name")
674       
675    def extract(self):
676        """ Return the Extract subsection
677        """
678        return self._subsection("Extract Name")
679       
680    def extract_name(self):
681        """ Return the Extract Name subsection
682        """
683        return self._column("Extract Name")
684       
685    def labeled_extract(self):
686        """ Return the Labeled Extract subsection
687        """
688        return self._subsection("Labeled Extract Name")
689       
690    def labeled_extract_name(self):
691        """ Return the Labeled Extract Name subsection
692        """
693        return self._column("Labeled Extract Name")
694       
695    def hybridization(self):
696        """ Return the Hibridization subsection.
697        """
698        return self._subsection("Hibridization Name")
699       
700    def hybridization_name(self):
701        """ Return the Hibridization Name subsection.
702        """
703        return self._column("Hibridization Name")
704       
705    def assay(self):
706        """ Return the Assay subsection
707        """
708        return self._subsection("Assay Name")
709       
710    def assay_name(self):
711        """ Return the Assay Name subsection
712        """
713        return self._column("Assay Name")
714       
715    def scan(self):
716        """ Return the Scan subsection
717        """
718        return self._subsection("Scan Name")
719       
720    def scan_name(self):
721        """ Return the Scan name subsection
722        """
723        return self._column("Scan Name")
724       
725    def normalization(self):
726        """ Return the Normalization subsection.
727        """
728        return self._subsection("Normalization Name")
729       
730    def normalization_name(self):
731        """ Return the Normalization Name subsection.
732        """
733        return self._column("Normalization Name")
734         
735    def array_data(self):
736        """ Return the Array Data subsection
737        """
738        return self._subsection("Array Data File")
739   
740    def array_data_file(self):
741        """ Return the Array Data File subsection
742        """
743        return self._column("Array Data File")
744       
745    def derived_array_data(self):
746        """ Return the Derived Array Data subsection
747        """
748        return self._subsection("Derived Array Data File")
749   
750    def derived_array_data_file(self):
751        """ Return the Derived Array Data File subsection
752        """
753        return self._column("Derived Array Data File")
754       
755    def array_data_matrix(self):
756        """ Return the Array Data Matrix subsection.
757        """
758        return self._subsection("Array Data Matrix File")
759   
760    def array_data_matrix_file(self):
761        """ Return the Array Data Matrix File subsection.
762        """
763        return self._column("Array Data Matrix File")
764       
765    def derived_array_data_matrix(self):
766        """ Return the Derived Array Data Matrix subsection.
767        """
768        return self._subsection("Derived Array Data Matrix File")
769   
770    def derived_array_data_matrix_file(self):
771        """ Return the Derived Array Data Matrix File subsection.
772        """
773        return self._column("Derived Array Data Matrix File")
774       
775    def image(self):
776        """ Return the Image subsection
777        """
778        return self._subsection("Image File")
779   
780    def image_file(self):
781        """ Return the Image File subsection.
782        """
783        return self._column("Image File")
784       
785class ArrayDesign(object):
786    """ Arary design (contains the contents of the .adf file).
787    """
788    def __init__(self, adf_file=None):
789        adf = parse_adf(adf_file)
790        self._adf = adf
791   
792def _is_float(str):
793    try:
794        float(str)
795        return True
796    except ValueError:
797        return False
798   
799def _is_continuous(items, check_count=100):
800    """ Are the strings in items continous numbers.
801    """
802    count = 0
803    i = 0
804    for i, item in enumerate(items):
805        if _is_float(item):
806            count += 1
807        if i >= check_count:
808            break
809    return count >= i * 0.5
810   
811def processed_matrix_to_orange(matrix_file):
812    """ Load a single processed matrix file in to an Orange.data.Table
813    instance.
814    """
815    import numpy
816    import Orange
817   
818    if isinstance(matrix_file, basestring):
819        matrix_file = open(matrix_file, "rb")
820       
821#    data_matrix = matrix_file.read()
822    header, quantification, rows, matrix = parse_data_matrix(matrix_file)
823    header_ref, header = header
824    row_ref, rows = rows
825   
826    matrix = numpy.array(matrix, dtype=object)
827   
828   
829    features = []
830    is_float = numpy.frompyfunc(_is_float, 1, 1) # an numpy ufunc
831         
832    for header_name, quant, column in zip(header, quantification, matrix.T):
833        if _is_continuous(column):
834            feature = Orange.feature.Continuous(header_name)
835            column[numpy.where(1 - is_float(column))] = "?" # relace all non parsable floats with '?'
836        else:
837            values = set(column)
838            feature = Orange.feature.Discrete(header_name, values=sorted(values))
839        feature.attributes["quantification"] = quant
840        features.append(feature)
841       
842    row_ref_feature = Orange.feature.String(row_ref)
843    domain = Orange.data.Domain(features, None)
844    domain.addmeta(Orange.feature.Descriptor.new_meta_id(), row_ref_feature)
845   
846    table = Orange.data.Table(domain, [list(row) for row in matrix])
847   
848    # Add row identifiers
849    for instance, row in zip(table, rows):
850        instance[row_ref_feature] = row
851   
852    return table
853   
854
855def mage_tab_to_orange(idf_filename):
856    """ Convert an MAGE-TAB annotated experiment into an Orange.data.Table
857    instance (assumes all the associated MAGE-TAB files are in the same
858    directory.
859   
860    .. todo:: Add Characteristics, Factor Values ... to the feature.attributes dict
861   
862    """
863    import Orange
864    dirname = os.path.dirname(idf_filename)
865    idf = InvestigationDesign(idf_filename)
866   
867    sdrf_filename = os.path.join(dirname, idf.sdrf_file[0])
868    sdrf = SampleDataRelationship(sdrf_filename)
869   
870    data_matrices = set(sdrf.derived_array_data_matrix_file())
871    data_matrices = [name for name in data_matrices if name.strip()]
872   
873    tables = []
874    for filename in data_matrices:
875        matrix_file = os.path.join(dirname, filename)
876        table = processed_matrix_to_orange(matrix_file)
877        tables.append(table)
878       
879    return hstack_tables(tables)
880   
881def hstack_tables(tables):
882    """ Stack the tables horizontaly.
883    """
884    import Orange
885    max_len = max([len(table) for table in tables])
886    stacked_features = []
887    stacked_values = [[] for i in range(max_len)]
888    stacked_meta_features = []
889    stacked_meta_values = [{} for i in range(max_len)]
890   
891    for table in tables:
892        stacked_features.extend(table.domain.variables)
893        stacked_meta_features.extend(table.domain.getmetas().items())
894       
895        for i, instance in enumerate(table):
896            stacked_values[i].extend(list(instance))
897            stacked_meta_values[i].update(instance.getmetas())
898           
899        # Fill extra lines with unknowns
900        for i in range(len(table), max_len):
901            stacked_values[i].extend(["?"] * len(table.domain.variables))
902       
903    domain = Orange.data.Domain(stacked_features, tables[-1].domain.class_var)
904    domain.addmetas(dict(set(stacked_meta_features)))
905    table = Orange.data.Table(domain, stacked_values)
906   
907    # Add meta attributes
908    for instance, metas in zip(table, stacked_meta_values):
909        for m, val in metas.iteritems():
910            instance[m] = val
911           
912    return table
913   
914def _dictify(element):
915    """ 'Dictify' and xml.etree.Element.Element instance by taking all
916    subnode tags as keys and the corresponding text values as items i.e. turn
917    `<node><bla>foo</bla><a>b</b></node>` into a {"bla":"foo", "a":b"}
918       
919    """
920    if element is None:
921        element = []
922    dict = {}
923    strip = lambda s: s.strip() if s else s
924    for node in element:
925        dict[node.tag] = strip(getattr(node, "text", None))
926    return dict
927   
928class SearchableList(list):
929    """ A list with a `search` method
930    """
931    def search(self, **kwargs):
932        """ Search the list for elements with members that correspond the
933        supplied keyword arguments.
934       
935            >>> foo.bar = "foo"
936            >>> list = SearchableList([foo, bar])
937            >>> list.search(bar="foo") # Search for objects which have a member named "bar" and that member equals "foo"
938            [<__main__.foo object ...
939           
940        """
941        ret = []
942        for obj in self:
943            for member, value in kwargs.items():
944                if hasattr(obj, member) and getattr(obj, member) == value:
945                    ret.append(obj)
946        return ret
947   
948class ArrayExpressExperiment(object):
949    """ An convinience class representing an Array Express Experiment.
950   
951    Example ::
952   
953        >>> ae = ArrayExpressExperiment("E-MEXP-2917")
954        >>> print ae.name
955        Characterization of Data Variation in Gene Expression Profiling of Human Peripheral Blood Samples
956       
957        >>> for file in ae.files:
958        ...     print file["name"], file["url"]
959        E-MEXP-2917.sdrf.txt http://www.ebi.ac.uk/arrayexpress/files/E-MEXP-2917/E-MEXP-2917.sdrf.txt
960        ...
961       
962        >>> table = ae.fgem_to_table() # Retieve the experiment data table
963           
964    """
965   
966    def __init__(self, accession, connection=None):
967        self.accession = accession
968        self.connection = connection
969        self._etree = tree = query_experiments(accession=accession, connection=self.connection, format="xml")
970        experiments = tree.findall("experiment")
971        # find the exact match (more then one experiment can be listed in the query result)
972        experiments = [e for e in experiments if e.find("accession").text.strip() == accession]
973        self._experiment = experiment = experiments[0]
974       
975        self.species = [e.text for e in experiment.findall("species")]
976        bool_values = {"true": True, "false": False}
977        self.rawdatafiles = bool_values[experiment.find("rawdatafiles").get("available","false")]
978        self.fgemdatafiles = bool_values[experiment.find("fgemdatafiles").get("available", "false")]
979       
980        self.sampleattributes = []
981        for sa in experiment.findall("sampleattribute"):
982            category = sa.find("category").text.strip()
983            values = [val.text for val in sa.findall("value")]
984            self.sampleattributes.append((category, values))
985           
986        self.experimentalfactors = []
987        for ef in experiment.findall("experimentalfactor"):
988            name = ef.find("name").text.strip()
989            values = [val.text.strip() for val in ef.findall("values")]
990            self.experimentalfactors.append((name, values))
991           
992        self.miamescores = _dictify(experiment.find("miamescores"))
993           
994        self.id = experiment.find("id").text
995        self.secondaryaccession = getattr(experiment.find("secondaryaccession"), "text", None)
996        self.name = experiment.find("name").text
997        self.experimenttype = experiment.find("experimenttype").text.strip()
998        self.releasedate = experiment.find("releasedate").text
999        self.lastupdatedate = getattr(experiment.find("lastupdatedate"), "text", None)
1000        self.samples = int(experiment.find("samples").text)
1001        self.assays = int(experiment.find("assays").text)
1002       
1003        self.arraydesign = [_dictify(e) for e in experiment.findall("arraydesign")]
1004           
1005        self.bioassaydatagroups = [_dictify(group) for group in experiment.findall("bioassaydatagroup")]
1006        self.bibliography = [_dictify(e) for e in experiment.findall("bibliography")]
1007        self.provider = [_dictify(e) for e in experiment.findall("provider")]
1008       
1009        self.experimentdesign = []
1010        for expd in experiment.findall("experimentdesign"):
1011            self.experimentdesign.append(expd.text)
1012           
1013        self.description = [_dictify(e) for e in experiment.findall("description")]
1014       
1015        tree = query_files(accession=self.accession, format="xml", connection=self.connection)
1016        experiments = tree.findall("experiment")
1017        experiments = [e for e in experiments if e.find("accession").text.strip() == accession]
1018        experiment = experiments[0]
1019        files = experiment.findall("file")
1020        self.files = [_dictify(file) for file in files]
1021       
1022    def _download_processed(self):
1023        """ Download the processed matrix file, and associated MAGE-TAB files (idf, sdrf, adf)
1024       
1025        .. todo:: What about the raw data files (we need converters for other file types)
1026        """
1027        assert(self.fgemdatafiles)
1028        exp_files = [(f["kind"], f) for f in self.files if f.get("kind") in ["idf", "sdrf"] and f.get("extension") == "txt"]
1029        exp_files += [(f["kind"], f) for f in self.files if f.get("kind") == "fgem"]
1030        array_files = [(f["kind"], f) for f in self.files if f.get("kind") == "adf" and f.get("extension") == "txt"]
1031        assert(len(files) == 3)
1032       
1033        for type, file in files.iteritems():
1034            url = file["url"].strip()
1035            rest, basename = os.path.split(url)
1036            _, dirname = os.path.split(rest)
1037           
1038            repo_dir = orngServerFiles.localpath("ArrayExpress", dirname)
1039            try:
1040                os.makedirs(repo_dir)
1041            except OSError:
1042                pass
1043            local_filename = os.path.join(repo_dir, basename)
1044            stream = urllib2.urlopen(url)
1045            shutil.copyfileobj(stream, open(local_filename, "wb"))
1046           
1047            if file["extension"] == "zip":
1048                import zipfile
1049                zfile = zlib.ZipFile(local_filename)
1050                zfile.extractall(repo_dir)
1051            elif file["extension"] == "gz":
1052                import gzip
1053                gzfile = gzip.open(local_filename)
1054                gzfile.extractall(repo_dir)
1055            elif file["extension"] in ["tgz", "tar"]:
1056                import tarfile
1057                tfile = tarfile.TarFile(local_filename)
1058                tfile.extractall(repo_dir)
1059            elif file["extension"] == "txt":
1060                pass
1061            else:
1062                raise ValueError("Unknown extension ('{0}').".format(basename))
1063           
1064    def _download_file(self, url, extract=True):
1065        """ Download the `file` from the ArrayExpress saving it to a local
1066        repository directory.
1067         
1068        """
1069        rest, basename = posixpath.split(url)
1070        dirname = posixpath.basename(rest)
1071        repo_dir = orngServerFiles.localpath("ArrayExpress", dirname)
1072        try:
1073            os.makedirs(repo_dir)
1074        except OSError:
1075            pass
1076        stream = urllib2.urlopen(url)
1077        local_filename = os.path.join(repo_dir, basename)
1078        shutil.copyfileobj(stream, open(local_filename, "wb"))
1079       
1080        if extract:
1081            _, extension = os.path.splitext(local_filename)
1082            if extension == ".zip":
1083                import zipfile
1084                zfile = zipfile.ZipFile(local_filename)
1085                zfile.extractall(repo_dir)
1086            elif extension == ".gz":
1087                import gzip
1088                gzfile = gzip.open(local_filename)
1089                gzfile.extractall(repo_dir)
1090            elif extension in [".tgz"]:
1091                import tarfile
1092                tfile = tarfile.TarFile(local_filename)
1093                tfile.extractall(repo_dir)
1094            elif extension == ".txt":
1095                pass
1096            else:
1097                raise ValueError("Unknown extension ('{0}').".format(basename))
1098           
1099    def _is_local(self, url):
1100        """ Is the `url` stored in the local repository.
1101        """
1102        return os.path.exists(self._local_filepath(url))
1103   
1104    def _local_filepath(self, url):
1105        """ Return the local file path for url.
1106        """
1107        rest, basename = posixpath.split(url)
1108        dirname = posixpath.basename(rest)
1109        return orngServerFiles.localpath("ArrayExpress", os.path.join(dirname, basename))
1110   
1111    def _open(self, url):
1112        """ Return an open file like handle to url (ArrayExpress file).
1113        The file is cached in the local repository for future access.
1114       
1115        """
1116        if not self._is_local(url):
1117            self._download_file(url, extract=True)
1118        file = self._local_filepath(url)
1119        return open(file, "rb")
1120   
1121    def _search_files(self, kind=None, extension=None):
1122        """ Search files by `kind` and `extension`.
1123        """
1124        res = []
1125        for file in self.files:
1126            kind_match = kind == file.get("kind") or kind is None
1127            extension_match = extension == file.get("extension") or extension is None
1128           
1129            if kind_match and extension_match:
1130                res.append(file)
1131        return res
1132       
1133    def array_design(self):
1134        """ Return a list of `ArrayDesign` instances used in this experiment.
1135        """
1136        files = [f for f in self.files if f.get("kind") == "adf" and \
1137                 f.get("extension") == "txt"]
1138       
1139        array_design = []
1140        for file in files:
1141            url = file.get("url")
1142            if not self._is_local(url):
1143                self._download_file(url)
1144            array_design.append(ArrayDesign(self._open(url)))
1145        return array_design
1146       
1147    def investigation_design(self):
1148        """ Return an `InvestigationDesgin` instance for this experiment
1149        """
1150        files = [f for f in self.files if f.get("kind") == "idf" and \
1151                 f.get("extension") == "txt"]
1152        if not files:
1153            raise ValueError("The experiment '{0}' does not have an investigation design file".format(self.accession))
1154        file = files[0]
1155        return InvestigationDesign(self._open(file.get("url")))
1156       
1157       
1158    def sample_data_relationship(self):
1159        """ Return an `SampleDataRelationship` instance describing this experiment.
1160        """
1161        files = [f for f in self.files if f.get("kind") == "sdrf" and \
1162                 f.get("extension") == "txt"]
1163        if not files:
1164            raise ValueError("The experiment '{0}' does not have an sample and data relationship file".format(self.accession))
1165        file = files[0]
1166        return SampleDataRelationship(self._open(file.get("url")))
1167       
1168    def fgem_to_table(self):
1169        """ Retrieve the processed matrix from the Array Express ftp
1170        server and convert it to an :class:`Orange.data.Table` instance.
1171         
1172        """
1173        assert(self.fgemdatafiles)
1174        repo_dir = orngServerFiles.localpath("ArrayExpress", self.accession)
1175        # Find the file listing the data matrix files (should be in sdrf but sometimes it is in 2column file only, why?)
1176        sdrf = self._search_files("sdrf", "txt")
1177        if sdrf:
1178            sdrf = SampleDataRelationship(self._open(sdrf[0].get("url")))
1179            if "Derived Array Data Matrix File" not in sdrf.header:
1180                twocol = self._search_files("twocolumn", "txt")
1181                if twocol:
1182                    sdrf = SampleDataRelationship(self._open(twocol[0].get("url")))
1183        matrix_file = self._search_files("fgem")[0]
1184        self._open(matrix_file.get("url")) 
1185        matrix_files = sorted(set(sdrf.derived_array_data_matrix_file()))
1186       
1187        idf_file = self._search_files("idf", "txt")[0]
1188        self._open(idf_file.get("url")) # To download if not cached
1189        return mage_tab_to_orange(os.path.join(repo_dir, idf_file.get("name")))
1190       
1191   
1192"""\
1193Gene Expression Atlas
1194---------------------
1195
1196.. WARNING:: Deprecated, use ``obiGeneAtlas`` instead.
1197
1198`Gene Expression Atlas <http://www.ebi.ac.uk/gxa/>`_ is a curated subset of
1199gene expression experiments in Array Express Archive.
1200
1201Use `query_atlas_simple` for simple querys.
1202
1203Example (query human genes for experiments in which they are up regulated) ::
1204
1205    >>> obiArrayExpress.query_atlas_simple(genes=["SORL1", "PSIP1", "CDKN1C"], regulation="up", organism="Homo sapiens")
1206    {u'...
1207   
1208Or use the `AtlasCondition` subclasses in this module to construct a more
1209advanced query and use the `query_atlas` function.
1210
1211Example (query human genes annotated to the GO term 'transporter activity'
1212that are up regulated in the liver in at least three experiments) ::
1213
1214    >>> go_cond = AtlasConditionGeneProperty("Goterm", "Is", "transporter activity")
1215    >>> liver_cond = AtlasConditionExperimentalFactor("Organism_part", "up", 3, "liver")
1216    >>> org_cond = AtlasConditionOrganism("Homo sapiens")
1217    >>> cond_list = AtlasConditionList([go_cond, liver_cond, org_cond])
1218    >>> query_atlas(cond_list)
1219    {u'...
1220   
1221"""
1222
1223class GeneExpressionAtlasConenction(object):
1224    """ A connection to Gene Expression Atlas database.
1225    """
1226    DEFAULT_ADDRESS = "http://www.ebi.ac.uk:80/gxa/"
1227    try:
1228        DEFAULT_CACHE = shelve.open(orngServerFiles.localpath("ArrayExpress", "GeneAtlasCache.shelve"))
1229    except Exception, ex:
1230        print ex
1231        DEFAULT_CACHE = {}
1232    def __init__(self, address=None, timeout=30, cache=None):
1233        """ Initialize the conenction.
1234       
1235        :param address: Address of the server.
1236        :param timeout: Socket timeout.
1237        :param cache : A dict like object to use as a cache.
1238       
1239        """
1240        self.address = address if address is not None else self.DEFAULT_ADDRESS
1241        self.timeout = timeout
1242        self.cache = cache if cache is not None else self.DEFAULT_CACHE
1243   
1244    def query(self, condition, format="json", start=None, rows=None, indent=False):
1245        url = self.address + "api/vx?" + condition.rest()
1246        if start is not None and rows is not None:
1247            url += "&start={0}&rows={1}".format(start, rows)
1248        url += "&format={0}".format(format)
1249        if indent:
1250            url += "&indent"
1251#        print url
1252        if self.cache is not None:
1253            return self._query_cached(url)
1254        else:
1255            return urllib2.urlopen(url)
1256        return response
1257   
1258    def _query_cached(self, url):
1259        if self.cache is not None:
1260            if url not in self.cache:
1261                response = urllib2.urlopen(url)
1262                contents = response.read()
1263                self.cache[url] = contents
1264                if hasattr(self.cache, "sync"):
1265                    self.cache.sync()
1266            return StringIO(self.cache[url])
1267        else:
1268            return urllib2.urlopen(url)
1269               
1270   
1271# Names of all Gene Property filter names
1272GENE_FILTERS = \
1273    ["Name", # Gene name
1274     "Goterm", #Gene Ontology Term
1275     "Interproterm", #InterPro Term
1276     "Disease", #Gene-Disease Assocation
1277     "Keyword", #Gene Keyword
1278     "Protein", #Protein
1279
1280     "Dbxref", #Other Database Cross-Refs
1281     "Embl", #EMBL-Bank ID
1282     "Ensfamily", #Ensembl Family
1283     "Ensgene", #Ensembl Gene ID
1284
1285     "Ensprotein", #Ensembl Protein ID
1286     "Enstranscript", #Ensembl Transcript ID
1287     "Goid", #Gene Ontology ID
1288     "Image", #IMAGE ID
1289     "Interproid", #InterPro ID
1290     "Locuslink", #Entrez Gene ID
1291
1292     "Omimid", #OMIM ID
1293     "Orf", #ORF
1294     "Refseq", #RefSeq ID
1295     "Unigene", #UniGene ID
1296     "Uniprot", #UniProt Accession
1297
1298     "Hmdb", #HMDB ID
1299     "Chebi", #ChEBI ID
1300     "Cas", #CAS
1301     "Uniprotmetenz", #Uniprotmetenz
1302     "Gene", #Gene Name or Identifier
1303     "Synonym", #Gene Synonym
1304     ]
1305   
1306# Valid Gene Property filter qualifiers
1307GENE_FILTER_QUALIFIERS =\
1308    ["Is",
1309     "IsNot"
1310     ]
1311
1312# Organisms in the Atlas
1313ATLAS_ORGANISMS = \
1314    ["Anopheles gambiae",
1315     "Arabidopsis thaliana",
1316     "Bos taurus",
1317     "Caenorhabditis elegans",
1318     "Danio rerio",
1319     "Drosophila melanogaster",
1320     "Epstein barr virus",
1321     "Gallus gallus",
1322     "Homo sapiens",
1323     "Human cytomegalovirus",
1324     "Kaposi sarcoma-associated herpesvirus",
1325     "Mus musculus",
1326     "Rattus norvegicus",
1327     "Saccharomyces cerevisiae",
1328     "Schizosaccharomyces pombe",
1329#     "Unknown",
1330     "Xenopus laevis"
1331     ]
1332   
1333#_COMMON_TAXIDS = \
1334#    {"Anopheles gambiae",
1335#     "Arabidopsis thaliana",
1336#     "Bos taurus",
1337#     "Caenorhabditis elegans",
1338#     "Danio rerio",
1339#     "Drosophila melanogaster",
1340#     "Epstein barr virus",
1341#     "Gallus gallus",
1342#     "Homo sapiens",
1343#     "Human cytomegalovirus",
1344#     "Kaposi sarcoma-associated herpesvirus",
1345#     "Mus musculus",
1346#     "Rattus norvegicus",
1347#     "Saccharomyces cerevisiae",
1348#     "Schizosaccharomyces pombe",
1349##     "Unknown",
1350#     "Xenopus laevis"
1351#     }
1352   
1353def ef_ontology():
1354    """ Return the `EF <http://www.ebi.ac.uk/efo/>`_ (Experimental Factor) ontology
1355    """
1356    from . import obiOntology
1357#    return obiOntology.OBOOntology(urllib2.urlopen("http://efo.svn.sourceforge.net/svnroot/efo/trunk/src/efoinobo/efo.obo"))
1358    from Orange.orng import orngServerFiles
1359    # Should this be in the OBOFoundry (Ontology) domain
1360    try:
1361        file = open(orngServerFiles.localpath_download("ArrayExpress", "efo.obo"), "rb")
1362    except urllib2.HTTPError:
1363        file = urllib2.urlopen("http://efo.svn.sourceforge.net/svnroot/efo/trunk/src/efoinobo/efo.obo")
1364    return obiOntology.OBOOntology(file)
1365
1366
1367class AtlasCondition(object):
1368    """ Base class for Gene Expression Atlas query condition
1369    """
1370    def validate(self):
1371        """ Validate condition in a subclass.
1372        """
1373        raise NotImplementedError
1374   
1375    def rest(self):
1376        """ Return a REST query part in a subclass.
1377        """
1378        raise NotImplementedError
1379   
1380   
1381class AtlasConditionList(list, AtlasCondition):
1382    """ A list of AtlasCondition instances.
1383    """ 
1384    def validate(self):
1385        for item in self:
1386            item.validate()
1387       
1388    def rest(self):
1389        return "&".join(cond.rest() for cond in self)
1390
1391
1392class AtlasConditionGeneProperty(AtlasCondition):
1393    """ An atlas gene filter condition.
1394   
1395    :param property: Property of the gene. If None or "" all properties
1396        will be searched.
1397    :param qualifier: Qualifier can be 'Is' or 'IsNot'
1398    :param value: The value to search for.
1399   
1400    Example ::
1401   
1402        >>> # Condition on a gene name
1403        >>> condition = AtlasConditionGeneProperty("Name", "Is", "AS3MT")
1404        >>> # Condition on genes from a GO Term
1405        >>> condition = AtlasConditionGeneProperty("Goterm", "Is", "p53 binding")
1406        >>> # Condition on disease association
1407        >>> condition = AtlasConditionGeneProperty("Disease", "Is", "cancer")
1408       
1409    """
1410    def __init__(self, property, qualifier, value):
1411        self.property = property or ""
1412        self.qualifier = qualifier
1413        if isinstance(value, basestring):
1414            self.value = value.replace(" ", "+")
1415        elif isinstance(value, list):
1416            self.value = "+".join(value)
1417        else:
1418            raise ValueError(value)
1419       
1420        self.validate()
1421       
1422    def validate(self):
1423        assert(self.property in GENE_FILTERS + [""])
1424        assert(self.qualifier in GENE_FILTER_QUALIFIERS + [""])
1425       
1426    def rest(self):
1427        return "gene{property}{qualifier}={value}".format(**self.__dict__)
1428       
1429       
1430class AtlasConditionExperimentalFactor(AtlasCondition):
1431    """ An atlas experimental factor filter condition.
1432   
1433    :param factor: EFO experiamntal factor
1434    :param regulation: "up", "down", "updown", "any" or "none"
1435    :param n: Minimum number of of experimants with this condition
1436    :param value: Experimantal factor value
1437   
1438    Example ::
1439   
1440        >>> # Any genes up regulated in at least 3 experiments involving cancer.
1441        >>> condition = AtlasConditionExperimentalFactor("", "up", 3, "cancer")
1442        >>> # Only genes which are up/down regulated in the heart in at least one experiment.
1443        >>> condition = AtlasConditionExperimentalFactor("Organism_part", "updown", 1, "heart")
1444       
1445    """
1446    def __init__(self, factor, regulation, n, value):
1447        self.factor = factor
1448        self.regulation = regulation
1449        self.n = n
1450        self.value = value
1451        self.validate()
1452       
1453    def validate(self):
1454        # TODO: validate the factor and value
1455#        assert(self.factor in ef_ontology())
1456        assert(self.regulation in ["up", "down", "updown"])
1457       
1458    def rest(self):
1459        return "{regulation}{n}In{factor}={value}".format(**self.__dict__)
1460       
1461class AtlasConditionOrganism(AtlasCondition):
1462    """ Condition on organism.
1463    """
1464    def __init__(self, organism):
1465        self.organism = organism
1466        self.validate()
1467       
1468    def validate(self):
1469        assert(self.organism in ATLAS_ORGANISMS)
1470       
1471    def rest(self):
1472        return "species={0}".format(self.organism.replace(" ", "+").lower())
1473       
1474       
1475class AtlasConditionExperiment(AtlasCondition):
1476    """ Condition on experiement
1477   
1478    :param property: Property of the experiment. If None or "" all properties
1479        will be searched.
1480    :param qualifier: Qualifier can be 'Has' or 'HasNot'
1481    :param value: The value to search for.
1482   
1483    Example ::
1484   
1485        >>> # Condition on a experiemnt acession
1486        >>> condition = AtlasConditionExperiment("", "", "E-GEOD-24283")
1487        >>> # Condition on experiments involving lung
1488        >>> condition = AtlasConditionExperiment("Organism_part", "Has", "lung")
1489       
1490    """
1491#    EXPERIMENT_FILTERS = [
1492#                "Organism"
1493#                "Factor"]
1494   
1495    EXPERIMENT_FILTER_QUALIFIERS = [
1496                "Has",
1497                "HasNot"]
1498   
1499    def __init__(self, property, qualifier, value):
1500        self.property = property
1501        self.qualifier = qualifier
1502        if isinstance(value, basestring):
1503            self.value = value.replace(" ", "+")
1504        elif isinstance(value, list):
1505            self.value = "+".join(value)
1506        else:
1507            raise ValueError(value)
1508       
1509        self.validate()
1510       
1511    def validate(self):
1512        # TODO: check to EFO factors
1513#        assert(self.property in EXPERIMENT_FILTERS + [""])
1514        assert(self.qualifier in self.EXPERIMENT_FILTER_QUALIFIERS + [""])
1515       
1516    def rest(self):
1517        return "experiment{property}{qualifier}={value}".format(**self.__dict__)
1518       
1519       
1520class GeneAtlasError(ValueError):
1521    """ An error response from the Atlas server.
1522    """
1523    pass
1524   
1525   
1526def __check_atlas_error_json(response):
1527    if "error" in response:
1528        raise GeneAtlasError(response["error"])
1529    return response
1530 
1531     
1532def __check_atlas_error_xml(response):
1533    error = response.find("error")
1534    if error is not None:
1535        raise GeneAtlasError(error.text)
1536    return response
1537   
1538       
1539def query_atlas_simple(genes=None, regulation=None, organism=None,
1540                       condition=None, format="json", start=None,
1541                       rows=None):
1542    """ A simple Atlas query.
1543   
1544    :param genes: A list of gene names to search for.
1545    :param regulation: Search for experiments in which `genes` are "up",
1546        "down", "updown" or "none" regulated. If None all experiments
1547        are searched.
1548    :param organism: Search experiments for organism. If None all experiments
1549        are searched.
1550    :param condition: An EFO factor value (e.g. "brain")
1551   
1552    Example ::
1553       
1554        >>> query_atlas_simple(genes=['Pou5f1', 'Dppa3'], organism="Mus musculus")
1555        {u'...
1556       
1557        >>> query_atlas_simple(genes=['Pou5f1', 'Dppa3'], regulation="up", organism="Mus musculus")
1558        {u'...
1559       
1560        >>> query_atlas_simple(genes=['Pou5f1', 'Dppa3'], regulation="up", condition="liver", organism="Mus musculus")
1561        {u'...
1562       
1563    """
1564    import warnings
1565    warnings.warn("Use 'obiGeneAtlas.run_simple_query' instead.", DeprecationWarning)
1566    conditions = AtlasConditionList()
1567    if genes:
1568        conditions.append(AtlasConditionGeneProperty("Gene", "Is", genes))
1569    if regulation or condition:
1570        regulation = "any" if regulation is None else regulation
1571        condition = "" if condition is None else condition
1572        conditions.append(AtlasConditionExperimentalFactor("", regulation, 1, condition))
1573    if organism:
1574        conditions.append(AtlasConditionOrganism(organism))
1575       
1576    connection = GeneExpressionAtlasConenction()
1577    results = connection.query(conditions, format=format, start=start,
1578                               rows=rows)
1579    if format == "json":
1580        return parse_json(results)
1581    else:
1582        return parse_xml(results)
1583
1584"""\
1585.. todo:: can this be implemented query_atlas(organism="...", Locuslink="...", Chebi="...", up3InCompound="..." downInEFO="...")
1586      Need a full list of accepted factors
1587"""
1588
1589def query_atlas(condition, format="json", start=None, rows=None, indent=False, connection=None):
1590    """ Query Atlas based on a `condition` (instance of AtlasCondition)
1591   
1592    Example ::
1593       
1594        >>> condition1 = AtlasConditionGeneProperty("Goterm", "Is", "p53 binding")
1595        >>> condition2 = AtlasConditionExperimentalFactor("Organism_part", "up", 3, "heart")
1596        >>> condition = AtlasConditionList([condition1, condition2])
1597        >>> query_atlas(condition)
1598        {u'...
1599       
1600    """
1601    import warnings
1602    warnings.warn("Use 'obiGeneAtlas.run_query' instead.", DeprecationWarning)
1603    if connection is None:
1604        connection = GeneExpressionAtlasConenction()
1605    results = connection.query(condition, format=format, start=start,
1606                               rows=rows, indent=indent)
1607    if format == "json":
1608        response = parse_json(results)
1609        return __check_atlas_error_json(response)
1610    else:
1611        response = parse_xml(results)
1612        return __check_atlas_error_xml(response)
1613
1614
1615def get_atlas_summary(genes, organism, connection=None):
1616    """ Return 3 dictionaries containing a summary of atlas information
1617    about three experimental factors:
1618   
1619        - Organism Part (OP)
1620        - Disease State (DS)
1621        - Cell type (CT)
1622   
1623    Each dictionary contains query genes as keys. Values are dictionaries
1624    mapping factor values to a 2-tuple containig the count of up regulated
1625    and down regulated experiments.
1626   
1627    Example ::
1628   
1629        >>> get_atlas_summary(["RUNX1"], "Homo sapiens")
1630        ({u'RUNX1': ...
1631       
1632    """
1633    import warnings
1634    warnings.warn("Use 'obiGeneAtlas.get_atlas_summary' instead.", DeprecationWarning)
1635    genes_condition = AtlasConditionGeneProperty("Gene", "Is", genes)
1636    org_condition = AtlasConditionOrganism(organism)
1637    condition = AtlasConditionList([genes_condition, org_condition])
1638    result = query_atlas(condition, format="json", connection=connection)
1639   
1640    org_part = collect_ef_summary(result, "organism_part")
1641    disease_state = collect_ef_summary(result, "disease_state")
1642    cell_type = collect_ef_summary(result, "cell_type")
1643   
1644    return dict(org_part), dict(disease_state), dict(cell_type)
1645   
1646   
1647def collect_ef_summary(info, ef, summary=None):
1648    """ Collect the results summary from query_atlas, result for experimental
1649    factor `ef`.
1650    """
1651    if summary is None:
1652        summary = defaultdict(dict)
1653       
1654    results = info["results"]
1655    for res in results:
1656        gene = res["gene"]
1657        expressions = res["expressions"] 
1658        for expression in expressions:
1659            if expression["ef"] == ef:
1660                efv = expression["efv"]
1661                updown = (expression["upExperiments"],
1662                          expression["downExperiments"]
1663                          )
1664               
1665                if any(updown):
1666                    summary[gene["name"]][efv] = updown
1667   
1668    return summary
1669   
1670   
1671def test():   
1672    conn = ArrayExpressConnection()
1673    import doctest
1674    foo = type("foo", (object,), {})()
1675    bar = type("bar", (object,), {})()
1676   
1677    doctest.testmod(optionflags=doctest.ELLIPSIS,
1678                    extraglobs={"conn": conn, "foo": foo,"bar": bar})
1679   
1680   
1681if __name__ == "__main__":
1682    test()
1683   
Note: See TracBrowser for help on using the repository browser.