source: orange-bioinformatics/obiArrayExpress.py @ 1470:e8cddaaee459

Revision 1470:e8cddaaee459, 56.4 KB checked in by ales_erjavec <ales.erjavec@…>, 3 years ago (diff)

Better caching.
Deprecated the Gene Atlas Expression interface, replaced by obiGeneAtlas module.

Line 
1"""
2===============
3obiArrayExpress
4===============
5
6A python module for accessing the ArrayExpress web services.
7
8`Array Express Archive <http://www.ebi.ac.uk/arrayexpress/>`_ is a database of gene expression experiments that you
9can query and download.
10
11Example of an Array Express query ::
12
13    >>> import obiArrayExpress
14    >>> obiArrayExpress.query_experiments(accession='E-MEXP-31')
15    {u'experiments': ...
16   
17    >>> obiArrayExpress.query_experiments(keywords='gliobastoma')
18    {u'experiments': ...
19   
20    >>> obiArrayExpress.query_files(accession='E-MEXP-32', format="xml")
21    <xml.etree.ElementTree.ElementTree instance...
22   
23.. note:: Currently querying ArrayExpress files only works with the xml format.
24
25.. note:: See the documentation of `query_experiments` for a full set of
26          parameters that these functions accept.
27
28"""
29
30import os, sys
31import urllib2
32
33import orngEnviron
34import orngServerFiles
35import warnings
36import posixpath
37import shelve
38import shutil
39import posixpath
40import json
41from xml.etree.ElementTree import ElementTree
42from StringIO import StringIO
43
44from collections import defaultdict
45from functools import partial
46from contextlib import closing
47
48parse_json = json.load
49
50def parse_xml(stream):
51    """ Parse an xml stream into an instance of xml.etree.ElementTree.ElementTree.
52    """
53    return ElementTree(file=stream) 
54
55# All searchable fields of ArrayExpress (see query_experiments docstring
56# for a description of the fields)
57ARRAYEXPRESS_FIELDS = \
58    ["keywords",
59     "accession",
60     "array",
61     "ef",
62     "efv",
63     "expdesign",
64     "exptype",
65     "gxa",
66     "pmid",
67     "sa",
68     "species",
69     "expandefo",
70     "directsub",
71     "assaycount",
72     "efcount",
73     "samplecount",
74     "sacount",
75     "rawcount",
76     "fgemcount",
77     "miamescore",
78     "date",
79     "wholewords",
80    ]
81   
82
83class ArrayExpressConnection(object):
84    """ A connection to the ArrayExpress. Used to construct a REST query
85    and run it.
86   
87    .. todo:: Implement user login.
88   
89    """
90   
91    DEFAULT_ADDRESS = "http://www.ebi.ac.uk/arrayexpress/{format}/v2/"
92    DEFAULT_FORMAT = "json"
93    DEFAULT_CACHE = orngServerFiles.localpath("ArrayExpress", "ArrayExpressCache.shelve")
94    # Order of arguments in the query
95    _ARGS_ORDER = ["keywords", "species", "array"]
96   
97    def __init__(self, address=None, timeout=30, cache=None,
98                 username=None, password=None):
99        """ Initialize the connection object.
100       
101        :param address: Address of the ArrayExpress API
102        :param timeout: Timeout for the socket connection
103       
104        .. todo:: Implement user login (see Accessing Private Data in API docs)
105       
106        """
107        self.address = address if address is not None else self.DEFAULT_ADDRESS
108        self.timeout = timeout
109        self.cache = cache if cache is not None else self.DEFAULT_CACHE
110        self.username = username
111        self.password = password
112       
113    def format_query(self, **kwargs):
114        """ Format the query arguments.
115       
116        Example ::
117       
118            >>> conn.format_query(gxa=True, efcount=(1, 5))
119            'efcount=[1 TO 5]&gxa=true'
120           
121        """
122        # Formaters:
123        def format_default(val):
124            if isinstance(val, basestring):
125                return val
126            else:
127                return "+".join(val)
128        def format_species(val):
129            return '"%s"' % val.lower()
130        def format_gxa(val):
131            if val:
132                return "true"
133            else:
134                raise ValueError("gxa={0}".format(val))
135        def format_expandefo(val):
136            if val:
137                return "on"
138            else:
139                raise ValueError("expandefo={0}".format(val))
140        def format_true_false(val):
141            return "true" if val else "false"
142        def format_interval(val):
143            if isinstance(val, tuple):
144                return "[{0} TO {1}]".format(*val)
145            else:
146                raise ValueError("Must be an interval argument (min, max)!")
147        def format_date(val):
148            # TODO check if val contains a datetime.date object, assert proper format
149            return format_interval(val)
150        def format_wholewords(val):
151            if val:
152                return "on"
153            else:
154                raise ValueError("wholewords={0}".format(val))
155       
156        formaters = {"species": format_species,
157                     "gxa": format_gxa,
158                     "expandefo": format_expandefo,
159                     "directsub": format_true_false,
160                     "assaycount": format_interval,
161                     "efcount": format_interval,
162                     "samplecount": format_interval,
163                     "sacount": format_interval,
164                     "rawcount": format_interval,
165                     "fgemcount": format_interval,
166                     "miamescore": format_interval,
167                     "date": format_date,
168                     "wholewords": format_wholewords,
169                     }
170        parts = []
171        arg_items = kwargs.items()
172        ordered = sorted(arg_items, key=lambda arg: self._ARGS_ORDER.index(arg[0]) \
173                         if arg[0] in self._ARGS_ORDER else 100)
174       
175        for key, value in kwargs.iteritems():
176            if key == "format":
177                continue # format is handled in query_url
178            if key not in ARRAYEXPRESS_FIELDS:
179                raise ValueError("Invalid argument name: '{0}'".format(key))
180            if value is not None and value != []:
181                fmt = formaters.get(key, format_default)
182                value = fmt(value)
183                parts.append("{0}={1}".format(key, value))
184                 
185        return "&".join(parts)
186       
187    def query_url(self, what="experiments", **kwargs):
188        """ Return a formated query URL for the query arguments
189       
190        Example ::
191            >>> conn.query_url(accession="E-MEXP-31")
192            'http://www.ebi.ac.uk/arrayexpress/json/v2/experiments?accession=E-MEXP-31'
193           
194        """
195        query = self.format_query(**kwargs)
196        url = posixpath.join(self.address, what)
197        url = url.format(format=kwargs.get("format", self.DEFAULT_FORMAT))
198        url = url + ("?" + query if query else "")
199        url = url.replace(" ", "%20")
200        return url
201   
202    def query_url_experiments(self, **kwargs):
203        """ Return a formated experiments query url for the calls arguments
204        """
205        return self.query_url("experiments", **kwargs)
206   
207    def query_url_files(self, **kwargs):
208        """ Return a formated experiments query url for the calls arguments
209        """
210        return self.query_url("files", **kwargs)
211   
212    def query_experiment(self, **kwargs):
213        """ Return an open stream to the experiments query results.
214       
215        .. note:: This member function takes the same arguments as the module
216                  level `query_experiemnts` function.
217         
218        """
219        url = self.query_url_experiments(**kwargs)
220        stream = self._cache_urlopen(url, timeout=self.timeout)
221        return stream
222   
223    def query_files(self, **kwargs):
224        """ Return an open stream to the files query results.
225       
226        .. note:: This member function takes the same arguments as the module
227                  level `query_files` function.
228        """
229        url = self.query_url_files(**kwargs)
230        stream = self._cache_urlopen(url, timeout=self.timeout)
231        return stream
232   
233    def open_file(self, accession, kind="raw", ext=None):
234        """ Return a file handle to experiment data.
235        Possible values for kind:
236            - raw: return the raw data if available
237            - fgem: return the processed data if available
238            - biosamples: a png or svg design image
239            - idf: investigation description
240            - adf: array design description
241            - mageml: MAGE-ML file
242           
243        Example ::
244       
245            >>> raw_file = conn.open_file("E-TABM-1087", kind="raw")
246            >>> processed_file = conn.open_file("E-TABM-1087", kind="fgem")
247             
248        """
249        stream = self.query_files(accession=accession, format="xml")
250        tree = ElementTree(file=stream)
251        files = tree.findall("experiment/file")
252        for file in files:
253            filekind = file.find("kind").text
254            fileext = file.find("extension").text
255            if filekind.strip() == kind and (fileext.strip() == ext or ext is None): 
256                url = file.find("url").text
257                return self._cache_urlopen(url.strip(), timeout=self.timeout)
258           
259    def _cache_urlopen(self, url, timeout=30):
260        if self.cache is not None:
261            with self.open_cache() as cache:
262                cached = url in cache
263            if not cached:
264                stream = urllib2.urlopen(url, timeout=timeout)
265                data = stream.read()
266                with self.open_cache() as cache:
267                    cache[url] = data
268            else:
269                with self.open_cache() as cache:
270                    data = cache[url]
271            return StringIO(data)
272        else:
273            return urllib2.urlopen(url, timeout=timeout)
274       
275    def open_cache(self):
276        if isinstance(self.cache, basestring):
277            return closing(shelve.open(self.cache))
278        elif hasattr(self.cache, "close"):
279            return closing(self.cache)
280        elif self.cache is None:
281            return fake_closing({})
282        else:
283            return fake_closing(self.cache)
284       
285from contextlib import contextmanager
286
287@contextmanager
288def fake_closing(obj):
289    yield obj
290   
291def query_experiments(keywords=None, accession=None, array=None, ef=None,
292                      efv=None, expdesign=None, exptype=None,
293                      gxa=None, pmid=None, sa=None, species=None,
294                      expandefo=None, directsub=None, assaycount=None,
295                      efcount=None, samplecount=None, rawcount=None,
296                      fgemcount=None, miamescore=None, date=None, 
297                      format="json", wholewords=None, connection=None):
298    """ Query Array Express experiments.
299   
300    :param keywords: A list of keywords to search (e.g. ['gliobastoma']
301    :param accession: Search by experiment accession (e.g. 'E-MEXP-31')
302    :param array: Search by array design name or accession (e.g. 'A-AFFY-33')
303    :param ef: Experimental factor (names of main variables of experiments)
304    :param efv: Experimental factor value (Has EFO expansion)
305    :param expdesign: Experiment design type. (e.g. ["dose", "response"])
306    :param exptype: Experiment type (e.g. 'RNA-Seq', has EFO expansion)
307    :param gxa: If True limit the results to experiments from the Gene
308        Expreission Atlas only.
309    :param pmid: Search by PubMed identifier
310    :param sa: Sample attribute values (e.g. 'fibroblast', has EFO expansion)
311    :param species: Search by species (e.g. 'Homo sapiens', has EFO expansion)
312   
313    :param expandefo: If True expand the search terms with all its child terms
314        in the Experimental Factor Ontology (EFO_) (e.g. keywords="cancer"
315        will be expanded to include for synonyms and sub types of cancer).
316    :param directsub: If True return only experiments submited directly to
317        Array Express else if False return only experiments imported from GEO
318        database (default None, return both)
319    :param assaycount: A two tuple (min, max) for filter on the number of
320        assays (e.g. (1, 5) will return only experiments with at least one
321        and no more then 5 assays).
322    :param efcount: Filter on the number of experimental factors (e.g. (1, 5))
323    :param sacount: Filter on the number of sample attribute categories
324    :param rawcount: Filter on the number or raw files
325    :param fgemcount: Filter on the number of final gene expression matrix
326        (processed data) files
327    :param miamescore: Filter on the MIAME complience score (max 5)
328    :param date: Filter by release date
329   
330    Example ::
331   
332        >>> query_experiments(species="Homo sapiens", ef="organism_part", efv="liver")
333        {u'experiments': ...
334       
335    .. _EFO: http://www.ebi.ac.uk/efo/
336   
337    """
338    if connection is None:
339        connection = ArrayExpressConnection()
340       
341    stream = connection.query_experiment(keywords=keywords, accession=accession,
342                array=array, ef=ef, efv=efv, expdesign=expdesign, exptype=exptype,
343                gxa=gxa, pmid=pmid, sa=sa, species=species, expandefo=expandefo,
344                directsub=directsub, assaycount=assaycount, efcount=efcount,
345                samplecount=samplecount, rawcount=rawcount, fgemcount=fgemcount,
346                miamescore=miamescore, date=date,  format=format,
347                wholewords=wholewords)
348   
349    if format == "json":
350        return parse_json(stream)
351    else:
352        return parse_xml(stream)
353   
354   
355def query_files(keywords=None, accession=None, array=None, ef=None,
356               efv=None, expdesign=None, exptype=None,
357               gxa=None, pmid=None, sa=None, species=None,
358               expandefo=None, directsub=None, assaycount=None,
359               efcount=None, samplecount=None, rawcount=None,
360               fgemcount=None, miamescore=None, date=None, 
361               format="json", wholewords=None, connection=None):
362    """ Query Array Express files.
363   
364    This function accepts the same arguments as `query_experiments`.
365   
366    Example ::
367   
368        >>> query_files(species="Mus musculus", ef="developmental_stage", efv="embryo", format="xml")
369        <xml.etree.ElementTree.ElementTree object ...
370       
371    .. todo:: why does the json interface not work.
372                       
373    """
374    if connection is None:
375        connection = ArrayExpressConnection()
376   
377    stream = connection.query_files(keywords=keywords, accession=accession,
378                array=array, ef=ef, efv=efv, expdesign=expdesign, exptype=exptype,
379                gxa=gxa, pmid=pmid, sa=sa, species=species, expandefo=expandefo,
380                directsub=directsub, assaycount=assaycount, efcount=efcount,
381                samplecount=samplecount, rawcount=rawcount, fgemcount=fgemcount,
382                miamescore=miamescore, date=date,  format=format,
383                wholewords=wholewords)
384   
385    if format == "json":
386        return parse_json(stream)
387    else:
388        return parse_xml(stream)
389   
390   
391def open_file(accession, kind="raw", ext=None, repo_dir=None):
392    """ Open a file for the experiment.
393     
394    Example ::
395   
396        >>> file = open_file("E-MTAB-369", kind="raw", repo_dir="~/.arrayexpress/")
397       
398    """
399    raise NotImplementedError
400
401"""\
402MAGE-TAB convinence functions, classes
403======================================
404"""
405
406IDF_SINGLE_TAGS = \
407    ["Investigation Title",
408     "Date of Experiment",
409     "Public Release Date",
410     "Experiment Description",
411    ]
412
413def parse_idf(file):
414    """ Parse an idf.txt (Investigation Description  Format) formated file.
415    Return a list of tuples where the first element is the tag (first column)
416    and the second element is a list of all tag values.
417   
418    """
419    if isinstance(file, basestring):
420        file = open(file, "rb")
421    data = file.read()
422    lines = data.splitlines()
423    lines = [line.split("\t") for line in lines if line and not line.startswith("#")]
424    parsed = [(line[0], line[1:]) for line in lines]
425    return parsed
426   
427def parse_sdrf(file):
428    """ Parse an sdfr formated file. Return a tuple with the first element
429    a list of header values and the second element is a list of all rows
430    (a row is a list of string values).
431   
432    """
433    if isinstance(file, basestring):
434        file = open(file, "rb")
435    data = file.read()
436    lines = data.splitlines()
437    lines = [line.split("\t") for line in lines if line.strip() and not line.startswith("#")]
438    header = [h for h in lines[0] if h]
439    rows = [line[:len(header)] for line in lines[1:]]
440    assert(all([len(r) == len(header) for r in rows]))
441    return header, rows
442   
443def parse_adf(file):
444    pass
445
446def parse_data_matrix(file):
447    """ Parse the MAGE-TAB processed data matrix. Return a tuple where the
448    elements are:
449        - a (header REF, header values) tuple (e.g. ("Hybridization REF", ["Stage1", "Stage2", ...]) )
450        - a list of quantitations for header values (e.g. ["log2 ratio", "log2 ratio", ...])
451        - a (row REF, row names list) tuple ("e.g. ("Reporter REF", ["Probe 1", "Probe 2", ...]) )
452        - a list of list matrix with values (as strings)
453       
454    """
455    if isinstance(file, basestring):
456        file = open(file, "rb")
457    data = file.read()
458    lines = data.splitlines()
459    lines = [line.split("\t") for line in lines if line.strip()]
460    header = lines[0]
461    header_ref, header = header[0], header[1:]
462    line2 = lines[1]
463    row_ref, quanifications = line2[0], line2[1:]
464    row_names, rows = [], []
465    for line in lines[2:]:
466        r_name, row = line[0], line[1:]
467        row_names.append(r_name)
468        rows.append(row)
469       
470    return ((header_ref, header),
471            quanifications,
472            (row_ref, row_names),
473            rows) 
474   
475class InvesigationDesign(dict):
476    """ Investigation design (contains the contents of the .idf).
477   
478    Example ::
479   
480        >>> idf = InvestigationDesign("foobar.idf")
481        >>> print idf.investigation_title
482        foo investigation
483        >>> print idf.experimental_design
484        ['fubar', 'snafu']
485        >>> print idf.sdrf_file
486        ['foobar.sdrf']
487       
488    """
489    def __init__(self, idf_file=None):
490        idf = parse_idf(idf_file)
491        self._idf = idf
492        self.update(dict(idf))
493        for tag, values in idf:
494            if tag in IDF_SINGLE_TAGS:
495                values = values[0] if values else None
496            ttag = self.transform_tag(tag)
497            setattr(self, ttag, values)
498       
499    def transform_tag(self, tag):
500        """ Transform the tag into a proper python attribute name by
501        replacing all spaces and special characters (e.g '[', ']' into
502        underscores).
503       
504        """
505        toreplace = [" ", "-", "[", "]"]
506        for s in toreplace:
507            tag = tag.replace(s, "_")
508        return tag.lower()
509           
510    def __getitem__(self, tag):
511        """ Return the tag values
512       
513        Example ::
514       
515            >>> idf["Investigation Title"]
516            'foo investigation'
517           
518        """
519        try:
520            return self._idf_dict[tag]
521        except KeyError:
522            pass
523       
524        ttag = self.transform_tag(tag)
525        if hasattr(self, ttag):
526            return getattr(self, ttag)
527        else:
528            raise KeyError(tag)
529       
530class SampleDataRelationship(object):
531    """ Sample-Data Relationship (contains the contents of the .sdrf file).
532   
533    Example ::
534   
535        >>> sdr = SampleDataRelationship("foobar.sdrf")
536        >>> sdr.source_name
537        ['foo', ...
538       
539        >>> sdr.sample_name
540        ['sampled foo', ...
541       
542        >>> sdr.extract_protocol_ref
543        ['bar the foo', ...
544       
545        >>> sdr.derived_array_data_matrix_file
546        ['foobar.data.txt', ...
547       
548        >>>
549         
550    """
551   
552    # Nodes an edges
553    NODES_EDGES = ["Source Name", "Sample Name", "Extract Name",
554                   "Labeled Extract Name", "Hybridization Name",
555                   "Assay Name", "Scan Name", "Normalization Name",
556                   "Array Data File", "Derived Array Data File",
557                   "Array Data Matrix File", "Derived Array Data Matrix File",
558                   "Image File", "Protocol REF"]
559   
560    # Attributes for nodes and edges
561    NODE_EDGE_ATTRIBUTES = \
562    {"Source Name": ["Characteristics", "Provider", "Material Type", "Description", "Comment"],
563     "Sample Name": ["Characteristics", "Material Type", "Description", "Comment"],
564     "Extract Name":["Characteristics", "Material Type", "Description", "Comment"],
565     "Labeled Extract Name": ["Characteristics", "Material Type", "Description", "Label", "Comment"],
566     "Hybridization Name": ["Array Design File", "Array Design REF", "Comment"],
567     "Assay Name": ["Technology Type", "Comment"],
568     "Scan Name": ["Comment"],
569     "Normalization Name": ["Comment"],
570     "Array Data File": ["Comment"],
571     "Derived Array Data File": ["Comment"],
572     "Array Data Matrix File": ["Comment"],
573     "Derived Array Data Matrix File": ["Comment"],
574     "Image File": ["Comment"],
575     "Protocol REF": ["Term Source REF", "Parameter", "Performer", "Date", "Comment"]
576     }
577   
578    # Attributes
579    ATTRIBUTE_COLUMNS = \
580    {"Characteristics []": ["Unit", "Term Source REF"],
581     "Provider": ["Comment"],
582     "Material Type": ["Term Source REF"],
583     "Label": ["Term Source REF"],
584     "Array Design File": ["Comment"],
585     "Array Design REF": ["Term Source REF", "Comment"],   
586     "Technology Type": ["Term Source REF"],
587     "Factor Value [] ()": ["Unit", "Term Source REF"],
588     "Performer": ["Comment"],
589     "Date": [],
590     "Parameter Value []": ["Unit", "Comment"],
591     "Unit []": ["Term Source REF"],
592     "Description": [],
593     "Term Source REF": ["Term Accession Number"],
594     "Term Accession Number": [],
595     "Comment []": []
596     }
597    def __init__(self, sdrf_file=None):
598        header, rows = parse_sdrf(sdrf_file)
599        self.header = header
600        self.rows = rows
601       
602    def transform_tag(self, tag):
603        """ Transform the tag into a proper python attribute name by
604        replacing all spaces and special characters (e.g '[', ']' into
605        underscores).
606       
607        """
608        toreplace = [" ", "-", "[", "]"]
609        for s in toreplace:
610            tag = tag.replace(s, "_")
611        return tag.lower()
612   
613    def _subsection(self, name):
614        """ Return the named subsection (name must be from the
615        NODES_EDGES list).
616       
617        """
618        idx = self.NODES_EDGES.index(name)
619        start = self.header.index(name)
620        end = -1
621        for name in self.NODES_EDGES[idx + 1:]:
622            if name in self.header[start + 1:]:
623                end = self.header.index(name, start + 1)
624                break
625        return self.header[start:end], [r[start:end] for r in self.rows]
626   
627    def _column(self, name):
628        """ Return the named column.
629        """
630        index = self.header.index(name)
631        return [r[index] for r in self.rows]
632       
633    def source(self):
634        """ Return the Source subsection
635        """
636        return self._subsection("Source Name")
637       
638    def source_name(self):
639        """ Return the Source Name subsection
640        """
641        return self._column("Source Name")
642       
643    def sample(self):
644        """ Return the Sample subsection
645        """
646        return self._subsection("Sample Name")
647       
648    def sample_name(self):
649        """ Return the Sample Name subsection
650        """
651        return self._column("Sample Name")
652       
653    def extract(self):
654        """ Return the Extract subsection
655        """
656        return self._subsection("Extract Name")
657       
658    def extract_name(self):
659        """ Return the Extract Name subsection
660        """
661        return self._column("Extract Name")
662       
663    def labeled_extract(self):
664        """ Return the Labeled Extract subsection
665        """
666        return self._subsection("Labeled Extract Name")
667       
668    def labeled_extract_name(self):
669        """ Return the Labeled Extract Name subsection
670        """
671        return self._column("Labeled Extract Name")
672       
673    def hybridization(self):
674        """ Return the Hibridization subsection.
675        """
676        return self._subsection("Hibridization Name")
677       
678    def hybridization_name(self):
679        """ Return the Hibridization Name subsection.
680        """
681        return self._column("Hibridization Name")
682       
683    def assay(self):
684        """ Return the Assay subsection
685        """
686        return self._subsection("Assay Name")
687       
688    def assay_name(self):
689        """ Return the Assay Name subsection
690        """
691        return self._column("Assay Name")
692       
693    def scan(self):
694        """ Return the Scan subsection
695        """
696        return self._subsection("Scan Name")
697       
698    def scan_name(self):
699        """ Return the Scan name subsection
700        """
701        return self._column("Scan Name")
702       
703    def normalization(self):
704        """ Return the Normalization subsection.
705        """
706        return self._subsection("Normalization Name")
707       
708    def normalization_name(self):
709        """ Return the Normalization Name subsection.
710        """
711        return self._column("Normalization Name")
712         
713    def array_data(self):
714        """ Return the Array Data subsection
715        """
716        return self._subsection("Array Data File")
717   
718    def array_data_file(self):
719        """ Return the Array Data File subsection
720        """
721        return self._column("Array Data File")
722       
723    def derived_array_data(self):
724        """ Return the Derived Array Data subsection
725        """
726        return self._subsection("Derived Array Data File")
727   
728    def derived_array_data_file(self):
729        """ Return the Derived Array Data File subsection
730        """
731        return self._column("Derived Array Data File")
732       
733    def array_data_matrix(self):
734        """ Return the Array Data Matrix subsection.
735        """
736        return self._subsection("Array Data Matrix File")
737   
738    def array_data_matrix_file(self):
739        """ Return the Array Data Matrix File subsection.
740        """
741        return self._column("Array Data Matrix File")
742       
743    def derived_array_data_matrix(self):
744        """ Return the Derived Array Data Matrix subsection.
745        """
746        return self._subsection("Derived Array Data Matrix File")
747   
748    def derived_array_data_matrix_file(self):
749        """ Return the Derived Array Data Matrix File subsection.
750        """
751        return self._column("Derived Array Data Matrix File")
752       
753    def image(self):
754        """ Return the Image subsection
755        """
756        return self._subsection("Image File")
757   
758    def image_file(self):
759        """ Return the Image File subsection.
760        """
761        return self._column("Image File")
762       
763class ArrayDesign(object):
764    """ Arary design (contains the contents of the .adf file).
765    """
766    def __init__(self, adf_file=None):
767        adf = parse_adf(adf_file)
768        self._adf = adf
769   
770def _is_float(str):
771    try:
772        float(str)
773        return True
774    except ValueError:
775        return False
776   
777def _is_continuous(items, check_count=100):
778    """ Are the strings in items continous numbers.
779    """
780    count = 0
781    i = 0
782    for i, item in enumerate(items):
783        if _is_float(item):
784            count += 1
785        if i >= check_count:
786            break
787    return count >= i * 0.5
788   
789def processed_matrix_to_orange(matrix_file):
790    """ Load a single processed matrix file in to an Orange.data.Table
791    instance.
792    """
793    import numpy
794    import Orange
795   
796    if isinstance(matrix_file, basestring):
797        matrix_file = open(matrix_file, "rb")
798       
799#    data_matrix = matrix_file.read()
800    header, quantification, rows, matrix = parse_data_matrix(matrix_file)
801    header_ref, header = header
802    row_ref, rows = rows
803   
804    matrix = numpy.array(matrix, dtype=object)
805   
806   
807    features = []
808    is_float = numpy.frompyfunc(_is_float, 1, 1) # an numpy ufunc
809         
810    for header_name, quant, column in zip(header, quantification, matrix.T):
811        if _is_continuous(column):
812            feature = Orange.data.variable.Continuous(header_name)
813            column[numpy.where(1 - is_float(column))] = "?" # relace all non parsable floats with '?'
814        else:
815            values = set(column)
816            feature = Orange.data.variable.Discrete(header_name, values=sorted(values))
817        feature.attributes["quantification"] = quant
818        features.append(feature)
819       
820    row_ref_feature = Orange.data.variable.String(row_ref)
821    domain = Orange.data.Domain(features, None)
822    domain.addmeta(Orange.data.new_meta_id(), row_ref_feature)
823   
824    table = Orange.data.Table(domain, [list(row) for row in matrix])
825   
826    # Add row identifiers
827    for instance, row in zip(table, rows):
828        instance[row_ref_feature] = row
829   
830    return table
831   
832
833def mage_tab_to_orange(idf_filename):
834    """ Convert an MAGE-TAB annotated experiment into an Orange.data.Table
835    instance (assumes all the associated MAGE-TAB files are in the same
836    directory.
837   
838    .. todo:: Add Characteristics, Factor Values ... to the feture.attributes dict
839   
840    """
841    import Orange
842    dirname = os.path.dirname(idf_filename)
843    idf = InvesigationDesign(idf_filename)
844   
845    sdrf_filename = os.path.join(dirname, idf.sdrf_file[0])
846    sdrf = SampleDataRelationship(sdrf_filename)
847   
848    data_matrices = set(sdrf.derived_array_data_matrix_file())
849    data_matrices = [name for name in data_matrices if name.strip()]
850   
851    tables = []
852    for filename in data_matrices:
853        matrix_file = os.path.join(dirname, filename)
854        table = processed_matrix_to_orange(matrix_file)
855        tables.append(table)
856       
857    return hstack_tables(tables)
858   
859def hstack_tables(tables):
860    """ Stack the tables horizontaly.
861    """
862    import Orange
863    max_len = max([len(table) for table in tables])
864    stacked_features = []
865    stacked_values = [[] for i in range(max_len)]
866    stacked_meta_features = []
867    stacked_meta_values = [{} for i in range(max_len)]
868   
869    for table in tables:
870        stacked_features.extend(table.domain.variables)
871        stacked_meta_features.extend(table.domain.getmetas().items())
872       
873        for i, instance in enumerate(table):
874            stacked_values[i].extend(list(instance))
875            stacked_meta_values[i].update(instance.getmetas())
876           
877        # Fill extra lines with unknowns
878        for i in range(len(table), max_len):
879            stacked_values[i].extend(["?"] * len(table.domain.variables))
880       
881    domain = Orange.data.Domain(stacked_features, tables[-1].domain.class_var)
882    domain.addmetas(dict(set(stacked_meta_features)))
883    table = Orange.data.Table(domain, stacked_values)
884   
885    # Add meta attributes
886    for instance, metas in zip(table, stacked_meta_values):
887        for m, val in metas.iteritems():
888            instance[m] = val
889           
890    return table
891   
892def _dictify(element):
893    """ Dictify and xml.etree.Element.Element instance.
894    """
895    if element is None:
896        element = []
897    dict = {}
898    strip = lambda s: s.strip() if s else s
899    for node in element:
900        dict[node.tag] = strip(getattr(node, "text", None))
901    return dict
902   
903   
904class ArrayExpressExperiment(object):
905    """ An convinience class representing an Array Express Experiment.
906   
907    Example ::
908   
909        >>> ae = ArrayExpressExperiment("E-MEXP-2917")
910        >>> print ae.name
911        Characterization of Data Variation in Gene Expression Profiling of Human Peripheral Blood Samples
912       
913        >>> for file in ae.files:
914        ...     print file["name"], file["url"]
915        E-MEXP-2917.sdrf.txt http://www.ebi.ac.uk/arrayexpress/files/E-MEXP-2917/E-MEXP-2917.sdrf.txt
916        ...
917           
918    """
919   
920    def __init__(self, accession, connection=None):
921        self.accession = accession
922        self.connection = connection
923        self._etree = tree = query_experiments(accession=accession, connection=self.connection, format="xml")
924        experiments = tree.findall("experiment")
925        # find the exact match (more then one experiment can be listed in the query result)
926        experiments = [e for e in experiments if e.find("accession").text.strip() == accession]
927        self._experiment = experiment = experiments[0]
928       
929        self.species = [e.text for e in experiment.findall("species")]
930        bool_values = {"true": True, "false": False}
931        self.rawdatafiles = bool_values[experiment.find("rawdatafiles").get("available","false")]
932        self.fgemdatafiles = bool_values[experiment.find("fgemdatafiles").get("available", "false")]
933       
934        self.sampleattributes = []
935        for sa in experiment.findall("sampleattribute"):
936            category = sa.find("category").text.strip()
937            values = [val.text for val in sa.findall("value")]
938            self.sampleattributes.append((category, values))
939           
940        self.experimentalfactors = []
941        for ef in experiment.findall("experimentalfactor"):
942            name = ef.find("name").text.strip()
943            values = [val.text.strip() for val in ef.findall("values")]
944            self.experimentalfactors.append((name, values))
945           
946        self.miamescores = _dictify(experiment.find("miamescores"))
947           
948        self.id = experiment.find("id").text
949        self.secondaryaccession = getattr(experiment.find("secondaryaccession"), "text", None)
950        self.name = experiment.find("name").text
951        self.experimenttype = experiment.find("experimenttype").text.strip()
952        self.releasedate = experiment.find("releasedate").text
953        self.lastupdatedate = getattr(experiment.find("lastupdatedate"), "text", None)
954        self.samples = int(experiment.find("samples").text)
955        self.assays = int(experiment.find("assays").text)
956       
957        self.arraydesign = [_dictify(e) for e in experiment.findall("arraydesign")]
958           
959        self.bioassaydatagroups = [_dictify(group) for group in experiment.findall("bioassaydatagroup")]
960        self.bibliography = [_dictify(e) for e in experiment.findall("bibliography")]
961        self.provider = [_dictify(e) for e in experiment.findall("provider")]
962       
963        self.experimentdesign = []
964        for expd in experiment.findall("experimentdesign"):
965            self.experimentdesign.append(expd.text)
966           
967        self.description = [_dictify(e) for e in experiment.findall("description")]
968       
969        tree = query_files(accession=self.accession, format="xml", connection=self.connection)
970        experiments = tree.findall("experiment")
971        experiments = [e for e in experiments if e.find("accession").text.strip() == accession]
972        experiment = experiments[0]
973        files = experiment.findall("file")
974        self.files = [_dictify(file) for file in files]
975       
976    def _download_processed(self):
977        """ Download the processed matrix file, and associated MAGE-TAB files (idf, sdrf, adf)
978        """
979        assert(self.fgemdatafiles)
980        exp_files = [(f["kind"], f) for f in self.files if f.get("kind") in ["idf", "sdrf"] and f.get("extension") == "txt"]
981        exp_files += [(f["kind"], f) for f in self.files if f.get("kind") == "fgem"]
982        array_files = [(f["kind"], f) for f in self.files if f.get("kind") == "adf" and f.get("extension") == "txt"]
983        assert(len(files) == 3)
984       
985        for type, file in files.iteritems():
986            url = file["url"].strip()
987            rest, basename = os.path.split(url)
988            _, dirname = os.path.split(rest)
989           
990            repo_dir = orngServerFiles.localpath("ArrayExpress", dirname)
991            try:
992                os.makedirs(repo_dir)
993            except OSError:
994                pass
995            local_filename = os.path.join(repo_dir, basename)
996            stream = urllib2.urlopen(url)
997            shutil.copyfileobj(stream, open(local_filename, "wb"))
998           
999            if file["extension"] == "zip":
1000                import zipfile
1001                zfile = zlib.ZipFile(local_filename)
1002                zfile.extractall(repo_dir)
1003            elif file["extension"] == "gz":
1004                import gzip
1005                gzfile = gzip.open(local_filename)
1006                gzfile.extractall(repo_dir)
1007            elif file["extension"] in ["tgz", "tar"]:
1008                import tarfile
1009                tfile = tarfile.TarFile(local_filename)
1010                tfile.extractall(repo_dir)
1011            elif file["extension"] == "txt":
1012                pass
1013            else:
1014                raise ValueError("Unknown extension ('{0}').".format(basename))
1015           
1016    def _download_file(self, url, extract=True):
1017        """ Download the `file` from the ArrayExpress saving it to a local
1018        repository directory.
1019         
1020        """
1021        rest, basename = posixpath.split(url)
1022        dirname = posixpath.basename(rest)
1023        repo_dir = orngServerFiles.localpath("ArrayExpress", dirname)
1024        try:
1025            os.makedirs(repo_dir)
1026        except OSError:
1027            pass
1028        stream = urllib2.urlopen(url)
1029        local_filename = os.path.join(repo_dir, basename)
1030        shutil.copyfileobj(stream, open(local_filename, "wb"))
1031       
1032        if extract:
1033            _, extension = os.path.splitext(local_filename)
1034            if extension == ".zip":
1035                import zipfile
1036                zfile = zipfile.ZipFile(local_filename)
1037                zfile.extractall(repo_dir)
1038            elif extension == ".gz":
1039                import gzip
1040                gzfile = gzip.open(local_filename)
1041                gzfile.extractall(repo_dir)
1042            elif extension in [".tgz"]:
1043                import tarfile
1044                tfile = tarfile.TarFile(local_filename)
1045                tfile.extractall(repo_dir)
1046            elif extension == ".txt":
1047                pass
1048            else:
1049                raise ValueError("Unknown extension ('{0}').".format(basename))
1050           
1051    def _is_local(self, url):
1052        """ Is the `url` stored in the local repository.
1053        """
1054        return os.path.exists(self._local_filepath(url))
1055   
1056    def _local_filepath(self, url):
1057        """ Return the local file path for url.
1058        """
1059        rest, basename = posixpath.split(url)
1060        dirname = posixpath.basename(rest)
1061        return orngServerFiles.localpath("ArrayExpress", os.path.join(dirname, basename))
1062   
1063    def _open(self, url):
1064        """ Return an open file like handle to url (ArrayExpress file).
1065        The file is cached in the local repository for future access.
1066       
1067        """
1068        if not self._is_local(url):
1069            self._download_file(url, extract=True)
1070        file = self._local_filepath(url)
1071        return open(file, "rb")
1072   
1073    def _search_files(self, kind=None, extension=None):
1074        """ Search files by `kind` and `extension`.
1075        """
1076        res = []
1077        for file in self.files:
1078            kind_match = kind == file.get("kind") or kind is None
1079            extension_match = extension == file.get("extension") or extension is None
1080           
1081            if kind_match and extension_match:
1082                res.append(file)
1083        return res
1084       
1085    def array_design(self):
1086        """ Return a list of `ArrayDesign` instances used in this experiment.
1087        """
1088        files = [f for f in self.files if f.get("kind") == "adf" and \
1089                 f.get("extension") == "txt"]
1090       
1091        array_design = []
1092        for file in files:
1093            url = file.get("url")
1094            if not self._is_local(url):
1095                self._download_file(url)
1096            array_design.append(ArrayDesign(self._open(url)))
1097        return array_design
1098       
1099    def investigation_design(self):
1100        """ Return an `InvestigationDesgin` instance for this experiment
1101        """
1102        files = [f for f in self.files if f.get("kind") == "idf" and \
1103                 f.get("extension") == "txt"]
1104        if not files:
1105            raise ValueError("The experiment '{0}' does not have an investigation design file".format(self.accession))
1106        file = files[0]
1107        return InvesigationDesign(self._open(file.get("url")))
1108       
1109       
1110    def sample_data_relationship(self):
1111        """ Return an `SampleDataRelationship` instance describing this experiment.
1112        """
1113        files = [f for f in self.files if f.get("kind") == "sdrf" and \
1114                 f.get("extension") == "txt"]
1115        if not files:
1116            raise ValueError("The experiment '{0}' does not have an sample and data relationship file".format(self.accession))
1117        file = files[0]
1118        return SampleDataRelationship(self._open(file.get("url")))
1119       
1120    def fgem_to_table(self):
1121        assert(self.fgemdatafiles)
1122        repo_dir = orngServerFiles.localpath("ArrayExpress", self.accession)
1123        # Find the file listing the data matrix files (should be in sdrf but sometimes it is in 2column file only, why?)
1124        sdrf = self._search_files("sdrf", "txt")
1125        if sdrf:
1126            sdrf = SampleDataRelationship(self._open(sdrf[0].get("url")))
1127            if "Derived Array Data Matrix File" not in sdrf.header:
1128                twocol = self._search_files("twocolumn", "txt")
1129                if twocol:
1130                    sdrf = SampleDataRelationship(self._open(twocol[0].get("url")))
1131        matrix_file = self._search_files("fgem")[0]
1132        self._open(matrix_file.get("url")) 
1133        matrix_files = sorted(set(sdrf.derived_array_data_matrix_file()))
1134       
1135        idf_file = self._search_files("idf", "txt")[0]
1136        self._open(idf_file.get("url")) # To download if not cached
1137        return mage_tab_to_orange(os.path.join(repo_dir, idf_file.get("name")))
1138       
1139   
1140__doc__ += """\
1141Gene Expression Atlas
1142---------------------
1143
1144.. WARNING:: Deprecated, use ``obiGeneAtlas`` instead.
1145
1146`Gene Expression Atlas <http://www.ebi.ac.uk/gxa/>`_ is a curated subset of
1147gene expression experiments in Array Express Archive.
1148
1149Use `query_atlas_simple` for simple querys.
1150
1151Example (query human genes for experiments in which they are up regulated) ::
1152
1153    >>> obiArrayExpress.query_atlas_simple(genes=["SORL1", "PSIP1", "CDKN1C"], regulation="up", organism="Homo sapiens")
1154    {u'...
1155   
1156Or use the `AtlasCondition` subclasses in this module to construct a more
1157advanced query and use the `query_atlas` function.
1158
1159Example (query human genes annotated to the GO term 'transporter activity'
1160that are up regulated in the liver in at least three experiments) ::
1161
1162    >>> go_cond = AtlasConditionGeneProperty("Goterm", "Is", "transporter activity")
1163    >>> liver_cond = AtlasConditionExperimentalFactor("Organism_part", "up", 3, "liver")
1164    >>> org_cond = AtlasConditionOrganism("Homo sapiens")
1165    >>> cond_list = AtlasConditionList([go_cond, liver_cond, org_cond])
1166    >>> query_atlas(cond_list)
1167    {u'...
1168   
1169"""
1170
1171class GeneExpressionAtlasConenction(object):
1172    """ A connection to Gene Expression Atlas database.
1173    """
1174    DEFAULT_ADDRESS = "http://www.ebi.ac.uk:80/gxa/"
1175    try:
1176        DEFAULT_CACHE = shelve.open(orngServerFiles.localpath("ArrayExpress", "GeneAtlasCache.shelve"))
1177    except Exception, ex:
1178        print ex
1179        DEFAULT_CACHE = {}
1180    def __init__(self, address=None, timeout=30, cache=None):
1181        """ Initialize the conenction.
1182       
1183        :param address: Address of the server.
1184        :param timeout: Socket timeout.
1185        :param cache : A dict like object to use as a cache.
1186       
1187        """
1188        self.address = address if address is not None else self.DEFAULT_ADDRESS
1189        self.timeout = timeout
1190        self.cache = cache if cache is not None else self.DEFAULT_CACHE
1191   
1192    def query(self, condition, format="json", start=None, rows=None, indent=False):
1193        url = self.address + "api/vx?" + condition.rest()
1194        if start is not None and rows is not None:
1195            url += "&start={0}&rows={1}".format(start, rows)
1196        url += "&format={0}".format(format)
1197        if indent:
1198            url += "&indent"
1199#        print url
1200        if self.cache is not None:
1201            return self._query_cached(url)
1202        else:
1203            return urllib2.urlopen(url)
1204        return response
1205   
1206    def _query_cached(self, url):
1207        if self.cache is not None:
1208            if url not in self.cache:
1209                response = urllib2.urlopen(url)
1210                contents = response.read()
1211                self.cache[url] = contents
1212                if hasattr(self.cache, "sync"):
1213                    self.cache.sync()
1214            return StringIO(self.cache[url])
1215        else:
1216            return urllib2.urlopen(url)
1217               
1218   
1219# Names of all Gene Property filter names
1220GENE_FILTERS = \
1221    ["Name", # Gene name
1222     "Goterm", #Gene Ontology Term
1223     "Interproterm", #InterPro Term
1224     "Disease", #Gene-Disease Assocation
1225     "Keyword", #Gene Keyword
1226     "Protein", #Protein
1227
1228     "Dbxref", #Other Database Cross-Refs
1229     "Embl", #EMBL-Bank ID
1230     "Ensfamily", #Ensembl Family
1231     "Ensgene", #Ensembl Gene ID
1232
1233     "Ensprotein", #Ensembl Protein ID
1234     "Enstranscript", #Ensembl Transcript ID
1235     "Goid", #Gene Ontology ID
1236     "Image", #IMAGE ID
1237     "Interproid", #InterPro ID
1238     "Locuslink", #Entrez Gene ID
1239
1240     "Omimid", #OMIM ID
1241     "Orf", #ORF
1242     "Refseq", #RefSeq ID
1243     "Unigene", #UniGene ID
1244     "Uniprot", #UniProt Accession
1245
1246     "Hmdb", #HMDB ID
1247     "Chebi", #ChEBI ID
1248     "Cas", #CAS
1249     "Uniprotmetenz", #Uniprotmetenz
1250     "Gene", #Gene Name or Identifier
1251     "Synonym", #Gene Synonym
1252     ]
1253   
1254# Valid Gene Property filter qualifiers
1255GENE_FILTER_QUALIFIERS =\
1256    ["Is",
1257     "IsNot"
1258     ]
1259
1260# Organisms in the Atlas
1261ATLAS_ORGANISMS = \
1262    ["Anopheles gambiae",
1263     "Arabidopsis thaliana",
1264     "Bos taurus",
1265     "Caenorhabditis elegans",
1266     "Danio rerio",
1267     "Drosophila melanogaster",
1268     "Epstein barr virus",
1269     "Gallus gallus",
1270     "Homo sapiens",
1271     "Human cytomegalovirus",
1272     "Kaposi sarcoma-associated herpesvirus",
1273     "Mus musculus",
1274     "Rattus norvegicus",
1275     "Saccharomyces cerevisiae",
1276     "Schizosaccharomyces pombe",
1277#     "Unknown",
1278     "Xenopus laevis"
1279     ]
1280   
1281#_COMMON_TAXIDS = \
1282#    {"Anopheles gambiae",
1283#     "Arabidopsis thaliana",
1284#     "Bos taurus",
1285#     "Caenorhabditis elegans",
1286#     "Danio rerio",
1287#     "Drosophila melanogaster",
1288#     "Epstein barr virus",
1289#     "Gallus gallus",
1290#     "Homo sapiens",
1291#     "Human cytomegalovirus",
1292#     "Kaposi sarcoma-associated herpesvirus",
1293#     "Mus musculus",
1294#     "Rattus norvegicus",
1295#     "Saccharomyces cerevisiae",
1296#     "Schizosaccharomyces pombe",
1297##     "Unknown",
1298#     "Xenopus laevis"
1299#     }
1300   
1301def ef_ontology():
1302    """ Return the `EF <http://www.ebi.ac.uk/efo/>`_ (Experimental Factor) ontology
1303    """
1304    import obiOntology
1305#    return obiOntology.OBOOntology(urllib2.urlopen("http://efo.svn.sourceforge.net/svnroot/efo/trunk/src/efoinobo/efo.obo"))
1306    import orngServerFiles
1307    # Should this be in the OBOFoundry (Ontology) domain
1308    try:
1309        file = open(orngServerFiles.localpath_download("ArrayExpress", "efo.obo"), "rb")
1310    except urllib2.HTTPError:
1311        file = urllib2.urlopen("http://efo.svn.sourceforge.net/svnroot/efo/trunk/src/efoinobo/efo.obo")
1312    return obiOntology.OBOOntology(file)
1313
1314
1315class AtlasCondition(object):
1316    """ Base class for Gene Expression Atlas query condition
1317    """
1318    def validate(self):
1319        """ Validate condition in a subclass.
1320        """
1321        raise NotImplementedError
1322   
1323    def rest(self):
1324        """ Return a REST query part in a subclass.
1325        """
1326        raise NotImplementedError
1327   
1328   
1329class AtlasConditionList(list, AtlasCondition):
1330    """ A list of AtlasCondition instances.
1331    """ 
1332    def validate(self):
1333        for item in self:
1334            item.validate()
1335       
1336    def rest(self):
1337        return "&".join(cond.rest() for cond in self)
1338
1339
1340class AtlasConditionGeneProperty(AtlasCondition):
1341    """ An atlas gene filter condition.
1342   
1343    :param property: Property of the gene. If None or "" all properties
1344        will be searched.
1345    :param qualifier: Qualifier can be 'Is' or 'IsNot'
1346    :param value: The value to search for.
1347   
1348    Example ::
1349   
1350        >>> # Condition on a gene name
1351        >>> condition = AtlasConditionGeneProperty("Name", "Is", "AS3MT")
1352        >>> # Condition on genes from a GO Term
1353        >>> condition = AtlasConditionGeneProperty("Goterm", "Is", "p53 binding")
1354        >>> # Condition on disease association
1355        >>> condition = AtlasConditionGeneProperty("Disease", "Is", "cancer")
1356       
1357    """
1358    def __init__(self, property, qualifier, value):
1359        self.property = property or ""
1360        self.qualifier = qualifier
1361        if isinstance(value, basestring):
1362            self.value = value.replace(" ", "+")
1363        elif isinstance(value, list):
1364            self.value = "+".join(value)
1365        else:
1366            raise ValueError(value)
1367       
1368        self.validate()
1369       
1370    def validate(self):
1371        assert(self.property in GENE_FILTERS + [""])
1372        assert(self.qualifier in GENE_FILTER_QUALIFIERS + [""])
1373       
1374    def rest(self):
1375        return "gene{property}{qualifier}={value}".format(**self.__dict__)
1376       
1377       
1378class AtlasConditionExperimentalFactor(AtlasCondition):
1379    """ An atlas experimental factor filter condition.
1380   
1381    :param factor: EFO experiamntal factor
1382    :param regulation: "up", "down", "updown", "any" or "none"
1383    :param n: Minimum number of of experimants with this condition
1384    :param value: Experimantal factor value
1385   
1386    Example ::
1387   
1388        >>> # Any genes up regulated in at least 3 experiments involving cancer.
1389        >>> condition = AtlasConditionExperimentalFactor("", "up", 3, "cancer")
1390        >>> # Only genes which are up/down regulated in the heart in at least one experiment.
1391        >>> condition = AtlasConditionExperimentalFactor("Organism_part", "updown", 1, "heart")
1392       
1393    """
1394    def __init__(self, factor, regulation, n, value):
1395        self.factor = factor
1396        self.regulation = regulation
1397        self.n = n
1398        self.value = value
1399        self.validate()
1400       
1401    def validate(self):
1402        # TODO: validate the factor and value
1403#        assert(self.factor in ef_ontology())
1404        assert(self.regulation in ["up", "down", "updown"])
1405       
1406    def rest(self):
1407        return "{regulation}{n}In{factor}={value}".format(**self.__dict__)
1408       
1409class AtlasConditionOrganism(AtlasCondition):
1410    """ Condition on organism.
1411    """
1412    def __init__(self, organism):
1413        self.organism = organism
1414        self.validate()
1415       
1416    def validate(self):
1417        assert(self.organism in ATLAS_ORGANISMS)
1418       
1419    def rest(self):
1420        return "species={0}".format(self.organism.replace(" ", "+").lower())
1421       
1422       
1423class AtlasConditionExperiment(AtlasCondition):
1424    """ Condition on experiement
1425   
1426    :param property: Property of the experiment. If None or "" all properties
1427        will be searched.
1428    :param qualifier: Qualifier can be 'Has' or 'HasNot'
1429    :param value: The value to search for.
1430   
1431    Example ::
1432   
1433        >>> # Condition on a experiemnt acession
1434        >>> condition = AtlasConditionExperiment("", "", "E-GEOD-24283")
1435        >>> # Condition on experiments involving lung
1436        >>> condition = AtlasConditionGeneProperty("Organism_part", "Has", "lung")
1437       
1438    """
1439#    EXPERIMENT_FILTERS = [
1440#                "Organism"
1441#                "Factor"]
1442   
1443    EXPERIMENT_FILTER_QUALIFIERS = [
1444                "Has",
1445                "HasNot"]
1446   
1447    def __init__(self, property, qualifier, value):
1448        self.property = property
1449        self.qualifier = qualifier
1450        if isinstance(value, basestring):
1451            self.value = value.replace(" ", "+")
1452        elif isinstance(value, list):
1453            self.value = "+".join(value)
1454        else:
1455            raise ValueError(value)
1456       
1457        self.validate()
1458       
1459    def validate(self):
1460        # TODO: check to EFO factors
1461#        assert(self.property in EXPERIMENT_FILTERS + [""])
1462        assert(self.qualifier in self.EXPERIMENT_FILTER_QUALIFIERS + [""])
1463       
1464    def rest(self):
1465        return "experiment{property}{qualifier}={value}".format(**self.__dict__)
1466       
1467       
1468class GeneAtlasError(ValueError):
1469    """ An error response from the Atlas server.
1470    """
1471    pass
1472   
1473   
1474def __check_atlas_error_json(response):
1475    if "error" in response:
1476        raise GeneAtlasError(response["error"])
1477    return response
1478 
1479     
1480def __check_atlas_error_xml(response):
1481    error = response.find("error")
1482    if error is not None:
1483        raise GeneAtlasError(error.text)
1484    return response
1485   
1486       
1487def query_atlas_simple(genes=None, regulation=None, organism=None,
1488                       condition=None, format="json", start=None,
1489                       rows=None):
1490    """ A simple Atlas query.
1491   
1492    :param genes: A list of gene names to search for.
1493    :param regulation: Search for experiments in which `genes` are "up",
1494        "down", "updown" or "none" regulated. If None all experiments
1495        are searched.
1496    :param organism: Search experiments for organism. If None all experiments
1497        are searched.
1498    :param condition: An EFO factor value (e.g. "brain")
1499   
1500    Example ::
1501       
1502        >>> query_atlas_simple(genes=['Pou5f1', 'Dppa3'], organism="Mus musculus")
1503        {u'...
1504       
1505        >>> query_atlas_simple(genes=['Pou5f1', 'Dppa3'], regulation="up", organism="Mus musculus")
1506        {u'...
1507       
1508        >>> query_atlas_simple(genes=['Pou5f1', 'Dppa3'], regulation="up", condition="liver", organism="Mus musculus")
1509        {u'...
1510       
1511    """
1512    import warnings
1513    warnings.warn("Use 'obiGeneAtlas.run_simple_query' instead.", DeprecationWarning)
1514    conditions = AtlasConditionList()
1515    if genes:
1516        conditions.append(AtlasConditionGeneProperty("Gene", "Is", genes))
1517    if regulation or condition:
1518        regulation = "any" if regulation is None else regulation
1519        condition = "" if condition is None else condition
1520        conditions.append(AtlasConditionExperimentalFactor("", regulation, 1, condition))
1521    if organism:
1522        conditions.append(AtlasConditionOrganism(organism))
1523       
1524    connection = GeneExpressionAtlasConenction()
1525    results = connection.query(conditions, format=format, start=start,
1526                               rows=rows)
1527    if format == "json":
1528        return parse_json(results)
1529    else:
1530        return parse_xml(results)
1531
1532"""\
1533.. todo:: can this be implemented query_atlas(organism="...", Locuslink="...", Chebi="...", up3InCompound="..." downInEFO="...")
1534      Need a full list of accepted factors
1535"""
1536
1537def query_atlas(condition, format="json", start=None, rows=None, indent=False, connection=None):
1538    """ Query Atlas based on a `condition` (instance of AtlasCondition)
1539   
1540    Example ::
1541       
1542        >>> condition1 = AtlasConditionGeneProperty("Goterm", "Is", "p53 binding")
1543        >>> condition2 = AtlasConditionExperimentalFactor("Organism_part", "up", 3, "heart")
1544        >>> condition = AtlasConditionList([condition1, condition2])
1545        >>> query_atlas(condition)
1546        {u'...
1547       
1548    """
1549    import warnings
1550    warnings.warn("Use 'obiGeneAtlas.run_query' instead.", DeprecationWarning)
1551    if connection is None:
1552        connection = GeneExpressionAtlasConenction()
1553    results = connection.query(condition, format=format, start=start,
1554                               rows=rows, indent=indent)
1555    if format == "json":
1556        response = parse_json(results)
1557        return __check_atlas_error_json(response)
1558    else:
1559        response = parse_xml(results)
1560        return __check_atlas_error_xml(response)
1561
1562
1563def get_atlas_summary(genes, organism, connection=None):
1564    """ Return 3 dictionaries containing a summary of atlas information
1565    about three experimental factors:
1566   
1567        - Organism Part (OP)
1568        - Disease State (DS)
1569        - Cell type (CT)
1570   
1571    Each dictionary contains query genes as keys. Values are dictionaries
1572    mapping factor values to a 2-tuple containig the count of up regulated
1573    and down regulated experiments.
1574   
1575    Example ::
1576   
1577        >>> get_atlas_summary(["RUNX1"], "Homo sapiens")
1578        ({u'RUNX1': ...
1579       
1580    """
1581    import warnings
1582    warnings.warn("Use 'obiGeneAtlas.get_atlas_summary' instead.", DeprecationWarning)
1583    genes_condition = AtlasConditionGeneProperty("Gene", "Is", genes)
1584    org_condition = AtlasConditionOrganism(organism)
1585    condition = AtlasConditionList([genes_condition, org_condition])
1586    result = query_atlas(condition, format="json", connection=connection)
1587   
1588    org_part = collect_ef_summary(result, "organism_part")
1589    disease_state = collect_ef_summary(result, "disease_state")
1590    cell_type = collect_ef_summary(result, "cell_type")
1591   
1592    return dict(org_part), dict(disease_state), dict(cell_type)
1593   
1594   
1595def collect_ef_summary(info, ef, summary=None):
1596    """ Collect the results summary from query_atlas, result for experimental
1597    factor `ef`.
1598    """
1599    if summary is None:
1600        summary = defaultdict(dict)
1601       
1602    results = info["results"]
1603    for res in results:
1604        gene = res["gene"]
1605        expressions = res["expressions"] 
1606        for expression in expressions:
1607            if expression["ef"] == ef:
1608                efv = expression["efv"]
1609                updown = (expression["upExperiments"],
1610                          expression["downExperiments"]
1611                          )
1612               
1613                if any(updown):
1614                    summary[gene["name"]][efv] = updown
1615   
1616    return summary
1617   
1618   
1619def test():   
1620    conn = ArrayExpressConnection()
1621    import doctest
1622    doctest.testmod(optionflags=doctest.ELLIPSIS, extraglobs={"conn": conn})
1623   
1624   
1625if __name__ == "__main__":
1626    test()
1627   
Note: See TracBrowser for help on using the repository browser.