Changeset 1687:460a66cfdc01 in orange-bioinformatics for _bioinformatics/obiArrayExpress.py


Ignore:
Timestamp:
06/27/12 14:32:18 (22 months ago)
Author:
Ales Erjavec <ales.erjavec@…>
Branch:
default
Message:

Basic annotations for data table features (Charecteristics, Factor/Parameter Values).

File:
1 edited

Legend:

Unmodified
Added
Removed
  • _bioinformatics/obiArrayExpress.py

    r1636 r1687  
    66A python module for accessing the ArrayExpress web services and database. 
    77 
    8 `Array Express Archive <http://www.ebi.ac.uk/arrayexpress/>`_ is a database of gene expression experiments that you 
    9 can query and download. 
     8`Array Express Archive <http://www.ebi.ac.uk/arrayexpress/>`_ is a 
     9database of gene expression experiments that you can query and download. 
    1010 
    1111Example :: 
    1212 
    13     >>> # Retrieve the object representing experiment with accession E-TABM-25  
     13    >>> # Retrieve the object representing experiment with accession E-TABM-25 
    1414    >>> experiment = ArrayExpressExperiment("E-TABM-25") 
    1515    >>> print experiment.accession 
    1616    E-TABM-25 
    17      
     17 
    1818    >>> print experiment.name 
    1919    Transcription profiling of aging in the primate brain 
    20      
     20 
    2121    >>> print experiment.species 
    2222    ['Pan troglodytes'] 
    23      
     23 
    2424    >>> print experiment.files 
    2525    [{'kind': ... 
    26      
     26 
    2727    >>> # Retrieve the data matrix for experiment 'E-MEXP-2917' 
    2828    >>> experiment = ArrayExpressExperiment("E-MEXP-2917") 
    29     >>> table = experiment.fgem_to_table()  
     29    >>> table = experiment.fgem_to_table() 
    3030 
    3131 
     
    3535    >>> obiArrayExpress.query_experiments(accession='E-MEXP-31') 
    3636    {u'experiments': ... 
    37      
     37 
    3838    >>> obiArrayExpress.query_experiments(keywords='gliobastoma') 
    3939    {u'experiments': ... 
    40      
     40 
    4141    >>> obiArrayExpress.query_files(accession='E-MEXP-32', format="xml") 
    4242    <xml.etree.ElementTree.ElementTree object ... 
    43     
     43 
    4444.. note:: Currently querying ArrayExpress files only works with the xml format. 
    4545 
     
    5151from __future__ import absolute_import 
    5252 
    53 import os, sys 
     53import os 
     54import sys 
    5455import urllib2 
     56import re 
    5557 
    5658from Orange.orng import orngServerFiles 
     59 
    5760import warnings 
    58 import posixpath 
    5961import shelve 
    6062import shutil 
     
    7072parse_json = json.load 
    7173 
     74 
    7275def parse_xml(stream): 
    73     """ Parse an xml stream into an instance of xml.etree.ElementTree.ElementTree. 
    74     """ 
    75     return ElementTree(file=stream)  
     76    """ Parse an xml stream into an instance of 
     77    `xml.etree.ElementTree.ElementTree`. 
     78    """ 
     79    return ElementTree(file=stream) 
    7680 
    7781# All searchable fields of ArrayExpress (see query_experiments docstring 
     
    101105     "wholewords", 
    102106    ] 
    103      
     107 
    104108 
    105109class ArrayExpressConnection(object): 
    106110    """ A connection to the ArrayExpress. Used to construct a REST query 
    107111    and run it. 
    108      
     112 
    109113    .. todo:: Implement user login. 
    110      
    111     """ 
    112      
     114 
     115    """ 
     116 
    113117    DEFAULT_ADDRESS = "http://www.ebi.ac.uk/arrayexpress/{format}/v2/" 
    114118    DEFAULT_FORMAT = "json" 
    115     DEFAULT_CACHE = orngServerFiles.localpath("ArrayExpress", "ArrayExpressCache.shelve") 
     119    DEFAULT_CACHE = orngServerFiles.localpath("ArrayExpress", 
     120                                              "ArrayExpressCache.shelve") 
    116121    # Order of arguments in the query 
    117122    _ARGS_ORDER = ["keywords", "species", "array"] 
    118      
     123 
    119124    def __init__(self, address=None, timeout=30, cache=None, 
    120125                 username=None, password=None): 
    121126        """ Initialize the connection object. 
    122          
     127 
    123128        :param address: Address of the ArrayExpress API 
    124129        :param timeout: Timeout for the socket connection 
    125          
     130 
    126131        .. todo:: Implement user login (see Accessing Private Data in API docs) 
    127          
     132 
    128133        """ 
    129134        self.address = address if address is not None else self.DEFAULT_ADDRESS 
     
    470475    elements are: 
    471476        - a (header REF, header values) tuple (e.g. ("Hybridization REF", ["Stage1", "Stage2", ...]) ) 
    472         - a list of quantitations for header values (e.g. ["log2 ratio", "log2 ratio", ...]) 
     477        - a list of quantitation type for header values (e.g. ["log2 ratio", "log2 ratio", ...]) 
    473478        - a (row REF, row names list) tuple ("e.g. ("Reporter REF", ["Probe 1", "Probe 2", ...]) ) 
    474479        - a list of list matrix with values (as strings) 
     
    483488    header_ref, header = header[0], header[1:] 
    484489    line2 = lines[1] 
    485     row_ref, quanifications = line2[0], line2[1:] 
     490    row_ref, quant_type = line2[0], line2[1:] 
    486491    row_names, rows = [], [] 
    487492    for line in lines[2:]: 
     
    491496         
    492497    return ((header_ref, header), 
    493             quanifications, 
     498            quant_type, 
    494499            (row_ref, row_names), 
    495500            rows)  
     
    556561     
    557562        >>> sdr = SampleDataRelationship("foobar.sdrf") 
    558         >>> sdr.source_name 
     563        >>> sdr.source_name() 
    559564        ['foo', ... 
    560565         
    561         >>> sdr.sample_name 
     566        >>> sdr.sample_name() 
    562567        ['sampled foo', ... 
    563568         
    564         >>> sdr.extract_protocol_ref 
     569        >>> sdr.extract_protocol_ref() 
    565570        ['bar the foo', ... 
    566571         
    567         >>> sdr.derived_array_data_matrix_file 
     572        >>> sdr.derived_array_data_matrix_file() 
    568573        ['foobar.data.txt', ... 
    569574         
     
    650655        """ Return the named column. 
    651656        """ 
    652         index = self.header.index(name) 
     657        if isinstance(name, basestring): 
     658            index = self.header.index(name) 
     659        else: 
     660            index = name 
    653661        return [r[index] for r in self.rows] 
    654662         
     
    694702         
    695703    def hybridization(self): 
    696         """ Return the Hibridization subsection. 
    697         """ 
    698         return self._subsection("Hibridization Name") 
     704        """ Return the Hybridization subsection. 
     705        """ 
     706        return self._subsection("Hybridization Name") 
    699707         
    700708    def hybridization_name(self): 
    701709        """ Return the Hibridization Name subsection. 
    702710        """ 
    703         return self._column("Hibridization Name") 
     711        return self._column("Hybridization Name") 
    704712         
    705713    def assay(self): 
     
    809817    return count >= i * 0.5 
    810818     
    811 def processed_matrix_to_orange(matrix_file): 
     819def processed_matrix_to_orange(matrix_file, sdrf=None): 
    812820    """ Load a single processed matrix file in to an Orange.data.Table 
    813821    instance.  
     
    818826    if isinstance(matrix_file, basestring): 
    819827        matrix_file = open(matrix_file, "rb") 
    820          
    821 #    data_matrix = matrix_file.read() 
    822     header, quantification, rows, matrix = parse_data_matrix(matrix_file) 
     828 
     829    header, quant_type, rows, matrix = parse_data_matrix(matrix_file) 
    823830    header_ref, header = header 
    824831    row_ref, rows = rows 
     
    830837    is_float = numpy.frompyfunc(_is_float, 1, 1) # an numpy ufunc 
    831838          
    832     for header_name, quant, column in zip(header, quantification, matrix.T): 
     839    for header_name, quant, column in zip(header, quant_type, matrix.T): 
    833840        if _is_continuous(column): 
    834841            feature = Orange.feature.Continuous(header_name) 
     
    837844            values = set(column) 
    838845            feature = Orange.feature.Discrete(header_name, values=sorted(values)) 
    839         feature.attributes["quantification"] = quant 
     846        feature.attributes["quantitation type"] = quant 
    840847        features.append(feature) 
    841848         
     
    845852     
    846853    table = Orange.data.Table(domain, [list(row) for row in matrix]) 
    847      
     854    table.setattr("header_ref", header_ref) 
    848855    # Add row identifiers 
    849856    for instance, row in zip(table, rows): 
    850857        instance[row_ref_feature] = row 
    851      
     858 
     859    if sdrf is not None: 
     860        pattern = re.compile(r"((Characteristics)|(Factor Value)|(Parameter Value)) \[(?P<name>.*?)\].*") 
     861        # Row name in sdrf 
     862        row_name = header_ref[:header_ref.find(" REF")] + " Name" 
     863        # feature names as listed in sdrf 
     864        feature_names = sdrf._column(row_name) 
     865        annotations = defaultdict(partial(defaultdict, set)) 
     866        for i, header in enumerate(sdrf.header): 
     867            match = pattern.match(header) 
     868            if match: 
     869                name = match.group("name") 
     870                for val, feature_name in zip(sdrf._column(i), feature_names): 
     871                    annotations[feature_name][name].add(val) 
     872 
     873        def to_str(values): 
     874            if len(values) > 1: 
     875                return str(list(values)) 
     876            else: 
     877                return str(list(values)[0]) 
     878 
     879        for feature in table.domain.features: 
     880            feature.attributes.update([(key, to_str(value)) for key, value in \ 
     881                                       annotations[feature.name].items()]) 
    852882    return table 
    853      
     883 
    854884 
    855885def mage_tab_to_orange(idf_filename): 
    856     """ Convert an MAGE-TAB annotated experiment into an Orange.data.Table 
     886    """Convert an MAGE-TAB annotated experiment into an Orange.data.Table 
    857887    instance (assumes all the associated MAGE-TAB files are in the same 
    858888    directory. 
    859      
    860     .. todo:: Add Characteristics, Factor Values ... to the feature.attributes dict 
    861      
    862     """ 
    863     import Orange 
     889 
     890    """ 
    864891    dirname = os.path.dirname(idf_filename) 
    865892    idf = InvestigationDesign(idf_filename) 
     
    874901    for filename in data_matrices: 
    875902        matrix_file = os.path.join(dirname, filename) 
    876         table = processed_matrix_to_orange(matrix_file) 
     903        table = processed_matrix_to_orange(matrix_file, sdrf) 
    877904        tables.append(table) 
    878          
    879     return hstack_tables(tables) 
    880      
     905    table = hstack_tables(tables) 
     906 
     907    return table 
     908 
     909 
    881910def hstack_tables(tables): 
    882911    """ Stack the tables horizontaly. 
     
    927956     
    928957class SearchableList(list): 
    929     """ A list with a `search` method 
     958    """ A list with an advanced `search` method 
    930959    """ 
    931960    def search(self, **kwargs): 
     
    937966            >>> list.search(bar="foo") # Search for objects which have a member named "bar" and that member equals "foo" 
    938967            [<__main__.foo object ... 
    939              
     968 
    940969        """ 
    941970        ret = [] 
     
    15621591         
    15631592    """ 
    1564     import warnings 
    15651593    warnings.warn("Use 'obiGeneAtlas.run_simple_query' instead.", DeprecationWarning) 
    15661594    conditions = AtlasConditionList() 
     
    15991627         
    16001628    """ 
    1601     import warnings 
    16021629    warnings.warn("Use 'obiGeneAtlas.run_query' instead.", DeprecationWarning) 
    16031630    if connection is None: 
     
    16311658         
    16321659    """ 
    1633     import warnings 
    16341660    warnings.warn("Use 'obiGeneAtlas.get_atlas_summary' instead.", DeprecationWarning) 
    16351661    genes_condition = AtlasConditionGeneProperty("Gene", "Is", genes) 
     
    16671693     
    16681694    return summary 
    1669      
    1670      
     1695 
     1696 
    16711697def test():     
    16721698    conn = ArrayExpressConnection() 
Note: See TracChangeset for help on using the changeset viewer.