Changeset 1354:f5dceb747aff in orange-bioinformatics


Ignore:
Timestamp:
03/25/11 12:14:07 (3 years ago)
Author:
ales_erjavec <ales.erjavec@…>
Branch:
default
Convert:
72d5c1d516054af8b12e8fe6f3d5f395062e16f7
Message:

Can now load data from multiple processed matrix files.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • obiArrayExpress.py

    r1353 r1354  
    433433    lines = [line.split("\t") for line in lines if line.strip() and not line.startswith("#")] 
    434434    header = [h for h in lines[0] if h] 
    435     rows = lines[1:] 
     435    rows = [line[:len(header)] for line in lines[1:]] 
    436436    assert(all([len(r) == len(header) for r in rows])) 
    437437    return header, rows 
     
    784784    return count >= i * 0.5 
    785785     
    786 def mage_tab_to_orange(idf_filename): 
    787     """ Convert an MAGE-TAB annotated experiment into an Orange.data.Table 
    788     instance (assumes all the associated MAGE-TAB files are in the same 
    789     directory. 
    790      
    791     .. todo:: Add Characteristics, Factor Values ... to the feture.attributes dict 
    792      
    793     """ 
     786def processed_matrix_to_orange(matrix_file): 
     787    """ Load a single processed matrix file in to an Orange.data.Table 
     788    instance.  
     789    """ 
     790    import numpy 
    794791    import Orange 
    795     dirname = os.path.dirname(idf_filename) 
    796     idf = InvesigationDesign(idf_filename) 
    797      
    798     sdrf_filename = os.path.join(dirname, idf.sdrf_file[0]) 
    799     sdrf = SampleDataRelationship(sdrf_filename) 
    800      
    801     data_matrices = set(sdrf.derived_array_data_matrix_file()) 
    802     assert(len(data_matrices) == 1) # How to handle multiple array designs. 
    803     data_matrix = open(os.path.join(dirname, data_matrices.pop()), "rb") 
    804     header, quantification, rows, matrix = parse_data_matrix(data_matrix) 
     792     
     793    if isinstance(matrix_file, basestring): 
     794        matrix_file = open(matrix_file, "rb") 
     795         
     796#    data_matrix = matrix_file.read() 
     797    header, quantification, rows, matrix = parse_data_matrix(matrix_file) 
    805798    header_ref, header = header 
    806799    row_ref, rows = rows 
    807800     
    808     import numpy 
    809     matrix = numpy.array(matrix, dtype=str) 
     801    matrix = numpy.array(matrix, dtype=object) 
    810802     
    811803     
    812804    features = [] 
     805    is_float = numpy.frompyfunc(_is_float, 1, 1) # an numpy ufunc 
     806          
    813807    for header_name, quant, column in zip(header, quantification, matrix.T): 
    814808        if _is_continuous(column): 
    815809            feature = Orange.data.variable.Continuous(header_name) 
     810            column[numpy.where(1 - is_float(column))] = "?" # relace all non parsable floats with '?' 
    816811        else: 
    817812            values = set(column) 
     
    824819    domain.addmeta(Orange.data.new_meta_id(), row_ref_feature) 
    825820     
    826     matrix[numpy.where(matrix == "NA")] = "?" 
    827      
    828821    table = Orange.data.Table(domain, [list(row) for row in matrix]) 
    829822     
     823    # Add row identifiers 
    830824    for instance, row in zip(table, rows): 
    831825        instance[row_ref_feature] = row 
    832826     
     827    return table 
     828     
     829 
     830def mage_tab_to_orange(idf_filename): 
     831    """ Convert an MAGE-TAB annotated experiment into an Orange.data.Table 
     832    instance (assumes all the associated MAGE-TAB files are in the same 
     833    directory. 
     834     
     835    .. todo:: Add Characteristics, Factor Values ... to the feture.attributes dict 
     836     
     837    """ 
     838    import Orange 
     839    dirname = os.path.dirname(idf_filename) 
     840    idf = InvesigationDesign(idf_filename) 
     841     
     842    sdrf_filename = os.path.join(dirname, idf.sdrf_file[0]) 
     843    sdrf = SampleDataRelationship(sdrf_filename) 
     844     
     845    data_matrices = set(sdrf.derived_array_data_matrix_file()) 
     846    data_matrices = [name for name in data_matrices if name.strip()] 
     847     
     848    tables = [] 
     849    for filename in data_matrices: 
     850        matrix_file = os.path.join(dirname, filename) 
     851        table = processed_matrix_to_orange(matrix_file) 
     852        tables.append(table) 
     853         
     854    return hstack_tables(tables) 
     855     
     856def hstack_tables(tables): 
     857    """ Stack the tables horizontaly. 
     858    """ 
     859    import Orange 
     860    max_len = max([len(table) for table in tables]) 
     861    stacked_features = [] 
     862    stacked_values = [[] for i in range(max_len)] 
     863    stacked_meta_features = [] 
     864    stacked_meta_values = [{} for i in range(max_len)] 
     865     
     866    for table in tables: 
     867        stacked_features.extend(table.domain.variables) 
     868        stacked_meta_features.extend(table.domain.getmetas().items()) 
     869         
     870        for i, instance in enumerate(table): 
     871            stacked_values[i].extend(list(instance)) 
     872            stacked_meta_values[i].update(instance.getmetas()) 
     873             
     874        # Fill extra lines with unknowns 
     875        for i in range(len(table), max_len): 
     876            stacked_values[i].extend(["?"] * len(table.domain.variables)) 
     877         
     878    domain = Orange.data.Domain(stacked_features, tables[-1].domain.class_var) 
     879    domain.addmetas(dict(set(stacked_meta_features))) 
     880    table = Orange.data.Table(domain, stacked_values) 
     881     
     882    # Add meta attributes 
     883    for instance, metas in zip(table, stacked_meta_values): 
     884        for m, val in metas.iteritems(): 
     885            instance[m] = val 
     886             
    833887    return table 
    834888     
     
    10801134        matrix_files = sorted(set(sdrf.derived_array_data_matrix_file())) 
    10811135         
    1082         return mage_tab_to_orange(os.path.join(repo_dir, self.accession + ".idf.txt")) 
     1136        idf_file = self._search_files("idf", "txt")[0] 
     1137        self._open(idf_file.get("url")) # To download if not cached 
     1138        return mage_tab_to_orange(os.path.join(repo_dir, idf_file.get("name"))) 
    10831139         
    10841140     
Note: See TracChangeset for help on using the changeset viewer.