source: orange-bioinformatics/Orange/bioinformatics/widgets/chipimpute.py @ 1625:cefeb35cbfc9

Revision 1625:cefeb35cbfc9, 5.9 KB checked in by mitar, 2 years ago (diff)

Moving files around.

Line 
1## Automatically adapted for numpy.oldnumeric Oct 04, 2007 by
2
3import numpy.oldnumeric as Numeric, numpy.oldnumeric.ma as MA
4import statc
5   
6
7def kNNimputeMA(arr2d, K=20, callback=None):
8    """Returns a new 2D MA.array with missing values imputed from K nearest neighbours.
9    Find K rows (axis 0) with the most similar values where similarity measure corresponds to weighted Euclidean distance.
10    Imputed value = weighted average of the corresponding values of K nearest neighbours,
11    where weights equal to tricubic distribution of distances to all rows.
12    Impute missing rows by average over all rows.
13    Version: 30.8.2005
14    """
15    arr2d = MA.asarray(arr2d)
16    assert len(arr2d.shape) == 2, "2D array expected"
17    # make a copy for imputation
18    aImp2 = MA.array(arr2d)
19    # leave out columns with 0 known values (columnInd: non-zero columns)
20    columnCond = Numeric.greater(MA.count(arr2d, axis=0), 0)
21    columnIndAll = Numeric.arange(arr2d.shape[1])
22    columnInd = Numeric.compress(columnCond, columnIndAll)
23    # impute the rows where 0 < #known_values < #non_zero_columns, i.e. exclude the rows with 0 and all (non-zero-column) values
24    countByRows = MA.count(arr2d, axis=1)
25    for rowIdx in Numeric.compress(Numeric.logical_and(Numeric.greater(countByRows, 0), Numeric.less(countByRows, columnInd.shape[0])), Numeric.arange(arr2d.shape[0])):
26        rowResized = MA.resize(arr2d[rowIdx], arr2d.shape)
27        diff = arr2d - rowResized
28        distances = MA.sqrt(MA.add.reduce((diff)**2, 1) / MA.count(diff, axis=1))
29        # nearest neighbours row indices (without the current row index)
30        indSorted = MA.argsort(distances)[1:]
31        distSorted = distances.take(indSorted)
32        # number of distances different from MA.masked
33        numNonMasked = distSorted.shape[0] - Numeric.add.reduce(Numeric.asarray(MA.getmaskarray(distSorted), Numeric.Int))
34        # number of distances to account for (K or less)
35        if numNonMasked > 1:
36            weightsSorted = MA.power(1-MA.power(distSorted/distSorted[numNonMasked-1],3),3) # tricubic distribution of all weights
37        else:
38            weightsSorted = Numeric.ones(distSorted.shape[0])
39        # compute average for each column separately in order to account for K non-masked values
40        colInd4CurrRow = Numeric.compress(Numeric.logical_and(MA.getmaskarray(arr2d[rowIdx]), columnCond), columnIndAll)
41        for colIdx in colInd4CurrRow:
42            # column values sorted by distances
43            columnVals = arr2d[:,colIdx].take(indSorted)
44            # take only those weights where columnVals does not equal MA.masked
45            weightsSortedCompressed = MA.compress(1-MA.getmaskarray(columnVals), weightsSorted)
46            # impute from K (or possibly less) values
47            aImp2[rowIdx,colIdx] = MA.average(columnVals.compressed()[:K], weights=weightsSortedCompressed[:K])
48        if callback:
49            callback()
50    # impute the unknown rows with average profile
51    avrgRow = MA.average(arr2d, 0)
52    for rowIdx in Numeric.compress(Numeric.equal(countByRows, 0), Numeric.arange(arr2d.shape[0])):
53        aImp2[rowIdx] = avrgRow
54        if callback:
55            callback()
56    return aImp2
57
58
59def loessMA(m, windowSize, axis=0, approxMasked=True, verbose=False, callback=None):
60    """Returns a new array with values at the given axis smoothed by loess;
61    if approxMasked==True: the masked values are approximated by loess;
62    assumes equidistant spacing of points on the given axis.
63    """
64    assert 0 < windowSize <= m.shape[axis]+0.1, "0 < windowSize[%s] <= 1 OR windowSize in range(1.1,m.shape[axis]+1) expected, got %f" % ("%", windowSize)
65    m = MA.asarray(m)
66    if m.dtype.char <> Numeric.Float:
67        m = m.astype(Numeric.Float)
68    shp_other = list(m.shape)
69    shp_other.pop(axis)
70    # get a transposed and reshaped mask and data from m; if m.mask() == None, construct a new array of zeros
71    mask = Numeric.reshape(Numeric.transpose(MA.getmaskarray(m), [axis] + range(0,axis) + range(axis+1,len(m.shape))), (m.shape[axis], Numeric.multiply.reduce(shp_other)))
72    data = MA.reshape(MA.transpose(m, [axis] + range(0,axis) + range(axis+1,len(m.shape))), (m.shape[axis], Numeric.multiply.reduce(shp_other)))
73    maskInv = -1*(mask-1)
74    xall = Numeric.arange(data.shape[0])
75    xallList = xall.tolist()
76    for ii in Numeric.compress(Numeric.add.reduce(maskInv,0) > 1, range(data.shape[1])):    # run loess if the profile contains more than 2 values
77        try:
78            data[:,ii] = MA.array(statc.loess(zip(MA.compress(maskInv[:,ii], xall).tolist(), MA.compress(maskInv[:,ii], data[:,ii]).tolist()), xallList, windowSize))[:,1]
79        except:
80            if verbose:
81                print "Warning: loessMA: could not loess axis %i index %i" % (axis, ii)
82        if callback:
83            callback()
84    if not approxMasked:
85        data = MA.array(data, mask=mask)
86    return MA.transpose(MA.reshape(data, [m.shape[axis]] + shp_other), [axis] + range(0,axis) + range(axis+1,len(m.shape)))
87
88
89
90if __name__=="__main__":
91    import numpy.oldnumeric.random_array as RandomArray, time
92
93#   10x4 array   
94##    m2 = MA.asarray(RandomArray.random((10, 4)))
95##    m2[0,0] = MA.masked
96##    m2[1,1] = MA.masked
97##    m2[2,3] = MA.masked
98##    m2[3,:2] = MA.masked
99##    m2[4,2:] = MA.masked
100##    m2[5,:] = MA.masked
101##    m2[5,1] = 0.42423
102##    m2[6,:] = MA.masked   
103##    m2i3 = kNNimputeMA(m2, 3)
104##    m2i30 = kNNimputeMA(m2, 30)
105
106    # 0 column
107##    m3 = MA.concatenate([m2, MA.zeros((10,1))*MA.masked], 1)
108##    m3i3 = kNNimputeMA(m3, 3)
109
110    # 3 rows only
111##    m34 = MA.asarray(RandomArray.random((3,4)))
112##    m34[0,:2] = MA.masked
113##    m34[1,1:3] = MA.masked
114##    m34[-1] = MA.masked
115##    m34i = kNNimputeMA(m34,3)
116
117    # large array
118##    shp = (1000,10)
119##    l1 = RandomArray.random(shp)
120##    m1 = RandomArray.randint(0,2,shp)
121##    r1 = MA.array(l1, mask=m1)
122##    print time.ctime()
123##    r1i = kNNimputeMA(r1)
124##    print time.ctime()
125
Note: See TracBrowser for help on using the repository browser.