# source:orange-bioinformatics/orangecontrib/bio/widgets/chipimpute.py@1873:0810c5708cc5

Revision 1873:0810c5708cc5, 5.9 KB checked in by Ales Erjavec <ales.erjavec@…>, 6 months ago (diff)

Moved '_bioinformatics' into orangecontrib namespace.

Line
1## Automatically adapted for numpy.oldnumeric Oct 04, 2007 by
2
3import numpy.oldnumeric as Numeric, numpy.oldnumeric.ma as MA
4import statc
5
6
7def kNNimputeMA(arr2d, K=20, callback=None):
8    """Returns a new 2D MA.array with missing values imputed from K nearest neighbours.
9    Find K rows (axis 0) with the most similar values where similarity measure corresponds to weighted Euclidean distance.
10    Imputed value = weighted average of the corresponding values of K nearest neighbours,
11    where weights equal to tricubic distribution of distances to all rows.
12    Impute missing rows by average over all rows.
13    Version: 30.8.2005
14    """
15    arr2d = MA.asarray(arr2d)
16    assert len(arr2d.shape) == 2, "2D array expected"
17    # make a copy for imputation
18    aImp2 = MA.array(arr2d)
19    # leave out columns with 0 known values (columnInd: non-zero columns)
20    columnCond = Numeric.greater(MA.count(arr2d, axis=0), 0)
21    columnIndAll = Numeric.arange(arr2d.shape[1])
22    columnInd = Numeric.compress(columnCond, columnIndAll)
23    # impute the rows where 0 < #known_values < #non_zero_columns, i.e. exclude the rows with 0 and all (non-zero-column) values
24    countByRows = MA.count(arr2d, axis=1)
25    for rowIdx in Numeric.compress(Numeric.logical_and(Numeric.greater(countByRows, 0), Numeric.less(countByRows, columnInd.shape[0])), Numeric.arange(arr2d.shape[0])):
26        rowResized = MA.resize(arr2d[rowIdx], arr2d.shape)
27        diff = arr2d - rowResized
28        distances = MA.sqrt(MA.add.reduce((diff)**2, 1) / MA.count(diff, axis=1))
29        # nearest neighbours row indices (without the current row index)
30        indSorted = MA.argsort(distances)[1:]
31        distSorted = distances.take(indSorted)
32        # number of distances different from MA.masked
34        # number of distances to account for (K or less)
36            weightsSorted = MA.power(1-MA.power(distSorted/distSorted[numNonMasked-1],3),3) # tricubic distribution of all weights
37        else:
38            weightsSorted = Numeric.ones(distSorted.shape[0])
39        # compute average for each column separately in order to account for K non-masked values
40        colInd4CurrRow = Numeric.compress(Numeric.logical_and(MA.getmaskarray(arr2d[rowIdx]), columnCond), columnIndAll)
41        for colIdx in colInd4CurrRow:
42            # column values sorted by distances
43            columnVals = arr2d[:,colIdx].take(indSorted)
44            # take only those weights where columnVals does not equal MA.masked
46            # impute from K (or possibly less) values
47            aImp2[rowIdx,colIdx] = MA.average(columnVals.compressed()[:K], weights=weightsSortedCompressed[:K])
48        if callback:
49            callback()
50    # impute the unknown rows with average profile
51    avrgRow = MA.average(arr2d, 0)
52    for rowIdx in Numeric.compress(Numeric.equal(countByRows, 0), Numeric.arange(arr2d.shape[0])):
53        aImp2[rowIdx] = avrgRow
54        if callback:
55            callback()
56    return aImp2
57
58
59def loessMA(m, windowSize, axis=0, approxMasked=True, verbose=False, callback=None):
60    """Returns a new array with values at the given axis smoothed by loess;
62    assumes equidistant spacing of points on the given axis.
63    """
64    assert 0 < windowSize <= m.shape[axis]+0.1, "0 < windowSize[%s] <= 1 OR windowSize in range(1.1,m.shape[axis]+1) expected, got %f" % ("%", windowSize)
65    m = MA.asarray(m)
66    if m.dtype.char <> Numeric.Float:
67        m = m.astype(Numeric.Float)
68    shp_other = list(m.shape)
69    shp_other.pop(axis)
70    # get a transposed and reshaped mask and data from m; if m.mask() == None, construct a new array of zeros
72    data = MA.reshape(MA.transpose(m, [axis] + range(0,axis) + range(axis+1,len(m.shape))), (m.shape[axis], Numeric.multiply.reduce(shp_other)))
74    xall = Numeric.arange(data.shape[0])
75    xallList = xall.tolist()
76    for ii in Numeric.compress(Numeric.add.reduce(maskInv,0) > 1, range(data.shape[1])):    # run loess if the profile contains more than 2 values
77        try:
79        except:
80            if verbose:
81                print "Warning: loessMA: could not loess axis %i index %i" % (axis, ii)
82        if callback:
83            callback()
86    return MA.transpose(MA.reshape(data, [m.shape[axis]] + shp_other), [axis] + range(0,axis) + range(axis+1,len(m.shape)))
87
88
89
90if __name__=="__main__":
91    import numpy.oldnumeric.random_array as RandomArray, time
92
93#   10x4 array
94##    m2 = MA.asarray(RandomArray.random((10, 4)))
101##    m2[5,1] = 0.42423
103##    m2i3 = kNNimputeMA(m2, 3)
104##    m2i30 = kNNimputeMA(m2, 30)
105
106    # 0 column
107##    m3 = MA.concatenate([m2, MA.zeros((10,1))*MA.masked], 1)
108##    m3i3 = kNNimputeMA(m3, 3)
109
110    # 3 rows only
111##    m34 = MA.asarray(RandomArray.random((3,4)))