#
source:
orange/orange/Orange/statistics/distribution.py
@
9526:cc86517bcbb0

Revision 9526:cc86517bcbb0, 12.3 KB checked in by markotoplak, 2 years ago (diff) |
---|

Line | |
---|---|

1 | """ |

2 | ============= |

3 | Distributions |

4 | ============= |

5 | |

6 | :obj:`Distribution` and derived classes store empirical |

7 | distributions of discrete and continuous variables. |

8 | |

9 | .. class:: Distribution |

10 | |

11 | This class can |

12 | store absolute or relative frequencies. It provides a convenience constructor |

13 | which constructs instances of derived classes. :: |

14 | |

15 | >>> import Orange |

16 | >>> data = Orange.data.Table("adult_sample") |

17 | >>> disc = Orange.statistics.distribution.Distribution("workclass", data) |

18 | >>> print disc |

19 | <685.000, 72.000, 28.000, 29.000, 59.000, 43.000, 2.000> |

20 | >>> print type(disc) |

21 | <type 'DiscDistribution'> |

22 | |

23 | The resulting distribution is of type :obj:`DiscDistribution` since variable |

24 | `workclass` is discrete. The printed numbers are counts of examples that have particular |

25 | attribute value. :: |

26 | |

27 | >>> workclass = data.domain["workclass"] |

28 | >>> for i in range(len(workclass.values)): |

29 | ... print "%20s: %5.3f" % (workclass.values[i], disc[i]) |

30 | Private: 685.000 |

31 | Self-emp-not-inc: 72.000 |

32 | Self-emp-inc: 28.000 |

33 | Federal-gov: 29.000 |

34 | Local-gov: 59.000 |

35 | State-gov: 43.000 |

36 | Without-pay: 2.000 |

37 | Never-worked: 0.000 |

38 | |

39 | Distributions resembles dictionaries, supporting indexing by instances of |

40 | :obj:`Orange.data.Value`, integers or floats (depending on the distribution |

41 | type), and symbolic names (if :obj:`variable` is defined). |

42 | |

43 | For instance, the number of examples with `workclass="private"`, can be |

44 | obtained in three ways:: |

45 | |

46 | print "Private: ", disc["Private"] |

47 | print "Private: ", disc[0] |

48 | print "Private: ", disc[orange.Value(workclass, "Private")] |

49 | |

50 | Elements cannot be removed from distributions. |

51 | |

52 | Length of distribution equals the number of possible values for discrete |

53 | distributions (if :obj:`variable` is set), the value with the highest index |

54 | encountered (if distribution is discrete and :obj: `variable` is |

55 | :obj:`None`) or the number of different values encountered (for continuous |

56 | distributions). |

57 | |

58 | .. attribute:: variable |

59 | |

60 | Variable to which the distribution applies; may be :obj:`None` if not |

61 | applicable. |

62 | |

63 | .. attribute:: unknowns |

64 | |

65 | The number of instances for which the value of the variable was |

66 | undefined. |

67 | |

68 | .. attribute:: abs |

69 | |

70 | Sum of all elements in the distribution. Usually it equals either |

71 | :obj:`cases` if the instance stores absolute frequencies or 1 if the |

72 | stored frequencies are relative, e.g. after calling :obj:`normalize`. |

73 | |

74 | .. attribute:: cases |

75 | |

76 | The number of instances from which the distribution is computed, |

77 | excluding those on which the value was undefined. If instances were |

78 | weighted, this is the sum of weights. |

79 | |

80 | .. attribute:: normalized |

81 | |

82 | :obj:`True` if distribution is normalized. |

83 | |

84 | .. attribute:: randomGenerator |

85 | |

86 | A pseudo-random number generator used for method :obj:`random`. |

87 | |

88 | .. method:: __init__(variable[, data[, weightId=0]]) |

89 | |

90 | Construct either :obj:`DiscDistribution` or :obj:`ContDistribution`, |

91 | depending on the variable type. If the variable is the only argument, it |

92 | must be an instance of :obj:`Orange.data.variable.Variable`. In that case, |

93 | an empty distribution is constructed. If data is given as well, the |

94 | variable can also be specified by name or index in the |

95 | domain. Constructor then computes the distribution of the specified |

96 | variable on the given data. If instances are weighted, the id of |

97 | meta-attribute with weights can be passed as the third argument. |

98 | |

99 | If variable is given by descriptor, it doesn't need to exist in the |

100 | domain, but it must be computable from given instances. For example, the |

101 | variable can be a discretized version of a variable from data. |

102 | |

103 | .. method:: keys() |

104 | |

105 | Return a list of possible values (if distribution is discrete and |

106 | :obj:`variable` is set) or a list encountered values otherwise. |

107 | |

108 | .. method:: values() |

109 | |

110 | Return a list of frequencies of values such as described above. |

111 | |

112 | .. method:: items() |

113 | |

114 | Return a list of pairs of elements of the above lists. |

115 | |

116 | .. method:: native() |

117 | |

118 | Return the distribution as a list (for discrete distributions) or as a |

119 | dictionary (for continuous distributions) |

120 | |

121 | .. method:: add(value[, weight=1]) |

122 | |

123 | Increase the count of the element corresponding to ``value`` by |

124 | ``weight``. |

125 | |

126 | :param value: Value |

127 | :type value: :obj:`Orange.data.Value`, string (if :obj:`variable` is set), :obj:`int` for discrete distributions or :obj:`float` for continuous distributions |

128 | :param weight: Weight to be added to the count for ``value`` |

129 | :type weight: float |

130 | |

131 | .. method:: normalize() |

132 | |

133 | Divide the counts by their sum, set :obj:`normalized` to :obj:`True` and |

134 | :obj:`abs` to 1. Attributes :obj:`cases` and :obj:`unknowns` are |

135 | unchanged. This changes absoluted frequencies into relative. |

136 | |

137 | .. method:: modus() |

138 | |

139 | Return the most common value. If there are multiple such values, one is |

140 | chosen at random, although the chosen value will always be the same for |

141 | the same distribution. |

142 | |

143 | .. method:: random() |

144 | |

145 | Return a random value based on the stored empirical probability |

146 | distribution. For continuous distributions, this will always be one of |

147 | the values which actually appeared (e.g. one of the values from |

148 | :obj:`keys`). |

149 | |

150 | The method uses :obj:`randomGenerator`. If none has been constructed or |

151 | assigned yet, a new one is constructed and stored for further use. |

152 | |

153 | |

154 | .. class:: Discrete |

155 | |

156 | Stores a discrete distribution of values. The class differs from its parent |

157 | class in having a few additional constructors. |

158 | |

159 | .. method:: __init__(variable) |

160 | |

161 | Construct an instance of :obj:`Discrete` and set the variable |

162 | attribute. |

163 | |

164 | :param variable: A discrete variable |

165 | :type variable: Orange.data.variable.Discrete |

166 | |

167 | .. method:: __init__(frequencies) |

168 | |

169 | Construct an instance and initialize the frequencies from the list, but |

170 | leave `Distribution.variable` empty. |

171 | |

172 | :param frequencies: A list of frequencies |

173 | :type frequencies: list |

174 | |

175 | Distribution constructed in this way can be used, for instance, to |

176 | generate random numbers from a given discrete distribution:: |

177 | |

178 | disc = Orange.statistics.distribution.Discrete([0.5, 0.3, 0.2]) |

179 | for i in range(20): |

180 | print disc.random(), |

181 | |

182 | This prints out approximatelly ten 0's, six 1's and four 2's. The values |

183 | can be named by assigning a variable:: |

184 | |

185 | v = orange.EnumVariable(values = ["red", "green", "blue"]) |

186 | disc.variable = v |

187 | |

188 | .. method:: __init__(distribution) |

189 | |

190 | Copy constructor; makes a shallow copy of the given distribution |

191 | |

192 | :param distribution: An existing discrete distribution |

193 | :type distribution: Discrete |

194 | |

195 | |

196 | .. class:: Continuous |

197 | |

198 | Stores a continuous distribution, that is, a dictionary-like structure with |

199 | values and their frequencies. |

200 | |

201 | .. method:: __init__(variable) |

202 | |

203 | Construct an instance of :obj:`ContDistribution` and set the variable |

204 | attribute. |

205 | |

206 | :param variable: A continuous variable |

207 | :type variable: Orange.data.variable.Continuous |

208 | |

209 | .. method:: __init__(frequencies) |

210 | |

211 | Construct an instance of :obj:`Continuous` and initialize it from |

212 | the given dictionary with frequencies, whose keys and values must be integers. |

213 | |

214 | :param frequencies: Values and their corresponding frequencies |

215 | :type frequencies: dict |

216 | |

217 | .. method:: __init__(distribution) |

218 | |

219 | Copy constructor; makes a shallow copy of the given distribution |

220 | |

221 | :param distribution: An existing continuous distribution |

222 | :type distribution: Continuous |

223 | |

224 | .. method:: average() |

225 | |

226 | Return the average value. Note that the average can also be |

227 | computed using a simpler and faster classes from module |

228 | :obj:`Orange.statistics.basic`. |

229 | |

230 | .. method:: var() |

231 | |

232 | Return the variance of distribution. |

233 | |

234 | .. method:: dev() |

235 | |

236 | Return the standard deviation. |

237 | |

238 | .. method:: error() |

239 | |

240 | Return the standard error. |

241 | |

242 | .. method:: percentile(p) |

243 | |

244 | Return the value at the `p`-th percentile. |

245 | |

246 | :param p: The percentile, must be between 0 and 100 |

247 | :type p: float |

248 | :rtype: float |

249 | |

250 | For example, if `d_age` is a continuous distribution, the quartiles can |

251 | be printed by :: |

252 | |

253 | print "Quartiles: %5.3f - %5.3f - %5.3f" % ( |

254 | dage.percentile(25), dage.percentile(50), dage.percentile(75)) |

255 | |

256 | .. method:: density(x) |

257 | |

258 | Return the probability density at `x`. If the value is not in |

259 | :obj:`Distribution.keys`, it is interpolated. |

260 | |

261 | |

262 | .. class:: Gaussian |

263 | |

264 | A class imitating :obj:`Continuous` by returning the statistics and |

265 | densities for Gaussian distribution. The class is not meant only for a |

266 | convenient substitution for code which expects an instance of |

267 | :obj:`Distribution`. For general use, Python module :obj:`random` |

268 | provides a comprehensive set of functions for various random distributions. |

269 | |

270 | .. attribute:: mean |

271 | |

272 | The mean value parameter of the Gauss distribution. |

273 | |

274 | .. attribute:: sigma |

275 | |

276 | The standard deviation of the distribution |

277 | |

278 | .. attribute:: abs |

279 | |

280 | The simulated number of instances; in effect, the Gaussian distribution |

281 | density, as returned by method :obj:`density` is multiplied by |

282 | :obj:`abs`. |

283 | |

284 | .. method:: __init__([mean=0, sigma=1]) |

285 | |

286 | Construct an instance, set :obj:`mean` and :obj:`sigma` to the given |

287 | values and :obj:`abs` to 1. |

288 | |

289 | .. method:: __init__(distribution) |

290 | |

291 | Construct a distribution which approximates the given distribution, |

292 | which must be either :obj:`Continuous`, in which case its |

293 | average and deviation will be used for mean and sigma, or and existing |

294 | :obj:`GaussianDistribution`, which will be copied. Attribute :obj:`abs` |

295 | is set to the given distribution's ``abs``. |

296 | |

297 | .. method:: average() |

298 | |

299 | Return :obj:`mean`. |

300 | |

301 | .. method:: dev() |

302 | |

303 | Return :obj:`sigma`. |

304 | |

305 | .. method:: var() |

306 | |

307 | Return square of :obj:`sigma`. |

308 | |

309 | .. method:: density(x) |

310 | |

311 | Return the density at point ``x``, that is, the Gaussian distribution |

312 | density multiplied by :obj:`abs`. |

313 | |

314 | |

315 | Class distributions |

316 | =================== |

317 | |

318 | There is a convenience function for computing empirical class distributions from |

319 | data. |

320 | |

321 | .. function:: getClassDistribution(data[, weightID=0]) |

322 | |

323 | Return a class distribution for the given data. |

324 | |

325 | :param data: A set of instances. |

326 | :type data: Orange.data.Table |

327 | :param weightID: An id for meta attribute with weights of instances |

328 | :type weightID: int |

329 | :rtype: :obj:`Discrete` or :obj:`Continuous`, depending on the class type |

330 | |

331 | Distributions of all variables |

332 | ============================== |

333 | |

334 | Distributions of all variables can be computed and stored in |

335 | :obj:`Domain`. The list-like object can be indexed by variable |

336 | indices in the domain, as well as by variables and their names. |

337 | |

338 | .. class:: Domain |

339 | |

340 | .. method:: __init__(data[, weightID=0]) |

341 | |

342 | Construct an instance with distributions of all discrete and continuous |

343 | variables from the given data. |

344 | |

345 | :param data: A set of instances. |

346 | :type data: Orange.data.Table |

347 | :param weightID: An id for meta attribute with weights of instances |

348 | :type weightID: int |

349 | |

350 | The script below computes distributions for all attributes in the data and |

351 | prints out distributions for discrete and averages for continuous attributes. :: |

352 | |

353 | dist = Orange.statistics.distribution.Domain(data) |

354 | |

355 | for d in dist: |

356 | if d.variable.var_type == Orange.data.Type.Discrete: |

357 | print "%30s: %s" % (d.variable.name, d) |

358 | else: |

359 | print "%30s: avg. %5.3f" % (d.variable.name, d.average()) |

360 | |

361 | The distribution for, say, attribute `age` can be obtained by its index and also |

362 | by its name:: |

363 | |

364 | dist_age = dist["age"] |

365 | |

366 | """ |

367 | |

368 | |

369 | from Orange.core import Distribution |

370 | from Orange.core import DiscDistribution as Discrete |

371 | from Orange.core import ContDistribution as Continuous |

372 | from Orange.core import GaussianDistribution as Gaussian |

373 | |

374 | from Orange.core import DomainDistributions as Domain |

**Note:**See TracBrowser for help on using the repository browser.