source: orange/orange/doc/datasets/DATA_INFO.py @ 526:fe2d65da2b2a

Revision 526:fe2d65da2b2a, 4.1 KB checked in by janezd <janez.demsar@…>, 10 years ago (diff)
  • moved documentation from a separate module to this one
Line 
1import os, glob, sys, orange, time, os.path, string
2
3descriptors = ['fname', 'inst', 'size', 'att', 'categ', 'cont', '%cont', 'class', 'values', '%major', 'date', 'description']
4verbose = 1
5
6# construct_datasets(): constructs list of data sets
7def build_datasets():
8  # use current directory instead
9  # os.chdir("d:\webMagix\orange\download\demos")
10  return glob.glob("*.tab")
11
12def dataset_statistics(fname, trace=0):
13  data = orange.ExampleTable(fname)
14  s = [fname]
15
16  # instances and size [kBytes]
17  size = '%5.1f' % (os.path.getsize(fname)/1000.)
18  s = s + [len(data), size]
19
20  # attributes
21  natt = len(data.domain.attributes)
22  ncont=0; ndisc=0
23  for a in data.domain.attributes:
24    if a.varType == orange.VarTypes.Discrete: ndisc = ndisc + 1
25    else: ncont = ncont + 1
26  pcont = '%5.1f' % (100.0 * ncont / natt)
27  s = s + [natt, ndisc, ncont, pcont]
28
29  # class name, values, majority class
30  if data.domain.classVar:
31    cname = data.domain.classVar.name
32    if data.domain.classVar.varType == 1:  # categorical data set
33      cval = 'discrete/' + str(len(data.domain.classVar.values))
34      c = [0] * len(data.domain.classVar.values)
35      for e in data:
36        c[int(e.getclass())] += 1
37      cmaj = '%5.1f' % (100.0 * max(c) / len(data))
38    else: # continuous data set
39      cval = 'continuous'
40      cmaj = 'n/a'
41  else:
42    cname = 'n/a'; cval = 'n/a'; cmaj = 'n/a'
43  s = s + [cname, cval, cmaj]
44
45  # date
46  rtime = time.gmtime(os.path.getmtime(fname))
47  t = time.strftime("%m/%d/%y", rtime)
48  s = s + [t]
49
50  # description
51  s = s + ['-']
52
53  # wrap up   
54  if trace: print fname, s
55  return s
56
57def compute_statistics(flist, trace=0):
58  global verbose
59  stat = {}
60  for f in flist:
61    if verbose:
62      print "processing %s" % (f)
63    s = dataset_statistics(f, trace)
64    stat[f] = s
65  return stat
66
67# obtain past descriptions (attributes) from info file
68def get_past():
69  past = {}
70  if glob.glob("data_info.txt"):
71    f = open("data_info.txt")
72    for line in f:
73      line = line[:-1] #remove new line at the end
74      att = string.split(line, '\t')
75      past[att[0]] = att
76    f.close()
77
78    import time
79    t = time.strftime("%m-%d-%y_%H-%M-%S", time.localtime(time.time()))
80    os.rename('data_info.txt', 'data_info_%s.txt' % t)
81  return past
82
83def get_past_desc():
84  past_desc = {}
85  if glob.glob("data_info.txt"):
86    f = open("data_info.txt")
87    for line in f:
88      line = line[:-1]
89      att = string.split(line, '\t')
90      past_desc[att[0]] = att[-1]
91    f.close()
92
93    import time
94    t = time.strftime("%m-%d-%y_%H-%M-%S", time.localtime(time.time()))
95    os.rename('data_info.txt', 'data_info_%s.txt' % t)
96  return past_desc
97
98def save_info(stat):
99  f = open("data_info.txt", 'w')
100  s = reduce(lambda x,y: str(x)+"\t"+str(y), descriptors)
101  f.write(s+'\n')
102  keys = stat.keys()
103  keys.sort()
104  print keys
105  for k in keys:
106    s = reduce(lambda x,y: str(x)+"\t"+str(y),stat[k])
107    f.write(s+"\n")
108  f.close()
109
110def help():
111  print 'data_info.py [-help|-list|-update|-add]'
112  print '  -help   prints this message'
113  print '  -list   lists statistics for data files'
114  print '  -update updates statistics in data_info.txt, maintains description fields'
115  print '  -add    adds statistics for data files not present in data_info.txt'
116
117def main():
118  flist = build_datasets()
119  if '-help' in sys.argv: help()
120  elif '-list' in sys.argv:
121    compute_statistics(flist, trace=1)
122  elif '-add' in sys.argv:
123    past = get_past()
124    k = past.keys()
125    fnew = filter(lambda x, k=k: not x in k, flist)
126    print 'new=', fnew
127    stat = compute_statistics(fnew)
128    # append past statistics
129    for k in past.keys():
130      stat[k] = past[k]
131    save_info(stat)
132  elif '-update' in sys.argv or 1:
133    # only description and file name is read here, where not equal to '-'
134    # this is used to update new statistics.
135    # this is constructed primarily if we want to change the number of
136    # descriptive fields
137    past_desc = get_past_desc()
138    past_desc_keys = past_desc.keys()
139    stat = compute_statistics(flist)
140    for k in stat.keys():
141      if k in past_desc_keys:
142        stat[k][-1] = past_desc[k]
143    save_info(stat)
144  else: help()
145 
146main()
Note: See TracBrowser for help on using the repository browser.