source: orange/source/orange/lib_io.cpp @ 10460:f2b41eb984eb

Revision 10460:f2b41eb984eb, 12.2 KB checked in by Ales Erjavec <ales.erjavec@…>, 2 years ago (diff)

Accept unicode objects in sys.getfilesystemencoding() as filenames to Table save/load functions.

Line 
1/*
2    This file is part of Orange.
3   
4    Copyright 1996-2010 Faculty of Computer and Information Science, University of Ljubljana
5    Contact: janez.demsar@fri.uni-lj.si
6
7    Orange is free software: you can redistribute it and/or modify
8    it under the terms of the GNU General Public License as published by
9    the Free Software Foundation, either version 3 of the License, or
10    (at your option) any later version.
11
12    Orange is distributed in the hope that it will be useful,
13    but WITHOUT ANY WARRANTY; without even the implied warranty of
14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15    GNU General Public License for more details.
16
17    You should have received a copy of the GNU General Public License
18    along with Orange.  If not, see <http://www.gnu.org/licenses/>.
19*/
20
21
22#ifdef _MSC_VER
23  #pragma warning (disable : 4786 4114 4018 4267 4244)
24#endif
25
26#include "vars.hpp"
27#include "domain.hpp"
28#include "examplegen.hpp"
29#include "table.hpp"
30
31#include "cls_orange.hpp"
32#include "externs.px"
33
34PVarList knownVars(PyObject *keywords); // defined in lib_kernel.cpp
35TMetaVector *knownMetas(PyObject *keywords); // ibid
36PDomain knownDomain(PyObject *keywords); // ibid
37
38PyObject *encodeStatus(const vector<int> &Status);  // in cls_misc.cpp
39PyObject *encodeStatus(const vector<pair<int, int> > &metaStatus);
40
41
42/* Same as sys.getfilesystemencoding()
43 * (the returned pointer points to a PyString_Object internal buffer
44 * and should not be modified).
45 */
46char* getFileSystemEncoding()
47{
48    PyObject *fsencoding = PySys_GetObject("getfilesystemencoding"); // Borrowed ref
49    fsencoding = PyObject_CallObject(fsencoding, NULL); // This should be a string.
50    assert(PyString_Check(fsencoding));
51    return PyString_AsString(fsencoding);
52}
53
54/* ************ FILE EXAMPLE GENERATORS ************ */
55
56#include "filegen.hpp"
57BASED_ON(FileExampleGenerator, ExampleGenerator)
58
59#include "tabdelim.hpp"
60#include "c45inter.hpp"
61#include "basket.hpp"
62
63
64bool divDot(const string &name, string &before, string &after)
65{ string::const_iterator bi(name.begin()), ei(name.end());
66  for(; (ei!=bi) && (*(--ei)!='.'); );
67  if (*ei!='.') return false;
68 
69  before=string(bi, ei); after=string(ei++, name.end());
70  return true;
71}
72
73
74NO_PICKLE(BasketExampleGenerator)
75NO_PICKLE(C45ExampleGenerator)
76NO_PICKLE(FileExampleGenerator)
77NO_PICKLE(TabDelimExampleGenerator)
78NO_PICKLE(BasketFeeder)
79
80BASED_ON(BasketFeeder, Orange)
81
82
83
84
85PyObject *TabDelimExampleGenerator_new(PyTypeObject *type, PyObject *args, PyObject *keywords) BASED_ON(FileExampleGenerator, "(examples[, use=domain|varlist])")
86{ PyTRY
87    char *fileName;
88    int createNewOn = TVariable::Incompatible;
89    if (!PyArg_ParseTuple(args, "s|i:TabDelimExampleGenerator.__new__", &fileName, &createNewOn))
90      return NULL;
91
92    string name(fileName), b, a;
93    if (!divDot(name, b, a))
94      name+=".tab";
95   
96    vector<int> status;
97    vector<pair<int, int> > metaStatus;
98    TExampleGenerator *egen = mlnew TTabDelimExampleGenerator(name, false, false, createNewOn, status, metaStatus);
99    return Py_BuildValue("NNN", WrapNewOrange(egen, type), encodeStatus(status), encodeStatus(metaStatus));
100  PyCATCH
101}
102
103
104PyObject *BasketExampleGenerator_new(PyTypeObject *type, PyObject *args, PyObject *keywords) BASED_ON(FileExampleGenerator, "(examples[, use=domain])")
105{ PyTRY
106    char *fileName;
107    int createNewOn = TVariable::Incompatible;
108    if (!PyArg_ParseTuple(args, "s|i:BasketExampleGenerator.__new__", &fileName, &createNewOn))
109      return NULL;
110
111    string name(fileName), b, a;
112    if (!divDot(name, b, a))
113      name+=".basket";
114
115    vector<int> status;
116    vector<pair<int, int> > metaStatus;
117    TExampleGenerator *egen = mlnew TBasketExampleGenerator(name, PDomain(), createNewOn, status, metaStatus);
118    return Py_BuildValue("NNN", WrapNewOrange(egen, type), encodeStatus(status), encodeStatus(metaStatus));
119  PyCATCH
120}
121
122
123PyObject *BasketFeeder_clearCache(PyObject *, PyObject *) PYARGS(METH_O, "() -> None")
124{ PyTRY
125    TBasketFeeder::clearCache();
126    RETURN_NONE;
127  PyCATCH
128}
129
130
131
132PyObject *C45ExampleGenerator_new(PyTypeObject *type, PyObject *args, PyObject *keywords) BASED_ON(FileExampleGenerator, "(examples[, use=domain|varlist])")
133{ PyTRY
134    char *stem;
135    int createNewOn = TVariable::Incompatible;
136    if (!PyArg_ParseTuple(args, "s|i:C45ExampleGenerator.__new__", &stem, &createNewOn))
137      return NULL;
138
139    string domain, data;
140    string b, a;
141    if (divDot(stem, b, a))
142      { data=stem; domain=b+".names"; }
143    else
144      { data=string(stem)+".data"; domain=string(stem)+".names"; }
145
146    vector<int> status;
147    vector<pair<int, int> > metaStatus;
148    TExampleGenerator *egen = mlnew TC45ExampleGenerator(data, domain, createNewOn, status, metaStatus);
149    return Py_BuildValue("NNO", WrapNewOrange(egen, type), encodeStatus(status), encodeStatus(metaStatus));
150  PyCATCH
151}
152
153
154
155
156int pt_ExampleGenerator(PyObject *args, void *egen);
157
158void tabDelim_writeDomain(FILE *, PDomain, bool autodetect, char delim = '\t', bool listDiscreteValues = true);
159void tabDelim_writeExamples(FILE *, PExampleGenerator, char delim = '\t', const char *DK = NULL, const char *DC = NULL);
160
161
162FILE *openWReplacedExtension(const char *filename, const char *extension, const char *oldExtension)
163{
164  const char *newname = replaceExtension(filename, extension, oldExtension);
165  FILE *ostr = fopen(newname, "wt");
166  if (!ostr)
167    PyErr_Format(PyExc_SystemError, "cannot open file '%s'", newname);
168  mldelete const_cast<char *>(newname);
169  return ostr;
170}
171
172   
173FILE *openExtended(const char *filename, const char *defaultExtension)
174{
175  const char *extension = getExtension(filename);
176  const char *extended = extension ? filename : replaceExtension(filename, defaultExtension, NULL);
177  FILE *ostr = fopen(extended, "wt");
178  if (!ostr)
179    PyErr_Format(PyExc_SystemError, "cannot open file '%s'", extended);
180  if (!extension)
181    mldelete const_cast<char *>(extended);
182  return ostr;
183}
184
185
186int getStringIfExists(PyObject *keyws, const char *name, char *&res)
187{
188  PyObject *ldv = PyDict_GetItemString(keyws, name);
189  if (ldv) {
190    if (!PyString_Check(ldv)) {
191      PyErr_Format(PyExc_TypeError, "string value expected for '%s'", name);
192      return -1;
193    }
194   
195    res = PyString_AsString(ldv);
196    return 0;
197  }
198
199  return 1;
200}
201
202
203bool readUndefinedSpecs(PyObject *keyws, char *&DK, char *&DC)
204{
205  if (keyws) {
206    int res;
207
208    char *tmp;
209    res = getStringIfExists(keyws, "NA", tmp);
210    if (res == -1)
211      return false;
212    if (!res)
213      DK = DC = tmp;
214
215    res = getStringIfExists(keyws, "DC", DC);
216    if (res == -1)
217      return false;
218
219    res = getStringIfExists(keyws, "DK", DK);
220    if (res == -1)
221      return false;
222  }
223
224  return true;
225}
226
227
228PyObject *tabDelimBasedWrite(PyObject *args, PyObject *keyws, const char *defaultExtension, bool skipAttrTypes, char delim, bool listDiscreteValues = true)
229{ PyTRY
230    char *filename;
231    bool free_filename = false;
232    PExampleGenerator gen;
233
234    if (!PyArg_ParseTuple(args, "sO&", &filename, pt_ExampleGenerator, &gen))
235    {
236        char *encoding = getFileSystemEncoding();
237        if (!PyArg_ParseTuple(args, "esO&", encoding, &filename, pt_ExampleGenerator, &gen))
238            PYERROR(PyExc_TypeError, "string and example generator expected", PYNULL);
239        PyErr_Clear();
240        free_filename = true;
241    }
242
243    if (skipAttrTypes && !gen->domain->classVar) {
244      PyErr_Format(PyExc_TypeError, "Format .%s cannot save classless data sets", defaultExtension);
245      if (free_filename)
246          PyMem_Free(filename);
247      return PYNULL;
248    }
249   
250    char *DK = NULL, *DC = NULL;
251    if (!readUndefinedSpecs(keyws, DK, DC))
252    {
253      if (free_filename)
254          PyMem_Free(filename);
255      return PYNULL;
256    }
257 
258    FILE *ostr = openExtended(filename, defaultExtension);
259    if (!ostr)
260    {
261      if (free_filename)
262          PyMem_Free(filename);
263      return PYNULL;
264    }
265
266    if (free_filename)
267        PyMem_Free(filename);
268    tabDelim_writeDomain(ostr, gen->domain, skipAttrTypes, delim, listDiscreteValues);
269    tabDelim_writeExamples(ostr, gen, delim, DK, DC);
270    fclose(ostr);
271
272    RETURN_NONE
273  PyCATCH
274}
275
276
277PyObject *saveTabDelimited(PyObject *, PyObject *args, PyObject *keyws) PYARGS(METH_VARARGS | METH_KEYWORDS, "(filename, examples[, list_discrete_values=1]) -> None")
278{
279  bool listDiscrete = true;
280
281  if (keyws) {
282    PyObject *ldv = PyDict_GetItemString(keyws, "list_discrete_values");
283    if (!ldv) {
284        ldv = PyDict_GetItemString(keyws, "listDiscreteValues");
285    }
286    listDiscrete = !ldv || (PyObject_IsTrue(ldv)!=0);
287  }
288
289  return tabDelimBasedWrite(args, keyws, "tab", false, '\t', listDiscrete);
290}
291
292PyObject *saveTxt(PyObject *, PyObject *args, PyObject *keyws) PYARGS(METH_VARARGS | METH_KEYWORDS, "(filename, examples) -> None")
293{
294  return tabDelimBasedWrite(args, keyws, "txt", true, '\t');
295}
296
297
298PyObject *saveCsv(PyObject *, PyObject *args, PyObject *keyws) PYARGS(METH_VARARGS | METH_KEYWORDS, "(filename, examples) -> None")
299{
300  return tabDelimBasedWrite(args, keyws, "csv", true, ',');
301}
302
303
304void c45_writeDomain(FILE *, PDomain);
305void c45_writeExamples(FILE *, PExampleGenerator);
306
307PyObject *saveC45(PyObject *, PyObject *args) PYARGS(METH_VARARGS, "(filename, examples) -> None")
308{ PyTRY
309    char *filename;
310    PExampleGenerator gen;
311
312    if (!PyArg_ParseTuple(args, "sO&", &filename, pt_ExampleGenerator, &gen))
313      PYERROR(PyExc_TypeError, "string and example generator expected", PYNULL)
314 
315    if (!gen->domain->classVar)
316      PYERROR(PyExc_SystemError, "C4.5 file cannot store classless data sets.", PYNULL);
317
318    if (gen->domain->classVar->varType!=TValue::INTVAR)
319      PYERROR(PyExc_SystemError, "Class in C4.5 file must be discrete.", PYNULL);
320
321    const char *oldExtension = getExtension(filename);
322
323    FILE *ostr;
324    ostr = openWReplacedExtension(filename, "names", oldExtension);
325    if (!ostr)
326      return PYNULL;
327    c45_writeDomain(ostr, gen->domain);
328    fclose(ostr);
329
330    ostr = openWReplacedExtension(filename, "data", oldExtension);
331    if (!ostr)
332      return PYNULL;
333    c45_writeExamples(ostr, gen);
334    fclose(ostr);
335
336    RETURN_NONE
337  PyCATCH
338}
339
340
341
342#include "spec_gen.hpp"
343
344
345void basket_writeExamples(FILE *, PExampleGenerator, set<int> &missing);
346void raiseWarning(bool, const char *s);
347
348PyObject *saveBasket(PyObject *, PyObject *args) PYARGS(METH_VARARGS, "(filename, examples) -> None")
349{ PyTRY
350    char *filename;
351    bool free_filename = false;
352    PExampleGenerator gen;
353
354    if (!PyArg_ParseTuple(args, "sO&:saveBasket", &filename, pt_ExampleGenerator, &gen))
355    {
356      char *encoding = getFileSystemEncoding();
357      if (!PyArg_ParseTuple(args, "esO&:saveBasket", encoding, &filename, pt_ExampleGenerator, &gen))
358          return PYNULL;
359      PyErr_Clear();
360      free_filename = true;
361    }
362
363    if (gen->domain->variables->size())
364    {
365      if (free_filename)
366          PyMem_Free(filename);
367      PYERROR(PyExc_TypeError, ".basket format can only store meta-attribute values", PYNULL);
368    }
369
370    FILE *ostr = openExtended(filename, "basket");
371
372    if (!ostr)
373    {
374      if (free_filename)
375          PyMem_Free(filename);
376      return PYNULL;
377    }
378
379    set<int> missing;
380
381    try {
382      basket_writeExamples(ostr, gen, missing);
383    }
384    catch (...) {
385      fclose(ostr);
386      remove(filename);
387      if (free_filename)
388          PyMem_Free(filename);
389      throw;
390    }
391
392    fclose(ostr);
393
394    if (free_filename)
395        PyMem_Free(filename);
396
397    if (!missing.empty()) {
398      if (missing.size() == 1) {
399        char excbuf[512];
400        snprintf(excbuf, 512, "saveBasket: attribute with id %i was not found in Domain and has not been stored", *(missing.begin()));
401        raiseWarning(false, excbuf);
402      }
403
404      else {
405        string misss;
406        bool comma = false;
407        const_ITERATE(set<int>, mi, missing) {
408          if (comma)
409            misss += ", ";
410          else
411            comma = true;
412
413          char ns[20];
414          sprintf(ns, "%i", (*mi));
415          misss += ns;
416        }
417
418        char *excbuf = mlnew char[misss.length() + 128];
419        sprintf(excbuf, "saveBasket: attributes with ids not found in Domain have not been stored (%s)", misss.c_str());
420        try {
421          raiseWarning(false, excbuf);
422        }
423        catch (...) {
424          mldelete excbuf;
425          throw;
426        }
427
428        mldelete excbuf;
429      }
430    }
431
432    RETURN_NONE
433  PyCATCH
434}
435
436
437#include "lib_io.px"
Note: See TracBrowser for help on using the repository browser.