| 1 | /* |
|---|
| 2 | This file is part of Orange. |
|---|
| 3 | |
|---|
| 4 | Copyright 1996-2010 Faculty of Computer and Information Science, University of Ljubljana |
|---|
| 5 | Contact: janez.demsar@fri.uni-lj.si |
|---|
| 6 | |
|---|
| 7 | Orange is free software: you can redistribute it and/or modify |
|---|
| 8 | it under the terms of the GNU General Public License as published by |
|---|
| 9 | the Free Software Foundation, either version 3 of the License, or |
|---|
| 10 | (at your option) any later version. |
|---|
| 11 | |
|---|
| 12 | Orange is distributed in the hope that it will be useful, |
|---|
| 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
|---|
| 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|---|
| 15 | GNU General Public License for more details. |
|---|
| 16 | |
|---|
| 17 | You should have received a copy of the GNU General Public License |
|---|
| 18 | along with Orange. If not, see <http://www.gnu.org/licenses/>. |
|---|
| 19 | */ |
|---|
| 20 | |
|---|
| 21 | |
|---|
| 22 | #ifdef _MSC_VER |
|---|
| 23 | #pragma warning (disable : 4786 4114 4018 4267 4244) |
|---|
| 24 | #endif |
|---|
| 25 | |
|---|
| 26 | #include "vars.hpp" |
|---|
| 27 | #include "domain.hpp" |
|---|
| 28 | #include "examplegen.hpp" |
|---|
| 29 | #include "table.hpp" |
|---|
| 30 | |
|---|
| 31 | #include "cls_orange.hpp" |
|---|
| 32 | #include "externs.px" |
|---|
| 33 | |
|---|
| 34 | PVarList knownVars(PyObject *keywords); // defined in lib_kernel.cpp |
|---|
| 35 | TMetaVector *knownMetas(PyObject *keywords); // ibid |
|---|
| 36 | PDomain knownDomain(PyObject *keywords); // ibid |
|---|
| 37 | |
|---|
| 38 | PyObject *encodeStatus(const vector<int> &Status); // in cls_misc.cpp |
|---|
| 39 | PyObject *encodeStatus(const vector<pair<int, int> > &metaStatus); |
|---|
| 40 | |
|---|
| 41 | /* ************ FILE EXAMPLE GENERATORS ************ */ |
|---|
| 42 | |
|---|
| 43 | #include "filegen.hpp" |
|---|
| 44 | BASED_ON(FileExampleGenerator, ExampleGenerator) |
|---|
| 45 | |
|---|
| 46 | #include "tabdelim.hpp" |
|---|
| 47 | #include "c45inter.hpp" |
|---|
| 48 | #include "basket.hpp" |
|---|
| 49 | |
|---|
| 50 | |
|---|
| 51 | bool divDot(const string &name, string &before, string &after) |
|---|
| 52 | { string::const_iterator bi(name.begin()), ei(name.end()); |
|---|
| 53 | for(; (ei!=bi) && (*(--ei)!='.'); ); |
|---|
| 54 | if (*ei!='.') return false; |
|---|
| 55 | |
|---|
| 56 | before=string(bi, ei); after=string(ei++, name.end()); |
|---|
| 57 | return true; |
|---|
| 58 | } |
|---|
| 59 | |
|---|
| 60 | |
|---|
| 61 | NO_PICKLE(BasketExampleGenerator) |
|---|
| 62 | NO_PICKLE(C45ExampleGenerator) |
|---|
| 63 | NO_PICKLE(FileExampleGenerator) |
|---|
| 64 | NO_PICKLE(TabDelimExampleGenerator) |
|---|
| 65 | NO_PICKLE(BasketFeeder) |
|---|
| 66 | |
|---|
| 67 | BASED_ON(BasketFeeder, Orange) |
|---|
| 68 | |
|---|
| 69 | |
|---|
| 70 | |
|---|
| 71 | |
|---|
| 72 | PyObject *TabDelimExampleGenerator_new(PyTypeObject *type, PyObject *args, PyObject *keywords) BASED_ON(FileExampleGenerator, "(examples[, use=domain|varlist])") |
|---|
| 73 | { PyTRY |
|---|
| 74 | char *fileName; |
|---|
| 75 | int createNewOn = TVariable::Incompatible; |
|---|
| 76 | if (!PyArg_ParseTuple(args, "s|i:TabDelimExampleGenerator.__new__", &fileName, &createNewOn)) |
|---|
| 77 | return NULL; |
|---|
| 78 | |
|---|
| 79 | string name(fileName), b, a; |
|---|
| 80 | if (!divDot(name, b, a)) |
|---|
| 81 | name+=".tab"; |
|---|
| 82 | |
|---|
| 83 | vector<int> status; |
|---|
| 84 | vector<pair<int, int> > metaStatus; |
|---|
| 85 | TExampleGenerator *egen = mlnew TTabDelimExampleGenerator(name, false, false, createNewOn, status, metaStatus); |
|---|
| 86 | return Py_BuildValue("NNN", WrapNewOrange(egen, type), encodeStatus(status), encodeStatus(metaStatus)); |
|---|
| 87 | PyCATCH |
|---|
| 88 | } |
|---|
| 89 | |
|---|
| 90 | |
|---|
| 91 | PyObject *BasketExampleGenerator_new(PyTypeObject *type, PyObject *args, PyObject *keywords) BASED_ON(FileExampleGenerator, "(examples[, use=domain])") |
|---|
| 92 | { PyTRY |
|---|
| 93 | char *fileName; |
|---|
| 94 | int createNewOn = TVariable::Incompatible; |
|---|
| 95 | if (!PyArg_ParseTuple(args, "s|i:BasketExampleGenerator.__new__", &fileName, &createNewOn)) |
|---|
| 96 | return NULL; |
|---|
| 97 | |
|---|
| 98 | string name(fileName), b, a; |
|---|
| 99 | if (!divDot(name, b, a)) |
|---|
| 100 | name+=".basket"; |
|---|
| 101 | |
|---|
| 102 | vector<int> status; |
|---|
| 103 | vector<pair<int, int> > metaStatus; |
|---|
| 104 | TExampleGenerator *egen = mlnew TBasketExampleGenerator(name, PDomain(), createNewOn, status, metaStatus); |
|---|
| 105 | return Py_BuildValue("NNN", WrapNewOrange(egen, type), encodeStatus(status), encodeStatus(metaStatus)); |
|---|
| 106 | PyCATCH |
|---|
| 107 | } |
|---|
| 108 | |
|---|
| 109 | |
|---|
| 110 | PyObject *BasketFeeder_clearCache(PyObject *, PyObject *) PYARGS(METH_O, "() -> None") |
|---|
| 111 | { PyTRY |
|---|
| 112 | TBasketFeeder::clearCache(); |
|---|
| 113 | RETURN_NONE; |
|---|
| 114 | PyCATCH |
|---|
| 115 | } |
|---|
| 116 | |
|---|
| 117 | |
|---|
| 118 | |
|---|
| 119 | PyObject *C45ExampleGenerator_new(PyTypeObject *type, PyObject *args, PyObject *keywords) BASED_ON(FileExampleGenerator, "(examples[, use=domain|varlist])") |
|---|
| 120 | { PyTRY |
|---|
| 121 | char *stem; |
|---|
| 122 | int createNewOn = TVariable::Incompatible; |
|---|
| 123 | if (!PyArg_ParseTuple(args, "s|i:C45ExampleGenerator.__new__", &stem, &createNewOn)) |
|---|
| 124 | return NULL; |
|---|
| 125 | |
|---|
| 126 | string domain, data; |
|---|
| 127 | string b, a; |
|---|
| 128 | if (divDot(stem, b, a)) |
|---|
| 129 | { data=stem; domain=b+".names"; } |
|---|
| 130 | else |
|---|
| 131 | { data=string(stem)+".data"; domain=string(stem)+".names"; } |
|---|
| 132 | |
|---|
| 133 | vector<int> status; |
|---|
| 134 | vector<pair<int, int> > metaStatus; |
|---|
| 135 | TExampleGenerator *egen = mlnew TC45ExampleGenerator(data, domain, createNewOn, status, metaStatus); |
|---|
| 136 | return Py_BuildValue("NNO", WrapNewOrange(egen, type), encodeStatus(status), encodeStatus(metaStatus)); |
|---|
| 137 | PyCATCH |
|---|
| 138 | } |
|---|
| 139 | |
|---|
| 140 | |
|---|
| 141 | |
|---|
| 142 | |
|---|
| 143 | int pt_ExampleGenerator(PyObject *args, void *egen); |
|---|
| 144 | |
|---|
| 145 | void tabDelim_writeDomain(FILE *, PDomain, bool autodetect, char delim = '\t', bool listDiscreteValues = true); |
|---|
| 146 | void tabDelim_writeExamples(FILE *, PExampleGenerator, char delim = '\t', const char *DK = NULL, const char *DC = NULL); |
|---|
| 147 | |
|---|
| 148 | |
|---|
| 149 | FILE *openWReplacedExtension(const char *filename, const char *extension, const char *oldExtension) |
|---|
| 150 | { |
|---|
| 151 | const char *newname = replaceExtension(filename, extension, oldExtension); |
|---|
| 152 | FILE *ostr = fopen(newname, "wt"); |
|---|
| 153 | if (!ostr) |
|---|
| 154 | PyErr_Format(PyExc_SystemError, "cannot open file '%s'", newname); |
|---|
| 155 | mldelete const_cast<char *>(newname); |
|---|
| 156 | return ostr; |
|---|
| 157 | } |
|---|
| 158 | |
|---|
| 159 | |
|---|
| 160 | FILE *openExtended(const char *filename, const char *defaultExtension) |
|---|
| 161 | { |
|---|
| 162 | const char *extension = getExtension(filename); |
|---|
| 163 | const char *extended = extension ? filename : replaceExtension(filename, defaultExtension, NULL); |
|---|
| 164 | FILE *ostr = fopen(extended, "wt"); |
|---|
| 165 | if (!ostr) |
|---|
| 166 | PyErr_Format(PyExc_SystemError, "cannot open file '%s'", extended); |
|---|
| 167 | if (!extension) |
|---|
| 168 | mldelete const_cast<char *>(extended); |
|---|
| 169 | return ostr; |
|---|
| 170 | } |
|---|
| 171 | |
|---|
| 172 | |
|---|
| 173 | int getStringIfExists(PyObject *keyws, const char *name, char *&res) |
|---|
| 174 | { |
|---|
| 175 | PyObject *ldv = PyDict_GetItemString(keyws, name); |
|---|
| 176 | if (ldv) { |
|---|
| 177 | if (!PyString_Check(ldv)) { |
|---|
| 178 | PyErr_Format(PyExc_TypeError, "string value expected for '%s'", name); |
|---|
| 179 | return -1; |
|---|
| 180 | } |
|---|
| 181 | |
|---|
| 182 | res = PyString_AsString(ldv); |
|---|
| 183 | return 0; |
|---|
| 184 | } |
|---|
| 185 | |
|---|
| 186 | return 1; |
|---|
| 187 | } |
|---|
| 188 | |
|---|
| 189 | |
|---|
| 190 | bool readUndefinedSpecs(PyObject *keyws, char *&DK, char *&DC) |
|---|
| 191 | { |
|---|
| 192 | if (keyws) { |
|---|
| 193 | int res; |
|---|
| 194 | |
|---|
| 195 | char *tmp; |
|---|
| 196 | res = getStringIfExists(keyws, "NA", tmp); |
|---|
| 197 | if (res == -1) |
|---|
| 198 | return false; |
|---|
| 199 | if (!res) |
|---|
| 200 | DK = DC = tmp; |
|---|
| 201 | |
|---|
| 202 | res = getStringIfExists(keyws, "DC", DC); |
|---|
| 203 | if (res == -1) |
|---|
| 204 | return false; |
|---|
| 205 | |
|---|
| 206 | res = getStringIfExists(keyws, "DK", DK); |
|---|
| 207 | if (res == -1) |
|---|
| 208 | return false; |
|---|
| 209 | } |
|---|
| 210 | |
|---|
| 211 | return true; |
|---|
| 212 | } |
|---|
| 213 | |
|---|
| 214 | |
|---|
| 215 | PyObject *tabDelimBasedWrite(PyObject *args, PyObject *keyws, const char *defaultExtension, bool skipAttrTypes, char delim, bool listDiscreteValues = true) |
|---|
| 216 | { PyTRY |
|---|
| 217 | char *filename; |
|---|
| 218 | PExampleGenerator gen; |
|---|
| 219 | |
|---|
| 220 | if (!PyArg_ParseTuple(args, "sO&", &filename, pt_ExampleGenerator, &gen)) |
|---|
| 221 | PYERROR(PyExc_TypeError, "string and example generator expected", PYNULL); |
|---|
| 222 | |
|---|
| 223 | if (skipAttrTypes && !gen->domain->classVar) { |
|---|
| 224 | PyErr_Format(PyExc_TypeError, "Format .%s cannot save classless data sets", defaultExtension); |
|---|
| 225 | return PYNULL; |
|---|
| 226 | } |
|---|
| 227 | |
|---|
| 228 | char *DK = NULL, *DC = NULL; |
|---|
| 229 | if (!readUndefinedSpecs(keyws, DK, DC)) |
|---|
| 230 | return PYNULL; |
|---|
| 231 | |
|---|
| 232 | FILE *ostr = openExtended(filename, defaultExtension); |
|---|
| 233 | if (!ostr) |
|---|
| 234 | return PYNULL; |
|---|
| 235 | |
|---|
| 236 | tabDelim_writeDomain(ostr, gen->domain, skipAttrTypes, delim, listDiscreteValues); |
|---|
| 237 | tabDelim_writeExamples(ostr, gen, delim, DK, DC); |
|---|
| 238 | fclose(ostr); |
|---|
| 239 | |
|---|
| 240 | RETURN_NONE |
|---|
| 241 | PyCATCH |
|---|
| 242 | } |
|---|
| 243 | |
|---|
| 244 | |
|---|
| 245 | PyObject *saveTabDelimited(PyObject *, PyObject *args, PyObject *keyws) PYARGS(METH_VARARGS | METH_KEYWORDS, "(filename, examples[, list_discrete_values=1]) -> None") |
|---|
| 246 | { |
|---|
| 247 | bool listDiscrete = true; |
|---|
| 248 | |
|---|
| 249 | if (keyws) { |
|---|
| 250 | PyObject *ldv = PyDict_GetItemString(keyws, "list_discrete_values"); |
|---|
| 251 | if (!ldv) { |
|---|
| 252 | ldv = PyDict_GetItemString(keyws, "listDiscreteValues"); |
|---|
| 253 | } |
|---|
| 254 | listDiscrete = !ldv || (PyObject_IsTrue(ldv)!=0); |
|---|
| 255 | } |
|---|
| 256 | |
|---|
| 257 | return tabDelimBasedWrite(args, keyws, "tab", false, '\t', listDiscrete); |
|---|
| 258 | } |
|---|
| 259 | |
|---|
| 260 | PyObject *saveTxt(PyObject *, PyObject *args, PyObject *keyws) PYARGS(METH_VARARGS | METH_KEYWORDS, "(filename, examples) -> None") |
|---|
| 261 | { |
|---|
| 262 | return tabDelimBasedWrite(args, keyws, "txt", true, '\t'); |
|---|
| 263 | } |
|---|
| 264 | |
|---|
| 265 | |
|---|
| 266 | PyObject *saveCsv(PyObject *, PyObject *args, PyObject *keyws) PYARGS(METH_VARARGS | METH_KEYWORDS, "(filename, examples) -> None") |
|---|
| 267 | { |
|---|
| 268 | return tabDelimBasedWrite(args, keyws, "csv", true, ','); |
|---|
| 269 | } |
|---|
| 270 | |
|---|
| 271 | |
|---|
| 272 | void c45_writeDomain(FILE *, PDomain); |
|---|
| 273 | void c45_writeExamples(FILE *, PExampleGenerator); |
|---|
| 274 | |
|---|
| 275 | PyObject *saveC45(PyObject *, PyObject *args) PYARGS(METH_VARARGS, "(filename, examples) -> None") |
|---|
| 276 | { PyTRY |
|---|
| 277 | char *filename; |
|---|
| 278 | PExampleGenerator gen; |
|---|
| 279 | |
|---|
| 280 | if (!PyArg_ParseTuple(args, "sO&", &filename, pt_ExampleGenerator, &gen)) |
|---|
| 281 | PYERROR(PyExc_TypeError, "string and example generator expected", PYNULL) |
|---|
| 282 | |
|---|
| 283 | if (!gen->domain->classVar) |
|---|
| 284 | PYERROR(PyExc_SystemError, "C4.5 file cannot store classless data sets.", PYNULL); |
|---|
| 285 | |
|---|
| 286 | if (gen->domain->classVar->varType!=TValue::INTVAR) |
|---|
| 287 | PYERROR(PyExc_SystemError, "Class in C4.5 file must be discrete.", PYNULL); |
|---|
| 288 | |
|---|
| 289 | const char *oldExtension = getExtension(filename); |
|---|
| 290 | |
|---|
| 291 | FILE *ostr; |
|---|
| 292 | ostr = openWReplacedExtension(filename, "names", oldExtension); |
|---|
| 293 | if (!ostr) |
|---|
| 294 | return PYNULL; |
|---|
| 295 | c45_writeDomain(ostr, gen->domain); |
|---|
| 296 | fclose(ostr); |
|---|
| 297 | |
|---|
| 298 | ostr = openWReplacedExtension(filename, "data", oldExtension); |
|---|
| 299 | if (!ostr) |
|---|
| 300 | return PYNULL; |
|---|
| 301 | c45_writeExamples(ostr, gen); |
|---|
| 302 | fclose(ostr); |
|---|
| 303 | |
|---|
| 304 | RETURN_NONE |
|---|
| 305 | PyCATCH |
|---|
| 306 | } |
|---|
| 307 | |
|---|
| 308 | |
|---|
| 309 | |
|---|
| 310 | #include "spec_gen.hpp" |
|---|
| 311 | |
|---|
| 312 | |
|---|
| 313 | void basket_writeExamples(FILE *, PExampleGenerator, set<int> &missing); |
|---|
| 314 | void raiseWarning(bool, const char *s); |
|---|
| 315 | |
|---|
| 316 | PyObject *saveBasket(PyObject *, PyObject *args) PYARGS(METH_VARARGS, "(filename, examples) -> None") |
|---|
| 317 | { PyTRY |
|---|
| 318 | char *filename; |
|---|
| 319 | PExampleGenerator gen; |
|---|
| 320 | |
|---|
| 321 | if (!PyArg_ParseTuple(args, "sO&:saveBasket", &filename, pt_ExampleGenerator, &gen)) |
|---|
| 322 | return PYNULL; |
|---|
| 323 | |
|---|
| 324 | if (gen->domain->variables->size()) |
|---|
| 325 | PYERROR(PyExc_TypeError, ".basket format can only store meta-attribute values", PYNULL); |
|---|
| 326 | |
|---|
| 327 | FILE *ostr = openExtended(filename, "basket"); |
|---|
| 328 | if (!ostr) |
|---|
| 329 | return PYNULL; |
|---|
| 330 | |
|---|
| 331 | set<int> missing; |
|---|
| 332 | |
|---|
| 333 | try { |
|---|
| 334 | basket_writeExamples(ostr, gen, missing); |
|---|
| 335 | } |
|---|
| 336 | catch (...) { |
|---|
| 337 | fclose(ostr); |
|---|
| 338 | remove(filename); |
|---|
| 339 | throw; |
|---|
| 340 | } |
|---|
| 341 | |
|---|
| 342 | fclose(ostr); |
|---|
| 343 | |
|---|
| 344 | if (!missing.empty()) { |
|---|
| 345 | if (missing.size() == 1) { |
|---|
| 346 | char excbuf[512]; |
|---|
| 347 | snprintf(excbuf, 512, "saveBasket: attribute with id %i was not found in Domain and has not been stored", *(missing.begin())); |
|---|
| 348 | raiseWarning(false, excbuf); |
|---|
| 349 | } |
|---|
| 350 | |
|---|
| 351 | else { |
|---|
| 352 | string misss; |
|---|
| 353 | bool comma = false; |
|---|
| 354 | const_ITERATE(set<int>, mi, missing) { |
|---|
| 355 | if (comma) |
|---|
| 356 | misss += ", "; |
|---|
| 357 | else |
|---|
| 358 | comma = true; |
|---|
| 359 | |
|---|
| 360 | char ns[20]; |
|---|
| 361 | sprintf(ns, "%i", (*mi)); |
|---|
| 362 | misss += ns; |
|---|
| 363 | } |
|---|
| 364 | |
|---|
| 365 | char *excbuf = mlnew char[misss.length() + 128]; |
|---|
| 366 | sprintf(excbuf, "saveBasket: attributes with ids not found in Domain have not been stored (%s)", misss.c_str()); |
|---|
| 367 | try { |
|---|
| 368 | raiseWarning(false, excbuf); |
|---|
| 369 | } |
|---|
| 370 | catch (...) { |
|---|
| 371 | mldelete excbuf; |
|---|
| 372 | throw; |
|---|
| 373 | } |
|---|
| 374 | |
|---|
| 375 | mldelete excbuf; |
|---|
| 376 | } |
|---|
| 377 | } |
|---|
| 378 | |
|---|
| 379 | RETURN_NONE |
|---|
| 380 | PyCATCH |
|---|
| 381 | } |
|---|
| 382 | |
|---|
| 383 | |
|---|
| 384 | #include "lib_io.px" |
|---|