| 1 | /* |
|---|
| 2 | This file is part of Orange. |
|---|
| 3 | |
|---|
| 4 | Copyright 1996-2010 Faculty of Computer and Information Science, University of Ljubljana |
|---|
| 5 | Contact: janez.demsar@fri.uni-lj.si |
|---|
| 6 | |
|---|
| 7 | Orange is free software: you can redistribute it and/or modify |
|---|
| 8 | it under the terms of the GNU General Public License as published by |
|---|
| 9 | the Free Software Foundation, either version 3 of the License, or |
|---|
| 10 | (at your option) any later version. |
|---|
| 11 | |
|---|
| 12 | Orange is distributed in the hope that it will be useful, |
|---|
| 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
|---|
| 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|---|
| 15 | GNU General Public License for more details. |
|---|
| 16 | |
|---|
| 17 | You should have received a copy of the GNU General Public License |
|---|
| 18 | along with Orange. If not, see <http://www.gnu.org/licenses/>. |
|---|
| 19 | */ |
|---|
| 20 | |
|---|
| 21 | |
|---|
| 22 | #include <string> |
|---|
| 23 | #include <vector> |
|---|
| 24 | #include <list> |
|---|
| 25 | #include <map> |
|---|
| 26 | |
|---|
| 27 | #include <math.h> |
|---|
| 28 | #include "stladdon.hpp" |
|---|
| 29 | #include "strings.hpp" |
|---|
| 30 | #include "getarg.hpp" |
|---|
| 31 | |
|---|
| 32 | #include "values.hpp" |
|---|
| 33 | #include "vars.hpp" |
|---|
| 34 | #include "stringvars.hpp" |
|---|
| 35 | #include "pythonvars.hpp" |
|---|
| 36 | #include "domain.hpp" |
|---|
| 37 | #include "examples.hpp" |
|---|
| 38 | |
|---|
| 39 | #include "tabdelim.ppp" |
|---|
| 40 | |
|---|
| 41 | int readTabAtom(TFileExampleIteratorData &fei, vector<string> &atoms, bool escapeSpaces=true, bool csv = false, bool allowEmpty=false); |
|---|
| 42 | bool atomsEmpty(const vector<string> &atoms); |
|---|
| 43 | |
|---|
| 44 | |
|---|
| 45 | const TTabDelimExampleGenerator::TIdentifierDeclaration TTabDelimExampleGenerator::typeIdentifiers[] = |
|---|
| 46 | {{"discrete", 0, TValue::INTVAR}, {"d", 0, TValue::INTVAR}, |
|---|
| 47 | {"continuous", 0, TValue::FLOATVAR}, {"c", 0, TValue::FLOATVAR}, |
|---|
| 48 | {"string", 0, STRINGVAR}, {"s", 0, STRINGVAR}, |
|---|
| 49 | {"python", 0, PYTHONVAR}, {"python:", 7, PYTHONVAR}, |
|---|
| 50 | {NULL, 0}}; |
|---|
| 51 | |
|---|
| 52 | |
|---|
| 53 | TTabDelimExampleGenerator::TTabDelimExampleGenerator(const TTabDelimExampleGenerator &old) |
|---|
| 54 | : TFileExampleGenerator(old), |
|---|
| 55 | attributeTypes(mlnew TIntList(old.attributeTypes.getReference())), |
|---|
| 56 | classPos(old.classPos), |
|---|
| 57 | headerLines(old.headerLines), |
|---|
| 58 | csv(old.csv) |
|---|
| 59 | {} |
|---|
| 60 | |
|---|
| 61 | |
|---|
| 62 | TTabDelimExampleGenerator::TTabDelimExampleGenerator(const string &afname, bool autoDetect, bool acsv, const int createNewOn, vector<int> &status, vector<pair<int, int> > &metaStatus, const char *aDK, const char *aDC, bool noCodedDiscrete, bool noClass) |
|---|
| 63 | : TFileExampleGenerator(afname, PDomain()), |
|---|
| 64 | attributeTypes(mlnew TIntList()), |
|---|
| 65 | DK(aDK ? strcpy((char *)malloc(strlen(aDK)+1), aDK) : NULL), |
|---|
| 66 | DC(aDC ? strcpy((char *)malloc(strlen(aDC)+1), aDC) : NULL), |
|---|
| 67 | classPos(-1), |
|---|
| 68 | headerLines(0), |
|---|
| 69 | csv(acsv) |
|---|
| 70 | { |
|---|
| 71 | // domain needs to be initialized after attributeTypes, classPos, headerLines |
|---|
| 72 | domain = readDomain(afname, autoDetect, createNewOn, status, metaStatus, noCodedDiscrete, noClass); |
|---|
| 73 | |
|---|
| 74 | TFileExampleIteratorData fei(afname); |
|---|
| 75 | |
|---|
| 76 | vector<string> atoms; |
|---|
| 77 | for (int i = headerLines; !feof(fei.file) && i--; ) |
|---|
| 78 | // read one line (not counting comment lines, but the flag line may be empty) |
|---|
| 79 | while(!feof(fei.file) && (readTabAtom(fei, atoms, true, csv, (headerLines==3) && !i) == -1)); |
|---|
| 80 | |
|---|
| 81 | startDataPos = ftell(fei.file); |
|---|
| 82 | startDataLine = fei.line; |
|---|
| 83 | } |
|---|
| 84 | |
|---|
| 85 | |
|---|
| 86 | TTabDelimExampleGenerator::~TTabDelimExampleGenerator() |
|---|
| 87 | { |
|---|
| 88 | if (DK) |
|---|
| 89 | free(DK); |
|---|
| 90 | |
|---|
| 91 | if (DC) |
|---|
| 92 | free(DC); |
|---|
| 93 | } |
|---|
| 94 | |
|---|
| 95 | bool TTabDelimExampleGenerator::readExample(TFileExampleIteratorData &fei, TExample &exam) |
|---|
| 96 | { |
|---|
| 97 | vector<string> atoms; |
|---|
| 98 | // read lines until eof or a non-empty line |
|---|
| 99 | while(!feof(fei.file) && ((readTabAtom(fei, atoms, true, csv)>0) || atomsEmpty(atoms))) { |
|---|
| 100 | vector<string>::iterator ii(atoms.begin()), ie(atoms.end()); |
|---|
| 101 | while ((ii!=ie) && !(*ii).length()) |
|---|
| 102 | ii++; |
|---|
| 103 | if (ii==ie) |
|---|
| 104 | atoms.clear(); |
|---|
| 105 | else |
|---|
| 106 | break; |
|---|
| 107 | } |
|---|
| 108 | |
|---|
| 109 | if (!atoms.size()) |
|---|
| 110 | return false; |
|---|
| 111 | |
|---|
| 112 | // Add an appropriate number of empty atoms, if needed |
|---|
| 113 | while (atoms.size()<attributeTypes->size()) |
|---|
| 114 | atoms.push_back(string("")); |
|---|
| 115 | _ASSERT(exam.domain==domain); |
|---|
| 116 | |
|---|
| 117 | exam.removeMetas(); |
|---|
| 118 | |
|---|
| 119 | TExample::iterator ei(exam.begin()); |
|---|
| 120 | TVarList::iterator vi(domain->attributes->begin()); |
|---|
| 121 | vector<string>::iterator ai(atoms.begin()); |
|---|
| 122 | TIntList::iterator si(attributeTypes->begin()), se(attributeTypes->end()); |
|---|
| 123 | TIntList::iterator cb, cp, ce; |
|---|
| 124 | if (classPoses) { |
|---|
| 125 | cb = cp = classPoses->begin(); |
|---|
| 126 | ce = classPoses->end(); |
|---|
| 127 | } |
|---|
| 128 | int pos=0; |
|---|
| 129 | for (; (si!=se); pos++, si++, ai++) { |
|---|
| 130 | if (*si) { // if attribute is not to be skipped and is not a basket |
|---|
| 131 | string valstr; |
|---|
| 132 | |
|---|
| 133 | // Check for don't care |
|---|
| 134 | valstr = *ai; |
|---|
| 135 | if (!valstr.length() || (valstr == "NA") || (valstr == ".") || (DC && (valstr == DC))) |
|---|
| 136 | valstr = "?"; |
|---|
| 137 | else if ((valstr == "*") || (DK && (valstr == DK))) |
|---|
| 138 | valstr = "~"; |
|---|
| 139 | |
|---|
| 140 | try { |
|---|
| 141 | if (*si==-1) |
|---|
| 142 | if (pos==classPos) { // if this is class value |
|---|
| 143 | TValue cval; |
|---|
| 144 | domain->classVar->filestr2val(valstr, cval, exam); |
|---|
| 145 | exam.setClass(cval); |
|---|
| 146 | } |
|---|
| 147 | else if (classPoses && (cp != ce) && (pos == *cp)) { |
|---|
| 148 | const int ind = cp - cb; |
|---|
| 149 | domain->classes->at(ind)->filestr2val(valstr, exam.values_end[ind], exam); |
|---|
| 150 | cp++; |
|---|
| 151 | } |
|---|
| 152 | else { // if this is a normal value |
|---|
| 153 | (*vi++)->filestr2val(valstr, *ei++, exam); |
|---|
| 154 | } |
|---|
| 155 | else { // if this is a meta value |
|---|
| 156 | TMetaDescriptor *md = domain->metas[*si]; |
|---|
| 157 | _ASSERT(md!=NULL); |
|---|
| 158 | TValue mval; |
|---|
| 159 | md->variable->filestr2val(valstr, mval, exam); |
|---|
| 160 | |
|---|
| 161 | exam.setMeta(*si, mval); |
|---|
| 162 | } |
|---|
| 163 | } |
|---|
| 164 | catch (mlexception &err) { |
|---|
| 165 | raiseError("file '%s', line '%i': %s", fei.filename.c_str(), fei.line, err.what()); |
|---|
| 166 | } |
|---|
| 167 | } |
|---|
| 168 | |
|---|
| 169 | // the attribute is marked to be skipped, but may also be a basket |
|---|
| 170 | else { |
|---|
| 171 | if (pos == basketPos) { |
|---|
| 172 | TSplits splits; |
|---|
| 173 | split(*ai, splits); |
|---|
| 174 | ITERATE(TSplits, si, splits) |
|---|
| 175 | basketFeeder->addItem(exam, string(si->first, si->second), fei.line); |
|---|
| 176 | } |
|---|
| 177 | } |
|---|
| 178 | } |
|---|
| 179 | |
|---|
| 180 | if (pos==classPos) // if class is the last value in the line, it is set here |
|---|
| 181 | domain->classVar->filestr2val(ai==atoms.end() ? "?" : *(ai++), exam[domain->variables->size()-1], exam); |
|---|
| 182 | /* I'm not sure that this is needed; this code is a mess but I don't wish to |
|---|
| 183 | waste time studying it since we are moving to 3.0 */ |
|---|
| 184 | else if (classPoses && (cp != ce) && (pos == *cp)) { |
|---|
| 185 | const int ind = cp - cb; |
|---|
| 186 | domain->classes->at(ind)->filestr2val(ai==atoms.end() ? "?" : *(ai++), exam.values_end[ind], exam); |
|---|
| 187 | } |
|---|
| 188 | |
|---|
| 189 | while ((ai!=atoms.end()) && !(*ai).length()) ai++; // line must be empty from now on |
|---|
| 190 | |
|---|
| 191 | if (ai!=atoms.end()) { |
|---|
| 192 | vector<string>::iterator ii=atoms.begin(); |
|---|
| 193 | string s=*ii; |
|---|
| 194 | while(++ii!=atoms.end()) s+=" "+*ii; |
|---|
| 195 | raiseError("example of invalid length (%s)", s.c_str()); |
|---|
| 196 | } |
|---|
| 197 | |
|---|
| 198 | return true; |
|---|
| 199 | } |
|---|
| 200 | |
|---|
| 201 | |
|---|
| 202 | char *TTabDelimExampleGenerator::mayBeTabFile(const string &stem) |
|---|
| 203 | { |
|---|
| 204 | vector<string> varNames, atoms; |
|---|
| 205 | vector<string>::const_iterator vi, ai, ei; |
|---|
| 206 | |
|---|
| 207 | TFileExampleIteratorData fei(stem); |
|---|
| 208 | |
|---|
| 209 | // if there is no names line, it is not .tab |
|---|
| 210 | while(!feof(fei.file) && (readTabAtom(fei, varNames, true, csv)==-1)); |
|---|
| 211 | if (varNames.empty()) { |
|---|
| 212 | char *res = mlnew char[128]; |
|---|
| 213 | res = strcpy(res, "empty file"); |
|---|
| 214 | return res; |
|---|
| 215 | } |
|---|
| 216 | |
|---|
| 217 | // if any name contains the correct hash formatting it is not tab-delim it's more likely .txt |
|---|
| 218 | for(vi = varNames.begin(), ei = varNames.end(); vi!=ei; vi++) { |
|---|
| 219 | const char *c = (*vi).c_str(); |
|---|
| 220 | if ((*c=='m') || (*c=='c') || (*c=='i')) |
|---|
| 221 | c++; |
|---|
| 222 | if ( ((*c=='D') || (*c=='C') || (*c=='S')) |
|---|
| 223 | && (c[1]=='#')) { |
|---|
| 224 | char *res= mlnew char[128 + (*vi).size()]; |
|---|
| 225 | sprintf(res, "attribute name '%s' looks suspicious", (*vi).c_str()); |
|---|
| 226 | return res; |
|---|
| 227 | } |
|---|
| 228 | } |
|---|
| 229 | |
|---|
| 230 | // if there is no var types line, it is not .tab |
|---|
| 231 | while(!feof(fei.file) && (readTabAtom(fei, atoms, true, csv)==-1)); |
|---|
| 232 | if (atoms.empty()) { |
|---|
| 233 | char *res = mlnew char[128]; |
|---|
| 234 | res = strcpy(res, "no line with attribute types"); |
|---|
| 235 | return res; |
|---|
| 236 | } |
|---|
| 237 | |
|---|
| 238 | if (atoms.size() != varNames.size()) |
|---|
| 239 | raiseError("the number of attribute types does not match the number of attributes"); |
|---|
| 240 | |
|---|
| 241 | // Each atom must be either 'd', 'c' or 's', or contain a space |
|---|
| 242 | for(vi = varNames.begin(), ai = atoms.begin(), ei = atoms.end(); ai != ei; ai++, vi++) { |
|---|
| 243 | const char *c = (*ai).c_str(); |
|---|
| 244 | if (!*c) { |
|---|
| 245 | char *res= mlnew char[128 + (*vi).size()]; |
|---|
| 246 | sprintf(res, "empty type entry for attribute '%s'", (*vi).c_str()); |
|---|
| 247 | return res; |
|---|
| 248 | } |
|---|
| 249 | |
|---|
| 250 | if (!strcmp("basket", c)) |
|---|
| 251 | continue; |
|---|
| 252 | |
|---|
| 253 | const TIdentifierDeclaration *tid = typeIdentifiers; |
|---|
| 254 | for(; tid->identifier && (tid->matchRoot ? strncmp(tid->identifier, c, tid->matchRoot) : strcmp(tid->identifier, c)); tid++); |
|---|
| 255 | if (tid->identifier) |
|---|
| 256 | continue; |
|---|
| 257 | |
|---|
| 258 | for(; *c && (*c!=' '); c++); |
|---|
| 259 | if (!*c) { |
|---|
| 260 | char *res= mlnew char[128 + (*vi).size() + (*ai).size()]; |
|---|
| 261 | sprintf(res, "attribute '%s' is defined as having only one value ('%s')", (*vi).c_str(), (*ai).c_str()); |
|---|
| 262 | return res; |
|---|
| 263 | } |
|---|
| 264 | } |
|---|
| 265 | |
|---|
| 266 | // if there is no flags line, it is not .tab |
|---|
| 267 | while(!feof(fei.file) && (readTabAtom(fei, atoms, true, csv, true)==-1)); |
|---|
| 268 | if (feof(fei.file)) { |
|---|
| 269 | char *res = mlnew char[128]; |
|---|
| 270 | res = strcpy(res, "file has only two lines"); |
|---|
| 271 | return res; |
|---|
| 272 | } |
|---|
| 273 | |
|---|
| 274 | if (atoms.size() > varNames.size()) |
|---|
| 275 | raiseError("the number of attribute options is greater than the number of attributes"); |
|---|
| 276 | |
|---|
| 277 | // Check flags |
|---|
| 278 | for(vi = varNames.begin(), ai = atoms.begin(), ei = atoms.end(); ai != ei; ai++, vi++) { |
|---|
| 279 | TProgArguments args("dc: ordered", *ai, false, true); |
|---|
| 280 | |
|---|
| 281 | /* Not any more: now they go into the Variable's dictionary |
|---|
| 282 | |
|---|
| 283 | if (args.unrecognized.size()) { |
|---|
| 284 | char *res= mlnew char[128 + (*vi).size()]; |
|---|
| 285 | sprintf(res, "unrecognized options at attribute '%s'", (*vi).c_str()); |
|---|
| 286 | return res; |
|---|
| 287 | } |
|---|
| 288 | */ |
|---|
| 289 | if (args.direct.size()) { |
|---|
| 290 | if (args.direct.size()>1) { |
|---|
| 291 | char *res= mlnew char[128 + (*vi).size()]; |
|---|
| 292 | sprintf(res, "too many direct options at attribute '%s'", (*vi).c_str()); |
|---|
| 293 | return res; |
|---|
| 294 | } |
|---|
| 295 | |
|---|
| 296 | static const char *legalDirects[] = {"s", "skip","i", "ignore", "c", "class", "m", "meta", NULL}; |
|---|
| 297 | string &direct = args.direct.front(); |
|---|
| 298 | const char **lc = legalDirects; |
|---|
| 299 | while(*lc && strcmp(*lc, direct.c_str())) |
|---|
| 300 | lc++; |
|---|
| 301 | if (!*lc) { |
|---|
| 302 | char *res= mlnew char[128 + (*vi).size() + (*ai).size()]; |
|---|
| 303 | sprintf(res, "unrecognized option ('%s') at attribute '%s'", (*ai).c_str(), (*vi).c_str()); |
|---|
| 304 | return res; |
|---|
| 305 | } |
|---|
| 306 | } |
|---|
| 307 | } |
|---|
| 308 | |
|---|
| 309 | return NULL; |
|---|
| 310 | } |
|---|
| 311 | |
|---|
| 312 | PDomain TTabDelimExampleGenerator::readDomain(const string &stem, const bool autoDetect, const int createNewOn, vector<int> &status, vector<pair<int, int> > &metaStatus, bool noCodedDiscrete, bool noClass) |
|---|
| 313 | { |
|---|
| 314 | // non-NULL when this cannot be tab file (reason given as result) |
|---|
| 315 | // NULL if this seems a valid tab file |
|---|
| 316 | char *isNotTab = mayBeTabFile(stem); |
|---|
| 317 | |
|---|
| 318 | TDomainDepot::TAttributeDescriptions descriptions; |
|---|
| 319 | |
|---|
| 320 | if (autoDetect) { |
|---|
| 321 | if (!isNotTab) |
|---|
| 322 | raiseWarning("'%s' is being loaded as .txt, but could be .tab file", stem.c_str()); |
|---|
| 323 | readTxtHeader(stem, descriptions); |
|---|
| 324 | } |
|---|
| 325 | else { |
|---|
| 326 | if (isNotTab) |
|---|
| 327 | raiseWarning("'%s' is being loaded as .tab, but looks more like .txt file\n(%s)", stem.c_str(), isNotTab); |
|---|
| 328 | readTabHeader(stem, descriptions); |
|---|
| 329 | } |
|---|
| 330 | |
|---|
| 331 | if (isNotTab) |
|---|
| 332 | mldelete isNotTab; |
|---|
| 333 | |
|---|
| 334 | scanAttributeValues(stem, descriptions); |
|---|
| 335 | |
|---|
| 336 | TIntList::iterator ati(attributeTypes->begin()); |
|---|
| 337 | TDomainDepot::TPAttributeDescriptions attributeDescriptions, metaDescriptions, classDescriptions; |
|---|
| 338 | int ind = 0, lastRegular = -1; |
|---|
| 339 | TIntList::const_iterator cp, ce; |
|---|
| 340 | if (classPoses) { |
|---|
| 341 | cp = classPoses->begin(); |
|---|
| 342 | ce = classPoses->end(); |
|---|
| 343 | } |
|---|
| 344 | |
|---|
| 345 | for(TDomainDepot::TAttributeDescriptions::iterator adi(descriptions.begin()), ade(descriptions.end()); adi != ade; adi++, ati++, ind++) { |
|---|
| 346 | if (!*ati) |
|---|
| 347 | continue; |
|---|
| 348 | |
|---|
| 349 | if (adi->varType == -1) { |
|---|
| 350 | switch (detectAttributeType(*adi, noCodedDiscrete)) { |
|---|
| 351 | case 0: |
|---|
| 352 | case 2: |
|---|
| 353 | adi->varType = TValue::INTVAR; |
|---|
| 354 | break; |
|---|
| 355 | |
|---|
| 356 | case 1: |
|---|
| 357 | adi->varType = TValue::FLOATVAR; |
|---|
| 358 | break; |
|---|
| 359 | |
|---|
| 360 | case 4: |
|---|
| 361 | adi->varType = STRINGVAR; |
|---|
| 362 | *ati = 1; |
|---|
| 363 | break; |
|---|
| 364 | |
|---|
| 365 | default: |
|---|
| 366 | raiseWarning("cannot determine type for attribute '%s'; the attribute will be ignored", adi->name.c_str()); |
|---|
| 367 | *ati = 0; |
|---|
| 368 | continue; |
|---|
| 369 | } |
|---|
| 370 | } |
|---|
| 371 | |
|---|
| 372 | if (*ati == 1) |
|---|
| 373 | metaDescriptions.push_back(&*adi); |
|---|
| 374 | else if (classPoses && (cp != ce) && (*cp == ind)) { |
|---|
| 375 | classDescriptions.push_back(&*adi); |
|---|
| 376 | cp++; |
|---|
| 377 | } |
|---|
| 378 | else if ((classPos != ind) && (basketPos != ind)) { |
|---|
| 379 | attributeDescriptions.push_back(&*adi); |
|---|
| 380 | lastRegular = ind; |
|---|
| 381 | } |
|---|
| 382 | } |
|---|
| 383 | |
|---|
| 384 | if (classPos > -1) |
|---|
| 385 | attributeDescriptions.push_back(&descriptions[classPos]); |
|---|
| 386 | else if (autoDetect && !noClass) |
|---|
| 387 | classPos = lastRegular; |
|---|
| 388 | |
|---|
| 389 | if (basketPos >= 0) |
|---|
| 390 | // basketFeeder = mlnew TBasketFeeder(sourceDomain, createNewOn == TVariable::OK, false); |
|---|
| 391 | basketFeeder = mlnew TBasketFeeder(PDomain(), createNewOn == TVariable::OK, false); |
|---|
| 392 | |
|---|
| 393 | /* if (sourceDomain) { |
|---|
| 394 | if (!domainDepot_tab.checkDomain(sourceDomain.AS(TDomain), &attributeDescriptions, classPos >= 0, NULL)) |
|---|
| 395 | raiseError("given domain does not match the file"); |
|---|
| 396 | |
|---|
| 397 | if (basketFeeder) |
|---|
| 398 | basketFeeder->domain = sourceDomain; |
|---|
| 399 | return sourceDomain; |
|---|
| 400 | } |
|---|
| 401 | */ |
|---|
| 402 | PDomain newDomain = domainDepot.prepareDomain(&attributeDescriptions, classPos>-1, &classDescriptions, &metaDescriptions, createNewOn, status, metaStatus); |
|---|
| 403 | |
|---|
| 404 | vector<pair<int, int> >::const_iterator mid(metaStatus.begin()); |
|---|
| 405 | PITERATE(TIntList, ii, attributeTypes) |
|---|
| 406 | if (*ii == 1) |
|---|
| 407 | *ii = mid++ ->first; |
|---|
| 408 | |
|---|
| 409 | if (basketFeeder) |
|---|
| 410 | basketFeeder->domain = newDomain; |
|---|
| 411 | |
|---|
| 412 | return newDomain; |
|---|
| 413 | } |
|---|
| 414 | |
|---|
| 415 | |
|---|
| 416 | |
|---|
| 417 | int TTabDelimExampleGenerator::detectAttributeType(TDomainDepot::TAttributeDescription &desc, const bool noCodedDiscrete) |
|---|
| 418 | { |
|---|
| 419 | char numTest[64]; |
|---|
| 420 | |
|---|
| 421 | int status = 3; // 3 - not encountered any values, 2 - can be coded discrete, 1 - can be float, 0 - must be nominal |
|---|
| 422 | // 4 (set later) - string value |
|---|
| 423 | typedef map<string, int> msi; |
|---|
| 424 | ITERATE(msi, vli, desc.values) { |
|---|
| 425 | |
|---|
| 426 | if (vli->first.length() > 63) { |
|---|
| 427 | status = 0; |
|---|
| 428 | break; |
|---|
| 429 | } |
|---|
| 430 | |
|---|
| 431 | const char *ceni = vli->first.c_str(); |
|---|
| 432 | if ( !*ceni |
|---|
| 433 | || !ceni[1] && ((*ceni=='?') || (*ceni=='.') || (*ceni=='~') || (*ceni=='*')) |
|---|
| 434 | || !strcmp(ceni, "NA") || (DC && !strcmp(ceni, DC)) || (DK && !strcmp(ceni, DK))) |
|---|
| 435 | continue; |
|---|
| 436 | |
|---|
| 437 | if (status == 3) |
|---|
| 438 | status = 2; |
|---|
| 439 | |
|---|
| 440 | if ((status == 2) && (ceni[1] || (*ceni<'0') || (*ceni>'9'))) |
|---|
| 441 | status = noCodedDiscrete ? 2 : 1; |
|---|
| 442 | |
|---|
| 443 | if (status == 1) { |
|---|
| 444 | strcpy(numTest, ceni); |
|---|
| 445 | for(char *sc = numTest; *sc; sc++) |
|---|
| 446 | if (*sc == ',') |
|---|
| 447 | *sc = '.'; |
|---|
| 448 | |
|---|
| 449 | char *eptr; |
|---|
| 450 | strtod(numTest, &eptr); |
|---|
| 451 | while (*eptr==32) |
|---|
| 452 | eptr++; |
|---|
| 453 | if (*eptr) { |
|---|
| 454 | status = 0; |
|---|
| 455 | break; |
|---|
| 456 | } |
|---|
| 457 | } |
|---|
| 458 | } |
|---|
| 459 | |
|---|
| 460 | /* Check whether this is a string attribute: |
|---|
| 461 | - has more than 20 values |
|---|
| 462 | - less than half of the values appear more than once */ |
|---|
| 463 | if ((status==0) && (desc.values.size() > 20)) { |
|---|
| 464 | int more2 = 0; |
|---|
| 465 | for(map<string, int>::const_iterator dvi(desc.values.begin()), dve(desc.values.end()); dvi != dve; dvi++) { |
|---|
| 466 | if (dvi->second > 1) |
|---|
| 467 | more2++; |
|---|
| 468 | } |
|---|
| 469 | if (more2*2 < desc.values.size()) { |
|---|
| 470 | status = 4; |
|---|
| 471 | } |
|---|
| 472 | } |
|---|
| 473 | return status; |
|---|
| 474 | } |
|---|
| 475 | |
|---|
| 476 | |
|---|
| 477 | |
|---|
| 478 | |
|---|
| 479 | /* These are the rules for determining the attribute types. |
|---|
| 480 | |
|---|
| 481 | There are three ways to determine a type. |
|---|
| 482 | |
|---|
| 483 | 1. By header prefixes to attribute names. |
|---|
| 484 | The prefix is formed by [cmi][DCS]# |
|---|
| 485 | c, m and i mean class attribute, meta attribute and ignore, |
|---|
| 486 | respectively. |
|---|
| 487 | D, C and S mean discrete, continuous and string attributes. |
|---|
| 488 | |
|---|
| 489 | |
|---|
| 490 | !!! NOT TRUE: |
|---|
| 491 | |
|---|
| 492 | 2. By knownVars. |
|---|
| 493 | If the type is not determined from header row (either because |
|---|
| 494 | there was no prefix or it only contained c, m or i) |
|---|
| 495 | knownVars is checked for the attribute with the same name. |
|---|
| 496 | If found, the attribute from knownVars will be used. |
|---|
| 497 | |
|---|
| 498 | 3. From the data. |
|---|
| 499 | These attributes can be either continuous or discrete. |
|---|
| 500 | The file is parsed and values for each attribute are checked. |
|---|
| 501 | Values denoting undefined values ('?', '.', '~', '*', 'NA' and |
|---|
| 502 | empty strings) are ignored. |
|---|
| 503 | If all values can be parsed as numbers, the attribute is continuous. |
|---|
| 504 | An exception to this rule are attributes with values 0, 1, 2, ..., 9. |
|---|
| 505 | These are treated as discrete (the assumption is that those number |
|---|
| 506 | are just codes for otherwise discrete values). |
|---|
| 507 | */ |
|---|
| 508 | |
|---|
| 509 | |
|---|
| 510 | void TTabDelimExampleGenerator::scanAttributeValues(const string &stem, TDomainDepot::TAttributeDescriptions &desc) |
|---|
| 511 | { |
|---|
| 512 | TFileExampleIteratorData fei(stem); |
|---|
| 513 | |
|---|
| 514 | vector<string> atoms; |
|---|
| 515 | vector<string>::const_iterator ai, ae; |
|---|
| 516 | TDomainDepot::TAttributeDescriptions::iterator di, db(desc.begin()), de(desc.end()); |
|---|
| 517 | TIntList::const_iterator ati, atb(attributeTypes->begin()); |
|---|
| 518 | |
|---|
| 519 | for (int i = headerLines; !feof(fei.file) && i--; ) |
|---|
| 520 | while(!feof(fei.file) && (readTabAtom(fei, atoms, true, csv, (headerLines==3) && !i) == -1)); |
|---|
| 521 | |
|---|
| 522 | while (!feof(fei.file)) { |
|---|
| 523 | if (readTabAtom(fei, atoms, true, csv) <= 0) |
|---|
| 524 | continue; |
|---|
| 525 | |
|---|
| 526 | for(di = db, ati = atb, ai = atoms.begin(), ae = atoms.end(); (di != de) && (ai != ae); di++, ai++, ati++) { |
|---|
| 527 | if (!*atb) |
|---|
| 528 | continue; |
|---|
| 529 | |
|---|
| 530 | //skip the attribute if it is a FLOATVAR or STRINGVAR |
|---|
| 531 | if ((di->varType != TValue::FLOATVAR) && (di->varType != STRINGVAR)) { |
|---|
| 532 | |
|---|
| 533 | const char *ceni = ai->c_str(); |
|---|
| 534 | if ( !*ceni |
|---|
| 535 | || !ceni[1] && ((*ceni=='?') || (*ceni=='.') || (*ceni=='~') || (*ceni=='*')) |
|---|
| 536 | || (*ai == "NA") || (DC && (*ai == DC)) || (DK && (*ai == DK))) |
|---|
| 537 | continue; |
|---|
| 538 | |
|---|
| 539 | map<string, int>::iterator vf = di->values.lower_bound(*ai); |
|---|
| 540 | if ((vf != di->values.end()) && (vf->first == *ai)) { |
|---|
| 541 | vf->second++; |
|---|
| 542 | } |
|---|
| 543 | else { |
|---|
| 544 | di->values.insert(vf, make_pair(*ai, 1)); |
|---|
| 545 | } |
|---|
| 546 | } |
|---|
| 547 | } |
|---|
| 548 | } |
|---|
| 549 | } |
|---|
| 550 | |
|---|
| 551 | |
|---|
| 552 | void TTabDelimExampleGenerator::readTxtHeader(const string &stem, TDomainDepot::TAttributeDescriptions &descs) |
|---|
| 553 | { |
|---|
| 554 | TFileExampleIteratorData fei(stem); |
|---|
| 555 | |
|---|
| 556 | vector<string> varNames; |
|---|
| 557 | while(!feof(fei.file) && (readTabAtom(fei, varNames, true, csv)==-1)); |
|---|
| 558 | if (varNames.empty()) |
|---|
| 559 | ::raiseError("unexpected end of file '%s' while searching for attribute names", fei.filename.c_str()); |
|---|
| 560 | |
|---|
| 561 | headerLines = 1; |
|---|
| 562 | classPos = -1; |
|---|
| 563 | basketPos = -1; |
|---|
| 564 | attributeTypes = mlnew TIntList(varNames.size(), -1); |
|---|
| 565 | TIntList::iterator attributeType(attributeTypes->begin()); |
|---|
| 566 | vector<string>::const_iterator ni(varNames.begin()), ne(varNames.end()); |
|---|
| 567 | int ind = 0; |
|---|
| 568 | |
|---|
| 569 | for(; ni != ne; ni++, ind++, attributeType++) { |
|---|
| 570 | /* Parses the header line |
|---|
| 571 | - sets *ni to a real name (without prefix) |
|---|
| 572 | - sets varType to TValue::varType or -1 if the type is not specified and -2 if it's a basket |
|---|
| 573 | - sets classPos/basketPos to the current position, if the attribute is class/basket attribute |
|---|
| 574 | (and reports an error if there is more than one such attribute) |
|---|
| 575 | - to attributeTypes, appends -1 for ordinary atributes, 1 for metas and 0 for ignored or baskets*/ |
|---|
| 576 | int varType = -1; // varType, or -1 for unnown, -2 for basket |
|---|
| 577 | |
|---|
| 578 | const char *cptr = (*ni).c_str(); |
|---|
| 579 | if (*cptr && (cptr[1]=='#') || (cptr[2] == '#')) { |
|---|
| 580 | if (*cptr == 'm') { |
|---|
| 581 | *attributeType = 1; |
|---|
| 582 | cptr++; |
|---|
| 583 | } |
|---|
| 584 | else if (*cptr == 'i') { |
|---|
| 585 | *attributeType = 0; |
|---|
| 586 | cptr++; |
|---|
| 587 | } |
|---|
| 588 | else if (*cptr == 'c') { |
|---|
| 589 | if (classPos>-1) |
|---|
| 590 | ::raiseError("more than one attribute marked as class"); |
|---|
| 591 | else |
|---|
| 592 | classPos = ind; |
|---|
| 593 | cptr++; |
|---|
| 594 | } |
|---|
| 595 | |
|---|
| 596 | // we may have encountered a m, i or c, so cptr points to the second character, |
|---|
| 597 | // or it can still point to the first |
|---|
| 598 | if (*cptr == 'D') { |
|---|
| 599 | varType = TValue::INTVAR; |
|---|
| 600 | cptr++; |
|---|
| 601 | } |
|---|
| 602 | else if (*cptr == 'C') { |
|---|
| 603 | varType = TValue::FLOATVAR; |
|---|
| 604 | cptr++; |
|---|
| 605 | } |
|---|
| 606 | else if (*cptr == 'S') { |
|---|
| 607 | varType = STRINGVAR; |
|---|
| 608 | cptr++; |
|---|
| 609 | } |
|---|
| 610 | else if (*cptr == 'B') { |
|---|
| 611 | varType = -2; |
|---|
| 612 | if ((*attributeType != -1) || (classPos == ind)) |
|---|
| 613 | ::raiseError("flag 'B' is incompatible with 'i', 'm' and 'c'"); |
|---|
| 614 | *attributeType = 0; |
|---|
| 615 | if (basketPos > -1) |
|---|
| 616 | ::raiseError("more than one basket attribute"); |
|---|
| 617 | else |
|---|
| 618 | basketPos = ind; |
|---|
| 619 | cptr++; |
|---|
| 620 | } |
|---|
| 621 | |
|---|
| 622 | if (*cptr != '#') |
|---|
| 623 | ::raiseError("unrecognized flags in attribute name '%s'", cptr); |
|---|
| 624 | cptr++; |
|---|
| 625 | } |
|---|
| 626 | |
|---|
| 627 | descs.push_back(TDomainDepot::TAttributeDescription(cptr, varType)); |
|---|
| 628 | } |
|---|
| 629 | } |
|---|
| 630 | |
|---|
| 631 | |
|---|
| 632 | |
|---|
| 633 | void TTabDelimExampleGenerator::readTabHeader(const string &stem, TDomainDepot::TAttributeDescriptions &descs) |
|---|
| 634 | { |
|---|
| 635 | classPos = -1; |
|---|
| 636 | classPoses = mlnew TIntList; |
|---|
| 637 | basketPos = -1; |
|---|
| 638 | headerLines = 3; |
|---|
| 639 | |
|---|
| 640 | TFileExampleIteratorData fei(stem); |
|---|
| 641 | |
|---|
| 642 | vector<string> varNames, varTypes, varFlags; |
|---|
| 643 | |
|---|
| 644 | while(!feof(fei.file) && (readTabAtom(fei, varNames, true, csv) == -1)); |
|---|
| 645 | if (varNames.empty()) |
|---|
| 646 | ::raiseError("empty file"); |
|---|
| 647 | |
|---|
| 648 | while(!feof(fei.file) && (readTabAtom(fei, varTypes, false, csv) == -1)); |
|---|
| 649 | if (varTypes.empty()) |
|---|
| 650 | ::raiseError("cannot read types of attributes"); |
|---|
| 651 | |
|---|
| 652 | while(!feof(fei.file) && (readTabAtom(fei, varFlags, true, csv, true) == -1)); |
|---|
| 653 | |
|---|
| 654 | if (varNames.size() != varTypes.size()) |
|---|
| 655 | ::raiseError("mismatching number of attributes and their types."); |
|---|
| 656 | if (varNames.size() < varFlags.size()) |
|---|
| 657 | ::raiseError("too many flags (third line too long)"); |
|---|
| 658 | while (varFlags.size() < varNames.size()) |
|---|
| 659 | varFlags.push_back(""); |
|---|
| 660 | |
|---|
| 661 | attributeTypes = mlnew TIntList(varNames.size(), -1); |
|---|
| 662 | |
|---|
| 663 | vector<string>::iterator vni(varNames.begin()), vne(varNames.end()); |
|---|
| 664 | vector<string>::iterator ti(varTypes.begin()); |
|---|
| 665 | vector<string>::iterator fi(varFlags.begin()), fe(varFlags.end()); |
|---|
| 666 | TIntList::iterator attributeType(attributeTypes->begin()); |
|---|
| 667 | int ind = 0; |
|---|
| 668 | |
|---|
| 669 | for(; vni!=vne; fi++, vni++, ti++, attributeType++, ind++) { |
|---|
| 670 | |
|---|
| 671 | descs.push_back(TDomainDepot::TAttributeDescription(*vni, 0)); |
|---|
| 672 | TDomainDepot::TAttributeDescription &desc = descs.back(); |
|---|
| 673 | |
|---|
| 674 | bool ordered = false; |
|---|
| 675 | |
|---|
| 676 | if (fi!=fe) { |
|---|
| 677 | TProgArguments args("dc: ordered", *fi, false, true); |
|---|
| 678 | |
|---|
| 679 | if (args.direct.size()) { |
|---|
| 680 | |
|---|
| 681 | if (args.direct.size()>1) |
|---|
| 682 | ::raiseError("invalid flags for attribute '%s'", (*vni).c_str()); |
|---|
| 683 | |
|---|
| 684 | string direct = args.direct.front(); |
|---|
| 685 | if ((direct=="s") || (direct=="skip") || (direct=="i") || (direct=="ignore")) |
|---|
| 686 | *attributeType = 0; |
|---|
| 687 | |
|---|
| 688 | else if ((direct=="c") || (direct=="class")) { |
|---|
| 689 | if (classPos != -1) |
|---|
| 690 | ::raiseError("multiple attributes are specified as class attribute ('%s' and '%s')", (*vni).c_str(), (*vni).c_str()); |
|---|
| 691 | classPos = ind; |
|---|
| 692 | } |
|---|
| 693 | |
|---|
| 694 | else if (direct=="multiclass") { |
|---|
| 695 | classPoses->push_back(ind); |
|---|
| 696 | } |
|---|
| 697 | |
|---|
| 698 | else if ((direct=="m") || (direct=="meta")) |
|---|
| 699 | *attributeType = 1; |
|---|
| 700 | } |
|---|
| 701 | |
|---|
| 702 | ITERATE(TMultiStringParameters, mi, args.options) |
|---|
| 703 | if ((*mi).first == "dc") |
|---|
| 704 | raiseWarning("argument -dc is not supported any more"); |
|---|
| 705 | |
|---|
| 706 | ordered = args.exists("ordered"); |
|---|
| 707 | |
|---|
| 708 | desc.userFlags = args.unrecognized; |
|---|
| 709 | } |
|---|
| 710 | |
|---|
| 711 | if (!strcmp((*ti).c_str(), "basket")) { |
|---|
| 712 | if (basketPos > -1) |
|---|
| 713 | ::raiseError("multiple basket attributes are defined"); |
|---|
| 714 | if (ordered || (classPos == ind) || (*attributeType != -1)) |
|---|
| 715 | ::raiseError("'basket' flag is incompatible with other flags"); |
|---|
| 716 | basketPos = ind; |
|---|
| 717 | *attributeType = 0; |
|---|
| 718 | } |
|---|
| 719 | |
|---|
| 720 | if (!*attributeType) |
|---|
| 721 | continue; |
|---|
| 722 | |
|---|
| 723 | if (!(*ti).length()) |
|---|
| 724 | ::raiseError("type for attribute '%s' is missing", (*vni).c_str()); |
|---|
| 725 | |
|---|
| 726 | const TIdentifierDeclaration *tid = typeIdentifiers; |
|---|
| 727 | for(; tid->identifier; tid++) |
|---|
| 728 | if (!(tid->matchRoot ? strncmp(tid->identifier, (*ti).c_str(), tid->matchRoot) |
|---|
| 729 | : strcmp(tid->identifier, (*ti).c_str()))) { |
|---|
| 730 | desc.varType = tid->varType; |
|---|
| 731 | desc.typeDeclaration = *ti; |
|---|
| 732 | break; |
|---|
| 733 | } |
|---|
| 734 | |
|---|
| 735 | if (!tid->identifier) { |
|---|
| 736 | desc.varType = TValue::INTVAR; |
|---|
| 737 | |
|---|
| 738 | string vals; |
|---|
| 739 | ITERATE(string, ci, *ti) { |
|---|
| 740 | if (*ci==' ') { |
|---|
| 741 | if (vals.length()) |
|---|
| 742 | desc.addValue(vals); |
|---|
| 743 | vals=""; |
|---|
| 744 | } |
|---|
| 745 | else { |
|---|
| 746 | if ((*ci=='\\') && (ci[1]==' ')) { |
|---|
| 747 | vals += ' '; |
|---|
| 748 | ci++; |
|---|
| 749 | } |
|---|
| 750 | else |
|---|
| 751 | vals += *ci; |
|---|
| 752 | } |
|---|
| 753 | } |
|---|
| 754 | |
|---|
| 755 | if (vals.length()) |
|---|
| 756 | desc.addValue(vals); |
|---|
| 757 | } |
|---|
| 758 | } |
|---|
| 759 | |
|---|
| 760 | if (!classPoses->size()) |
|---|
| 761 | classPoses = PIntList(); |
|---|
| 762 | } |
|---|
| 763 | |
|---|
| 764 | |
|---|
| 765 | bool atomsEmpty(const vector<string> &atoms) |
|---|
| 766 | { const_ITERATE(vector<string>, ai, atoms) |
|---|
| 767 | if ((*ai).length()) |
|---|
| 768 | return false; |
|---|
| 769 | return true; |
|---|
| 770 | } |
|---|
| 771 | |
|---|
| 772 | |
|---|
| 773 | int trimAtomsList(vector<string> &atoms) |
|---|
| 774 | { |
|---|
| 775 | if (!atoms.size()) |
|---|
| 776 | return 0; |
|---|
| 777 | |
|---|
| 778 | vector<string>::iterator ei(atoms.end()-1), bi(atoms.begin()); |
|---|
| 779 | for(; !(*ei).length() && ei!=bi; ei--); |
|---|
| 780 | if (!(*ei).length()) |
|---|
| 781 | atoms.clear(); |
|---|
| 782 | else |
|---|
| 783 | atoms.erase(++ei, atoms.end()); |
|---|
| 784 | return atoms.size(); |
|---|
| 785 | } |
|---|
| 786 | |
|---|
| 787 | /* Reads a list of atoms from a line of tab or comma delimited file. Atom consists of any characters |
|---|
| 788 | except \n, \r and \t (and ',' if csv=true). Multiple spaces are replaced by a single space. Atoms |
|---|
| 789 | are separated by \t or ',' if csv=true. Lines end with \n or \r. Lines which begin with | are ignored. |
|---|
| 790 | |
|---|
| 791 | Returns number of atoms, -1 for comment line and -2 for EOF |
|---|
| 792 | */ |
|---|
| 793 | int readTabAtom(TFileExampleIteratorData &fei, vector<string> &atoms, bool escapeSpaces, bool csv, bool allowEmpty) |
|---|
| 794 | { |
|---|
| 795 | atoms.clear(); |
|---|
| 796 | |
|---|
| 797 | if (!fei.file) |
|---|
| 798 | raiseErrorWho("TabDelimExampleGenerator", "file not opened"); |
|---|
| 799 | |
|---|
| 800 | if (feof(fei.file)) |
|---|
| 801 | return -2; |
|---|
| 802 | |
|---|
| 803 | fei.line++; |
|---|
| 804 | |
|---|
| 805 | char c, c2; |
|---|
| 806 | int col = 0; |
|---|
| 807 | string atom; |
|---|
| 808 | for(;;) { |
|---|
| 809 | c = fgetc(fei.file); |
|---|
| 810 | |
|---|
| 811 | if (c==(char)EOF) |
|---|
| 812 | break; |
|---|
| 813 | if (!col && (c=='|')) { |
|---|
| 814 | for (c=fgetc(fei.file); (c!='\r') && (c!='\n') && (c!=(char)EOF); c=fgetc(fei.file)); |
|---|
| 815 | return -1; |
|---|
| 816 | } |
|---|
| 817 | |
|---|
| 818 | col++; |
|---|
| 819 | |
|---|
| 820 | switch(c) { |
|---|
| 821 | case '\r': |
|---|
| 822 | case '\n': |
|---|
| 823 | c2 = fgetc(fei.file); |
|---|
| 824 | if ((c2!='\r') && (c2!='\n') || (c2 == c)) |
|---|
| 825 | ungetc(c2, fei.file); |
|---|
| 826 | if (atom.length() || atoms.size()) |
|---|
| 827 | atoms.push_back(trim(atom)); // end of line |
|---|
| 828 | if (allowEmpty || atoms.size()) |
|---|
| 829 | return trimAtomsList(atoms); |
|---|
| 830 | break; |
|---|
| 831 | |
|---|
| 832 | case '\t': |
|---|
| 833 | atoms.push_back(trim(atom)); |
|---|
| 834 | atom.clear(); |
|---|
| 835 | break; |
|---|
| 836 | |
|---|
| 837 | case ',': |
|---|
| 838 | if (csv) { |
|---|
| 839 | atoms.push_back(trim(atom)); |
|---|
| 840 | atom.clear(); |
|---|
| 841 | break; |
|---|
| 842 | } |
|---|
| 843 | // else fallthrough |
|---|
| 844 | |
|---|
| 845 | case ' ': |
|---|
| 846 | atom += c; |
|---|
| 847 | break; |
|---|
| 848 | |
|---|
| 849 | case '\\': |
|---|
| 850 | if (escapeSpaces) { |
|---|
| 851 | c = fgetc(fei.file); |
|---|
| 852 | if (c != ' ') |
|---|
| 853 | atom += '\\'; |
|---|
| 854 | } |
|---|
| 855 | |
|---|
| 856 | default: |
|---|
| 857 | // trim left |
|---|
| 858 | if ((c>=' ') || (c<0)) |
|---|
| 859 | atom += c; |
|---|
| 860 | }; |
|---|
| 861 | } |
|---|
| 862 | |
|---|
| 863 | if (ferror(fei.file)) |
|---|
| 864 | raiseErrorWho("TabDelimExampleGenerator", "error while reading line %i of file '%s'", fei.line, fei.filename.c_str()); |
|---|
| 865 | |
|---|
| 866 | if (atom.length() || atoms.size()) |
|---|
| 867 | atoms.push_back(csv ? trim(atom) : atom); |
|---|
| 868 | |
|---|
| 869 | return trimAtomsList(atoms); |
|---|
| 870 | } |
|---|
| 871 | |
|---|
| 872 | |
|---|
| 873 | |
|---|
| 874 | |
|---|
| 875 | // ********* Output ********* // |
|---|
| 876 | |
|---|
| 877 | |
|---|
| 878 | #define PUTDELIM { if (ho) putc(delim, file); else ho = true; } |
|---|
| 879 | |
|---|
| 880 | void tabDelim_writeExample(FILE *file, const TExample &ex, char delim) |
|---|
| 881 | { |
|---|
| 882 | } |
|---|
| 883 | |
|---|
| 884 | |
|---|
| 885 | inline const char *checkCtrl(const char *c) { |
|---|
| 886 | for(const char *cc = c; *cc; cc++) |
|---|
| 887 | if ((const unsigned char)(*cc) < 32) |
|---|
| 888 | raiseErrorWho("write", "string '%s' cannot be written to a file since it contains invalid characters", c); |
|---|
| 889 | return c; |
|---|
| 890 | } |
|---|
| 891 | |
|---|
| 892 | void tabDelim_writeExamples(FILE *file, PExampleGenerator rg, char delim, const char *DK, const char *DC) |
|---|
| 893 | { |
|---|
| 894 | const TDomain domain = rg->domain.getReference(); |
|---|
| 895 | TVarList::const_iterator vb(domain.variables->begin()), vi, ve(domain.variables->end()); |
|---|
| 896 | |
|---|
| 897 | PEITERATE(ex, rg) { |
|---|
| 898 | vi = vb; |
|---|
| 899 | TExample::const_iterator ri((*ex).begin()); |
|---|
| 900 | string st; |
|---|
| 901 | bool ho = false; |
|---|
| 902 | |
|---|
| 903 | for(; vi!=ve; vi++, ri++) { |
|---|
| 904 | PUTDELIM; |
|---|
| 905 | if (DK && ((*ri).valueType == valueDK)) |
|---|
| 906 | fprintf(file, DK); |
|---|
| 907 | else if (DC && ((*ri).valueType == valueDC)) |
|---|
| 908 | fprintf(file, DC); |
|---|
| 909 | else { |
|---|
| 910 | (*vi)->val2filestr(*ri, st, *ex); |
|---|
| 911 | fprintf(file, checkCtrl(st.c_str())); |
|---|
| 912 | } |
|---|
| 913 | } |
|---|
| 914 | |
|---|
| 915 | TMetaVector::const_iterator mb((*ex).domain->metas.begin()), mi, me((*ex).domain->metas.end()); |
|---|
| 916 | |
|---|
| 917 | for(mi = mb; mi != me; mi++) { |
|---|
| 918 | if (!(*mi).optional) { |
|---|
| 919 | PUTDELIM; |
|---|
| 920 | if (DK && ((*ri).valueType == valueDK)) |
|---|
| 921 | fprintf(file, DK); |
|---|
| 922 | else if (DC && ((*ri).valueType == valueDC)) |
|---|
| 923 | fprintf(file, DC); |
|---|
| 924 | else { |
|---|
| 925 | (*mi).variable->val2filestr((*ex)[(*mi).id], st, *ex); |
|---|
| 926 | fprintf(file, "%s", checkCtrl(st.c_str())); |
|---|
| 927 | } |
|---|
| 928 | } |
|---|
| 929 | } |
|---|
| 930 | |
|---|
| 931 | bool first = true; |
|---|
| 932 | for(mi = mb; mi != me; mi++) { |
|---|
| 933 | if ((*mi).optional) { |
|---|
| 934 | const TVariable &var = (*mi).variable.getReference(); |
|---|
| 935 | if ((var.varType == TValue::FLOATVAR) && (*ex).hasMeta((*mi).id)) { |
|---|
| 936 | const TValue &mval = (*ex).getMeta((*mi).id); |
|---|
| 937 | if (!mval.isSpecial()) { |
|---|
| 938 | if (first) { |
|---|
| 939 | PUTDELIM; |
|---|
| 940 | first = false; |
|---|
| 941 | } |
|---|
| 942 | else |
|---|
| 943 | fprintf(file, " "); |
|---|
| 944 | |
|---|
| 945 | if (mval.floatV == 1.0) |
|---|
| 946 | fprintf(file, checkCtrl(var.get_name().c_str())); |
|---|
| 947 | else { |
|---|
| 948 | var.val2filestr(mval, st, *ex); |
|---|
| 949 | fprintf(file, "%s=%s", checkCtrl(var.get_name().c_str()), checkCtrl(st.c_str())); |
|---|
| 950 | } |
|---|
| 951 | } |
|---|
| 952 | } |
|---|
| 953 | } |
|---|
| 954 | } |
|---|
| 955 | fprintf(file, "\n"); |
|---|
| 956 | } |
|---|
| 957 | } |
|---|
| 958 | |
|---|
| 959 | string escSpaces(const string &s) |
|---|
| 960 | { string res; |
|---|
| 961 | const_ITERATE(string, si, s) |
|---|
| 962 | if (*si==' ') |
|---|
| 963 | res += "\\ "; |
|---|
| 964 | else |
|---|
| 965 | res += *si; |
|---|
| 966 | return res; |
|---|
| 967 | } |
|---|
| 968 | |
|---|
| 969 | extern TOrangeType PyOrPythonVariable_Type; |
|---|
| 970 | |
|---|
| 971 | void printVarType(FILE *file, PVariable var, bool listDiscreteValues) |
|---|
| 972 | { |
|---|
| 973 | TEnumVariable *enumv = var.AS(TEnumVariable); |
|---|
| 974 | if (enumv) { |
|---|
| 975 | TValue val; |
|---|
| 976 | string sval; |
|---|
| 977 | if (!enumv->firstValue(val) || !listDiscreteValues) |
|---|
| 978 | fprintf(file, "d"); |
|---|
| 979 | else { |
|---|
| 980 | enumv->val2str(val, sval); |
|---|
| 981 | fprintf(file, checkCtrl(escSpaces(sval).c_str())); |
|---|
| 982 | while(enumv->nextValue(val)) { |
|---|
| 983 | enumv->val2str(val, sval); |
|---|
| 984 | fprintf(file, " %s", checkCtrl(escSpaces(sval).c_str())); |
|---|
| 985 | } |
|---|
| 986 | } |
|---|
| 987 | } |
|---|
| 988 | else if (var.is_derived_from(TFloatVariable)) |
|---|
| 989 | fprintf(file, "continuous"); |
|---|
| 990 | else if (var.is_derived_from(TStringVariable)) |
|---|
| 991 | fprintf(file, "string"); |
|---|
| 992 | else if (var.is_derived_from(TPythonVariable)) { |
|---|
| 993 | if (var.counter->ob_type == (PyTypeObject *)&PyOrPythonVariable_Type) |
|---|
| 994 | fprintf(file, "python"); |
|---|
| 995 | else { |
|---|
| 996 | PyObject *pyclassname = PyObject_GetAttrString((PyObject *)(var.counter)->ob_type, "__name__"); |
|---|
| 997 | fprintf(file, "python:%s", checkCtrl(PyString_AsString(pyclassname))); |
|---|
| 998 | Py_DECREF(pyclassname); |
|---|
| 999 | } |
|---|
| 1000 | } |
|---|
| 1001 | else |
|---|
| 1002 | raiseErrorWho("tabDelim_writeDomain", "tabDelim format supports only discrete, continuous and string variables"); |
|---|
| 1003 | } |
|---|
| 1004 | |
|---|
| 1005 | |
|---|
| 1006 | void tabDelim_printAttributes(FILE *file, PVariable var, bool needsSpace) { |
|---|
| 1007 | TPyOrange *bvar = (TPyOrange *)(var.counter); |
|---|
| 1008 | PyObject *attrdict = bvar->orange_dict ? PyDict_GetItemString(bvar->orange_dict, "attributes") : NULL; |
|---|
| 1009 | if (attrdict) { |
|---|
| 1010 | PyObject *key, *value; |
|---|
| 1011 | Py_ssize_t pos = 0; |
|---|
| 1012 | while (PyDict_Next(attrdict, &pos, &key, &value)) { |
|---|
| 1013 | if (PyString_Check(key)) |
|---|
| 1014 | Py_INCREF(key); |
|---|
| 1015 | else |
|---|
| 1016 | key = PyObject_Repr(key); |
|---|
| 1017 | if (PyString_Check(value)) |
|---|
| 1018 | Py_INCREF(value); |
|---|
| 1019 | else |
|---|
| 1020 | value = PyObject_Repr(value); |
|---|
| 1021 | fprintf(file, (pos>1) || needsSpace ? " %s=%s" : "%s=%s", PyString_AsString(key), PyString_AsString(value)); |
|---|
| 1022 | Py_DECREF(value); |
|---|
| 1023 | Py_DECREF(key); |
|---|
| 1024 | } |
|---|
| 1025 | } |
|---|
| 1026 | } |
|---|
| 1027 | |
|---|
| 1028 | void tabDelim_writeDomainWithoutDetection(FILE *file, PDomain dom, char delim, bool listDiscreteValues) |
|---|
| 1029 | { |
|---|
| 1030 | TVarList::const_iterator vi, vb(dom->variables->begin()), ve(dom->variables->end()); |
|---|
| 1031 | TMetaVector::const_iterator mi, mb(dom->metas.begin()), me(dom->metas.end()); |
|---|
| 1032 | |
|---|
| 1033 | bool ho = false; |
|---|
| 1034 | bool hasOptionalFloats = false; |
|---|
| 1035 | |
|---|
| 1036 | // First line: attribute names |
|---|
| 1037 | for(vi = vb; vi!=ve; vi++) { |
|---|
| 1038 | PUTDELIM; |
|---|
| 1039 | fprintf(file, "%s", checkCtrl((*vi)->get_name().c_str())); |
|---|
| 1040 | } |
|---|
| 1041 | for(mi = mb; mi!=me; mi++) { |
|---|
| 1042 | if (mi->optional) { |
|---|
| 1043 | if ((*mi).variable->varType == TValue::FLOATVAR) |
|---|
| 1044 | hasOptionalFloats = true; |
|---|
| 1045 | } |
|---|
| 1046 | else { |
|---|
| 1047 | PUTDELIM; |
|---|
| 1048 | fprintf(file, "%s", checkCtrl((*mi).variable->get_name().c_str())); |
|---|
| 1049 | } |
|---|
| 1050 | } |
|---|
| 1051 | |
|---|
| 1052 | if (hasOptionalFloats) { |
|---|
| 1053 | PUTDELIM; |
|---|
| 1054 | fprintf(file, "__basket_foo"); |
|---|
| 1055 | } |
|---|
| 1056 | |
|---|
| 1057 | fprintf(file, "\n"); |
|---|
| 1058 | |
|---|
| 1059 | |
|---|
| 1060 | // Second line: types |
|---|
| 1061 | ho = false; |
|---|
| 1062 | for(vi = vb; vi!=ve; vi++) { |
|---|
| 1063 | PUTDELIM; |
|---|
| 1064 | printVarType(file, *vi, listDiscreteValues); |
|---|
| 1065 | } |
|---|
| 1066 | for(mi = mb; mi!=me; mi++) { |
|---|
| 1067 | if (mi->optional) |
|---|
| 1068 | continue; |
|---|
| 1069 | PUTDELIM; |
|---|
| 1070 | printVarType(file, (*mi).variable, listDiscreteValues); |
|---|
| 1071 | } |
|---|
| 1072 | |
|---|
| 1073 | if (hasOptionalFloats) { |
|---|
| 1074 | PUTDELIM; |
|---|
| 1075 | fprintf(file, "basket"); |
|---|
| 1076 | } |
|---|
| 1077 | |
|---|
| 1078 | fprintf(file, "\n"); |
|---|
| 1079 | |
|---|
| 1080 | |
|---|
| 1081 | // Third line: "meta" and "-ordered" |
|---|
| 1082 | ho = false; |
|---|
| 1083 | for(vb = vi = dom->attributes->begin(), ve = dom->attributes->end(); vi!=ve; vi++) { |
|---|
| 1084 | PUTDELIM; |
|---|
| 1085 | bool isOrdered = ((*vi)->varType == TValue::INTVAR) && (*vi)->ordered; |
|---|
| 1086 | if (isOrdered) |
|---|
| 1087 | fprintf(file, "-ordered"); |
|---|
| 1088 | tabDelim_printAttributes(file, *vi, isOrdered); |
|---|
| 1089 | } |
|---|
| 1090 | if (dom->classVar) { |
|---|
| 1091 | PUTDELIM; |
|---|
| 1092 | fprintf(file, "class"); |
|---|
| 1093 | tabDelim_printAttributes(file, dom->classVar, true); |
|---|
| 1094 | } |
|---|
| 1095 | for(mi = mb; mi!=me; mi++) { |
|---|
| 1096 | if (mi->optional) |
|---|
| 1097 | continue; |
|---|
| 1098 | PUTDELIM; |
|---|
| 1099 | fprintf(file, "meta"); |
|---|
| 1100 | if (((*mi).variable->varType == TValue::INTVAR) && (*mi).variable->ordered) |
|---|
| 1101 | fprintf(file, " -ordered"); |
|---|
| 1102 | tabDelim_printAttributes(file, (*mi).variable, true); |
|---|
| 1103 | } |
|---|
| 1104 | |
|---|
| 1105 | if (hasOptionalFloats) |
|---|
| 1106 | PUTDELIM; |
|---|
| 1107 | |
|---|
| 1108 | fprintf(file, "\n"); |
|---|
| 1109 | } |
|---|
| 1110 | |
|---|
| 1111 | |
|---|
| 1112 | /* If discrete value can be mistakenly read as continuous, we need to add the prefix. |
|---|
| 1113 | This needs to be checked. */ |
|---|
| 1114 | bool tabDelim_checkNeedsD(PVariable var) |
|---|
| 1115 | { |
|---|
| 1116 | bool floated = false; |
|---|
| 1117 | TEnumVariable *enumv = var.AS(TEnumVariable); |
|---|
| 1118 | if (enumv) { |
|---|
| 1119 | TValue val; |
|---|
| 1120 | string sval; |
|---|
| 1121 | char svalc[65]; |
|---|
| 1122 | |
|---|
| 1123 | if (!enumv->firstValue(val)) |
|---|
| 1124 | return true; |
|---|
| 1125 | |
|---|
| 1126 | do { |
|---|
| 1127 | enumv->val2str(val, sval); |
|---|
| 1128 | if (sval.size()>63) |
|---|
| 1129 | return false; |
|---|
| 1130 | |
|---|
| 1131 | if ((sval.size()==1) && (sval[0]>='0') && (sval[0]<='9')) |
|---|
| 1132 | continue; |
|---|
| 1133 | |
|---|
| 1134 | // Convert commas into dots |
|---|
| 1135 | char *sc = svalc; |
|---|
| 1136 | ITERATE(string, si, sval) { |
|---|
| 1137 | *(sc++) = *si==',' ? '.' : *si; |
|---|
| 1138 | *sc = 0; |
|---|
| 1139 | |
|---|
| 1140 | char *eptr; |
|---|
| 1141 | strtod(svalc, &eptr); |
|---|
| 1142 | if (*eptr) |
|---|
| 1143 | return false; |
|---|
| 1144 | else |
|---|
| 1145 | floated = true; |
|---|
| 1146 | } |
|---|
| 1147 | } while (enumv->nextValue(val)); |
|---|
| 1148 | } |
|---|
| 1149 | |
|---|
| 1150 | // All values were either one digit or successfully interpreted as continuous |
|---|
| 1151 | // We need to return true if there were some that were not one-digit... |
|---|
| 1152 | return floated; |
|---|
| 1153 | } |
|---|
| 1154 | |
|---|
| 1155 | |
|---|
| 1156 | void tabDelim_writeDomainWithDetection(FILE *file, PDomain dom, char delim) |
|---|
| 1157 | { |
|---|
| 1158 | bool ho = false; |
|---|
| 1159 | const_PITERATE(TVarList, vi, dom->attributes) { |
|---|
| 1160 | PUTDELIM; |
|---|
| 1161 | fprintf(file, "%s%s", (tabDelim_checkNeedsD(*vi) ? "D#" : ""), checkCtrl((*vi)->get_name().c_str())); |
|---|
| 1162 | } |
|---|
| 1163 | |
|---|
| 1164 | if (dom->classVar) { |
|---|
| 1165 | PUTDELIM; |
|---|
| 1166 | fprintf(file, "%s%s", (tabDelim_checkNeedsD(dom->classVar) ? "cD#" : "c#"), checkCtrl(dom->classVar->get_name().c_str())); |
|---|
| 1167 | } |
|---|
| 1168 | |
|---|
| 1169 | |
|---|
| 1170 | bool hasOptionalFloats = false; |
|---|
| 1171 | |
|---|
| 1172 | const_ITERATE(TMetaVector, mi, dom->metas) { |
|---|
| 1173 | if (mi->optional) { |
|---|
| 1174 | if ((*mi).variable->varType == TValue::FLOATVAR) |
|---|
| 1175 | hasOptionalFloats = true; |
|---|
| 1176 | } |
|---|
| 1177 | else { |
|---|
| 1178 | PUTDELIM; |
|---|
| 1179 | fprintf(file, "%s%s", (tabDelim_checkNeedsD((*mi).variable) ? "mD#" : "m#"), checkCtrl((*mi).variable->get_name().c_str())); |
|---|
| 1180 | } |
|---|
| 1181 | } |
|---|
| 1182 | |
|---|
| 1183 | if (hasOptionalFloats) { |
|---|
| 1184 | PUTDELIM; |
|---|
| 1185 | fprintf(file, "B#__basket_foo"); |
|---|
| 1186 | } |
|---|
| 1187 | |
|---|
| 1188 | fprintf(file, "\n"); |
|---|
| 1189 | } |
|---|
| 1190 | |
|---|
| 1191 | |
|---|
| 1192 | void tabDelim_writeDomain(FILE *file, PDomain dom, bool autodetect, char delim, bool listDiscreteValues) |
|---|
| 1193 | { if (autodetect) |
|---|
| 1194 | tabDelim_writeDomainWithDetection(file, dom, delim); |
|---|
| 1195 | else |
|---|
| 1196 | tabDelim_writeDomainWithoutDetection(file, dom, delim, listDiscreteValues); |
|---|
| 1197 | } |
|---|