Ignore:
Timestamp:
07/01/13 16:27:24 (10 months ago)
Author:
Ales Erjavec <ales.erjavec@…>
Branch:
default
Message:

Cleanup of TLinearLearner/Classifier.

The training examples are sorted before training so the labels in
LIBLINEAR model match the order of class_var.values.

TLinearClassifier no longer has the 'examples' member, has a changed
constructor and (un)pickle signature.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • source/orange/liblinear_interface.cpp

    r11016 r11610  
    2727#include <string.h> 
    2828#include <stdarg.h> 
     29#include <assert.h> 
     30 
    2931#include "liblinear_interface.ppp" 
    3032 
     
    292294 
    293295void destroy_problem(problem *prob){ 
    294     for (int i=0; i<prob->l; i++) 
     296    for (int i = 0; i < prob->l; i++) 
    295297        delete[] prob->x[i]; 
    296298    delete[] prob->x; 
     
    299301 
    300302static void dont_print_string(const char *s){} 
     303 
     304 
     305/* 
     306 * Extract feature weights from a LIBLINEAR model. 
     307 * The number of class values must be provided. 
     308 */ 
     309 
     310TFloatListList * extract_feature_weights(model * model, int nr_class_values) { 
     311    /* Number of liblinear classifiers. 
     312     * 
     313     * NOTE: If some class values do not have any data instances in 
     314     * the training set they are not present in the liblinear model 
     315     * so this number might be different than nr_class_values. 
     316     */ 
     317    int nr_classifier = model->nr_class; 
     318    if (model->nr_class == 2 && model->param.solver_type != MCSVM_CS) { 
     319        // model contains a single weight vector 
     320        nr_classifier = 1; 
     321    } 
     322 
     323    // Number of weight vectors to return. 
     324    int nr_orange_weights = nr_class_values; 
     325    if (nr_class_values == 2 && model->param.solver_type != MCSVM_CS) { 
     326        nr_orange_weights = 1; 
     327    } 
     328 
     329    assert(nr_orange_weights >= nr_classifier); 
     330 
     331    int nr_feature = model->nr_feature; 
     332 
     333    if (model->bias >= 0.0){ 
     334        nr_feature++; 
     335    } 
     336 
     337    int* labels = new int[model->nr_class]; 
     338    get_labels(model, labels); 
     339 
     340    // Initialize the weight matrix (nr_orange_weights x nr_features). 
     341    TFloatListList * weights = mlnew TFloatListList(nr_orange_weights); 
     342    for (int i = 0; i < nr_orange_weights; i++){ 
     343        weights->at(i) = mlnew TFloatList(nr_feature, 0.0f); 
     344    } 
     345 
     346    if (nr_classifier > 1) { 
     347        /* 
     348         * NOTE: If some class was missing from the training data set 
     349         * (had no instances) its weight vector will be left initialized 
     350         * to 0 
     351         */ 
     352        for (int i = 0; i < nr_classifier; i++) { 
     353            for (int j = 0; j < nr_feature; j++) { 
     354                weights->at(labels[i])->at(j) = model->w[j * nr_classifier + i]; 
     355            } 
     356        } 
     357    } else { 
     358        for (int j = 0; j < nr_feature; j++) { 
     359            if (nr_orange_weights > 1) { 
     360                /* There were more than 2 orange class values. This means 
     361                 * there were no instances for one or more classed in the 
     362                 * training data set. We cannot simply leave the 'negative' 
     363                 * class vector as zero because we would lose information 
     364                 * which class was used (i.e. we could not make a proper 
     365                 * negative classification using the weights). 
     366                 */ 
     367                weights->at(labels[0])->at(j) = model->w[j]; 
     368                weights->at(labels[1])->at(j) = - model->w[j]; 
     369            } else { 
     370                weights->at(0)->at(j) = model->w[j]; 
     371            } 
     372        } 
     373    } 
     374 
     375    delete[] labels; 
     376 
     377    return weights; 
     378} 
     379 
    301380 
    302381TLinearLearner::TLinearLearner(){ 
     
    307386    set_print_string_function(&dont_print_string); 
    308387} 
     388 
    309389 
    310390PClassifier TLinearLearner::operator()(PExampleGenerator examples, const int &weight){ 
     
    317397    param->weight = NULL; 
    318398 
    319     PVariable classVar = examples->domain->classVar; 
    320     if (!classVar) 
     399    PDomain domain = examples->domain; 
     400 
     401    if (!domain->classVar) 
    321402        raiseError("classVar expected"); 
    322     if (classVar->varType != TValue::INTVAR) 
     403 
     404    if (domain->classVar->varType != TValue::INTVAR) 
    323405        raiseError("Discrete class expected"); 
    324406 
    325     problem *prob = problemFromExamples(examples, bias); 
     407    // Shallow copy of examples. 
     408    PExampleTable train_data = mlnew TExampleTable(examples, /* owns= */ false); 
     409 
     410    /* 
     411     * Sort the training instances by class. 
     412     * This is necessary because LIBLINEAR's class/label/weight order 
     413     * is defined by the order of labels encountered in the training 
     414     * data. By sorting we make sure it matches classVar.values order. 
     415     */ 
     416    vector<int> sort_column(domain->variables->size() - 1); 
     417    train_data->sort(sort_column); 
     418 
     419    problem *prob = problemFromExamples(train_data, bias); 
    326420 
    327421    const char * error_msg = check_parameter(prob, param); 
     
    333427    /* The solvers in liblinear use rand() function. 
    334428     * To make the results reproducible we set the seed from the data table's 
    335      * crc 
     429     * crc. 
    336430     */ 
    337     PExampleTable extable(examples); 
    338     srand(extable->checkSum(false)); 
     431    srand(train_data->checkSum(false)); 
    339432 
    340433    model *model = train(prob, param); 
    341434    destroy_problem(prob); 
    342435 
    343     return PClassifier(mlnew TLinearClassifier(examples->domain->classVar, examples, model)); 
    344 } 
    345  
    346 TLinearClassifier::TLinearClassifier(const PVariable &var, PExampleTable _examples, struct model *_model){ 
    347     classVar = var; 
    348     domain = _examples->domain; 
    349     examples = _examples; 
    350     linmodel = _model; 
    351     bias = _model->bias; 
    352     dbias = _model->bias; 
     436    return PClassifier(mlnew TLinearClassifier(domain, model)); 
     437} 
     438 
     439 
     440/* 
     441 * Construct a TLinearClassifer given a domain and a trained LIBLINEAR 
     442 * constructed model. 
     443 */ 
     444 
     445TLinearClassifier::TLinearClassifier(PDomain domain, struct model * model) : TClassifierFD(domain) { 
     446    linmodel = model; 
     447    bias = model->bias; 
     448    dbias = model->bias; 
    353449 
    354450    computesProbabilities = check_probability_model(linmodel) != 0; 
    355     // Number of class values 
    356     int nr_values = this->get_nr_values(); 
    357  
    358     /* Number of liblinear classifiers (if some class values are missing 
    359      * from the training set they are not present in the liblinear model). 
    360      */ 
    361     int nr_classifier = linmodel->nr_class; 
    362     if (linmodel->nr_class == 2 && linmodel->param.solver_type != MCSVM_CS) 
    363     { 
    364         nr_classifier = 1; 
    365     } 
    366  
    367     // Number of weight vectors exposed in orange. 
    368     int nr_orange_weights = nr_values; 
    369     if (nr_values == 2 && linmodel->param.solver_type != MCSVM_CS) 
    370     { 
    371         nr_orange_weights = 1; 
    372     } 
    373  
    374     int nr_feature = linmodel->nr_feature; 
    375  
    376     if (linmodel->bias >= 0.0) 
    377     { 
    378         nr_feature++; 
    379     } 
    380  
    381     int* labels = new int[linmodel->nr_class]; 
    382     get_labels(linmodel, labels); 
    383  
    384     // Initialize nr_orange_weights vectors 
    385     weights = mlnew TFloatListList(nr_orange_weights); 
    386     for (int i = 0; i < nr_orange_weights; i++) 
    387     { 
    388         weights->at(i) = mlnew TFloatList(nr_feature, 0.0f); 
    389     } 
    390  
    391     if (nr_classifier > 1) 
    392     { 
    393         for (int i = 0; i < nr_classifier; i++) 
    394         { 
    395             for (int j = 0; j < nr_feature; j++) 
    396             { 
    397                 weights->at(labels[i])->at(j) = \ 
    398                         linmodel->w[j*nr_classifier + i]; 
    399             } 
    400         } 
    401 } 
    402     else 
    403     { 
    404         /* If the order of the liblinear internaly stored classes 
    405          * is different from the order of orange's class values, 
    406          * we reverse the weight vector. 
    407          */ 
    408         float factor = (labels[0] == 0)? 1.0f : -1.0f; 
    409  
    410         for (int j = 0; j < nr_feature; j++) 
    411         { 
    412             if (nr_orange_weights > 1) 
    413             { 
    414                /* There are more than 2 orange class values. This means 
    415                 * there were no instances for one or more classed in the training 
    416                 * data set. 
    417                 */ 
    418                 weights->at(labels[0])->at(j) = linmodel->w[j]; 
    419                 weights->at(labels[1])->at(j) = - linmodel->w[j]; 
    420             } 
    421             else 
    422             { 
    423                 weights->at(0)->at(j) = factor * linmodel->w[j]; 
    424             } 
    425         } 
    426     } 
    427     delete[] labels; 
    428 } 
    429  
    430 TLinearClassifier::~TLinearClassifier(){ 
     451 
     452    weights = extract_feature_weights(model, get_nr_values()); 
     453} 
     454 
     455 
     456TLinearClassifier::~TLinearClassifier() { 
    431457    if (linmodel) 
    432458        free_and_destroy_model(&linmodel); 
     
    441467    TEnumVariable * enum_var = NULL; 
    442468    enum_var = dynamic_cast<TEnumVariable*>(classVar.getUnwrappedPtr()); 
    443     if (enum_var) 
    444     { 
     469    if (enum_var) { 
    445470        nr_values = enum_var->noOfValues(); 
    446471    } 
    447     else 
    448     { 
     472    else { 
    449473        raiseError("Discrete class expected."); 
    450474    } 
     
    455479    TExample new_example(domain, example); 
    456480    int numClass = get_nr_class(linmodel); 
    457     map<int, int> indexMap; 
     481 
    458482    feature_node *x = feature_nodeFromExample(new_example, bias); 
    459483 
     
    477501    TExample new_example(domain, example); 
    478502    int numClass = get_nr_class(linmodel); 
    479     map<int, int> indexMap; 
     503 
    480504    feature_node *x = feature_nodeFromExample(new_example, bias); 
    481505 
     
    484508    return TValue(predict_label); 
    485509} 
    486  
Note: See TracChangeset for help on using the changeset viewer.