Changeset 9500:39a2fa79d715 in orange


Ignore:
Timestamp:
12/28/11 22:33:52 (2 years ago)
Author:
matija <matija.polajnar@…>
Branch:
default
Convert:
42a19eee71bc50fb142cd3ab95017611414171c4
Message:

Converting MLC methods to the new file format. Also, some major code revision due to poor code quality.

Location:
orange
Files:
1 deleted
8 edited

Legend:

Unmodified
Added
Removed
  • orange/Orange/multilabel/__init__.py

    r9474 r9500  
     1import Orange 
     2 
    13from multibase import MultiLabelLearner 
    24from multibase import MultiLabelClassifier 
     
    1618from brknn import BRkNNLearner 
    1719from brknn import BRkNNClassifier 
     20 
     21def is_multilabel(data): 
     22    if not data.domain.class_vars: 
     23        return False 
     24    for c in data.domain.class_vars: 
     25        if type(c) is not Orange.data.variable.Discrete or sorted(c.values) != ['0', '1']: 
     26            return False 
     27    return True 
  • orange/Orange/multilabel/br.py

    r9478 r9500  
    1818International Journal of Data Warehousing and Mining, 3(3):1-13, 2007. 
    1919 
     20Note that a copy of the table is made for each label to enable construction of 
     21a classifier. Due to technical limitations, that is currently unavoidable and 
     22should be remedied in Orange 3. 
     23 
    2024.. index:: Binary Relevance Learner 
    2125.. autoclass:: Orange.multilabel.BinaryRelevanceLearner 
     
    2630   BinaryRelevanceLearner Constructor 
    2731    
    28    :param instances: a table of instances, covered by the rule. 
     32   :param instances: a table of instances. 
    2933   :type instances: :class:`Orange.data.Table` 
    3034       
     
    3943   .. method:: __call__(self, example, result_type) 
    4044   :rtype: a list of :class:`Orange.data.Value`,  
    41               :class:`Orange.statistics.Distribution` or a tuple with both  
     45              a list of :class:`Orange.statistics.Distribution` or a tuple 
     46              with both  
    4247    
    4348Examples 
     
    5661 
    5762import Orange 
    58 from Orange.core import BayesLearner as _BayesLearner 
    59 import label 
     63from Orange.classification.bayes import NaiveLearner as _BayesLearner 
    6064import multibase as _multibase 
    6165 
     
    7377        if instances: 
    7478            self.__init__(**argkw) 
    75             return self.__call__(instances,base_learner,weight_id) 
     79            return self.__call__(instances, weight_id) 
    7680        else: 
    7781            return self 
    7882         
    79     def __call__(self, instances, base_learner = None, weight_id = 0, **kwds): 
     83    def __call__(self, instances, weight_id = 0, **kwds): 
     84        if not Orange.multilabel.is_multilabel(instances): 
     85            raise TypeError("The given data set is not a multi-label data set.") 
     86         
    8087        for k in kwds.keys(): 
    8188            self.__dict__[k] = kwds[k] 
    8289 
    83         num_labels = label.get_num_labels(instances) 
    84         label_indices = label.get_label_indices(instances) 
    85          
    8690        classifiers = [] 
    8791             
    88         for i in range(num_labels): 
    89             # Indices of attributes to remove 
    90             #abtain the labels and use a string to represent it and store the classvalues 
    91             new_class = Orange.data.variable.Discrete(instances.domain[ label_indices[i] ].name, values = ['0','1']) 
    92              
    93             #remove the label attributes 
    94             indices_remove = [var for index, var in enumerate(label_indices)] 
    95             new_domain = label.remove_indices(instances,indices_remove) 
    96              
    97             #add the class attribute 
    98             new_domain = Orange.data.Domain(new_domain,new_class) 
     92        for c in instances.domain.class_vars: 
     93            new_domain = Orange.data.Domain(instances.domain.attributes, c) 
    9994             
    10095            #build the instances 
    101             new_table = Orange.data.Table(new_domain) 
    102             for e in instances: 
    103                 new_row = Orange.data.Instance( 
    104                   new_domain,  
    105                   [v.value for v in e if v.variable.attributes.has_key('label') <> 1] + 
    106                         [e[label_indices[i]].value]) 
    107                 new_table.append(new_row) 
    108              
     96            new_table = Orange.data.Table(new_domain, instances) 
    10997            classifer = self.base_learner(new_table) 
    11098            classifiers.append(classifer) 
     
    112100        #Learn from the given table of data instances. 
    113101        return BinaryRelevanceClassifier(instances = instances,  
    114                                          label_indices = label_indices, 
    115102                                         classifiers = classifiers, 
    116103                                         weight_id = weight_id) 
     
    121108        self.__dict__.update(kwds) 
    122109         
    123     def __call__(self, example, result_type=Orange.classification.Classifier.GetValue): 
    124         num_labels = len(self.label_indices) 
     110    def __call__(self, instance, result_type=Orange.classification.Classifier.GetValue): 
    125111        domain = self.instances.domain 
    126112        labels = [] 
    127         prob = [] 
    128         if num_labels == 0: 
    129             raise ValueError, "has no label attribute: 'the multilabel data should have at last one label attribute' " 
     113        dists = [] 
    130114         
    131         for i in range(num_labels): 
    132             c,p = self.classifiers[i](example,Orange.classification.Classifier.GetBoth) 
    133             #get the index of label value that = 1, so as to locate the index of label in prob  
    134             label_index = -1 
    135             values = domain[ self.label_indices[i] ].values 
    136             if len(values) > 2: 
    137                 raise ValueError, "invalid label value: 'the label value in instances should be only 0 or 1' " 
    138              
    139             if values[0] == '1' : 
    140                 label_index = 0 
    141             elif values[1] == '1': 
    142                 label_index = 1 
    143             else: 
    144                 raise ValueError, "invalid label value: 'the label value in instances should be only 0 or 1' " 
     115        for c in self.classifiers: 
     116            v, p = c(instance, Orange.classification.Classifier.GetBoth) 
    145117                 
    146             prob.append(p[label_index]) 
    147             labels.append(c) 
    148              
    149             disc = Orange.statistics.distribution.Discrete(prob) 
    150             disc.variable = Orange.core.EnumVariable(values = [domain[val].name for index,val in enumerate(self.label_indices)]) 
     118            labels.append(v) 
     119            dists.append(p) 
    151120             
    152121        if result_type == Orange.classification.Classifier.GetValue: 
    153122            return labels 
    154123        if result_type == Orange.classification.Classifier.GetProbabilities: 
    155             return disc 
    156         return labels,disc 
     124            return dists 
     125        return labels, dists 
    157126         
    158127######################################################################################### 
    159 # Test the code, run from DOS prompt 
    160 # assume the data file is in proper directory 
     128# A quick test/example. 
    161129 
    162130if __name__ == "__main__": 
  • orange/Orange/multilabel/brknn.py

    r9495 r9500  
    2121   BRkNNLearner Constructor 
    2222    
    23    :param instances: a table of instances, covered by the rule. 
     23   :param instances: a table of instances. 
    2424   :type instances: :class:`Orange.data.Table` 
    2525 
     
    4747""" 
    4848import random 
     49import math 
     50 
    4951import Orange 
    50 import label 
    5152import multiknn as _multiknn 
    52 import math 
    5353 
    5454class BRkNNLearner(_multiknn.MultikNNLearner): 
    5555    """ 
    56     Class implementing the BR-kNN algorithm.  
     56    Class implementing the BR-kNN learner.  
    5757     
    5858    .. attribute:: k 
     
    6363    .. attribute:: ext 
    6464     
    65         Extension type. The default is None, means 'Standard BR'; 'a' means Predict top ranked label in case of empty prediction set; 
    66         'b' means Predict top n ranked labels based on size of labelset in neighbours  
     65        Extension type. The default is None, means 'Standard BR'; 'a' means 
     66        predicting top ranked label in case of empty prediction set; 'b' means 
     67        predicting top n ranked labels based on size of labelset in neighbours.  
    6768     
    6869    .. attribute:: knn 
     
    7576        Constructor of BRkNNLearner 
    7677         
    77         :param instances: a table of instances, covered by the rule. 
     78        :param instances: a table of instances. 
    7879        :type instances: :class:`Orange.data.Table` 
    7980         
     
    8182        :type k: int 
    8283         
    83         :param ext:  Extension type (Default value is set to '0' which yields the Standard BR). 
    84         :type smooth: string 
     84        :param ext: extension type (default value is None which yields 
     85            the Standard BR), values 'a' and 'b' are also possible. 
     86        :type ext: string 
    8587         
    8688        :rtype: :class:`BRkNNLearner` 
     
    8991        self = _multiknn.MultikNNLearner.__new__(cls, k, **argkw) 
    9092         
    91         if ext and ext <>'a' and ext <> 'b': 
    92             raise ValueError, "invalid ext value: 'the extension value should be only None, 'a' or 'b' " 
     93        if ext not in [None, 'a', 'b']: 
     94            raise ValueError, "Invalid ext value: should be None, 'a' or 'b'." 
    9395        self.ext = ext 
    9496         
     
    101103 
    102104    def __call__(self, instances, weight_id = 0, **kwds): 
     105        if not Orange.multilabel.is_multilabel(instances): 
     106            raise TypeError("The given data set is not a multi-label data set.") 
     107 
    103108        for k in kwds.keys(): 
    104109            self.__dict__[k] = kwds[k] 
    105  
    106         _multiknn.MultikNNLearner.transfor_table(self,instances) 
    107          
    108         return BRkNNClassifier(instances = instances, label_indices = self.label_indices, 
     110        self._build_knn(instances) 
     111 
     112        labeling_f = [BRkNNClassifier.get_labels, BRkNNClassifier.get_labels_a, 
     113                      BRkNNClassifier.get_labels_b][ [None, 'a', 'b'].index(self.ext) ] 
     114         
     115        return BRkNNClassifier(instances = instances, 
    109116                               ext = self.ext, 
    110117                               knn = self.knn, 
    111                                weight_id = self.weight_id, 
    112                                k = self.k) 
    113  
    114 def max(x,y): 
    115     if x > y: 
    116         return x 
    117     else: 
    118         return y 
    119  
    120 class BRkNNClassifier(_multiknn.MultikNNClassifier):     
    121     def __call__(self, example, result_type=Orange.classification.Classifier.GetValue): 
    122         self.num_labels = len(self.label_indices) 
     118                               k = self.k, 
     119                               labeling_f = labeling_f) 
     120 
     121class BRkNNClassifier(_multiknn.MultikNNClassifier): 
     122    def __call__(self, instance, result_type=Orange.classification.Classifier.GetValue): 
    123123        domain = self.instances.domain 
    124         labels = [] 
    125         if self.num_labels == 0: 
    126             raise ValueError, "has no label attribute: 'the multilabel data should have at last one label attribute' " 
    127  
    128         neighbours = self.knn(example, self.k) 
    129         distances = [inst.get_weight(self.weight_id) for i,inst in enumerate(neighbours)] 
    130          
    131         prob = self.get_prob(neighbours, distances) 
    132          
    133         if self.ext == None: 
    134             labels = self.get_label(prob, 0.5) 
    135         elif self.ext == 'a': 
    136             labels = self.get_label_a(prob) 
    137         elif self.ext == 'b': 
    138             labels = self.get_label_b(prob) 
    139          
    140         disc = Orange.statistics.distribution.Discrete(prob) 
    141         disc.variable = Orange.core.EnumVariable( 
    142             values = [domain[val].name for index,val in enumerate(self.label_indices)]) 
     124 
     125        neighbours = self.knn(instance, self.k) 
     126         
     127        prob = self.get_prob(neighbours) 
     128         
     129        labels = self.labeling_f(self, prob, neighbours) 
    143130         
    144131        if result_type == Orange.classification.Classifier.GetValue: 
    145132            return labels 
     133 
     134        dists = [Orange.statistics.distribution.Discrete([1-p, p]) for p in prob] 
     135        for v, d in zip(self.instances.domain.class_vars, dists): 
     136            d.variable = v 
     137 
    146138        if result_type == Orange.classification.Classifier.GetProbabilities: 
    147             return disc 
    148         return labels,disc 
    149      
    150     def get_prob(self, neighbours, distances): 
    151         """ 
    152         Calculates the probabilities of the labels, based on the neighboring instances 
     139            return dists 
     140        return labels, dists 
     141     
     142    def get_prob(self, neighbours): 
     143        """ 
     144        Calculates the probabilities of the labels, based on the neighboring 
     145        instances. 
    153146      
    154         :param neighbours: a list of nearest neighboring instances 
     147        :param neighbours: a list of nearest neighboring instances. 
    155148        :type neighbours: list of :class:`Orange.data.Instance` 
    156149         
    157         :param distances: distance of the neighbours 
    158         :type distances: list of double 
    159          
    160150        :rtype: the prob of the labels 
    161151         
    162152        """ 
    163153        total = 0 
    164         weight = 0 
    165         neighbor_labels = 0 
    166         confidences = [0.0]* self.num_labels 
    167  
    168         #Set up a correction to the estimator 
    169         for i  in range(self.num_labels): 
    170             confidences[i] = 1.0 / max(1, len(self.instances)) 
    171          
    172         total = self.num_labels / max(1, len(self.instances)) 
    173          
    174         for i in range(self.k): 
    175             #Collect class counts 
    176             current = neighbours[i] 
    177             distances[i] = distances[i] * distances[i] 
    178             distances[i] = math.sqrt(distances[i] / (len(self.instances.domain.variables) - self.num_labels)) 
    179             
    180             weight = 1.0 
    181             #weight *= current.weight(); 
    182  
    183             for j in range(self.num_labels): 
    184                 value = current.get_class().value[j] 
     154        label_count = len(self.instances.domain.class_vars) 
     155        confidences = [1.0 / max(1, len(self.instances))] * label_count 
     156 
     157        total = float(label_count) / max(1, len(self.instances)) 
     158         
     159        for neigh in neighbours: 
     160            vals = neigh.get_classes() 
     161            for j, value in enumerate(vals): 
    185162                if value == '1': 
    186                     confidences[j] += weight 
    187                     neighbor_labels += weight 
    188             total += weight 
    189  
    190         self.avg_predicted_labels = int(math.ceil(neighbor_labels / total)) 
    191          
    192         #Normalise distribution 
     163                    confidences[j] += 1 
     164            total += 1 
     165 
     166        #Normalize distribution 
    193167        if total > 0: 
    194168            confidences = [con/total for con in confidences] 
     
    196170        return confidences 
    197171     
    198     def get_label(self, prob, thresh): 
    199         labels = [] 
    200         for i in range(self.num_labels): 
    201             if prob[i] >= thresh: 
    202                 labels.append(Orange.data.Value(self.instances.domain[self.label_indices[i]],'1')) 
    203             else: 
    204                 labels.append(Orange.data.Value(self.instances.domain[self.label_indices[i]],'0')) 
    205             
    206         return labels 
    207      
    208     def get_label_a(self, prob): 
     172    def get_labels(self, prob, _neighs=None, thresh=0.5): 
     173        return [Orange.data.Value(lvar, str(int(p>thresh))) 
     174                for p, lvar in zip(prob, self.instances.domain.class_vars)] 
     175     
     176    def get_labels_a(self, prob, _neighs=None): 
    209177        """ 
    210178        used for BRknn-a 
     
    215183        :rtype: the list label value 
    216184        """ 
    217         labels = [] 
    218         flag = False; #check the case that no label is true 
    219  
    220         for i in range(self.num_labels): 
    221             if prob[i] >= 0.5: 
    222                 labels.append(Orange.data.Value(self.instances.domain[self.label_indices[i]],'1')) 
    223             else: 
    224                 labels.append(Orange.data.Value(self.instances.domain[self.label_indices[i]],'0')) 
     185        labels = self.get_labels(prob) 
    225186             
    226         #assign the class with the greater confidence 
    227         if flag == False: 
    228             max_p = -1 
    229             index = -1 
    230             for i in range(len(prob)): 
    231                 if max_p <= prob[i]: 
    232                     max_p = prob[i] 
    233                     index = i 
    234             if index <> -1: 
    235                 labels[index].value = '1' 
     187        #assign the class with the greatest confidence 
     188        if all(l.value=='0' for l in labels): 
     189            index = max((v,i) for i,v in enumerate(prob))[1] 
     190            labels[index].value = '1' 
    236191         
    237192        return labels 
    238193     
    239     def get_label_b(self, prob): 
     194    def get_labels_b(self, prob, neighs): 
    240195        """ 
    241196        used for BRknn-b 
     
    247202        """ 
    248203         
    249         labels = [] 
    250         for i in range(self.num_labels): 
    251             labels.append(Orange.data.Value(self.instances.domain[self.label_indices[i]],'0')) 
    252          
    253         prob_copy = prob 
    254         prob_copy.sort() 
    255          
    256         indices = [] 
    257         counter = 0 
    258  
    259         for i in range(self.num_labels): 
    260             if prob[i] > prob[self.num_labels - self.avg_predicted_labels]: 
    261                 labels[i].value = '1' 
    262                 counter = counter + 1 
    263             elif prob[i] == prob[self.num_labels - self.avg_predicted_labels]: 
    264                 indices.append(i) 
    265  
    266         size = len(indices) 
    267  
    268         j = self.avg_predicted_labels - counter 
    269         while j > 0: 
    270             next = random.randint(0,size-1) 
    271             if labels[indices[next]] <> '1': 
    272                 labels[indices[next]] = '1' 
    273                 j = j - 1 
    274          
    275         return labels        
     204        labels = [Orange.data.Value(lvar, '0') 
     205                  for p, lvar in zip(prob, self.instances.domain.class_vars)] 
     206         
     207        avg_label_cnt = sum(sum(l.value=='1' for l in n.get_classes()) 
     208                            for n in neighs) / float(len(neighs)) 
     209        avg_label_cnt = int(round(avg_label_cnt)) 
     210         
     211        for p, lval in sorted(zip(prob, labels), reverse=True)[:avg_label_cnt]: 
     212            lval.value = '1' 
     213 
     214        return labels 
    276215     
    277216######################################################################################### 
  • orange/Orange/multilabel/lp.py

    r9494 r9500  
    2222   LabelPowersetLearner Constructor 
    2323    
    24    :param instances: a table of instances, covered by the rule. 
     24   :param instances: a table of instances. 
    2525   :type instances: :class:`Orange.data.Table` 
    2626       
     
    5353import Orange 
    5454from Orange.core import BayesLearner as _BayesLearner 
    55 import label 
    5655import multibase as _multibase 
     56 
     57def get_label_bitstream(e): 
     58    return ''.join(lv.value for lv in e.get_classes()) 
     59 
     60def transform_to_powerset(instances): 
     61    new_class = Orange.data.variable.Discrete("label") 
     62     
     63    for e in instances: 
     64        class_value = get_label_bitstream(e) 
     65        new_class.add_value(class_value) 
     66     
     67    new_domain = Orange.data.Domain(instances.domain.attributes, new_class) 
     68     
     69    #build the instances 
     70    new_table = Orange.data.Table(new_domain) 
     71    for e in instances: 
     72        new_row = Orange.data.Instance( 
     73          new_domain, 
     74          [e[a].value for a in instances.domain.attributes] + 
     75          [get_label_bitstream(e)]) 
     76         
     77        new_table.append(new_row) 
     78     
     79    return new_table 
    5780 
    5881class LabelPowersetLearner(_multibase.MultiLabelLearner): 
     
    6285    def __new__(cls, instances = None, base_learner = None, weight_id = 0, **argkw): 
    6386        self = _multibase.MultiLabelLearner.__new__(cls, **argkw) 
    64         if base_learner: 
    65             self.base_learner = base_learner 
    66         else: 
    67             self.base_learner = _BayesLearner 
    6887         
    6988        if instances: 
    7089            self.__init__(**argkw) 
    71             return self.__call__(instances,base_learner,weight_id) 
     90            return self.__call__(instances, base_learner, weight_id) 
    7291        else: 
    7392            return self 
    7493                 
    7594    def __call__(self, instances, base_learner = None, weight_id = 0, **kwds): 
    76         for k in kwds.keys(): 
    77             self.__dict__[k] = kwds[k] 
     95        if not Orange.multilabel.is_multilabel(instances): 
     96            raise TypeError("The given data set is not a multi-label data set.") 
    7897 
    79         num_labels = label.get_num_labels(instances) 
    80         label_indices = label.get_label_indices(instances) 
     98        self.__dict__.update(kwds) 
     99 
     100        new_table = transform_to_powerset(instances) 
    81101         
    82         #abtain the labels and use a string to represent it and store the classvalues 
    83         new_class = Orange.data.variable.Discrete("label") 
    84          
    85         for e in instances: 
    86             class_value = label.get_label_bitstream(instances,e) 
    87             new_class.add_value(class_value) 
    88          
    89         #remove the label attributes 
    90         indices_remove = [var for index, var in enumerate(label_indices)] 
    91         new_domain = label.remove_indices(instances,indices_remove) 
    92          
    93         #add the class attribute 
    94         new_domain = Orange.data.Domain(new_domain,new_class) 
    95          
    96         #build the instances 
    97         new_table = Orange.data.Table(new_domain) 
    98         for e in instances: 
    99             new_row = Orange.data.Instance( 
    100               new_domain,  
    101               [v.value for v in e if v.variable.attributes.has_key('label') <> 1] + 
    102                     [label.get_label_bitstream(instances,e)]) 
    103              
    104             new_table.append(new_row) 
    105               
    106102        #store the classifier 
    107         classifier = self.base_learner(new_table) 
     103        base_learner = base_learner if base_learner else _BayesLearner 
     104        classifier = base_learner(new_table) 
    108105         
    109106        #Learn from the given table of data instances. 
    110107        return LabelPowersetClassifier(instances = instances,  
    111                                        label_indices = label_indices, 
    112108                                       classifier = classifier, 
    113109                                       weight_id = weight_id) 
    114110 
    115 class LabelPowersetClassifier(_multibase.MultiLabelClassifier):       
    116     def __call__(self, example, result_type=Orange.classification.Classifier.GetValue): 
    117         num_labels = len(self.label_indices) 
    118         domain = self.instances.domain 
     111class LabelPowersetClassifier(_multibase.MultiLabelClassifier): 
     112    def __call__(self, instance, result_type=Orange.classification.Classifier.GetValue): 
    119113        labels = [] 
    120114        prob = [] 
    121         if num_labels == 0: 
    122             raise ValueError, "has no label attribute: 'the multilabel data should have at last one label attribute' " 
    123115         
    124         c,p = self.classifier(example,Orange.classification.Classifier.GetBoth) 
    125         str = c.value 
    126         for i in range(len(str)): 
    127             if str[i] == '0': 
    128                 labels.append(Orange.data.Value(domain[self.label_indices[i]],'0')) 
    129                 prob.append(0.0) 
    130             elif str[i] == '1': 
    131                 labels.append(Orange.data.Value(domain[self.label_indices[i]],'1')) 
    132                 prob.append(1.0) 
    133             else: 
    134                 #raise ValueError, "invalid label value: 'the label value in instances should be only 0 or 1' " 
    135                 labels.append(Orange.data.Value(domain[self.label_indices[i]],'?')) 
    136                 prob.append(0.0) 
    137          
    138         disc = Orange.statistics.distribution.Discrete(prob) 
    139         disc.variable = Orange.core.EnumVariable(values = [domain[val].name for index,val in enumerate(self.label_indices)]) 
     116        c = self.classifier(instance) 
     117        for bit, lvar in zip(c.value, self.instances.domain.class_vars): 
     118            labels.append(Orange.data.Value(lvar, bit)) 
     119            prob.append(float(bit == '1')) 
    140120         
    141121        if result_type == Orange.classification.Classifier.GetValue: 
    142122            return labels 
     123         
     124        disc = [Orange.statistics.distribution.Discrete([1-p, p]) for p in prob] 
     125        for v, d in zip(self.instances.domain.class_vars, disc): 
     126            d.variable = v 
     127         
    143128        if result_type == Orange.classification.Classifier.GetProbabilities: 
    144129            return disc 
  • orange/Orange/multilabel/mlknn.py

    r9495 r9500  
    66*************************************** 
    77 
    8 ML-kNN Classification is a kind of adaptation method for multi-label classification.  
    9 It is an adaptation of the kNN lazy learning algorithm for multi-label data.  
     8ML-kNN Classification is a kind of adaptation method for multi-label classification. 
     9It is an adaptation of the kNN lazy learning algorithm for multi-label data. 
    1010In essence, ML-kNN uses the kNN algorithm independently for each label :math:`l`. 
    11 It finds the k nearest examples to the test instance and considers those that are  
    12 labelled at least with :math:`l` as positive and the rest as negative.  
    13 Actually this method follows the paradigm of Binary Relevance (BR). What mainly  
     11It finds the k nearest examples to the test instance and considers those that are 
     12labelled at least with :math:`l` as positive and the rest as negative. 
     13Actually this method follows the paradigm of Binary Relevance (BR). What mainly 
    1414differentiates this method from BR is the use of prior probabilities. ML-kNN has also 
    1515the capability of producing a ranking of the labels as an output. 
    16 For more information, see Zhang, M. and Zhou, Z. 2007. `ML-KNN: A lazy learning  
     16For more information, see Zhang, M. and Zhou, Z. 2007. `ML-KNN: A lazy learning 
    1717approach to multi-label learning <http://dx.doi.org/10.1016/j.patcog.2006.12.019>`_.  
    1818Pattern Recogn. 40, 7 (Jul. 2007), 2038-2048.   
     
    2626   MLkNNLearner Constructor 
    2727    
    28    :param instances: a table of instances, covered by the rule. 
     28   :param instances: a table of instances. 
    2929   :type instances: :class:`Orange.data.Table` 
    3030 
     
    5353import random 
    5454import Orange 
    55 import label 
    5655import multiknn as _multiknn 
     56 
     57from lp import transform_to_powerset 
    5758 
    5859class MLkNNLearner(_multiknn.MultikNNLearner): 
     
    125126        Constructor of MLkNNLearner 
    126127         
    127         :param instances: a table of instances, covered by the rule. 
     128        :param instances: a table of instances. 
    128129        :type instances: :class:`Orange.data.Table` 
    129130         
     
    148149 
    149150    def __call__(self, instances, weight_id = 0, **kwds): 
    150         for k in kwds.keys(): 
    151             self.__dict__[k] = kwds[k] 
    152  
    153         _multiknn.MultikNNLearner.transfor_table(self,instances) 
    154  
    155         num_labels = self.num_labels 
     151        if not Orange.multilabel.is_multilabel(instances): 
     152            raise TypeError("The given data set is not a multi-label data set.") 
     153         
     154        self.__dict__.update(kwds) 
     155        self._build_knn(instances) 
     156 
     157        #Computing the prior probabilities P(H_b^l) 
     158        prior_prob = self.compute_prior(instances) 
     159         
     160        #Computing the posterior probabilities P(E_j^l|H_b^l) 
     161        cond_prob = list(self.compute_cond(instances)) 
     162         
     163        return MLkNNClassifier(instances = instances, 
     164                               prior_prob = prior_prob,  
     165                               cond_prob = cond_prob, 
     166                               knn = self.knn, 
     167                               k = self.k) 
     168 
     169    def compute_prior(self, instances): 
     170        """ Computing Prior Probabilities for each label of the training set """ 
     171        prior_prob = [] 
     172        for lvar in instances.domain.class_vars: 
     173            freq = sum(inst[lvar].value == '1' for inst in instances) 
     174            prior_prob.append( float(self.smooth + freq) / (self.smooth * 2 + len(instances)) ) 
     175        return prior_prob 
     176             
     177    def compute_cond(self, instances): 
     178        """ Computing Posterior Probabilities for each label of the training set """ 
    156179        k = self.k 
    157180         
    158         #A table holding the prior probability for an instance to belong in each class 
    159         self.prior_probabilities  = [0.] * num_labels 
    160          
    161         #A table holding the prior probability for an instance not to belong in each class 
    162         self.prior_nprobabilities = [0.] * num_labels 
    163          
    164         #A table holding the probability for an instance to belong in each class given that i:0..k of its neighbors belong to that class 
    165         self.cond_probabilities   = [ [0.] * (k + 1) ] * num_labels 
    166          
    167         #A table holding the probability for an instance not to belong in each class given that i:0..k of its neighbors belong to that class 
    168         self.cond_nprobabilities  = [ [0.] * (k + 1) ] * num_labels 
    169          
    170         #Computing the prior probabilities P(H_b^l) 
    171         self.compute_prior(instances) 
    172          
    173         #Computing the posterior probabilities P(E_j^l|H_b^l) 
    174         self.compute_cond(instances) 
    175          
    176         return MLkNNClassifier(instances = instances, label_indices = self.label_indices,  
    177                                prior_probabilities = self.prior_probabilities,  
    178                                prior_nprobabilities = self.prior_nprobabilities, 
    179                                cond_probabilities = self.cond_probabilities, 
    180                                cond_nprobabilities = self.cond_nprobabilities, 
    181                                knn = self.knn, 
    182                                k = self.k, 
    183                                weight_id = weight_id) 
    184  
    185     def compute_prior(self,instances): 
    186         """ Computing Prior and PriorN Probabilities for each class of the training set """ 
    187         num_instances = len(instances) 
    188         for i in range(self.num_labels): 
    189             temp_ci = 0 
    190             for j in range(num_instances): 
    191                 value = instances[j][self.label_indices[i]].value 
    192                 if value == '1': 
    193                     temp_ci = temp_ci+1 
    194             self.prior_probabilities[i] = (self.smooth + temp_ci) / (self.smooth * 2 + num_instances) 
    195             self.prior_nprobabilities[i] = 1 - self.prior_probabilities[i] 
    196              
    197     def compute_cond(self,instances): 
    198         """ Computing Cond and CondN Probabilities for each class of the training set """ 
    199         num_labels = self.num_labels 
    200         label_indices = self.label_indices 
    201         k = self.k 
    202         num_instances = len(instances) 
    203          
    204         temp_ci  = [ [0] * (k + 1) ] * num_labels 
    205         temp_nci = [ [0] * (k + 1) ] * num_labels 
    206  
    207         for i  in range(num_instances): 
    208             neighbors = self.knn(instances[i], k) 
    209                   
    210             # now compute values of temp_ci and temp_nci for every class label 
    211             for j in range(num_labels): 
    212                 aces = 0 # num of aces in Knn for j 
    213                 for m in range(k): 
    214                     value = neighbors[m].get_class().value[j] 
    215                     if value == '1': 
    216                         aces = aces + 1 
    217                       
    218                 #raise the counter of temp_ci[j][aces] and temp_nci[j][aces] by 1 
    219                 if instances[i][label_indices[j]].value == '1': 
    220                     temp_ci[j][aces] = temp_ci[j][aces] + 1 
    221                 else: 
    222                     temp_nci[j][aces] = temp_nci[j][aces] + 1 
     181        def _remove_identical(table, inst): 
     182            try: 
     183                i = [inst1.get_classes() == inst.get_classes() for inst1 in table].index(1) 
     184            except: 
     185                i = -1 
     186            del table[i] 
     187            return table 
     188             
     189             
     190        neighbor_lists = [_remove_identical(self.knn(inst, k+1), inst) for inst in instances] 
     191        p1 = [[0]*(k+1) for lvar in instances.domain.class_vars] 
     192        p0 = [[0]*(k+1) for lvar in instances.domain.class_vars] 
     193 
     194        for li, lvar in enumerate(instances.domain.class_vars): 
     195            c  = [0] * (k + 1) 
     196            cn = [0] * (k + 1) 
     197             
     198            for inst, neighbors in zip(instances, neighbor_lists): 
     199                delta = sum(n[lvar].value=='1' for n in neighbors) 
    223200                 
    224         # compute cond_probabilities[i][..] for labels based on temp_ci[] 
    225         for i in range(num_labels): 
    226             temp1 = 0 
    227             temp2 = 0 
    228             for j in range(k + 1): 
    229                 temp1 += temp_ci[i][j] 
    230                 temp2 += temp_nci[i][j] 
    231             for j in range(k + 1): 
    232                 self.cond_probabilities[i][j] = (self.smooth + temp_ci[i][j]) / (self.smooth * (k + 1) + temp1) 
    233                 self.cond_nprobabilities[i][j] = (self.smooth + temp_nci[i][j]) / (self.smooth * (k + 1) + temp2) 
     201                (c if inst[lvar].value == '1' else cn)[delta] += 1 
     202                 
     203            for j in range(k+1): 
     204                p1[li][j] = float(self.smooth + c[j]) / (self.smooth * (k+1) + sum(c)) 
     205                p0[li][j] = float(self.smooth + cn[j]) / (self.smooth * (k+1) + sum(cn)) 
     206         
     207        return p0, p1 
    234208  
    235209class MLkNNClassifier(_multiknn.MultikNNClassifier):       
    236     def __call__(self, example, result_type=Orange.classification.Classifier.GetValue): 
    237         num_labels = len(self.label_indices) 
    238         domain = self.instances.domain 
     210    def __call__(self, instance, result_type=Orange.classification.Classifier.GetValue): 
     211        neighbors = self.knn(instance, self.k) 
     212         
    239213        labels = [] 
    240         prob = [] 
    241         if num_labels == 0: 
    242             raise ValueError, "has no label attribute: 'the multilabel data should have at last one label attribute' " 
    243          
    244         #Computing y_t and r_t 
    245         neighbors = self.knn(example, self.k) 
    246         for i in range(num_labels): 
    247             # compute sum of aces in KNN 
    248             aces = 0  #num of aces in Knn for i 
    249             for m in range(self.k): 
    250                 value = neighbors[m].get_class().value[i] 
    251                 if value == '1': 
    252                     aces = aces + 1 
    253      
    254             prob_in = self.prior_probabilities[i] * self.cond_probabilities[i][aces] 
    255             prob_out = self.prior_nprobabilities[i] * self.cond_nprobabilities[i][aces] 
    256                  
    257             if prob_in > prob_out: 
    258                 labels.append(Orange.data.Value(domain[self.label_indices[i]],'1')) 
    259             elif prob_in < prob_out: 
    260                 labels.append(Orange.data.Value(domain[self.label_indices[i]],'0')) 
    261             else: 
    262                 rnd = random.randint(0,1) 
    263                 if rnd == 0: 
    264                     labels.append(Orange.data.Value(domain[self.label_indices[i]],'0')) 
    265                 else: 
    266                     labels.append(Orange.data.Value(domain[self.label_indices[i]],'1')) 
    267              
    268             #ranking function 
    269             prob.append( prob_in / (prob_in + prob_out) ) 
     214        dists = [] 
     215         
     216        for li, lvar in enumerate(self.instances.domain.class_vars): 
     217            delta = sum(n[lvar].value=='1' for n in neighbors) 
     218     
     219            p1 = self.prior_prob[li] * self.cond_prob[1][li][delta] 
     220            p0 = (1-self.prior_prob[li]) * self.cond_prob[0][li][delta] 
     221            y = (p1 >= p0) 
     222            labels.append(Orange.data.Value(lvar, str(int(y)))) 
     223             
     224            r = p1 / (p0+p1) 
     225            dists.append( Orange.statistics.distribution.Discrete([1-r, r]) ) 
    270226        
    271         disc = Orange.statistics.distribution.Discrete(prob) 
    272         disc.variable = Orange.core.EnumVariable( 
    273             values = [domain[val].name for index,val in enumerate(self.label_indices)]) 
     227        for d, lvar in zip(dists, self.instances.domain.class_vars): 
     228            d.variable = lvar 
    274229         
    275230        if result_type == Orange.classification.Classifier.GetValue: 
    276231            return labels 
    277232        if result_type == Orange.classification.Classifier.GetProbabilities: 
    278             return disc 
    279         return labels,disc 
     233            return dists 
     234        return labels, dists 
    280235         
    281236######################################################################################### 
  • orange/Orange/multilabel/mulan.py

    r9489 r9500  
    44 
    55def trans_mulan_data(xml_name,arff_name, create_on_new = Orange.data.variable.Variable.MakeStatus.Incompatible, **kwargs): 
    6     """ transform the mulan data format to Tab file 
     6    """ Transform the mulan data format to Tab file. 
    77     
    88        :param xml: a text file in XML format, specifying the labels and any hierarchical relationship among them.  
     
    2828     
    2929    #remove class tag 
    30     domain = Orange.data.Domain(domain,False) 
    31      
    32     for i, var in enumerate(domain.variables): 
    33         if var.name in labels: 
    34             domain[i].attributes["label"] = 1 
     30    features = [v for v in domain.attributes if v.name not in labels] 
     31    class_vars = [v for v in domain.attributes if v.name in labels] 
     32    domain = Orange.data.Domain(features, None, class_vars = class_vars) 
    3533     
    3634    table = arff_table.translate(domain) 
  • orange/Orange/multilabel/multiknn.py

    r9470 r9500  
    99*************************************** 
    1010 
    11 MultikNN Classification is the base class of kNN method based multi-label classification.  
     11MultikNN Classification is the base class of kNN method based multi-label 
     12classification.  
    1213It is an adaptation of the kNN lazy learning algorithm for multi-label data.  
    13 For more information, see Zhang, M. and Zhou, Z. 2007. `ML-KNN: A lazy learning approach to multi-label learning <http://dx.doi.org/10.1016/j.patcog.2006.12.019>`_.  
     14For more information, see Zhang, M. and Zhou, Z. 2007. `ML-KNN: A lazy learning 
     15approach to multi-label learning <http://dx.doi.org/10.1016/j.patcog.2006.12.019>`_.  
    1416Pattern Recogn. 40, 7 (Jul. 2007), 2038-2048.   
    1517 
     
    2224   MLkNNLearner Constructor 
    2325    
    24    :param instances: a table of instances, covered by the rule. 
     26   :param instances: a table of instances. 
    2527   :type instances: :class:`Orange.data.Table` 
    2628 
     
    4850""" 
    4951import random 
     52 
    5053import Orange 
    51 import label 
    5254import multibase as _multibase 
    5355 
     
    8688        self.k = k 
    8789        return self 
    88  
    89     def transfor_table(self, instances): 
    90         """ build the instances table using power set transfor method  
    91          
    92         :param instances: a table of instances, covered by the rule. 
    93         :type instances: :class:`Orange.data.Table` 
    94          
    95         :rtype: :class:`Orange.data.Table` 
    96          
    97         """ 
    98          
    99         self.num_labels = label.get_num_labels(instances) 
    100         self.label_indices = label.get_label_indices(instances) 
    101          
    102         num_labels = self.num_labels 
    103         label_indices = self.label_indices 
    104         k = self.k 
    105          
    106         #build a kNNLearner 
    107         #remove labels 
    108         indices_remove = [var for index, var in enumerate(label_indices)] 
    109         new_domain = label.remove_indices(instances,indices_remove)  
    110          
    111         new_class = Orange.data.variable.Discrete("label") 
    112         for e in instances: 
    113             class_value = label.get_label_bitstream(instances,e) 
    114             new_class.add_value(class_value) 
    115          
    116         new_domain = Orange.data.Domain(new_domain,new_class) 
    117          
    118         new_table = Orange.data.Table(new_domain) 
    119         for e in instances: 
    120             new_row = Orange.data.Instance( 
    121               new_domain,  
    122               [v.value for v in e if v.variable.attributes.has_key('label') <> 1] + 
    123                     [label.get_label_bitstream(instances,e)]) 
    124              
    125             new_table.append(new_row) 
    126          
     90     
     91    def _build_knn(self, instances): 
    12792        nnc = Orange.classification.knn.FindNearestConstructor() 
    12893        nnc.distanceConstructor = Orange.core.ExamplesDistanceConstructor_Euclidean() 
    12994         
    13095        weight_id = Orange.core.newmetaid() 
    131         self.knn = nnc(new_table, 0, weight_id) 
     96        self.knn = nnc(instances, 0, weight_id) 
    13297        self.weight_id = weight_id 
    133          
    134         return new_table 
    135          
     98 
    13699class MultikNNClassifier(_multibase.MultiLabelClassifier):    
    137100    pass 
  • orange/doc/Orange/rst/code/mlc-classify.py

    r9470 r9500  
    11import Orange 
    22 
    3 data = Orange.data.Table("multidata.tab") 
     3def test_mlc(data, learners): 
     4    for l in learners: 
     5        c = l(data) 
     6        for e in data: 
     7            labels, probs = c(e, Orange.classification.Classifier.GetBoth) 
     8            print [val.value for val in labels], "[%s]" % ", ".join("(%.4f, %.4f)" % (p['0'], p['1']) for p in probs) 
     9        print 
    410 
    5 classifier = Orange.multilabel.BinaryRelevanceLearner(data) 
    6  
    7 for e in data: 
    8     c,p = classifier(e,Orange.classification.Classifier.GetBoth) 
    9     print c,p 
    10     #prints [<orange.Value 'Sports'='1'>, <orange.Value 'Politics'='1'>] <1.000, 0.000, 0.000, 1.000> 
    11     #prints [<orange.Value 'SCience'='1'>, <orange.Value 'Politics'='1'>] <0.000, 0.000, 1.000, 1.000> 
    12     #prints [<orange.Value 'Sports'='1'>] <1.000, 0.000, 0.000, 0.000> 
    13     #prints [<orange.Value 'Religion'='1'>, <orange.Value 'SCience'='1'>] <0.000, 1.000, 1.000, 0.000> 
    14      
    15 powerset_cliassifer = Orange.multilabel.LabelPowersetLearner(data) 
    16 for e in data: 
    17     c,p = powerset_cliassifer(e,Orange.classification.Classifier.GetBoth) 
    18     print c,p 
    19     #prints [<orange.Value 'Sports'='1'>, <orange.Value 'Politics'='1'>] <1.000, 0.000, 0.000, 1.000> 
    20     #prints [<orange.Value 'SCience'='1'>, <orange.Value 'Politics'='1'>] <0.000, 0.000, 1.000, 1.000> 
    21     #prints [<orange.Value 'Sports'='1'>] <1.000, 0.000, 0.000, 0.000> 
    22     #prints [<orange.Value 'Religion'='1'>, <orange.Value 'SCience'='1'>] <0.000, 1.000, 1.000, 0.000> 
    23  
    24 mlknn_cliassifer = Orange.multilabel.MLkNNLearner(data,k=1) 
    25 for e in data: 
    26     c,p = mlknn_cliassifer(e,Orange.classification.Classifier.GetBoth) 
    27     print c,p 
    28      
    29 br_cliassifer = Orange.multilabel.BRkNNLearner(data,k=1) 
    30 for e in data: 
    31     c,p = br_cliassifer(e,Orange.classification.Classifier.GetBoth) 
    32     print c,p 
     11learners = [Orange.multilabel.BinaryRelevanceLearner(), 
     12            Orange.multilabel.LabelPowersetLearner(), 
     13            Orange.multilabel.MLkNNLearner(k=1), 
     14            Orange.multilabel.MLkNNLearner(k=5), 
     15            Orange.multilabel.BRkNNLearner(k=1), 
     16            Orange.multilabel.BRkNNLearner(k=5), 
     17            Orange.multilabel.BRkNNLearner(k=5,ext='a'), 
     18            Orange.multilabel.BRkNNLearner(k=5,ext='b') 
     19            ] 
     20             
     21test_mlc(Orange.data.Table("multidata.tab"), learners) 
     22test_mlc(Orange.data.Table("emotions.tab"), learners) 
Note: See TracChangeset for help on using the changeset viewer.