multilabel/classify-split.py at master · mendozamanu/multilabel · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
#Script para clasificar los datasets grandes, particionados mediante iterative train-test split 70-30

import numpy as np
import scipy.sparse as sp
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.ensemble import RandomForestClassifier
import sklearn.metrics
import datetime

def readDataFromFile (fileName):
    "This functions reads data from a file and store it in two matrices"
    #Open the file
    file = open(fileName, 'r')

    #Now we have to read the first line and check if it's sparse or dense
    firstLine = file.readline()
    words = firstLine.split()
    word = words[1]
    if word[:-1] == 'SPARSE':
        sparse = True #The file is in sparse mode
    else:
        sparse = False #The file is in dense mode


    secondLine = file.readline()
    words = secondLine.split()
    instances = int(words[1])
    thirdLine = file.readline()
    words = thirdLine.split()
    attributes = int(words[1])
    fourthLine = file.readline()
    words = fourthLine.split()
    labels = int(words[1])
    #Now we do a loop reading all the other lines
    #Then we read the file, different way depending if sparse or dense

    #The loop starts in the first line of data
    #We have to store that data in two matrices
    X = np.zeros((instances, attributes), dtype=float)
    y = np.zeros((instances, labels), dtype=int)
    numberLine = 0
    for line in file.readlines():
        putToX = True
        firstIndex = 1
        numberData = 0
        numberY = 0
        for data in line.split():
            if sparse:#Sparse format, we have to split each data
                if data == '[':
                    putToX = False

                if putToX == True and (data != '[' and data != ']'):
                    sparseArray = data.split(':')
                    lastIndex = int(sparseArray[0])
                    for i in range(firstIndex, lastIndex - 1):
                        X[numberLine, i-1] = float(0)
                    X[numberLine, lastIndex-1] = float(sparseArray[1])
                    firstIndex = lastIndex-1
                else:
                    if (data != '[') and (data != ']'):
                        aux = float(data)
                        y[numberLine, numberY] = int(aux)
                        numberY += 1

            else:#Dense format
                if data == '[':
                    putToX = False

                if putToX == True and (data != '[' and data != ']'):
                    X[numberLine, numberData] = float(data)
                else:
                    if (data != '[') and (data != ']'):
                        #This is good for the dense format
                        aux = float(data)
                        y[numberLine, numberY] = int(aux)
                        numberY += 1
            numberData += 1

        numberLine += 1
    X = sp.csr_matrix(X)
    file.close()
    return X, y

#Listado de datasets a ejecutar
dataset = {
   'delicious',
   'bookmarks',
   'mediamill',
   'tmc2007'
}

def timeStamp(fn, fmt='{fname}_%Y-%m-%d-%H-%M-%S.report'):
    return datetime.datetime.now().strftime(fmt).format(fname=fn)

for s in dataset:
    fp = open(timeStamp('./datasets/'+s+'/'+s), 'w')

    X_train,y_train=readDataFromFile('./datasets/'+s+'/'+s+'.strain')
    print('Reading: ./datasets/'+s+'/'+s+'.strain')
    X_test,y_test=readDataFromFile('./datasets/'+s+'/'+s+'.stest')
    print('Reading: ./datasets/'+s+'/'+s+'.stest')
    classif = BinaryRelevance(classifier=RandomForestClassifier(n_estimators=10),require_dense=[False,True])
    classif.fit(X_train,y_train)
    y_score = classif.predict(X_test)

    #y_prob = classif.predict_proba(X_test)

    #-----------------------------------------#
    #Medidas: sklearn.metrics...(true,predict,..)
    acc= sklearn.metrics.accuracy_score(y_test, y_score)
    fp.write("Accuracy: %0.5f\n"%acc)
    #-----------------------------------------#
    hl=sklearn.metrics.hamming_loss(y_test, y_score)
    fp.write("Hamming loss: %0.5f\n"%hl)
    #-----------------------------------------#
    #Coverage
    #c=sklearn.metrics.coverage_error(y_test, y_prob.toarray(), sample_weight=None)
    #print ("Coverage: %0.5f - 1"%c)
    #-----------------------------------------#
    #Ranking loss
    #rl=sklearn.metrics.label_ranking_loss(y_test, y_prob.toarray(), sample_weight=None)
    #print("Ranking loss: %0.5f"%rl)
    #-----------------------------------------#
    #F1 score
    #f1= sklearn.metrics.f1_score(y_test, y_score)
    #print ("Accuracy: %0.5f"%f1)
    #-----------------------------------------#
    #Mean average precision
    m=sklearn.metrics.average_precision_score(y_test, y_score.toarray(), average='macro', pos_label=1, sample_weight=None)
    fp.write("Mean average precision: %0.5f\n"%m)
    #-----------------------------------------#
    #Micro-average AUC
    rmi=sklearn.metrics.roc_auc_score(y_test, y_score.toarray(), average='micro', sample_weight=None, max_fpr=None)
    fp.write("ROC AUC micro: %0.5f\n"%rmi)
    fp.close()

#print("Problems while trying to calculate coverage & ranking loss due to probab measure")