-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdiscretize.pl
More file actions
executable file
·62 lines (44 loc) · 1.47 KB
/
discretize.pl
File metadata and controls
executable file
·62 lines (44 loc) · 1.47 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
#!/usr/bin/perl -w
# bin the input data file on the specified field, sum or concatenate other fields
if(scalar(@ARGV) != 3 && scalar(@ARGV) != 4) {
print "Usage: ~ <input.csv> <field_index> <num_categories> [output.csv]\n";
exit(1);
}
use Flat;
use math;
my($in) = Flat->new1($ARGV[0]);
my($fldIndex) = $in->getFieldIndex($ARGV[1]);
my($numOfCat) = $ARGV[2];
my($out);
if(scalar(@ARGV) == 4) {
$out = $ARGV[3];
}
else {
$out = $ARGV[0];
}
if(!$in->fieldIsNumeric($fldIndex)) {
die "Cannot discretize a discrete field $fldIndex\n";
}
my(@fldData) = $in->getColumnData($fldIndex);
# remove NaN entries
my(@nanIndice);
for(my($i) = 0; $i < scalar(@fldData); $i++) {
if(math::util::NaN($fldData[$i])) {
push @nanIndice, $i;
}
}
$in->removeRowsByIndice(@nanIndice);
@fldData = $in->getColumnData($fldIndex);
my(@ranks) = @{math::util::getRanks([@fldData])};
my($section) = (scalar(@ranks) + 0.001) / $numOfCat; # the length of each category
print "fldData size = ", scalar(@fldData), " rank size = ", scalar(@ranks), "\n";
print "section = $section, rank[0] = min rank = ", math::util::getMin(@ranks), " max rank = ", math::util::getMax(@ranks), "\n";
my(@data) = $in->getDataArray();
open OUT, "+>$out" || die $!;
print OUT Flat::dataRowToString($in->getFieldNames()), "\n";
# discretize @fldData
for(my($i) = 0; $i < scalar(@data); $i++) {
$data[$i][$fldIndex] = int($ranks[$i] / $section);
print OUT Flat::dataRowToString(@{$data[$i]}), "\n";
}
close OUT;