-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathfindUniqueVals.pl
More file actions
executable file
·115 lines (90 loc) · 3.58 KB
/
findUniqueVals.pl
File metadata and controls
executable file
·115 lines (90 loc) · 3.58 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
#!/usr/bin/perl -w
use Flat;
use math;
use Util;
use Getopt::Std;
my(%options);
getopts("vswd:e:", \%options);
if(scalar(@ARGV) < 2) {
print "Usage: ~ [-e RE] [-v|s] [-w] [-d \"fname1 ... fnamen>\"] <input.csv> <fld_no1|fld_name1> .. <fld_non|fld_namen>\n";
print "\t-e -- exclude keys with the specified regular expression pattern. e.g. 'NA'\n";
print "\t-s -- semi verbose mode\n";
print "\t-v -- verbose\n";
print "\t-w -- sort by keyword instead of count\n";
print "\t-d -- fields to display after unique counting. Default is the key fields\n";
exit(1);
}
my($semi) = exists $options{"s"} || exists $options{"d"};
my($verbose) = exists $options{"v"};
my($sortByWord) = exists $options{"w"};
my $exRE = exists $options{"e"}?$options{"e"}:"";
my($in) = Flat->new1(shift @ARGV);
my(@fldIndice) = $in->getFieldIndice([@ARGV]);
my(@fldNames) = $in->getFieldNames(@ARGV);
my(@displayFldIndice); # indice in (@fldIndice)
print join("\t", @fldNames), "\n";
if(exists $options{"d"}) {
my(@fnames) = split(/\s+/, $options{"d"});
@displayFldIndice = $in->getFieldIndice([@fnames]);
if(scalar(@displayFldIndice) < scalar(@fnames)) {
Util::dieIt("Not all specified fields exist in ", $in->getFileName(), ": @fnames\n");
}
}
else {
@displayFldIndice = @fldIndice;
if(scalar(@displayFldIndice) < scalar(@ARGV)) {
Util::dieIt("Not all specified fields exist in ", $in->getFileName(), ": @ARGV\n");
}
}
for(my($i) = 0; $i < scalar(@displayFldIndice); $i++) {
if($displayFldIndice[$i] == -1) {
Util::dieIt("Display field $i does not exist in ", $in->getFileName());
}
}
my(%uniqueValIndice) = $in->getIndiceOfFieldValues(@fldIndice);
my(%displayVal2count); # display values
my(@data) = $in->getDataArray();
foreach $fldVals (sort keys %uniqueValIndice) {
if($exRE && $fldVals =~ /$exRE/) { # if the key value is excluded
print "Skipping field value $fldVals\n";
delete $uniqueValIndice{$fldVals};
next;
}
my(@indice) = @{$uniqueValIndice{$fldVals}};
# create unique display value for the duplicates
my(%uniqueFldVals);
for(my($i) = 0; $i < scalar(@displayFldIndice); $i++) {
for(my($j) = 0; $j < scalar(@indice); $j++) {
$uniqueFldVals{$i}{$data[$indice[$j]][$displayFldIndice[$i]]} = 1;
}
}
# my($dval) = join(",", map { $data[$indice[0]][$_] } @displayFldIndice);
my($dval) = join(",", map {join("/", sort keys %{$uniqueFldVals{$_}})} sort {$a <=> $b } keys %uniqueFldVals);
$displayVal2count{$dval}++;
if($verbose && !$semi) {
for(my($i) = 0; $i < scalar(@indice); $i++) {
my(@row) = @{$data[$indice[$i]]};
print "Duplicated: ", Flat::dataRowToString(map { $row[$_]; } @fldIndice), "\n";
}
}
}
my $accuPct = 0;
my $total = math::util::getSum(map { scalar(@{$uniqueValIndice{$_}}) } keys %uniqueValIndice);
if($semi || $verbose) {
if(exists $options{"d"}) {
foreach $v (sort { if($sortByWord) { $a cmp $b } else { scalar(@{$displayVal2count{$b}}) <=> scalar(@{$displayVal2count{$a}})}} keys %displayVal2count) {
my $pct = $displayVal2count{$v} / $total;
$accuPct += $pct;
print "$v\t$displayVal2count{$v}\t$pct\t$accuPct\n";
}
}
else {
foreach $v (sort { if($sortByWord) { $a cmp $b } else { scalar(@{$uniqueValIndice{$b}}) <=> scalar(@{$uniqueValIndice{$a}})}} keys %uniqueValIndice) {
my $pct = scalar(@{$uniqueValIndice{$v}}) / $total;
$accuPct += $pct;
print "$v\t", scalar(@{$uniqueValIndice{$v}}), "\t$pct\t$accuPct\n";
}
}
}
print "\nTotal unique values: ", scalar(keys %uniqueValIndice), "\n";
print "\nTotal cases: ", $total, "\n\n";