-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathequalSeqChunk.pl
More file actions
executable file
·118 lines (90 loc) · 2.24 KB
/
equalSeqChunk.pl
File metadata and controls
executable file
·118 lines (90 loc) · 2.24 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
#!/usr/bin/perl -w
sub printUsage {
print "Allocate equal sized un-broken chunks into output files\n";
print "Usage: ~ [-s] [-h y|n] [-r total_num_of_rows] <in1.csv> <num_of_files> <out_stem>\n";
print " -s\tSkip checking row data format. Default is no.\n\n";
exit(1);
}
use Getopt::Std;
my(%options);
getopts("sh:r:", \%options);
my $APPEND = exists $options{"a"};
my $header = "U"; # unspecified
my $skip = exists $options{"s"};
if(exists $options{"h"}) {
$header = $options{"h"};
}
if(scalar(@ARGV) != 3) {
printUsage();
}
use Util;
use Flat;
use math;
use Fcntl ':flock';
my $in;
if($header eq "U") {
$in = Flat->new1(shift @ARGV);
}
if($header eq "y") {
$in = Flat->new(shift @ARGV, 1);
}
elsif($options{"h"} eq "n") {
$in = Flat->new(shift @ARGV, 0);
}
else {
print "-h should be followed by either 'y' or 'n'\n";
printUsage();
}
my $numRows;
if(exists $options{"r"}) {
$numRows = $options{"r"};
}
else {
$numRows = $in->getNumOfRows();
$in->reset();
}
my($num) = $in->getFieldIndex(shift @ARGV);
my $chunkSize;
if($numRows % $num == 0) {
$chunkSize = $numRows / $num;
}
else {
$chunkSize = int($numRows / $num) + 1;
}
print "numRows = $numRows, num = $num, chunkSize = $chunkSize\n";
my($outStem) = shift @ARGV;
my(@fieldNames) = $in->getFieldNames();
my %fname2fh;
my $outFileIndex = 0;
while($row = $in->readNextRow($skip)) {
if($in->getRowIndex() % $chunkSize == 1) {
$outFileIndex ++;
print "outFileIndex = $outFileIndex\n";
}
# next;
my $fh;
if(exists $fname2fh{$outFileIndex}) {
$fh = $fname2fh{$outFileIndex};
}
else { # file does not exist yet
$fh = "OUT$outFileIndex";
my @fldNames = @fieldNames;
open $fh, "+>$outStem.$outFileIndex.csv" or die $!;
#disable lock for now: flock($fh, LOCK_EX);
if($in->hasHeader()) {
print $fh Flat::dataRowToString(@fldNames), "\n";
}
$fname2fh{$outFileIndex} = $fh;
}
print $fh join("\t", @{$row}), "\n";
}
# unlock the output files
# foreach $fh (values %val2outfile) {
# disable lock for now: flock($fh, LOCK_UN);
# }
# Util::run("gzip $fname", 1);
# close files
foreach $fh (values %fname2fh) {
close $fh;
}
print "DONE partition.pl at ", `date`;