Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file added foo
Binary file not shown.
2 changes: 1 addition & 1 deletion src/sickle.h
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,6 @@ typedef struct __cutsites_ {
/* Function Prototypes */
int single_main (int argc, char *argv[]);
int paired_main (int argc, char *argv[]);
cutsites* sliding_window (kseq_t *fqrec, int qualtype, int length_threshold, int qual_threshold, int no_fiveprime, int trunc_n, int debug);
cutsites* sliding_window (kseq_t *fqrec, int qualtype, int length_threshold, int qual_threshold, int no_fiveprime, int trunc_n, int trunc_size, int debug);

#endif /*SICKLE_H*/
25 changes: 23 additions & 2 deletions src/sliding.c
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,11 @@ int get_quality_num (char qualchar, int qualtype, kseq_t *fqrec, int pos) {
}


cutsites* sliding_window (kseq_t *fqrec, int qualtype, int length_threshold, int qual_threshold, int no_fiveprime, int trunc_n, int debug) {

cutsites* sliding_window (kseq_t *fqrec, int qualtype, int length_threshold, int qual_threshold, int no_fiveprime, int trunc_n, int trunc_size, int debug) {
if (trunc_size >= 0 && trunc_size < length_threshold) {
trunc_size = length_threshold;
}

int window_size = (int) (0.1 * fqrec->seq.l);
int i,j;
int window_start=0;
Expand Down Expand Up @@ -130,6 +133,24 @@ cutsites* sliding_window (kseq_t *fqrec, int qualtype, int length_threshold, int

if (debug) printf ("\n\n");

/* if trunc_size is less than read length, remove equal number of bases from each side */
/* TODO: this should acutally remove bases based on quality */


if (trunc_size >= 0) {
int side = 0;
while (trunc_size < three_prime_cut - five_prime_cut) {
if (side == 0) {
five_prime_cut++;
side = 1;
}
else {
three_prime_cut--;
side = 0;
}
}
}

retvals = (cutsites*) malloc (sizeof(cutsites));
retvals->three_prime_cut = three_prime_cut;
retvals->five_prime_cut = five_prime_cut;
Expand Down
178 changes: 139 additions & 39 deletions src/trim_paired.c
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,10 @@ __KS_GETUNTIL(gzread, BUFFER_SIZE)
__KSEQ_READ

int paired_qual_threshold = 20;
int orphan_qual_threshold = -1;
int paired_length_threshold = 20;
int orphan_length_threshold = -1;
int recheck_orphans = 1;

static struct option paired_long_options[] = {
{"qual-type", required_argument, 0, 't'},
Expand All @@ -23,12 +26,16 @@ static struct option paired_long_options[] = {
{"pe-combo", optional_argument, 0, 'c'},
{"output-pe1", optional_argument, 0, 'o'},
{"output-pe2", optional_argument, 0, 'p'},
{"output-all", optional_argument, 0, 'a'},
{"output-single", optional_argument, 0, 's'},
{"output-combo", optional_argument, 0, 'm'},
{"qual-threshold", optional_argument, 0, 'q'},
{"orphan-qual-threshold", optional_argument, 0, 'Q'},
{"length-threshold", optional_argument, 0, 'l'},
{"orphan-length-threshold", optional_argument, 0, 'L'},
{"no-fiveprime", optional_argument, 0, 'x'},
{"truncate-n", optional_argument, 0, 'n'},
{"truncate-size", optional_argument, 0, 'N'},
{"gzip-output", optional_argument, 0, 'g'},
{"output-combo-all", optional_argument, 0, 'M'},
{"quiet", optional_argument, 0, 'z'},
Expand All @@ -51,7 +58,8 @@ Paired-end separated reads\n\
-f, --pe-file1, Input paired-end forward fastq file (Input files must have same number of records)\n\
-r, --pe-file2, Input paired-end reverse fastq file\n\
-o, --output-pe1, Output trimmed forward fastq file\n\
-p, --output-pe2, Output trimmed reverse fastq file. Must use -s option.\n\n\
-p, --output-pe2, Output trimmed reverse fastq file. Must use -s option.\n\
-a, --output-all, Output all pairs with at least one surviving reads, with a discarded read written to output file as a single N.\n\n\
Paired-end interleaved reads\n\
----------------------------\n");
fprintf(stderr,"-c, --pe-combo, Combined (interleaved) input paired-end fastq\n\
Expand All @@ -62,9 +70,12 @@ Global options\n\
-t, --qual-type, Type of quality values (solexa (CASAVA < 1.3), illumina (CASAVA 1.3 to 1.7), sanger (which is CASAVA >= 1.8)) (required)\n");
fprintf(stderr, "-s, --output-single, Output trimmed singles fastq file\n\
-q, --qual-threshold, Threshold for trimming based on average quality in a window. Default 20.\n\
-Q, --orphan-qual-threshold, Quality threshold to use for retaining a single mate if one (but not both) mate fails the paired-end quality threshold. Defaults to the value specified for --qual-threshold.\n\
-l, --length-threshold, Threshold to keep a read based on length after trimming. Default 20.\n\
-L, --orphan-length-threshold, Length threshold to use for retaining a single mate if one (but not both) mate fails the paired-end length threshold. Defaults to the value specified for --length-threshold.\n\
-x, --no-fiveprime, Don't do five prime trimming.\n\
-n, --truncate-n, Truncate sequences at position of first N.\n");
-n, --truncate-n, Truncate sequences at position of first N.\n\
-N, --truncate-size, Truncate sequences at a maximum length.\n");


fprintf(stderr, "-g, --gzip-output, Output gzipped files.\n--quiet, do not output trimming info\n\
Expand Down Expand Up @@ -114,14 +125,16 @@ int paired_main(int argc, char *argv[]) {
int quiet = 0;
int no_fiveprime = 0;
int trunc_n = 0;
int trunc_size = -1;
int gzip_output = 0;
int split_all = 0;
int combo_all=0;
int combo_s=0;
int total=0;

while (1) {
int option_index = 0;
optc = getopt_long(argc, argv, "df:r:c:t:o:p:m:M:s:q:l:xng", paired_long_options, &option_index);
optc = getopt_long(argc, argv, "df:r:c:t:o:p:m:M:s:q:l:Q:L:xnga", paired_long_options, &option_index);

if (optc == -1)
break;
Expand Down Expand Up @@ -164,7 +177,11 @@ int paired_main(int argc, char *argv[]) {
outfn2 = (char *) malloc(strlen(optarg) + 1);
strcpy(outfn2, optarg);
break;


case 'a':
split_all = 1;
break;

case 'm':
outfnc = (char *) malloc(strlen(optarg) + 1);
strcpy(outfnc, optarg);
Expand All @@ -189,7 +206,15 @@ int paired_main(int argc, char *argv[]) {
return EXIT_FAILURE;
}
break;


case 'Q':
orphan_qual_threshold = atoi(optarg);
if (orphan_qual_threshold < 0) {
fprintf(stderr, "Orphan quality threshold must be >= 0\n");
return EXIT_FAILURE;
}
break;

case 'l':
paired_length_threshold = atoi(optarg);
if (paired_length_threshold < 0) {
Expand All @@ -198,14 +223,24 @@ int paired_main(int argc, char *argv[]) {
}
break;

case 'L':
orphan_length_threshold = atoi(optarg);
if (orphan_length_threshold < 0) {
fprintf(stderr, "Orphan length threshold must be >= 0\n");
return EXIT_FAILURE;
}
break;

case 'x':
no_fiveprime = 1;
break;

case 'n':
trunc_n = 1;
break;

case 'N':
trunc_size = atoi(optarg);
break;
case 'g':
gzip_output = 1;
break;
Expand Down Expand Up @@ -241,6 +276,24 @@ int paired_main(int argc, char *argv[]) {
paired_usage(EXIT_FAILURE, "****Error: Must have either -f OR -c argument.");
}

/* set orphan length and quality thresholds to paired thresholds if unset */
if ((orphan_qual_threshold == -1 || orphan_qual_threshold == paired_qual_threshold) &&
(orphan_length_threshold == -1 || orphan_length_threshold == paired_length_threshold)) {
recheck_orphans = 0;
}
else {
if (orphan_qual_threshold == -1) {
orphan_qual_threshold = paired_qual_threshold;
}
if (orphan_length_threshold == -1) {
orphan_length_threshold = paired_length_threshold;
}
}

if (split_all) {
sfn = "";
}

if (infnc) { /* using combined input file */

if (infn1 || infn2 || outfn1 || outfn2) {
Expand Down Expand Up @@ -284,7 +337,7 @@ int paired_main(int argc, char *argv[]) {

} else { /* using forward and reverse input files */

if (infn1 && (!infn2 || !outfn1 || !outfn2 || !sfn)) {
if (infn1 && (!infn2 || !outfn1 || !outfn2 || (!sfn && !split_all))) {
paired_usage(EXIT_FAILURE, "****Error: Using the -f option means you must have the -r, -o, -p, and -s options.");
}

Expand Down Expand Up @@ -341,7 +394,7 @@ int paired_main(int argc, char *argv[]) {
}

/* get singles output file handle */
if (sfn && !combo_all) {
if (sfn && !combo_all && !split_all) {
if (!gzip_output) {
single = fopen(sfn, "w");
if (!single) {
Expand Down Expand Up @@ -374,8 +427,8 @@ int paired_main(int argc, char *argv[]) {
break;
}

p1cut = sliding_window(fqrec1, qualtype, paired_length_threshold, paired_qual_threshold, no_fiveprime, trunc_n, debug);
p2cut = sliding_window(fqrec2, qualtype, paired_length_threshold, paired_qual_threshold, no_fiveprime, trunc_n, debug);
p1cut = sliding_window(fqrec1, qualtype, paired_length_threshold, paired_qual_threshold, no_fiveprime, trunc_n, trunc_size, debug);
p2cut = sliding_window(fqrec2, qualtype, paired_length_threshold, paired_qual_threshold, no_fiveprime, trunc_n, trunc_size, debug);
total += 2;

if (debug) printf("p1cut: %d,%d\n", p1cut->five_prime_cut, p1cut->three_prime_cut);
Expand Down Expand Up @@ -411,45 +464,92 @@ int paired_main(int argc, char *argv[]) {
/* if only one sequence passed filter, then put its record in singles and discard the other */
/* or put an "N" record in if that option was chosen. */
else if (p1cut->three_prime_cut >= 0 && p2cut->three_prime_cut < 0) {
if (!gzip_output) {
if (combo_all) {
print_record (combo, fqrec1, p1cut);
print_record_N (combo, fqrec2, qualtype);
} else {
print_record (single, fqrec1, p1cut);
int write_orphan = 1;

if (recheck_orphans) {
p1cut = sliding_window(fqrec1, qualtype, orphan_length_threshold, orphan_qual_threshold, no_fiveprime, trunc_n, trunc_size, debug);
if (p1cut->three_prime_cut < 0) {
if (debug) printf("p1 orhpan discarded");
write_orphan = 0;
}
} else {
if (combo_all) {
print_record_gzip (combo_gzip, fqrec1, p1cut);
print_record_N_gzip (combo_gzip, fqrec2, qualtype);
else if (debug) printf("p1 orhpan retained");
}

if (write_orphan) {
if (!gzip_output) {
if (combo_all) {
print_record (combo, fqrec1, p1cut);
print_record_N (combo, fqrec2, qualtype);
} else if (split_all) {
print_record(outfile1, fqrec1, p1cut);
print_record_N(outfile2, fqrec2, qualtype);
}
else {
print_record (single, fqrec1, p1cut);
}
} else {
print_record_gzip (single_gzip, fqrec1, p1cut);
if (combo_all) {
print_record_gzip (combo_gzip, fqrec1, p1cut);
print_record_N_gzip (combo_gzip, fqrec2, qualtype);
} else if (split_all) {
print_record_gzip(outfile1, fqrec1, p1cut);
print_record_N_gzip(outfile2, fqrec2, qualtype);
}
else {
print_record_gzip (single_gzip, fqrec1, p1cut);
}
}
}

kept_s1++;
discard_s2++;
kept_s1++;
discard_s2++;
}
else {
discard_s2 += 2;
}
}

else if (p1cut->three_prime_cut < 0 && p2cut->three_prime_cut >= 0) {
if (!gzip_output) {
if (combo_all) {
print_record_N (combo, fqrec1, qualtype);
print_record (combo, fqrec2, p2cut);
} else {
print_record (single, fqrec2, p2cut);
int write_orphan = 1;

if (recheck_orphans) {
p2cut = sliding_window(fqrec2, qualtype, orphan_length_threshold, orphan_qual_threshold, no_fiveprime, trunc_n, trunc_size, debug);
if (p2cut->three_prime_cut < 0) {
if (debug) printf("p2 orhpan discarded");
write_orphan = 0;
}
} else {
if (combo_all) {
print_record_N_gzip (combo_gzip, fqrec1, qualtype);
print_record_gzip (combo_gzip, fqrec2, p2cut);
else if (debug) printf("p2 orhpan retained");
}

if (write_orphan) {
if (!gzip_output) {
if (combo_all) {
print_record_N (combo, fqrec1, qualtype);
print_record (combo, fqrec2, p2cut);
} else if (split_all) {
print_record_N(outfile1, fqrec1, qualtype);
print_record(outfile2, fqrec2, p2cut);
}
else {
print_record (single, fqrec2, p2cut);
}
} else {
print_record_gzip (single_gzip, fqrec2, p2cut);
if (combo_all) {
print_record_N_gzip (combo_gzip, fqrec1, qualtype);
print_record_gzip (combo_gzip, fqrec2, p2cut);
} else if (split_all) {
print_record_N_gzip(outfile1, fqrec1, qualtype);
print_record_gzip(outfile2, fqrec2, p2cut);
}
else {
print_record_gzip (single_gzip, fqrec2, p2cut);
}
}
kept_s2++;
discard_s1++;
}
else {
discard_s2 += 2;
}

kept_s2++;
discard_s1++;

} else {

Expand Down
9 changes: 7 additions & 2 deletions src/trim_single.c
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ static struct option single_long_options[] = {
{"length-threshold", optional_argument, 0, 'l'},
{"no-fiveprime", optional_argument, 0, 'x'},
{"discard-n", optional_argument, 0, 'n'},
{"trunc-size", optional_argument, 0, 'N'},
{"gzip-output", optional_argument, 0, 'g'},
{"quiet", optional_argument, 0, 'z'},
{GETOPT_HELP_OPTION_DECL},
Expand All @@ -43,6 +44,7 @@ Options:\n\
-l, --length-threshold, Threshold to keep a read based on length after trimming. Default 20.\n\
-x, --no-fiveprime, Don't do five prime trimming.\n\
-n, --trunc-n, Truncate sequences at position of first N.\n\
-N, --truncate-size, Truncate sequences at a maximum length.\n\
-g, --gzip-output, Output gzipped files.\n\
--quiet, Don't print out any trimming information\n\
--help, display this help and exit\n\
Expand Down Expand Up @@ -71,6 +73,7 @@ int single_main(int argc, char *argv[]) {
int quiet = 0;
int no_fiveprime = 0;
int trunc_n = 0;
int trunc_size = -1;
int gzip_output = 0;
int total=0;

Expand Down Expand Up @@ -131,7 +134,9 @@ int single_main(int argc, char *argv[]) {
case 'n':
trunc_n = 1;
break;

case 'N':
trunc_size = atoi(optarg);
break;
case 'g':
gzip_output = 1;
break;
Expand Down Expand Up @@ -192,7 +197,7 @@ int single_main(int argc, char *argv[]) {

while ((l = kseq_read(fqrec)) >= 0) {

p1cut = sliding_window(fqrec, qualtype, single_length_threshold, single_qual_threshold, no_fiveprime, trunc_n, debug);
p1cut = sliding_window(fqrec, qualtype, single_length_threshold, single_qual_threshold, no_fiveprime, trunc_n, trunc_size, debug);
total++;

if (debug) printf("P1cut: %d,%d\n", p1cut->five_prime_cut, p1cut->three_prime_cut);
Expand Down