diff --git a/foo b/foo new file mode 100755 index 0000000..736b207 Binary files /dev/null and b/foo differ diff --git a/src/sickle.h b/src/sickle.h index e7b679c..d30df14 100644 --- a/src/sickle.h +++ b/src/sickle.h @@ -96,6 +96,6 @@ typedef struct __cutsites_ { /* Function Prototypes */ int single_main (int argc, char *argv[]); int paired_main (int argc, char *argv[]); -cutsites* sliding_window (kseq_t *fqrec, int qualtype, int length_threshold, int qual_threshold, int no_fiveprime, int trunc_n, int debug); +cutsites* sliding_window (kseq_t *fqrec, int qualtype, int length_threshold, int qual_threshold, int no_fiveprime, int trunc_n, int trunc_size, int debug); #endif /*SICKLE_H*/ diff --git a/src/sliding.c b/src/sliding.c index 7573106..056b7a8 100644 --- a/src/sliding.c +++ b/src/sliding.c @@ -32,8 +32,11 @@ int get_quality_num (char qualchar, int qualtype, kseq_t *fqrec, int pos) { } -cutsites* sliding_window (kseq_t *fqrec, int qualtype, int length_threshold, int qual_threshold, int no_fiveprime, int trunc_n, int debug) { - +cutsites* sliding_window (kseq_t *fqrec, int qualtype, int length_threshold, int qual_threshold, int no_fiveprime, int trunc_n, int trunc_size, int debug) { + if (trunc_size >= 0 && trunc_size < length_threshold) { + trunc_size = length_threshold; + } + int window_size = (int) (0.1 * fqrec->seq.l); int i,j; int window_start=0; @@ -130,6 +133,24 @@ cutsites* sliding_window (kseq_t *fqrec, int qualtype, int length_threshold, int if (debug) printf ("\n\n"); + /* if trunc_size is less than read length, remove equal number of bases from each side */ + /* TODO: this should acutally remove bases based on quality */ + + + if (trunc_size >= 0) { + int side = 0; + while (trunc_size < three_prime_cut - five_prime_cut) { + if (side == 0) { + five_prime_cut++; + side = 1; + } + else { + three_prime_cut--; + side = 0; + } + } + } + retvals = (cutsites*) malloc (sizeof(cutsites)); retvals->three_prime_cut = three_prime_cut; retvals->five_prime_cut = five_prime_cut; diff --git a/src/trim_paired.c b/src/trim_paired.c index f63b877..c138830 100644 --- a/src/trim_paired.c +++ b/src/trim_paired.c @@ -14,7 +14,10 @@ __KS_GETUNTIL(gzread, BUFFER_SIZE) __KSEQ_READ int paired_qual_threshold = 20; +int orphan_qual_threshold = -1; int paired_length_threshold = 20; +int orphan_length_threshold = -1; +int recheck_orphans = 1; static struct option paired_long_options[] = { {"qual-type", required_argument, 0, 't'}, @@ -23,12 +26,16 @@ static struct option paired_long_options[] = { {"pe-combo", optional_argument, 0, 'c'}, {"output-pe1", optional_argument, 0, 'o'}, {"output-pe2", optional_argument, 0, 'p'}, + {"output-all", optional_argument, 0, 'a'}, {"output-single", optional_argument, 0, 's'}, {"output-combo", optional_argument, 0, 'm'}, {"qual-threshold", optional_argument, 0, 'q'}, + {"orphan-qual-threshold", optional_argument, 0, 'Q'}, {"length-threshold", optional_argument, 0, 'l'}, + {"orphan-length-threshold", optional_argument, 0, 'L'}, {"no-fiveprime", optional_argument, 0, 'x'}, {"truncate-n", optional_argument, 0, 'n'}, + {"truncate-size", optional_argument, 0, 'N'}, {"gzip-output", optional_argument, 0, 'g'}, {"output-combo-all", optional_argument, 0, 'M'}, {"quiet", optional_argument, 0, 'z'}, @@ -51,7 +58,8 @@ Paired-end separated reads\n\ -f, --pe-file1, Input paired-end forward fastq file (Input files must have same number of records)\n\ -r, --pe-file2, Input paired-end reverse fastq file\n\ -o, --output-pe1, Output trimmed forward fastq file\n\ --p, --output-pe2, Output trimmed reverse fastq file. Must use -s option.\n\n\ +-p, --output-pe2, Output trimmed reverse fastq file. Must use -s option.\n\ +-a, --output-all, Output all pairs with at least one surviving reads, with a discarded read written to output file as a single N.\n\n\ Paired-end interleaved reads\n\ ----------------------------\n"); fprintf(stderr,"-c, --pe-combo, Combined (interleaved) input paired-end fastq\n\ @@ -62,9 +70,12 @@ Global options\n\ -t, --qual-type, Type of quality values (solexa (CASAVA < 1.3), illumina (CASAVA 1.3 to 1.7), sanger (which is CASAVA >= 1.8)) (required)\n"); fprintf(stderr, "-s, --output-single, Output trimmed singles fastq file\n\ -q, --qual-threshold, Threshold for trimming based on average quality in a window. Default 20.\n\ +-Q, --orphan-qual-threshold, Quality threshold to use for retaining a single mate if one (but not both) mate fails the paired-end quality threshold. Defaults to the value specified for --qual-threshold.\n\ -l, --length-threshold, Threshold to keep a read based on length after trimming. Default 20.\n\ +-L, --orphan-length-threshold, Length threshold to use for retaining a single mate if one (but not both) mate fails the paired-end length threshold. Defaults to the value specified for --length-threshold.\n\ -x, --no-fiveprime, Don't do five prime trimming.\n\ --n, --truncate-n, Truncate sequences at position of first N.\n"); +-n, --truncate-n, Truncate sequences at position of first N.\n\ +-N, --truncate-size, Truncate sequences at a maximum length.\n"); fprintf(stderr, "-g, --gzip-output, Output gzipped files.\n--quiet, do not output trimming info\n\ @@ -114,14 +125,16 @@ int paired_main(int argc, char *argv[]) { int quiet = 0; int no_fiveprime = 0; int trunc_n = 0; + int trunc_size = -1; int gzip_output = 0; + int split_all = 0; int combo_all=0; int combo_s=0; int total=0; - + while (1) { int option_index = 0; - optc = getopt_long(argc, argv, "df:r:c:t:o:p:m:M:s:q:l:xng", paired_long_options, &option_index); + optc = getopt_long(argc, argv, "df:r:c:t:o:p:m:M:s:q:l:Q:L:xnga", paired_long_options, &option_index); if (optc == -1) break; @@ -164,7 +177,11 @@ int paired_main(int argc, char *argv[]) { outfn2 = (char *) malloc(strlen(optarg) + 1); strcpy(outfn2, optarg); break; - + + case 'a': + split_all = 1; + break; + case 'm': outfnc = (char *) malloc(strlen(optarg) + 1); strcpy(outfnc, optarg); @@ -189,7 +206,15 @@ int paired_main(int argc, char *argv[]) { return EXIT_FAILURE; } break; - + + case 'Q': + orphan_qual_threshold = atoi(optarg); + if (orphan_qual_threshold < 0) { + fprintf(stderr, "Orphan quality threshold must be >= 0\n"); + return EXIT_FAILURE; + } + break; + case 'l': paired_length_threshold = atoi(optarg); if (paired_length_threshold < 0) { @@ -198,6 +223,14 @@ int paired_main(int argc, char *argv[]) { } break; + case 'L': + orphan_length_threshold = atoi(optarg); + if (orphan_length_threshold < 0) { + fprintf(stderr, "Orphan length threshold must be >= 0\n"); + return EXIT_FAILURE; + } + break; + case 'x': no_fiveprime = 1; break; @@ -205,7 +238,9 @@ int paired_main(int argc, char *argv[]) { case 'n': trunc_n = 1; break; - + case 'N': + trunc_size = atoi(optarg); + break; case 'g': gzip_output = 1; break; @@ -241,6 +276,24 @@ int paired_main(int argc, char *argv[]) { paired_usage(EXIT_FAILURE, "****Error: Must have either -f OR -c argument."); } + /* set orphan length and quality thresholds to paired thresholds if unset */ + if ((orphan_qual_threshold == -1 || orphan_qual_threshold == paired_qual_threshold) && + (orphan_length_threshold == -1 || orphan_length_threshold == paired_length_threshold)) { + recheck_orphans = 0; + } + else { + if (orphan_qual_threshold == -1) { + orphan_qual_threshold = paired_qual_threshold; + } + if (orphan_length_threshold == -1) { + orphan_length_threshold = paired_length_threshold; + } + } + + if (split_all) { + sfn = ""; + } + if (infnc) { /* using combined input file */ if (infn1 || infn2 || outfn1 || outfn2) { @@ -284,7 +337,7 @@ int paired_main(int argc, char *argv[]) { } else { /* using forward and reverse input files */ - if (infn1 && (!infn2 || !outfn1 || !outfn2 || !sfn)) { + if (infn1 && (!infn2 || !outfn1 || !outfn2 || (!sfn && !split_all))) { paired_usage(EXIT_FAILURE, "****Error: Using the -f option means you must have the -r, -o, -p, and -s options."); } @@ -341,7 +394,7 @@ int paired_main(int argc, char *argv[]) { } /* get singles output file handle */ - if (sfn && !combo_all) { + if (sfn && !combo_all && !split_all) { if (!gzip_output) { single = fopen(sfn, "w"); if (!single) { @@ -374,8 +427,8 @@ int paired_main(int argc, char *argv[]) { break; } - p1cut = sliding_window(fqrec1, qualtype, paired_length_threshold, paired_qual_threshold, no_fiveprime, trunc_n, debug); - p2cut = sliding_window(fqrec2, qualtype, paired_length_threshold, paired_qual_threshold, no_fiveprime, trunc_n, debug); + p1cut = sliding_window(fqrec1, qualtype, paired_length_threshold, paired_qual_threshold, no_fiveprime, trunc_n, trunc_size, debug); + p2cut = sliding_window(fqrec2, qualtype, paired_length_threshold, paired_qual_threshold, no_fiveprime, trunc_n, trunc_size, debug); total += 2; if (debug) printf("p1cut: %d,%d\n", p1cut->five_prime_cut, p1cut->three_prime_cut); @@ -411,45 +464,92 @@ int paired_main(int argc, char *argv[]) { /* if only one sequence passed filter, then put its record in singles and discard the other */ /* or put an "N" record in if that option was chosen. */ else if (p1cut->three_prime_cut >= 0 && p2cut->three_prime_cut < 0) { - if (!gzip_output) { - if (combo_all) { - print_record (combo, fqrec1, p1cut); - print_record_N (combo, fqrec2, qualtype); - } else { - print_record (single, fqrec1, p1cut); + int write_orphan = 1; + + if (recheck_orphans) { + p1cut = sliding_window(fqrec1, qualtype, orphan_length_threshold, orphan_qual_threshold, no_fiveprime, trunc_n, trunc_size, debug); + if (p1cut->three_prime_cut < 0) { + if (debug) printf("p1 orhpan discarded"); + write_orphan = 0; } - } else { - if (combo_all) { - print_record_gzip (combo_gzip, fqrec1, p1cut); - print_record_N_gzip (combo_gzip, fqrec2, qualtype); + else if (debug) printf("p1 orhpan retained"); + } + + if (write_orphan) { + if (!gzip_output) { + if (combo_all) { + print_record (combo, fqrec1, p1cut); + print_record_N (combo, fqrec2, qualtype); + } else if (split_all) { + print_record(outfile1, fqrec1, p1cut); + print_record_N(outfile2, fqrec2, qualtype); + } + else { + print_record (single, fqrec1, p1cut); + } } else { - print_record_gzip (single_gzip, fqrec1, p1cut); + if (combo_all) { + print_record_gzip (combo_gzip, fqrec1, p1cut); + print_record_N_gzip (combo_gzip, fqrec2, qualtype); + } else if (split_all) { + print_record_gzip(outfile1, fqrec1, p1cut); + print_record_N_gzip(outfile2, fqrec2, qualtype); + } + else { + print_record_gzip (single_gzip, fqrec1, p1cut); + } } - } - kept_s1++; - discard_s2++; + kept_s1++; + discard_s2++; + } + else { + discard_s2 += 2; + } } else if (p1cut->three_prime_cut < 0 && p2cut->three_prime_cut >= 0) { - if (!gzip_output) { - if (combo_all) { - print_record_N (combo, fqrec1, qualtype); - print_record (combo, fqrec2, p2cut); - } else { - print_record (single, fqrec2, p2cut); + int write_orphan = 1; + + if (recheck_orphans) { + p2cut = sliding_window(fqrec2, qualtype, orphan_length_threshold, orphan_qual_threshold, no_fiveprime, trunc_n, trunc_size, debug); + if (p2cut->three_prime_cut < 0) { + if (debug) printf("p2 orhpan discarded"); + write_orphan = 0; } - } else { - if (combo_all) { - print_record_N_gzip (combo_gzip, fqrec1, qualtype); - print_record_gzip (combo_gzip, fqrec2, p2cut); + else if (debug) printf("p2 orhpan retained"); + } + + if (write_orphan) { + if (!gzip_output) { + if (combo_all) { + print_record_N (combo, fqrec1, qualtype); + print_record (combo, fqrec2, p2cut); + } else if (split_all) { + print_record_N(outfile1, fqrec1, qualtype); + print_record(outfile2, fqrec2, p2cut); + } + else { + print_record (single, fqrec2, p2cut); + } } else { - print_record_gzip (single_gzip, fqrec2, p2cut); + if (combo_all) { + print_record_N_gzip (combo_gzip, fqrec1, qualtype); + print_record_gzip (combo_gzip, fqrec2, p2cut); + } else if (split_all) { + print_record_N_gzip(outfile1, fqrec1, qualtype); + print_record_gzip(outfile2, fqrec2, p2cut); + } + else { + print_record_gzip (single_gzip, fqrec2, p2cut); + } } + kept_s2++; + discard_s1++; + } + else { + discard_s2 += 2; } - - kept_s2++; - discard_s1++; } else { diff --git a/src/trim_single.c b/src/trim_single.c index 44fe013..e182c04 100644 --- a/src/trim_single.c +++ b/src/trim_single.c @@ -23,6 +23,7 @@ static struct option single_long_options[] = { {"length-threshold", optional_argument, 0, 'l'}, {"no-fiveprime", optional_argument, 0, 'x'}, {"discard-n", optional_argument, 0, 'n'}, + {"trunc-size", optional_argument, 0, 'N'}, {"gzip-output", optional_argument, 0, 'g'}, {"quiet", optional_argument, 0, 'z'}, {GETOPT_HELP_OPTION_DECL}, @@ -43,6 +44,7 @@ Options:\n\ -l, --length-threshold, Threshold to keep a read based on length after trimming. Default 20.\n\ -x, --no-fiveprime, Don't do five prime trimming.\n\ -n, --trunc-n, Truncate sequences at position of first N.\n\ +-N, --truncate-size, Truncate sequences at a maximum length.\n\ -g, --gzip-output, Output gzipped files.\n\ --quiet, Don't print out any trimming information\n\ --help, display this help and exit\n\ @@ -71,6 +73,7 @@ int single_main(int argc, char *argv[]) { int quiet = 0; int no_fiveprime = 0; int trunc_n = 0; + int trunc_size = -1; int gzip_output = 0; int total=0; @@ -131,7 +134,9 @@ int single_main(int argc, char *argv[]) { case 'n': trunc_n = 1; break; - + case 'N': + trunc_size = atoi(optarg); + break; case 'g': gzip_output = 1; break; @@ -192,7 +197,7 @@ int single_main(int argc, char *argv[]) { while ((l = kseq_read(fqrec)) >= 0) { - p1cut = sliding_window(fqrec, qualtype, single_length_threshold, single_qual_threshold, no_fiveprime, trunc_n, debug); + p1cut = sliding_window(fqrec, qualtype, single_length_threshold, single_qual_threshold, no_fiveprime, trunc_n, trunc_size, debug); total++; if (debug) printf("P1cut: %d,%d\n", p1cut->five_prime_cut, p1cut->three_prime_cut);