From 225dfbb729f76a819f5679862a3480aee2cbc4d2 Mon Sep 17 00:00:00 2001 From: Thomas Sibley Date: Fri, 15 Aug 2014 11:35:31 -0700 Subject: [PATCH 1/7] Fix option argument requirements to avoid segfaults Many options which required an argument were declared as taking an optional argument. The option handling loop, however, did not expect optional arguments. For example, passing "--qual-threshold 30" rather than "--qual-threshold=30" or "-q 30" led to a segfault when atoi() was passed NULL (optarg). There were other affected options as well. This commit correctly declares the required/optional/none status of option arguments and matches that short-form option spec. Presumably this was confusion between the entire option being optional and the option's argument being optional. The required option checking is done outside of getopt. --- src/trim_paired.c | 37 +++++++++++++++++++------------------ src/trim_single.c | 24 ++++++++++++------------ 2 files changed, 31 insertions(+), 30 deletions(-) diff --git a/src/trim_paired.c b/src/trim_paired.c index f63b877..00063b3 100644 --- a/src/trim_paired.c +++ b/src/trim_paired.c @@ -17,24 +17,25 @@ int paired_qual_threshold = 20; int paired_length_threshold = 20; static struct option paired_long_options[] = { - {"qual-type", required_argument, 0, 't'}, - {"pe-file1", optional_argument, 0, 'f'}, - {"pe-file2", optional_argument, 0, 'r'}, - {"pe-combo", optional_argument, 0, 'c'}, - {"output-pe1", optional_argument, 0, 'o'}, - {"output-pe2", optional_argument, 0, 'p'}, - {"output-single", optional_argument, 0, 's'}, - {"output-combo", optional_argument, 0, 'm'}, - {"qual-threshold", optional_argument, 0, 'q'}, - {"length-threshold", optional_argument, 0, 'l'}, - {"no-fiveprime", optional_argument, 0, 'x'}, - {"truncate-n", optional_argument, 0, 'n'}, - {"gzip-output", optional_argument, 0, 'g'}, - {"output-combo-all", optional_argument, 0, 'M'}, - {"quiet", optional_argument, 0, 'z'}, - {GETOPT_HELP_OPTION_DECL}, - {GETOPT_VERSION_OPTION_DECL}, - {NULL, 0, NULL, 0} + { "qual-type", required_argument, NULL, 't' }, + { "pe-file1", required_argument, NULL, 'f' }, + { "pe-file2", required_argument, NULL, 'r' }, + { "pe-combo", required_argument, NULL, 'c' }, + { "output-pe1", required_argument, NULL, 'o' }, + { "output-pe2", required_argument, NULL, 'p' }, + { "output-single", required_argument, NULL, 's' }, + { "output-combo", required_argument, NULL, 'm' }, + { "qual-threshold", required_argument, NULL, 'q' }, + { "length-threshold", required_argument, NULL, 'l' }, + { "no-fiveprime", no_argument, NULL, 'x' }, + { "truncate-n", no_argument, NULL, 'n' }, + { "gzip-output", no_argument, NULL, 'g' }, + { "output-combo-all", required_argument, NULL, 'M' }, + { "quiet", no_argument, NULL, 'z' }, + { "debug", no_argument, NULL, 'd' }, + { GETOPT_HELP_OPTION_DECL }, + { GETOPT_VERSION_OPTION_DECL }, + { NULL, 0, NULL, 0 } }; void paired_usage (int status, char *msg) { diff --git a/src/trim_single.c b/src/trim_single.c index 44fe013..cd6ca01 100644 --- a/src/trim_single.c +++ b/src/trim_single.c @@ -16,18 +16,18 @@ int single_qual_threshold = 20; int single_length_threshold = 20; static struct option single_long_options[] = { - {"fastq-file", required_argument, 0, 'f'}, - {"output-file", required_argument, 0, 'o'}, - {"qual-type", required_argument, 0, 't'}, - {"qual-threshold", optional_argument, 0, 'q'}, - {"length-threshold", optional_argument, 0, 'l'}, - {"no-fiveprime", optional_argument, 0, 'x'}, - {"discard-n", optional_argument, 0, 'n'}, - {"gzip-output", optional_argument, 0, 'g'}, - {"quiet", optional_argument, 0, 'z'}, - {GETOPT_HELP_OPTION_DECL}, - {GETOPT_VERSION_OPTION_DECL}, - {NULL, 0, NULL, 0} + { "fastq-file", required_argument, NULL, 'f' }, + { "output-file", required_argument, NULL, 'o' }, + { "qual-type", required_argument, NULL, 't' }, + { "qual-threshold", required_argument, NULL, 'q' }, + { "length-threshold", required_argument, NULL, 'l' }, + { "no-fiveprime", no_argument, NULL, 'x' }, + { "truncate-n", no_argument, NULL, 'n' }, + { "gzip-output", no_argument, NULL, 'g' }, + { "quiet", no_argument, NULL, 'z' }, + { GETOPT_HELP_OPTION_DECL }, + { GETOPT_VERSION_OPTION_DECL }, + { NULL, 0, NULL, 0 } }; void single_usage(int status, char *msg) { From 0dfe78ee587560fdd43999839ae8cd6f29b04074 Mon Sep 17 00:00:00 2001 From: Thomas Sibley Date: Fri, 15 Aug 2014 11:40:26 -0700 Subject: [PATCH 2/7] Move short option spec closer to long option spec so they don't drift This will help keep them in sync when updating options. --- src/trim_paired.c | 3 ++- src/trim_single.c | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/trim_paired.c b/src/trim_paired.c index 00063b3..7226a12 100644 --- a/src/trim_paired.c +++ b/src/trim_paired.c @@ -16,6 +16,7 @@ __KSEQ_READ int paired_qual_threshold = 20; int paired_length_threshold = 20; +static const char *paired_short_options = "df:r:c:t:o:p:m:M:s:q:l:xng"; static struct option paired_long_options[] = { { "qual-type", required_argument, NULL, 't' }, { "pe-file1", required_argument, NULL, 'f' }, @@ -122,7 +123,7 @@ int paired_main(int argc, char *argv[]) { while (1) { int option_index = 0; - optc = getopt_long(argc, argv, "df:r:c:t:o:p:m:M:s:q:l:xng", paired_long_options, &option_index); + optc = getopt_long(argc, argv, paired_short_options, paired_long_options, &option_index); if (optc == -1) break; diff --git a/src/trim_single.c b/src/trim_single.c index cd6ca01..85febd6 100644 --- a/src/trim_single.c +++ b/src/trim_single.c @@ -15,6 +15,7 @@ __KSEQ_READ int single_qual_threshold = 20; int single_length_threshold = 20; +static const char *single_short_options = "df:t:o:q:l:zxng"; static struct option single_long_options[] = { { "fastq-file", required_argument, NULL, 'f' }, { "output-file", required_argument, NULL, 'o' }, @@ -76,7 +77,7 @@ int single_main(int argc, char *argv[]) { while (1) { int option_index = 0; - optc = getopt_long(argc, argv, "df:t:o:q:l:zxng", single_long_options, &option_index); + optc = getopt_long(argc, argv, single_short_options, single_long_options, &option_index); if (optc == -1) break; From 705ae6c83a997fd036ee76a789c1b9c52130948c Mon Sep 17 00:00:00 2001 From: Thomas Sibley Date: Fri, 15 Aug 2014 11:41:12 -0700 Subject: [PATCH 3/7] Overhaul usage formatting, both in code and when displayed The usage docs now indicate option arguments and are easier to read. If the usage was specifically requested with --help, then it is printed to stdout instead of stderr. This is useful for the common idiom of asking for help and piping to a pager like less or more (without redirecting stderr). --- src/sickle.h | 2 + src/trim_paired.c | 104 +++++++++++++++++++++++++++++++--------------- src/trim_single.c | 54 ++++++++++++++++-------- 3 files changed, 110 insertions(+), 50 deletions(-) diff --git a/src/sickle.h b/src/sickle.h index e7b679c..e78e667 100644 --- a/src/sickle.h +++ b/src/sickle.h @@ -61,6 +61,8 @@ exit(EXIT_SUCCESS); \ break; /* end code drawn from system.h */ +#define STDERR_OR_OUT(status) (status == EXIT_SUCCESS ? stdout : stderr) + typedef enum { PHRED, SANGER, diff --git a/src/trim_paired.c b/src/trim_paired.c index 7226a12..6a42251 100644 --- a/src/trim_paired.c +++ b/src/trim_paired.c @@ -40,40 +40,78 @@ static struct option paired_long_options[] = { }; void paired_usage (int status, char *msg) { + static const char *usage_format = + "\n" + "If you have separate files for forward and reverse reads:\n" + " Usage: %1$s pe [options] -f \n" + " %2$*3$s -r \n" + " %2$*3$s -t \n" + " %2$*3$s -o \n" + " %2$*3$s -p \n" + " %2$*3$s -s \n" + "\n" + "If you have one file with interleaved forward and reverse reads:\n" + " Usage: %1$s pe [options] -c \n" + " %2$*3$s -t \n" + " %2$*3$s -m \n" + " %2$*3$s -s \n" + "\n" + "If you have one file with interleaved reads as input and you want\n" + "ONLY one interleaved file as output:\n" + " Usage: %1$s pe [options] -c \n" + " %2$*3$s -t \n" + " %2$*3$s -M \n" + "\n" + "Options:\n" + "\n" + "Paired-end separated reads\n" + "--------------------------\n" + "-f, --pe-file1 Input paired-end forward fastq file\n" + "-r, --pe-file2 Input paired-end reverse fastq file\n" + " (input files must have same number of records)\n" + "-o, --output-pe1 Output trimmed forward fastq file\n" + "-p, --output-pe2 Output trimmed reverse fastq file. Must use -s option.\n" + "\n" + "Paired-end interleaved reads\n" + "----------------------------\n" + "-c, --pe-combo Combined (interleaved) input paired-end fastq\n" + "-m, --output-combo Output combined (interleaved) paired-end fastq file.\n" + " Must use -s option.\n" + "-M, --output-combo-all Output combined (interleaved) paired-end fastq file\n" + " with any discarded read written to output file as a\n" + " single N. Cannot be used with the -s option.\n" + "\n" + "Global options\n" + "--------------\n" + "-t TYPE, --qual-type TYPE Type of quality values, one of:\n" + " solexa (CASAVA < 1.3)\n" + " illumina (CASAVA 1.3 to 1.7)\n" + " sanger (which is CASAVA >= 1.8)\n" + " (required)\n" + "-s FILE, --output-single FILE Output trimmed singles fastq file\n" + "-q #, --qual-threshold # Threshold for trimming based on average quality\n" + " in a window. Default %3$d.\n" + "-l #, --length-threshold # Threshold to keep a read based on length after\n" + " trimming. Default %4$d.\n" + "-x, --no-fiveprime Don't do five prime trimming.\n" + "-n, --truncate-n Truncate sequences at position of first N.\n" + "-g, --gzip-output Output gzipped files.\n" + "--quiet Do not output trimming info\n" + "--help Display this help and exit\n" + "--version Output version information and exit\n" + ; + + if (msg) fprintf( STDERR_OR_OUT(status), "%s\n", msg ); + + fprintf( + STDERR_OR_OUT(status), + usage_format, + PROGRAM_NAME, + "", strlen(PROGRAM_NAME) + 3, // +3 makes it possible to keep text visually aligned in the format string itself + paired_qual_threshold, + paired_length_threshold + ); - fprintf(stderr, "\nIf you have separate files for forward and reverse reads:\n"); - fprintf(stderr, "Usage: %s pe [options] -f -r -t -o -p -s \n\n", PROGRAM_NAME); - fprintf(stderr, "If you have one file with interleaved forward and reverse reads:\n"); - fprintf(stderr, "Usage: %s pe [options] -c -t -m -s \n\n\ -If you have one file with interleaved reads as input and you want ONLY one interleaved file as output:\n\ -Usage: %s pe [options] -c -t -M \n\n", PROGRAM_NAME, PROGRAM_NAME); - fprintf(stderr, "Options:\n\ -Paired-end separated reads\n\ ---------------------------\n\ --f, --pe-file1, Input paired-end forward fastq file (Input files must have same number of records)\n\ --r, --pe-file2, Input paired-end reverse fastq file\n\ --o, --output-pe1, Output trimmed forward fastq file\n\ --p, --output-pe2, Output trimmed reverse fastq file. Must use -s option.\n\n\ -Paired-end interleaved reads\n\ -----------------------------\n"); - fprintf(stderr,"-c, --pe-combo, Combined (interleaved) input paired-end fastq\n\ --m, --output-combo, Output combined (interleaved) paired-end fastq file. Must use -s option.\n\ --M, --output-combo-all, Output combined (interleaved) paired-end fastq file with any discarded read written to output file as a single N. Cannot be used with the -s option.\n\n\ -Global options\n\ ---------------\n\ --t, --qual-type, Type of quality values (solexa (CASAVA < 1.3), illumina (CASAVA 1.3 to 1.7), sanger (which is CASAVA >= 1.8)) (required)\n"); - fprintf(stderr, "-s, --output-single, Output trimmed singles fastq file\n\ --q, --qual-threshold, Threshold for trimming based on average quality in a window. Default 20.\n\ --l, --length-threshold, Threshold to keep a read based on length after trimming. Default 20.\n\ --x, --no-fiveprime, Don't do five prime trimming.\n\ --n, --truncate-n, Truncate sequences at position of first N.\n"); - - - fprintf(stderr, "-g, --gzip-output, Output gzipped files.\n--quiet, do not output trimming info\n\ ---help, display this help and exit\n\ ---version, output version information and exit\n\n"); - - if (msg) fprintf(stderr, "%s\n\n", msg); exit(status); } diff --git a/src/trim_single.c b/src/trim_single.c index 85febd6..afedd8b 100644 --- a/src/trim_single.c +++ b/src/trim_single.c @@ -32,24 +32,44 @@ static struct option single_long_options[] = { }; void single_usage(int status, char *msg) { + static const char *usage_format = + "\n" + "Usage: %1$s se [options] -f \n" + " %2$*3$s -t \n" + " %2$*3$s -o \n" + "\n" + "Options:\n" + "\n" + "-f FILE, --fastq-file FILE Input fastq file (required)\n" + "-o FILE, --output-file FILE Output trimmed fastq file (required)\n" + "-t TYPE, --qual-type TYPE Type of quality values, one of:\n" + " solexa (CASAVA < 1.3)\n" + " illumina (CASAVA 1.3 to 1.7)\n" + " sanger (which is CASAVA >= 1.8)\n" + " (required)\n" + "-q #, --qual-threshold # Threshold for trimming based on average quality\n" + " in a window. Default %3$d.\n" + "-l #, --length-threshold # Threshold to keep a read based on length after\n" + " trimming. Default %4$d.\n" + "-x, --no-fiveprime Don't do five prime trimming.\n" + "-n, --truncate-n Truncate sequences at position of first N.\n" + "-g, --gzip-output Output gzipped files.\n" + "--quiet Do not output trimming info\n" + "--help Display this help and exit\n" + "--version Output version information and exit\n" + ; + + if (msg) fprintf( STDERR_OR_OUT(status), "%s\n", msg ); + + fprintf( + STDERR_OR_OUT(status), + usage_format, + PROGRAM_NAME, + "", strlen(PROGRAM_NAME) + 3, // +3 makes it possible to keep text visually aligned in the format string itself + single_qual_threshold, + single_length_threshold + ); - fprintf(stderr, "\nUsage: %s se [options] -f -t -o \n\ -\n\ -Options:\n\ --f, --fastq-file, Input fastq file (required)\n\ --t, --qual-type, Type of quality values (solexa (CASAVA < 1.3), illumina (CASAVA 1.3 to 1.7), sanger (which is CASAVA >= 1.8)) (required)\n\ --o, --output-file, Output trimmed fastq file (required)\n", PROGRAM_NAME); - - fprintf(stderr, "-q, --qual-threshold, Threshold for trimming based on average quality in a window. Default 20.\n\ --l, --length-threshold, Threshold to keep a read based on length after trimming. Default 20.\n\ --x, --no-fiveprime, Don't do five prime trimming.\n\ --n, --trunc-n, Truncate sequences at position of first N.\n\ --g, --gzip-output, Output gzipped files.\n\ ---quiet, Don't print out any trimming information\n\ ---help, display this help and exit\n\ ---version, output version information and exit\n\n"); - - if (msg) fprintf(stderr, "%s\n\n", msg); exit(status); } From 0425b63b65a68c59e92a1abccedb55612116ce98 Mon Sep 17 00:00:00 2001 From: Thomas Sibley Date: Wed, 24 Sep 2014 15:43:52 -0700 Subject: [PATCH 4/7] Optionally use a fixed window size instead of 0.1 of the read length A fixed window size provides a stricter control on average base qualities for datasets with a wide range of read lengths. --- src/sickle.h | 2 +- src/sliding.c | 11 ++++++----- src/trim_paired.c | 18 +++++++++++++++--- src/trim_single.c | 16 ++++++++++++++-- 4 files changed, 36 insertions(+), 11 deletions(-) diff --git a/src/sickle.h b/src/sickle.h index e78e667..a6b9e1f 100644 --- a/src/sickle.h +++ b/src/sickle.h @@ -98,6 +98,6 @@ typedef struct __cutsites_ { /* Function Prototypes */ int single_main (int argc, char *argv[]); int paired_main (int argc, char *argv[]); -cutsites* sliding_window (kseq_t *fqrec, int qualtype, int length_threshold, int qual_threshold, int no_fiveprime, int trunc_n, int debug); +cutsites* sliding_window (kseq_t *fqrec, int qualtype, int length_threshold, int qual_threshold, int window_size, int no_fiveprime, int trunc_n, int debug); #endif /*SICKLE_H*/ diff --git a/src/sliding.c b/src/sliding.c index 7573106..d087dd6 100644 --- a/src/sliding.c +++ b/src/sliding.c @@ -32,9 +32,10 @@ int get_quality_num (char qualchar, int qualtype, kseq_t *fqrec, int pos) { } -cutsites* sliding_window (kseq_t *fqrec, int qualtype, int length_threshold, int qual_threshold, int no_fiveprime, int trunc_n, int debug) { +cutsites* sliding_window (kseq_t *fqrec, int qualtype, int length_threshold, int qual_threshold, int window_size, int no_fiveprime, int trunc_n, int debug) { - int window_size = (int) (0.1 * fqrec->seq.l); + if (!window_size) + window_size = (int) (0.1 * fqrec->seq.l); int i,j; int window_start=0; int window_total=0; @@ -53,9 +54,9 @@ cutsites* sliding_window (kseq_t *fqrec, int qualtype, int length_threshold, int return (retvals); } - /* if the seq length is less then 10bp, */ + /* if the seq length is less then 10bp or the fixed window size is larger than the sequence length, */ /* then make the window size the length of the seq */ - if (window_size == 0) window_size = fqrec->seq.l; + if (window_size == 0 || window_size > fqrec->seq.l) window_size = fqrec->seq.l; for (i=0; iqual.s[i], qualtype, fqrec, i); @@ -65,7 +66,7 @@ cutsites* sliding_window (kseq_t *fqrec, int qualtype, int length_threshold, int window_avg = (double)window_total / (double)window_size; - if (debug) printf ("no_fiveprime: %d, found 5prime: %d, window_avg: %f\n", no_fiveprime, found_five_prime, window_avg); + if (debug) printf ("no_fiveprime: %d, found 5prime: %d, window_avg: %f, window_size: %d\n", no_fiveprime, found_five_prime, window_avg, window_size); /* Finding the 5' cutoff */ /* Find when the average quality in the window goes above the threshold starting from the 5' end */ diff --git a/src/trim_paired.c b/src/trim_paired.c index 6a42251..5e7be72 100644 --- a/src/trim_paired.c +++ b/src/trim_paired.c @@ -16,7 +16,7 @@ __KSEQ_READ int paired_qual_threshold = 20; int paired_length_threshold = 20; -static const char *paired_short_options = "df:r:c:t:o:p:m:M:s:q:l:xng"; +static const char *paired_short_options = "df:r:c:t:o:p:m:M:s:q:l:w:xng"; static struct option paired_long_options[] = { { "qual-type", required_argument, NULL, 't' }, { "pe-file1", required_argument, NULL, 'f' }, @@ -28,6 +28,7 @@ static struct option paired_long_options[] = { { "output-combo", required_argument, NULL, 'm' }, { "qual-threshold", required_argument, NULL, 'q' }, { "length-threshold", required_argument, NULL, 'l' }, + { "window", required_argument, NULL, 'w' }, { "no-fiveprime", no_argument, NULL, 'x' }, { "truncate-n", no_argument, NULL, 'n' }, { "gzip-output", no_argument, NULL, 'g' }, @@ -93,6 +94,8 @@ void paired_usage (int status, char *msg) { " in a window. Default %3$d.\n" "-l #, --length-threshold # Threshold to keep a read based on length after\n" " trimming. Default %4$d.\n" + "-w #, --window # Fixed window size to use. Default is a dynamic\n" + " window size of 0.1 of the read length.\n" "-x, --no-fiveprime Don't do five prime trimming.\n" "-n, --truncate-n Truncate sequences at position of first N.\n" "-g, --gzip-output Output gzipped files.\n" @@ -158,6 +161,7 @@ int paired_main(int argc, char *argv[]) { int combo_all=0; int combo_s=0; int total=0; + int window_size = 0; while (1) { int option_index = 0; @@ -238,6 +242,14 @@ int paired_main(int argc, char *argv[]) { } break; + case 'w': + window_size = atoi(optarg); + if (window_size < 1) { + fprintf(stderr, "Fixed window size must be >= 1\n"); + return EXIT_FAILURE; + } + break; + case 'x': no_fiveprime = 1; break; @@ -414,8 +426,8 @@ int paired_main(int argc, char *argv[]) { break; } - p1cut = sliding_window(fqrec1, qualtype, paired_length_threshold, paired_qual_threshold, no_fiveprime, trunc_n, debug); - p2cut = sliding_window(fqrec2, qualtype, paired_length_threshold, paired_qual_threshold, no_fiveprime, trunc_n, debug); + p1cut = sliding_window(fqrec1, qualtype, paired_length_threshold, paired_qual_threshold, window_size, no_fiveprime, trunc_n, debug); + p2cut = sliding_window(fqrec2, qualtype, paired_length_threshold, paired_qual_threshold, window_size, no_fiveprime, trunc_n, debug); total += 2; if (debug) printf("p1cut: %d,%d\n", p1cut->five_prime_cut, p1cut->three_prime_cut); diff --git a/src/trim_single.c b/src/trim_single.c index afedd8b..009afb0 100644 --- a/src/trim_single.c +++ b/src/trim_single.c @@ -15,13 +15,14 @@ __KSEQ_READ int single_qual_threshold = 20; int single_length_threshold = 20; -static const char *single_short_options = "df:t:o:q:l:zxng"; +static const char *single_short_options = "df:t:o:q:l:w:zxng"; static struct option single_long_options[] = { { "fastq-file", required_argument, NULL, 'f' }, { "output-file", required_argument, NULL, 'o' }, { "qual-type", required_argument, NULL, 't' }, { "qual-threshold", required_argument, NULL, 'q' }, { "length-threshold", required_argument, NULL, 'l' }, + { "window", required_argument, NULL, 'w' }, { "no-fiveprime", no_argument, NULL, 'x' }, { "truncate-n", no_argument, NULL, 'n' }, { "gzip-output", no_argument, NULL, 'g' }, @@ -51,6 +52,8 @@ void single_usage(int status, char *msg) { " in a window. Default %3$d.\n" "-l #, --length-threshold # Threshold to keep a read based on length after\n" " trimming. Default %4$d.\n" + "-w #, --window # Fixed window size to use. Default is a dynamic\n" + " window size of 0.1 of the read length.\n" "-x, --no-fiveprime Don't do five prime trimming.\n" "-n, --truncate-n Truncate sequences at position of first N.\n" "-g, --gzip-output Output gzipped files.\n" @@ -94,6 +97,7 @@ int single_main(int argc, char *argv[]) { int trunc_n = 0; int gzip_output = 0; int total=0; + int window_size = 0; while (1) { int option_index = 0; @@ -145,6 +149,14 @@ int single_main(int argc, char *argv[]) { } break; + case 'w': + window_size = atoi(optarg); + if (window_size < 1) { + fprintf(stderr, "Fixed window size must be >= 1\n"); + return EXIT_FAILURE; + } + break; + case 'x': no_fiveprime = 1; break; @@ -213,7 +225,7 @@ int single_main(int argc, char *argv[]) { while ((l = kseq_read(fqrec)) >= 0) { - p1cut = sliding_window(fqrec, qualtype, single_length_threshold, single_qual_threshold, no_fiveprime, trunc_n, debug); + p1cut = sliding_window(fqrec, qualtype, single_length_threshold, single_qual_threshold, window_size, no_fiveprime, trunc_n, debug); total++; if (debug) printf("P1cut: %d,%d\n", p1cut->five_prime_cut, p1cut->three_prime_cut); From d5ecfa12d17121ebf8a4a71e6bcd3c147f82ab40 Mon Sep 17 00:00:00 2001 From: Thomas Sibley Date: Thu, 14 Aug 2014 12:22:37 -0700 Subject: [PATCH 5/7] Add a --drop-n / -N option to drop any sequence containing an N This is a stricter version of --truncate-n which may use sequence fragments up until the first N provided they pass the length filter. --- src/sickle.h | 2 +- src/sliding.c | 14 +++++++++----- src/trim_paired.c | 20 ++++++++++++++++---- src/trim_single.c | 16 ++++++++++++++-- 4 files changed, 40 insertions(+), 12 deletions(-) diff --git a/src/sickle.h b/src/sickle.h index a6b9e1f..128fd88 100644 --- a/src/sickle.h +++ b/src/sickle.h @@ -98,6 +98,6 @@ typedef struct __cutsites_ { /* Function Prototypes */ int single_main (int argc, char *argv[]); int paired_main (int argc, char *argv[]); -cutsites* sliding_window (kseq_t *fqrec, int qualtype, int length_threshold, int qual_threshold, int window_size, int no_fiveprime, int trunc_n, int debug); +cutsites* sliding_window (kseq_t *fqrec, int qualtype, int length_threshold, int qual_threshold, int window_size, int no_fiveprime, int trunc_n, int drop_n, int debug); #endif /*SICKLE_H*/ diff --git a/src/sliding.c b/src/sliding.c index d087dd6..02f992f 100644 --- a/src/sliding.c +++ b/src/sliding.c @@ -32,7 +32,7 @@ int get_quality_num (char qualchar, int qualtype, kseq_t *fqrec, int pos) { } -cutsites* sliding_window (kseq_t *fqrec, int qualtype, int length_threshold, int qual_threshold, int window_size, int no_fiveprime, int trunc_n, int debug) { +cutsites* sliding_window (kseq_t *fqrec, int qualtype, int length_threshold, int qual_threshold, int window_size, int no_fiveprime, int trunc_n, int drop_n, int debug) { if (!window_size) window_size = (int) (0.1 * fqrec->seq.l); @@ -113,10 +113,14 @@ cutsites* sliding_window (kseq_t *fqrec, int qualtype, int length_threshold, int } - /* If truncate N option is selected, and sequence has Ns, then */ - /* change 3' cut site to be the base before the first N */ - if (trunc_n && ((npos = strstr(fqrec->seq.s, "N")) || (npos = strstr(fqrec->seq.s, "n")))) { - three_prime_cut = npos - fqrec->seq.s; + /* If truncate N option is selected, and sequence has Ns, then + * change 3' cut site to be the base before the first N. + * If drop N option is selected, omit the sequence. */ + if ((npos = strstr(fqrec->seq.s, "N")) || (npos = strstr(fqrec->seq.s, "n"))) { + if (trunc_n) + three_prime_cut = npos - fqrec->seq.s; + else if (drop_n) + three_prime_cut = five_prime_cut = -1; } /* if cutting length is less than threshold then return -1 for both */ diff --git a/src/trim_paired.c b/src/trim_paired.c index 5e7be72..97fd3eb 100644 --- a/src/trim_paired.c +++ b/src/trim_paired.c @@ -16,7 +16,7 @@ __KSEQ_READ int paired_qual_threshold = 20; int paired_length_threshold = 20; -static const char *paired_short_options = "df:r:c:t:o:p:m:M:s:q:l:w:xng"; +static const char *paired_short_options = "df:r:c:t:o:p:m:M:s:q:l:w:xnNg"; static struct option paired_long_options[] = { { "qual-type", required_argument, NULL, 't' }, { "pe-file1", required_argument, NULL, 'f' }, @@ -31,6 +31,7 @@ static struct option paired_long_options[] = { { "window", required_argument, NULL, 'w' }, { "no-fiveprime", no_argument, NULL, 'x' }, { "truncate-n", no_argument, NULL, 'n' }, + { "drop-n", no_argument, NULL, 'N' }, { "gzip-output", no_argument, NULL, 'g' }, { "output-combo-all", required_argument, NULL, 'M' }, { "quiet", no_argument, NULL, 'z' }, @@ -98,6 +99,7 @@ void paired_usage (int status, char *msg) { " window size of 0.1 of the read length.\n" "-x, --no-fiveprime Don't do five prime trimming.\n" "-n, --truncate-n Truncate sequences at position of first N.\n" + "-N, --drop-n Discard sequences containing an N.\n" "-g, --gzip-output Output gzipped files.\n" "--quiet Do not output trimming info\n" "--help Display this help and exit\n" @@ -157,6 +159,7 @@ int paired_main(int argc, char *argv[]) { int quiet = 0; int no_fiveprime = 0; int trunc_n = 0; + int drop_n = 0; int gzip_output = 0; int combo_all=0; int combo_s=0; @@ -258,6 +261,10 @@ int paired_main(int argc, char *argv[]) { trunc_n = 1; break; + case 'N': + drop_n = 1; + break; + case 'g': gzip_output = 1; break; @@ -288,6 +295,11 @@ int paired_main(int argc, char *argv[]) { paired_usage(EXIT_FAILURE, "****Error: Quality type is required."); } + if (trunc_n && drop_n) { + fprintf(stderr, "****Error: cannot specify both --truncate-n and --drop-n\n\n"); + return EXIT_FAILURE; + } + /* make sure minimum input filenames are specified */ if (!infn1 && !infnc) { paired_usage(EXIT_FAILURE, "****Error: Must have either -f OR -c argument."); @@ -426,8 +438,8 @@ int paired_main(int argc, char *argv[]) { break; } - p1cut = sliding_window(fqrec1, qualtype, paired_length_threshold, paired_qual_threshold, window_size, no_fiveprime, trunc_n, debug); - p2cut = sliding_window(fqrec2, qualtype, paired_length_threshold, paired_qual_threshold, window_size, no_fiveprime, trunc_n, debug); + p1cut = sliding_window(fqrec1, qualtype, paired_length_threshold, paired_qual_threshold, window_size, no_fiveprime, trunc_n, drop_n, debug); + p2cut = sliding_window(fqrec2, qualtype, paired_length_threshold, paired_qual_threshold, window_size, no_fiveprime, trunc_n, drop_n, debug); total += 2; if (debug) printf("p1cut: %d,%d\n", p1cut->five_prime_cut, p1cut->three_prime_cut); @@ -532,7 +544,7 @@ int paired_main(int argc, char *argv[]) { } if (!quiet) { - if (infn1 && infn2) fprintf(stdout, "\nPE forwrd file: %s\nPE reverse file: %s\n", infn1, infn2); + if (infn1 && infn2) fprintf(stdout, "\nPE forward file: %s\nPE reverse file: %s\n", infn1, infn2); if (infnc) fprintf(stdout, "\nPE interleaved file: %s\n", infnc); fprintf(stdout, "\nTotal input FastQ records: %d (%d pairs)\n", total, (total / 2)); fprintf(stdout, "\nFastQ paired records kept: %d (%d pairs)\n", kept_p, (kept_p / 2)); diff --git a/src/trim_single.c b/src/trim_single.c index 009afb0..7f76d8a 100644 --- a/src/trim_single.c +++ b/src/trim_single.c @@ -15,7 +15,7 @@ __KSEQ_READ int single_qual_threshold = 20; int single_length_threshold = 20; -static const char *single_short_options = "df:t:o:q:l:w:zxng"; +static const char *single_short_options = "df:t:o:q:l:w:zxnNg"; static struct option single_long_options[] = { { "fastq-file", required_argument, NULL, 'f' }, { "output-file", required_argument, NULL, 'o' }, @@ -25,6 +25,7 @@ static struct option single_long_options[] = { { "window", required_argument, NULL, 'w' }, { "no-fiveprime", no_argument, NULL, 'x' }, { "truncate-n", no_argument, NULL, 'n' }, + { "drop-n", no_argument, NULL, 'N' }, { "gzip-output", no_argument, NULL, 'g' }, { "quiet", no_argument, NULL, 'z' }, { GETOPT_HELP_OPTION_DECL }, @@ -56,6 +57,7 @@ void single_usage(int status, char *msg) { " window size of 0.1 of the read length.\n" "-x, --no-fiveprime Don't do five prime trimming.\n" "-n, --truncate-n Truncate sequences at position of first N.\n" + "-N, --drop-n Discard sequences containing an N.\n" "-g, --gzip-output Output gzipped files.\n" "--quiet Do not output trimming info\n" "--help Display this help and exit\n" @@ -95,6 +97,7 @@ int single_main(int argc, char *argv[]) { int quiet = 0; int no_fiveprime = 0; int trunc_n = 0; + int drop_n = 0; int gzip_output = 0; int total=0; int window_size = 0; @@ -165,6 +168,10 @@ int single_main(int argc, char *argv[]) { trunc_n = 1; break; + case 'N': + drop_n = 1; + break; + case 'g': gzip_output = 1; break; @@ -220,12 +227,17 @@ int single_main(int argc, char *argv[]) { } } + if (trunc_n && drop_n) { + fprintf(stderr, "****Error: cannot specify both --truncate-n and --drop-n\n\n"); + return EXIT_FAILURE; + } + fqrec = kseq_init(se); while ((l = kseq_read(fqrec)) >= 0) { - p1cut = sliding_window(fqrec, qualtype, single_length_threshold, single_qual_threshold, window_size, no_fiveprime, trunc_n, debug); + p1cut = sliding_window(fqrec, qualtype, single_length_threshold, single_qual_threshold, window_size, no_fiveprime, trunc_n, drop_n, debug); total++; if (debug) printf("P1cut: %d,%d\n", p1cut->five_prime_cut, p1cut->three_prime_cut); From ccbb38a6a038cc0909eb42f4b048af32019619f2 Mon Sep 17 00:00:00 2001 From: Thomas Sibley Date: Mon, 6 Oct 2014 13:59:13 -0700 Subject: [PATCH 6/7] -n / -N options: Only consider Ns between the sliding window cut sites Otherwise Ns that might otherwise be trimmed off will cause unnecessary read dropping. For example using --truncate-n, if an N occurs early in the sequence, it's possible for the 3' cut site to be set closer to the 5' end than the 5' cut site, which results in a dropped read due to negative length. --- src/sliding.c | 25 +++++++++++++++++-------- src/trim_paired.c | 7 +++++-- src/trim_single.c | 7 +++++-- 3 files changed, 27 insertions(+), 12 deletions(-) diff --git a/src/sliding.c b/src/sliding.c index 02f992f..a93412a 100644 --- a/src/sliding.c +++ b/src/sliding.c @@ -113,14 +113,23 @@ cutsites* sliding_window (kseq_t *fqrec, int qualtype, int length_threshold, int } - /* If truncate N option is selected, and sequence has Ns, then - * change 3' cut site to be the base before the first N. - * If drop N option is selected, omit the sequence. */ - if ((npos = strstr(fqrec->seq.s, "N")) || (npos = strstr(fqrec->seq.s, "n"))) { - if (trunc_n) - three_prime_cut = npos - fqrec->seq.s; - else if (drop_n) - three_prime_cut = five_prime_cut = -1; + /* If truncate N option is selected, and the trimmed sequence has Ns, + * then change 3' cut site to be the base before the first N. + * If drop N option is selected, omit the sequence. + */ + if (five_prime_cut >= 0 && three_prime_cut >= 0 && three_prime_cut > five_prime_cut) { + int cut_len = three_prime_cut - five_prime_cut; + char *trimmed = malloc(cut_len + 1); + strncpy(trimmed, fqrec->seq.s + five_prime_cut, cut_len); + trimmed[cut_len] = '\0'; + + if ((npos = strstr(trimmed, "N")) || (npos = strstr(trimmed, "n"))) { + if (trunc_n) + three_prime_cut = npos - trimmed + five_prime_cut; + else if (drop_n) + three_prime_cut = five_prime_cut = -1; + } + free(trimmed); } /* if cutting length is less than threshold then return -1 for both */ diff --git a/src/trim_paired.c b/src/trim_paired.c index 97fd3eb..4cd1794 100644 --- a/src/trim_paired.c +++ b/src/trim_paired.c @@ -98,8 +98,11 @@ void paired_usage (int status, char *msg) { "-w #, --window # Fixed window size to use. Default is a dynamic\n" " window size of 0.1 of the read length.\n" "-x, --no-fiveprime Don't do five prime trimming.\n" - "-n, --truncate-n Truncate sequences at position of first N.\n" - "-N, --drop-n Discard sequences containing an N.\n" + "-n, --truncate-n Truncate sequences at position of first N between\n" + " the 5' and 3' trim sites (i.e. moves the 3' trim site\n" + " to the N closest to the 5' end).\n" + "-N, --drop-n Discard sequences containing an N between the 5'\n" + " and 3' trim sites.\n" "-g, --gzip-output Output gzipped files.\n" "--quiet Do not output trimming info\n" "--help Display this help and exit\n" diff --git a/src/trim_single.c b/src/trim_single.c index 7f76d8a..d11a776 100644 --- a/src/trim_single.c +++ b/src/trim_single.c @@ -56,8 +56,11 @@ void single_usage(int status, char *msg) { "-w #, --window # Fixed window size to use. Default is a dynamic\n" " window size of 0.1 of the read length.\n" "-x, --no-fiveprime Don't do five prime trimming.\n" - "-n, --truncate-n Truncate sequences at position of first N.\n" - "-N, --drop-n Discard sequences containing an N.\n" + "-n, --truncate-n Truncate sequences at position of first N between\n" + " the 5' and 3' trim sites (i.e. moves the 3' trim site\n" + " to the N closest to the 5' end).\n" + "-N, --drop-n Discard sequences containing an N between the 5'\n" + " and 3' trim sites.\n" "-g, --gzip-output Output gzipped files.\n" "--quiet Do not output trimming info\n" "--help Display this help and exit\n" From 5e7072519bb215d6a8df9b171b6604f25c28641e Mon Sep 17 00:00:00 2001 From: Thomas Sibley Date: Mon, 6 Oct 2014 14:06:09 -0700 Subject: [PATCH 7/7] Basic test infrastructure and a few tests of -n/-N Perl's testing infrastructure is mature, well proven, and easy to extend. I've listed all the dependencies for completeness in test/cpanfile, used by `make install-test-deps`, but only one dep (Capture::Tiny) is not distributed with core Perl. --- .gitignore | 2 +- Makefile | 13 ++++++++- test/N-handling.fastq | 16 ++++++++++++ test/N-handling.t | 58 +++++++++++++++++++++++++++++++++++++++++ test/cpanfile | 5 ++++ test/lib/Sickle/Test.pm | 37 ++++++++++++++++++++++++++ 6 files changed, 129 insertions(+), 2 deletions(-) create mode 100644 test/N-handling.fastq create mode 100644 test/N-handling.t create mode 100644 test/cpanfile create mode 100644 test/lib/Sickle/Test.pm diff --git a/.gitignore b/.gitignore index 3de57c9..9bd79a9 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,2 @@ *.o -sickle +/sickle diff --git a/Makefile b/Makefile index 24ce4ac..427413b 100644 --- a/Makefile +++ b/Makefile @@ -9,7 +9,7 @@ LDFLAGS= LIBS = -lz SDIR = src -.PHONY: clean default build distclean dist debug +.PHONY: clean default build distclean dist debug test default: build @@ -43,3 +43,14 @@ build: sliding.o trim_single.o trim_paired.o sickle.o print_record.o debug: $(MAKE) build "CFLAGS=-Wall -pedantic -g -DDEBUG" +test: build + @prove -Itest/lib -w test/*.t + +install-test-deps: + @if ! which cpanm >/dev/null 2>&1; then \ + echo "You don't appear to have cpanm installed. Try:"; \ + echo " curl https://raw.githubusercontent.com/miyagawa/cpanminus/master/cpanm | perl - App::cpanminus --sudo"; \ + echo; \ + exit 1; \ + fi + (cd test; cpanm --installdeps .) diff --git a/test/N-handling.fastq b/test/N-handling.fastq new file mode 100644 index 0000000..6cc71ac --- /dev/null +++ b/test/N-handling.fastq @@ -0,0 +1,16 @@ +@middle +ABCDEFGHIJKLMNOPQRSTUVWXYZ ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~ +@start_no_middle +ANBCDEFGHIJKLMOPQRSTUVWXYZ ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~ +@start_trimmed_and_middle +ANBCDEFGHIJKLMNOPQRSTUVWXY ++ +!!!~~~~~~~~~~~~~~~~~~~~~!! +@start_trimmed_no_middle_end +ABNCDEFGHIJKLMOPQRSTUVWXNZ ++ +!!!~~~~~~~~~~~~~~~~~~~~~!! diff --git a/test/N-handling.t b/test/N-handling.t new file mode 100644 index 0000000..cdb88d0 --- /dev/null +++ b/test/N-handling.t @@ -0,0 +1,58 @@ +use strict; +use warnings; +use 5.014; +use Test::More; +use Sickle::Test; + +my $fastq = $0 =~ s/\.t$/.fastq/r; + +my @se = (qw(se -t sanger -l 1 -w 1 --quiet -f), $fastq, qw(-o /dev/stdout)); + +my ($output, $err); + +($output, $err) = sickle_ok(@se); +is $output, <<'', "default; no -n or -N"; +@middle +ABCDEFGHIJKLMNOPQRSTUVWXYZ ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~ +@start_no_middle +ANBCDEFGHIJKLMOPQRSTUVWXYZ ++ +~~~~~~~~~~~~~~~~~~~~~~~~~~ +@start_trimmed_and_middle +CDEFGHIJKLMNOPQRSTUVW ++ +~~~~~~~~~~~~~~~~~~~~~ +@start_trimmed_no_middle_end +CDEFGHIJKLMOPQRSTUVWX ++ +~~~~~~~~~~~~~~~~~~~~~ + +($output, $err) = sickle_ok(@se, "-n"); +is $output, <<'', "with -n"; +@middle +ABCDEFGHIJKLM ++ +~~~~~~~~~~~~~ +@start_no_middle +A ++ +~ +@start_trimmed_and_middle +CDEFGHIJKLM ++ +~~~~~~~~~~~ +@start_trimmed_no_middle_end +CDEFGHIJKLMOPQRSTUVWX ++ +~~~~~~~~~~~~~~~~~~~~~ + +($output, $err) = sickle_ok(@se, "-N"); +is $output, <<'', "with -N"; +@start_trimmed_no_middle_end +CDEFGHIJKLMOPQRSTUVWX ++ +~~~~~~~~~~~~~~~~~~~~~ + +done_testing; diff --git a/test/cpanfile b/test/cpanfile new file mode 100644 index 0000000..1d7777a --- /dev/null +++ b/test/cpanfile @@ -0,0 +1,5 @@ +requires 'perl', '5.014'; + +requires 'File::Basename'; +requires 'Capture::Tiny'; +requires 'Test::More'; diff --git a/test/lib/Sickle/Test.pm b/test/lib/Sickle/Test.pm new file mode 100644 index 0000000..51351f8 --- /dev/null +++ b/test/lib/Sickle/Test.pm @@ -0,0 +1,37 @@ +use strict; +use warnings; + +package Sickle::Test; +use base 'Exporter'; + +use File::Basename qw< dirname >; +use Capture::Tiny qw< capture >; +use Test::More; + +our @EXPORT = qw( sickle sickle_ok ); +our $SICKLE = dirname(__FILE__) . "/../../../sickle"; + +# Linux libc and glibc tunable malloc(); on detected heap corruption, this will +# cause a message to stderr and immediate abort. It's a poor-man's valgrind +# that you can enable always. See malloc(3) for more details. +$ENV{MALLOC_CHECK_} = 2; + +sub sickle { + my ($command, @opts) = @_; + my ($stdout, $stderr, $exit) = capture { + system $SICKLE, $command, @opts + }; + return ($stdout, $stderr, $exit); +} + +sub sickle_ok { + local $Test::Builder::Level = $Test::Builder::Level + 1; + my ($stdout, $stderr, $exit) = sickle(@_); + ok $exit >> 8 == 0, "exit code" + or diag "error running command: " . join(" ", "sickle", @_) . "\n" + ."exit code = " . ($exit >> 8) . " (system() return value = $exit)\n" + ."stderr = \n$stderr"; + return ($stdout, $stderr); +} + +1;