diff --git a/Makefile b/Makefile index 24ce4ac..0326696 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ PROGRAM_NAME = sickle VERSION = 1.33 CC = gcc -CFLAGS = -Wall -pedantic -DVERSION=$(VERSION) +CFLAGS = -Wall -pedantic -std=c99 -DVERSION=$(VERSION) DEBUG = -g OPT = -O3 ARCHIVE = $(PROGRAM_NAME)_$(VERSION) diff --git a/src/sickle.h b/src/sickle.h index e7b679c..a6b9e1f 100644 --- a/src/sickle.h +++ b/src/sickle.h @@ -61,6 +61,8 @@ exit(EXIT_SUCCESS); \ break; /* end code drawn from system.h */ +#define STDERR_OR_OUT(status) (status == EXIT_SUCCESS ? stdout : stderr) + typedef enum { PHRED, SANGER, @@ -96,6 +98,6 @@ typedef struct __cutsites_ { /* Function Prototypes */ int single_main (int argc, char *argv[]); int paired_main (int argc, char *argv[]); -cutsites* sliding_window (kseq_t *fqrec, int qualtype, int length_threshold, int qual_threshold, int no_fiveprime, int trunc_n, int debug); +cutsites* sliding_window (kseq_t *fqrec, int qualtype, int length_threshold, int qual_threshold, int window_size, int no_fiveprime, int trunc_n, int debug); #endif /*SICKLE_H*/ diff --git a/src/sliding.c b/src/sliding.c index 7573106..d087dd6 100644 --- a/src/sliding.c +++ b/src/sliding.c @@ -32,9 +32,10 @@ int get_quality_num (char qualchar, int qualtype, kseq_t *fqrec, int pos) { } -cutsites* sliding_window (kseq_t *fqrec, int qualtype, int length_threshold, int qual_threshold, int no_fiveprime, int trunc_n, int debug) { +cutsites* sliding_window (kseq_t *fqrec, int qualtype, int length_threshold, int qual_threshold, int window_size, int no_fiveprime, int trunc_n, int debug) { - int window_size = (int) (0.1 * fqrec->seq.l); + if (!window_size) + window_size = (int) (0.1 * fqrec->seq.l); int i,j; int window_start=0; int window_total=0; @@ -53,9 +54,9 @@ cutsites* sliding_window (kseq_t *fqrec, int qualtype, int length_threshold, int return (retvals); } - /* if the seq length is less then 10bp, */ + /* if the seq length is less then 10bp or the fixed window size is larger than the sequence length, */ /* then make the window size the length of the seq */ - if (window_size == 0) window_size = fqrec->seq.l; + if (window_size == 0 || window_size > fqrec->seq.l) window_size = fqrec->seq.l; for (i=0; iqual.s[i], qualtype, fqrec, i); @@ -65,7 +66,7 @@ cutsites* sliding_window (kseq_t *fqrec, int qualtype, int length_threshold, int window_avg = (double)window_total / (double)window_size; - if (debug) printf ("no_fiveprime: %d, found 5prime: %d, window_avg: %f\n", no_fiveprime, found_five_prime, window_avg); + if (debug) printf ("no_fiveprime: %d, found 5prime: %d, window_avg: %f, window_size: %d\n", no_fiveprime, found_five_prime, window_avg, window_size); /* Finding the 5' cutoff */ /* Find when the average quality in the window goes above the threshold starting from the 5' end */ diff --git a/src/trim_paired.c b/src/trim_paired.c index dcf2eb1..ede1072 100644 --- a/src/trim_paired.c +++ b/src/trim_paired.c @@ -16,62 +16,105 @@ __KSEQ_READ int paired_qual_threshold = 20; int paired_length_threshold = 20; +static const char *paired_short_options = "df:r:c:t:o:p:m:M:s:q:l:w:xng"; static struct option paired_long_options[] = { - {"qual-type", required_argument, 0, 't'}, - {"pe-file1", required_argument, 0, 'f'}, - {"pe-file2", required_argument, 0, 'r'}, - {"pe-combo", required_argument, 0, 'c'}, - {"output-pe1", required_argument, 0, 'o'}, - {"output-pe2", required_argument, 0, 'p'}, - {"output-single", required_argument, 0, 's'}, - {"output-combo", required_argument, 0, 'm'}, - {"qual-threshold", required_argument, 0, 'q'}, - {"length-threshold", required_argument, 0, 'l'}, - {"no-fiveprime", no_argument, 0, 'x'}, - {"truncate-n", no_argument, 0, 'n'}, - {"gzip-output", no_argument, 0, 'g'}, - {"output-combo-all", required_argument, 0, 'M'}, - {"quiet", no_argument, 0, 'z'}, + { "qual-type", required_argument, 0, 't' }, + { "pe-file1", required_argument, 0, 'f' }, + { "pe-file2", required_argument, 0, 'r' }, + { "pe-combo", required_argument, 0, 'c' }, + { "output-pe1", required_argument, 0, 'o' }, + { "output-pe2", required_argument, 0, 'p' }, + { "output-single", required_argument, 0, 's' }, + { "output-combo", required_argument, 0, 'm' }, + { "qual-threshold", required_argument, 0, 'q' }, + { "length-threshold", required_argument, 0, 'l' }, + { "window", required_argument, 0, 'w' }, + { "no-fiveprime", no_argument, 0, 'x' }, + { "truncate-n", no_argument, 0, 'n' }, + { "gzip-output", no_argument, 0, 'g' }, + { "output-combo-all", required_argument, 0, 'M' }, + { "quiet", no_argument, 0, 'z' }, + { "debug", no_argument, 0, 'd' }, {GETOPT_HELP_OPTION_DECL}, {GETOPT_VERSION_OPTION_DECL}, {NULL, 0, NULL, 0} }; void paired_usage (int status, char *msg) { + static const char *usage_format = + "\n" + "If you have separate files for forward and reverse reads:\n" + " Usage: %1$s pe [options] -f \n" + " %2$*3$s -r \n" + " %2$*3$s -t \n" + " %2$*3$s -o \n" + " %2$*3$s -p \n" + " %2$*3$s -s \n" + "\n" + "If you have one file with interleaved forward and reverse reads:\n" + " Usage: %1$s pe [options] -c \n" + " %2$*3$s -t \n" + " %2$*3$s -m \n" + " %2$*3$s -s \n" + "\n" + "If you have one file with interleaved reads as input and you want\n" + "ONLY one interleaved file as output:\n" + " Usage: %1$s pe [options] -c \n" + " %2$*3$s -t \n" + " %2$*3$s -M \n" + "\n" + "Options:\n" + "\n" + "Paired-end separated reads\n" + "--------------------------\n" + "-f, --pe-file1 Input paired-end forward fastq file\n" + "-r, --pe-file2 Input paired-end reverse fastq file\n" + " (input files must have same number of records)\n" + "-o, --output-pe1 Output trimmed forward fastq file\n" + "-p, --output-pe2 Output trimmed reverse fastq file. Must use -s option.\n" + "\n" + "Paired-end interleaved reads\n" + "----------------------------\n" + "-c, --pe-combo Combined (interleaved) input paired-end fastq\n" + "-m, --output-combo Output combined (interleaved) paired-end fastq file.\n" + " Must use -s option.\n" + "-M, --output-combo-all Output combined (interleaved) paired-end fastq file\n" + " with any discarded read written to output file as a\n" + " single N. Cannot be used with the -s option.\n" + "\n" + "Global options\n" + "--------------\n" + "-t TYPE, --qual-type TYPE Type of quality values, one of:\n" + " solexa (CASAVA < 1.3)\n" + " illumina (CASAVA 1.3 to 1.7)\n" + " sanger (which is CASAVA >= 1.8)\n" + " (required)\n" + "-s FILE, --output-single FILE Output trimmed singles fastq file\n" + "-q #, --qual-threshold # Threshold for trimming based on average quality\n" + " in a window. Default %3$d.\n" + "-l #, --length-threshold # Threshold to keep a read based on length after\n" + " trimming. Default %4$d.\n" + "-w #, --window # Fixed window size to use. Default is a dynamic\n" + " window size of 0.1 of the read length.\n" + "-x, --no-fiveprime Don't do five prime trimming.\n" + "-n, --truncate-n Truncate sequences at position of first N.\n" + "-g, --gzip-output Output gzipped files.\n" + "--quiet Do not output trimming info\n" + "--help Display this help and exit\n" + "--version Output version information and exit\n" + ; + + if (msg) fprintf( STDERR_OR_OUT(status), "%s\n", msg ); + + fprintf( + STDERR_OR_OUT(status), + usage_format, + PROGRAM_NAME, + "", strlen(PROGRAM_NAME) + 3, // +3 makes it possible to keep text visually aligned in the format string itself + paired_qual_threshold, + paired_length_threshold + ); - fprintf(stderr, "\nIf you have separate files for forward and reverse reads:\n"); - fprintf(stderr, "Usage: %s pe [options] -f -r -t -o -p -s \n\n", PROGRAM_NAME); - fprintf(stderr, "If you have one file with interleaved forward and reverse reads:\n"); - fprintf(stderr, "Usage: %s pe [options] -c -t -m -s \n\n\ -If you have one file with interleaved reads as input and you want ONLY one interleaved file as output:\n\ -Usage: %s pe [options] -c -t -M \n\n", PROGRAM_NAME, PROGRAM_NAME); - fprintf(stderr, "Options:\n\ -Paired-end separated reads\n\ ---------------------------\n\ --f, --pe-file1, Input paired-end forward fastq file (Input files must have same number of records)\n\ --r, --pe-file2, Input paired-end reverse fastq file\n\ --o, --output-pe1, Output trimmed forward fastq file\n\ --p, --output-pe2, Output trimmed reverse fastq file. Must use -s option.\n\n\ -Paired-end interleaved reads\n\ -----------------------------\n"); - fprintf(stderr,"-c, --pe-combo, Combined (interleaved) input paired-end fastq\n\ --m, --output-combo, Output combined (interleaved) paired-end fastq file. Must use -s option.\n\ --M, --output-combo-all, Output combined (interleaved) paired-end fastq file with any discarded read written to output file as a single N. Cannot be used with the -s option.\n\n\ -Global options\n\ ---------------\n\ --t, --qual-type, Type of quality values (solexa (CASAVA < 1.3), illumina (CASAVA 1.3 to 1.7), sanger (which is CASAVA >= 1.8)) (required)\n"); - fprintf(stderr, "-s, --output-single, Output trimmed singles fastq file\n\ --q, --qual-threshold, Threshold for trimming based on average quality in a window. Default 20.\n\ --l, --length-threshold, Threshold to keep a read based on length after trimming. Default 20.\n\ --x, --no-fiveprime, Don't do five prime trimming.\n\ --n, --truncate-n, Truncate sequences at position of first N.\n"); - - - fprintf(stderr, "-g, --gzip-output, Output gzipped files.\n--quiet, do not output trimming info\n\ ---help, display this help and exit\n\ ---version, output version information and exit\n\n"); - - if (msg) fprintf(stderr, "%s\n\n", msg); exit(status); } @@ -118,10 +161,11 @@ int paired_main(int argc, char *argv[]) { int combo_all=0; int combo_s=0; int total=0; + int window_size = 0; while (1) { int option_index = 0; - optc = getopt_long(argc, argv, "df:r:c:t:o:p:m:M:s:q:l:xng", paired_long_options, &option_index); + optc = getopt_long(argc, argv, paired_short_options, paired_long_options, &option_index); if (optc == -1) break; @@ -198,6 +242,14 @@ int paired_main(int argc, char *argv[]) { } break; + case 'w': + window_size = atoi(optarg); + if (window_size < 1) { + fprintf(stderr, "Fixed window size must be >= 1\n"); + return EXIT_FAILURE; + } + break; + case 'x': no_fiveprime = 1; break; @@ -374,8 +426,8 @@ int paired_main(int argc, char *argv[]) { break; } - p1cut = sliding_window(fqrec1, qualtype, paired_length_threshold, paired_qual_threshold, no_fiveprime, trunc_n, debug); - p2cut = sliding_window(fqrec2, qualtype, paired_length_threshold, paired_qual_threshold, no_fiveprime, trunc_n, debug); + p1cut = sliding_window(fqrec1, qualtype, paired_length_threshold, paired_qual_threshold, window_size, no_fiveprime, trunc_n, debug); + p2cut = sliding_window(fqrec2, qualtype, paired_length_threshold, paired_qual_threshold, window_size, no_fiveprime, trunc_n, debug); total += 2; if (debug) printf("p1cut: %d,%d\n", p1cut->five_prime_cut, p1cut->three_prime_cut); diff --git a/src/trim_single.c b/src/trim_single.c index 2f59921..f0d6001 100644 --- a/src/trim_single.c +++ b/src/trim_single.c @@ -15,40 +15,66 @@ __KSEQ_READ int single_qual_threshold = 20; int single_length_threshold = 20; +static const char *single_short_options = "df:t:o:q:l:w:zxng"; static struct option single_long_options[] = { - {"fastq-file", required_argument, 0, 'f'}, - {"output-file", required_argument, 0, 'o'}, - {"qual-type", required_argument, 0, 't'}, - {"qual-threshold", required_argument, 0, 'q'}, - {"length-threshold", required_argument, 0, 'l'}, - {"no-fiveprime", no_argument, 0, 'x'}, - {"discard-n", no_argument, 0, 'n'}, - {"gzip-output", no_argument, 0, 'g'}, - {"quiet", no_argument, 0, 'z'}, + { "fastq-file", required_argument, 0, 'f' }, + { "output-file", required_argument, 0, 'o' }, + { "qual-type", required_argument, 0, 't' }, + { "qual-threshold", required_argument, 0, 'q' }, + { "length-threshold", required_argument, 0, 'l' }, + { "no-fiveprime", no_argument, 0, 'x' }, + { "window", required_argument, 0, 'w' }, + { "trunc-n", no_argument, 0, 'n' }, + { "truncate-n", no_argument, 0, 'n' }, + { "gzip-output", no_argument, 0, 'g' }, + { "quiet", no_argument, 0, 'z' }, + { "debug", no_argument, 0, 'd' }, {GETOPT_HELP_OPTION_DECL}, {GETOPT_VERSION_OPTION_DECL}, {NULL, 0, NULL, 0} }; void single_usage(int status, char *msg) { + static const char *usage_format = + "\n" + "Usage: %1$s se [options] -f \n" + " %2$*3$s -t \n" + " %2$*3$s -o \n" + "\n" + "Options:\n" + "\n" + "-f FILE, --fastq-file FILE Input fastq file (required)\n" + "-o FILE, --output-file FILE Output trimmed fastq file (required)\n" + "-t TYPE, --qual-type TYPE Type of quality values, one of:\n" + " solexa (CASAVA < 1.3)\n" + " illumina (CASAVA 1.3 to 1.7)\n" + " sanger (which is CASAVA >= 1.8)\n" + " (required)\n" + "-q #, --qual-threshold # Threshold for trimming based on average quality\n" + " in a window. Default %3$d.\n" + "-l #, --length-threshold # Threshold to keep a read based on length after\n" + " trimming. Default %4$d.\n" + "-w #, --window # Fixed window size to use. Default is a dynamic\n" + " window size of 0.1 of the read length.\n" + "-x, --no-fiveprime Don't do five prime trimming.\n" + "-n, --truncate-n Truncate sequences at position of first N.\n" + "-g, --gzip-output Output gzipped files.\n" + "--quiet Do not output trimming info\n" + "--help Display this help and exit\n" + "--version Output version information and exit\n" + ; + + if (msg) fprintf( STDERR_OR_OUT(status), "%s\n", msg ); + + fprintf( + STDERR_OR_OUT(status), + usage_format, + PROGRAM_NAME, + "", strlen(PROGRAM_NAME) + 3, // +3 makes it possible to keep text visually aligned in the format string itself + single_qual_threshold, + single_length_threshold + ); - fprintf(stderr, "\nUsage: %s se [options] -f -t -o \n\ -\n\ -Options:\n\ --f, --fastq-file, Input fastq file (required)\n\ --t, --qual-type, Type of quality values (solexa (CASAVA < 1.3), illumina (CASAVA 1.3 to 1.7), sanger (which is CASAVA >= 1.8)) (required)\n\ --o, --output-file, Output trimmed fastq file (required)\n", PROGRAM_NAME); - - fprintf(stderr, "-q, --qual-threshold, Threshold for trimming based on average quality in a window. Default 20.\n\ --l, --length-threshold, Threshold to keep a read based on length after trimming. Default 20.\n\ --x, --no-fiveprime, Don't do five prime trimming.\n\ --n, --trunc-n, Truncate sequences at position of first N.\n\ --g, --gzip-output, Output gzipped files.\n\ ---quiet, Don't print out any trimming information\n\ ---help, display this help and exit\n\ ---version, output version information and exit\n\n"); - - if (msg) fprintf(stderr, "%s\n\n", msg); exit(status); } @@ -73,10 +99,11 @@ int single_main(int argc, char *argv[]) { int trunc_n = 0; int gzip_output = 0; int total=0; + int window_size = 0; while (1) { int option_index = 0; - optc = getopt_long(argc, argv, "df:t:o:q:l:zxng", single_long_options, &option_index); + optc = getopt_long(argc, argv, single_short_options, single_long_options, &option_index); if (optc == -1) break; @@ -124,6 +151,14 @@ int single_main(int argc, char *argv[]) { } break; + case 'w': + window_size = atoi(optarg); + if (window_size < 1) { + fprintf(stderr, "Fixed window size must be >= 1\n"); + return EXIT_FAILURE; + } + break; + case 'x': no_fiveprime = 1; break; @@ -192,7 +227,7 @@ int single_main(int argc, char *argv[]) { while ((l = kseq_read(fqrec)) >= 0) { - p1cut = sliding_window(fqrec, qualtype, single_length_threshold, single_qual_threshold, no_fiveprime, trunc_n, debug); + p1cut = sliding_window(fqrec, qualtype, single_length_threshold, single_qual_threshold, window_size, no_fiveprime, trunc_n, debug); total++; if (debug) printf("P1cut: %d,%d\n", p1cut->five_prime_cut, p1cut->three_prime_cut);