Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
*.o
sickle
/sickle
13 changes: 12 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ LDFLAGS=
LIBS = -lz
SDIR = src

.PHONY: clean default build distclean dist debug
.PHONY: clean default build distclean dist debug test

default: build

Expand Down Expand Up @@ -43,3 +43,14 @@ build: sliding.o trim_single.o trim_paired.o sickle.o print_record.o
debug:
$(MAKE) build "CFLAGS=-Wall -pedantic -g -DDEBUG"

test: build
@prove -Itest/lib -w test/*.t

install-test-deps:
@if ! which cpanm >/dev/null 2>&1; then \
echo "You don't appear to have cpanm installed. Try:"; \
echo " curl https://raw.githubusercontent.com/miyagawa/cpanminus/master/cpanm | perl - App::cpanminus --sudo"; \
echo; \
exit 1; \
fi
(cd test; cpanm --installdeps .)
4 changes: 3 additions & 1 deletion src/sickle.h
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,8 @@ exit(EXIT_SUCCESS); \
break;
/* end code drawn from system.h */

#define STDERR_OR_OUT(status) (status == EXIT_SUCCESS ? stdout : stderr)

typedef enum {
PHRED,
SANGER,
Expand Down Expand Up @@ -96,6 +98,6 @@ typedef struct __cutsites_ {
/* Function Prototypes */
int single_main (int argc, char *argv[]);
int paired_main (int argc, char *argv[]);
cutsites* sliding_window (kseq_t *fqrec, int qualtype, int length_threshold, int qual_threshold, int no_fiveprime, int trunc_n, int debug);
cutsites* sliding_window (kseq_t *fqrec, int qualtype, int length_threshold, int qual_threshold, int window_size, int no_fiveprime, int trunc_n, int drop_n, int debug);

#endif /*SICKLE_H*/
32 changes: 23 additions & 9 deletions src/sliding.c
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,10 @@ int get_quality_num (char qualchar, int qualtype, kseq_t *fqrec, int pos) {
}


cutsites* sliding_window (kseq_t *fqrec, int qualtype, int length_threshold, int qual_threshold, int no_fiveprime, int trunc_n, int debug) {
cutsites* sliding_window (kseq_t *fqrec, int qualtype, int length_threshold, int qual_threshold, int window_size, int no_fiveprime, int trunc_n, int drop_n, int debug) {

int window_size = (int) (0.1 * fqrec->seq.l);
if (!window_size)
window_size = (int) (0.1 * fqrec->seq.l);
int i,j;
int window_start=0;
int window_total=0;
Expand All @@ -53,9 +54,9 @@ cutsites* sliding_window (kseq_t *fqrec, int qualtype, int length_threshold, int
return (retvals);
}

/* if the seq length is less then 10bp, */
/* if the seq length is less then 10bp or the fixed window size is larger than the sequence length, */
/* then make the window size the length of the seq */
if (window_size == 0) window_size = fqrec->seq.l;
if (window_size == 0 || window_size > fqrec->seq.l) window_size = fqrec->seq.l;

for (i=0; i<window_size; i++) {
window_total += get_quality_num (fqrec->qual.s[i], qualtype, fqrec, i);
Expand All @@ -65,7 +66,7 @@ cutsites* sliding_window (kseq_t *fqrec, int qualtype, int length_threshold, int

window_avg = (double)window_total / (double)window_size;

if (debug) printf ("no_fiveprime: %d, found 5prime: %d, window_avg: %f\n", no_fiveprime, found_five_prime, window_avg);
if (debug) printf ("no_fiveprime: %d, found 5prime: %d, window_avg: %f, window_size: %d\n", no_fiveprime, found_five_prime, window_avg, window_size);

/* Finding the 5' cutoff */
/* Find when the average quality in the window goes above the threshold starting from the 5' end */
Expand Down Expand Up @@ -112,10 +113,23 @@ cutsites* sliding_window (kseq_t *fqrec, int qualtype, int length_threshold, int
}


/* If truncate N option is selected, and sequence has Ns, then */
/* change 3' cut site to be the base before the first N */
if (trunc_n && ((npos = strstr(fqrec->seq.s, "N")) || (npos = strstr(fqrec->seq.s, "n")))) {
three_prime_cut = npos - fqrec->seq.s;
/* If truncate N option is selected, and the trimmed sequence has Ns,
* then change 3' cut site to be the base before the first N.
* If drop N option is selected, omit the sequence.
*/
if (five_prime_cut >= 0 && three_prime_cut >= 0 && three_prime_cut > five_prime_cut) {
int cut_len = three_prime_cut - five_prime_cut;
char *trimmed = malloc(cut_len + 1);
strncpy(trimmed, fqrec->seq.s + five_prime_cut, cut_len);
trimmed[cut_len] = '\0';

if ((npos = strstr(trimmed, "N")) || (npos = strstr(trimmed, "n"))) {
if (trunc_n)
three_prime_cut = npos - trimmed + five_prime_cut;
else if (drop_n)
three_prime_cut = five_prime_cut = -1;
}
free(trimmed);
}

/* if cutting length is less than threshold then return -1 for both */
Expand Down
177 changes: 122 additions & 55 deletions src/trim_paired.c
Original file line number Diff line number Diff line change
Expand Up @@ -16,62 +16,110 @@ __KSEQ_READ
int paired_qual_threshold = 20;
int paired_length_threshold = 20;

static const char *paired_short_options = "df:r:c:t:o:p:m:M:s:q:l:w:xnNg";
static struct option paired_long_options[] = {
{"qual-type", required_argument, 0, 't'},
{"pe-file1", optional_argument, 0, 'f'},
{"pe-file2", optional_argument, 0, 'r'},
{"pe-combo", optional_argument, 0, 'c'},
{"output-pe1", optional_argument, 0, 'o'},
{"output-pe2", optional_argument, 0, 'p'},
{"output-single", optional_argument, 0, 's'},
{"output-combo", optional_argument, 0, 'm'},
{"qual-threshold", optional_argument, 0, 'q'},
{"length-threshold", optional_argument, 0, 'l'},
{"no-fiveprime", optional_argument, 0, 'x'},
{"truncate-n", optional_argument, 0, 'n'},
{"gzip-output", optional_argument, 0, 'g'},
{"output-combo-all", optional_argument, 0, 'M'},
{"quiet", optional_argument, 0, 'z'},
{GETOPT_HELP_OPTION_DECL},
{GETOPT_VERSION_OPTION_DECL},
{NULL, 0, NULL, 0}
{ "qual-type", required_argument, NULL, 't' },
{ "pe-file1", required_argument, NULL, 'f' },
{ "pe-file2", required_argument, NULL, 'r' },
{ "pe-combo", required_argument, NULL, 'c' },
{ "output-pe1", required_argument, NULL, 'o' },
{ "output-pe2", required_argument, NULL, 'p' },
{ "output-single", required_argument, NULL, 's' },
{ "output-combo", required_argument, NULL, 'm' },
{ "qual-threshold", required_argument, NULL, 'q' },
{ "length-threshold", required_argument, NULL, 'l' },
{ "window", required_argument, NULL, 'w' },
{ "no-fiveprime", no_argument, NULL, 'x' },
{ "truncate-n", no_argument, NULL, 'n' },
{ "drop-n", no_argument, NULL, 'N' },
{ "gzip-output", no_argument, NULL, 'g' },
{ "output-combo-all", required_argument, NULL, 'M' },
{ "quiet", no_argument, NULL, 'z' },
{ "debug", no_argument, NULL, 'd' },
{ GETOPT_HELP_OPTION_DECL },
{ GETOPT_VERSION_OPTION_DECL },
{ NULL, 0, NULL, 0 }
};

void paired_usage (int status, char *msg) {
static const char *usage_format =
"\n"
"If you have separate files for forward and reverse reads:\n"
" Usage: %1$s pe [options] -f <paired-end forward fastq file>\n"
" %2$*3$s -r <paired-end reverse fastq file>\n"
" %2$*3$s -t <quality type>\n"
" %2$*3$s -o <trimmed PE forward file>\n"
" %2$*3$s -p <trimmed PE reverse file>\n"
" %2$*3$s -s <trimmed singles file>\n"
"\n"
"If you have one file with interleaved forward and reverse reads:\n"
" Usage: %1$s pe [options] -c <interleaved input file>\n"
" %2$*3$s -t <quality type>\n"
" %2$*3$s -m <interleaved trimmed paired-end output>\n"
" %2$*3$s -s <trimmed singles file>\n"
"\n"
"If you have one file with interleaved reads as input and you want\n"
"ONLY one interleaved file as output:\n"
" Usage: %1$s pe [options] -c <interleaved input file>\n"
" %2$*3$s -t <quality type>\n"
" %2$*3$s -M <interleaved trimmed output>\n"
"\n"
"Options:\n"
"\n"
"Paired-end separated reads\n"
"--------------------------\n"
"-f, --pe-file1 Input paired-end forward fastq file\n"
"-r, --pe-file2 Input paired-end reverse fastq file\n"
" (input files must have same number of records)\n"
"-o, --output-pe1 Output trimmed forward fastq file\n"
"-p, --output-pe2 Output trimmed reverse fastq file. Must use -s option.\n"
"\n"
"Paired-end interleaved reads\n"
"----------------------------\n"
"-c, --pe-combo Combined (interleaved) input paired-end fastq\n"
"-m, --output-combo Output combined (interleaved) paired-end fastq file.\n"
" Must use -s option.\n"
"-M, --output-combo-all Output combined (interleaved) paired-end fastq file\n"
" with any discarded read written to output file as a\n"
" single N. Cannot be used with the -s option.\n"
"\n"
"Global options\n"
"--------------\n"
"-t TYPE, --qual-type TYPE Type of quality values, one of:\n"
" solexa (CASAVA < 1.3)\n"
" illumina (CASAVA 1.3 to 1.7)\n"
" sanger (which is CASAVA >= 1.8)\n"
" (required)\n"
"-s FILE, --output-single FILE Output trimmed singles fastq file\n"
"-q #, --qual-threshold # Threshold for trimming based on average quality\n"
" in a window. Default %3$d.\n"
"-l #, --length-threshold # Threshold to keep a read based on length after\n"
" trimming. Default %4$d.\n"
"-w #, --window # Fixed window size to use. Default is a dynamic\n"
" window size of 0.1 of the read length.\n"
"-x, --no-fiveprime Don't do five prime trimming.\n"
"-n, --truncate-n Truncate sequences at position of first N between\n"
" the 5' and 3' trim sites (i.e. moves the 3' trim site\n"
" to the N closest to the 5' end).\n"
"-N, --drop-n Discard sequences containing an N between the 5'\n"
" and 3' trim sites.\n"
"-g, --gzip-output Output gzipped files.\n"
"--quiet Do not output trimming info\n"
"--help Display this help and exit\n"
"--version Output version information and exit\n"
;

if (msg) fprintf( STDERR_OR_OUT(status), "%s\n", msg );

fprintf(
STDERR_OR_OUT(status),
usage_format,
PROGRAM_NAME,
"", strlen(PROGRAM_NAME) + 3, // +3 makes it possible to keep text visually aligned in the format string itself
paired_qual_threshold,
paired_length_threshold
);

fprintf(stderr, "\nIf you have separate files for forward and reverse reads:\n");
fprintf(stderr, "Usage: %s pe [options] -f <paired-end forward fastq file> -r <paired-end reverse fastq file> -t <quality type> -o <trimmed PE forward file> -p <trimmed PE reverse file> -s <trimmed singles file>\n\n", PROGRAM_NAME);
fprintf(stderr, "If you have one file with interleaved forward and reverse reads:\n");
fprintf(stderr, "Usage: %s pe [options] -c <interleaved input file> -t <quality type> -m <interleaved trimmed paired-end output> -s <trimmed singles file>\n\n\
If you have one file with interleaved reads as input and you want ONLY one interleaved file as output:\n\
Usage: %s pe [options] -c <interleaved input file> -t <quality type> -M <interleaved trimmed output>\n\n", PROGRAM_NAME, PROGRAM_NAME);
fprintf(stderr, "Options:\n\
Paired-end separated reads\n\
--------------------------\n\
-f, --pe-file1, Input paired-end forward fastq file (Input files must have same number of records)\n\
-r, --pe-file2, Input paired-end reverse fastq file\n\
-o, --output-pe1, Output trimmed forward fastq file\n\
-p, --output-pe2, Output trimmed reverse fastq file. Must use -s option.\n\n\
Paired-end interleaved reads\n\
----------------------------\n");
fprintf(stderr,"-c, --pe-combo, Combined (interleaved) input paired-end fastq\n\
-m, --output-combo, Output combined (interleaved) paired-end fastq file. Must use -s option.\n\
-M, --output-combo-all, Output combined (interleaved) paired-end fastq file with any discarded read written to output file as a single N. Cannot be used with the -s option.\n\n\
Global options\n\
--------------\n\
-t, --qual-type, Type of quality values (solexa (CASAVA < 1.3), illumina (CASAVA 1.3 to 1.7), sanger (which is CASAVA >= 1.8)) (required)\n");
fprintf(stderr, "-s, --output-single, Output trimmed singles fastq file\n\
-q, --qual-threshold, Threshold for trimming based on average quality in a window. Default 20.\n\
-l, --length-threshold, Threshold to keep a read based on length after trimming. Default 20.\n\
-x, --no-fiveprime, Don't do five prime trimming.\n\
-n, --truncate-n, Truncate sequences at position of first N.\n");


fprintf(stderr, "-g, --gzip-output, Output gzipped files.\n--quiet, do not output trimming info\n\
--help, display this help and exit\n\
--version, output version information and exit\n\n");

if (msg) fprintf(stderr, "%s\n\n", msg);
exit(status);
}

Expand Down Expand Up @@ -114,14 +162,16 @@ int paired_main(int argc, char *argv[]) {
int quiet = 0;
int no_fiveprime = 0;
int trunc_n = 0;
int drop_n = 0;
int gzip_output = 0;
int combo_all=0;
int combo_s=0;
int total=0;
int window_size = 0;

while (1) {
int option_index = 0;
optc = getopt_long(argc, argv, "df:r:c:t:o:p:m:M:s:q:l:xng", paired_long_options, &option_index);
optc = getopt_long(argc, argv, paired_short_options, paired_long_options, &option_index);

if (optc == -1)
break;
Expand Down Expand Up @@ -198,6 +248,14 @@ int paired_main(int argc, char *argv[]) {
}
break;

case 'w':
window_size = atoi(optarg);
if (window_size < 1) {
fprintf(stderr, "Fixed window size must be >= 1\n");
return EXIT_FAILURE;
}
break;

case 'x':
no_fiveprime = 1;
break;
Expand All @@ -206,6 +264,10 @@ int paired_main(int argc, char *argv[]) {
trunc_n = 1;
break;

case 'N':
drop_n = 1;
break;

case 'g':
gzip_output = 1;
break;
Expand Down Expand Up @@ -236,6 +298,11 @@ int paired_main(int argc, char *argv[]) {
paired_usage(EXIT_FAILURE, "****Error: Quality type is required.");
}

if (trunc_n && drop_n) {
fprintf(stderr, "****Error: cannot specify both --truncate-n and --drop-n\n\n");
return EXIT_FAILURE;
}

/* make sure minimum input filenames are specified */
if (!infn1 && !infnc) {
paired_usage(EXIT_FAILURE, "****Error: Must have either -f OR -c argument.");
Expand Down Expand Up @@ -374,8 +441,8 @@ int paired_main(int argc, char *argv[]) {
break;
}

p1cut = sliding_window(fqrec1, qualtype, paired_length_threshold, paired_qual_threshold, no_fiveprime, trunc_n, debug);
p2cut = sliding_window(fqrec2, qualtype, paired_length_threshold, paired_qual_threshold, no_fiveprime, trunc_n, debug);
p1cut = sliding_window(fqrec1, qualtype, paired_length_threshold, paired_qual_threshold, window_size, no_fiveprime, trunc_n, drop_n, debug);
p2cut = sliding_window(fqrec2, qualtype, paired_length_threshold, paired_qual_threshold, window_size, no_fiveprime, trunc_n, drop_n, debug);
total += 2;

if (debug) printf("p1cut: %d,%d\n", p1cut->five_prime_cut, p1cut->three_prime_cut);
Expand Down Expand Up @@ -480,7 +547,7 @@ int paired_main(int argc, char *argv[]) {
}

if (!quiet) {
if (infn1 && infn2) fprintf(stdout, "\nPE forwrd file: %s\nPE reverse file: %s\n", infn1, infn2);
if (infn1 && infn2) fprintf(stdout, "\nPE forward file: %s\nPE reverse file: %s\n", infn1, infn2);
if (infnc) fprintf(stdout, "\nPE interleaved file: %s\n", infnc);
fprintf(stdout, "\nTotal input FastQ records: %d (%d pairs)\n", total, (total / 2));
fprintf(stdout, "\nFastQ paired records kept: %d (%d pairs)\n", kept_p, (kept_p / 2));
Expand Down
Loading