From 177430b8651e2ea24acde033c504e6715f10001e Mon Sep 17 00:00:00 2001 From: serine Date: Tue, 6 Mar 2018 18:39:19 +1100 Subject: [PATCH 01/55] added gitignore --- .gitignore | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..68be19f --- /dev/null +++ b/.gitignore @@ -0,0 +1,5 @@ +# ignore C object files +*.o +# ignore compiled executable +sabre + From 2ab43e7a7637b14706342c86642ab8a47226ab80 Mon Sep 17 00:00:00 2001 From: serine Date: Tue, 6 Mar 2018 18:40:05 +1100 Subject: [PATCH 02/55] made barcode to be appended to the fastq header --- src/demulti_paired.c | 4 ++-- src/demulti_single.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/demulti_paired.c b/src/demulti_paired.c index 2acdb11..2a66928 100644 --- a/src/demulti_paired.c +++ b/src/demulti_paired.c @@ -224,7 +224,7 @@ int paired_main (int argc, char *argv[]) { if (curr != NULL) { - fprintf (curr->bcfile1, "@%s", fqrec1->name.s); + fprintf (curr->bcfile1, "@%s:%s", fqrec1->name.s, curr->bc); if (fqrec1->comment.l) fprintf (curr->bcfile1, " %s\n", fqrec1->comment.s); else fprintf (curr->bcfile1, "\n"); @@ -237,7 +237,7 @@ int paired_main (int argc, char *argv[]) { fprintf (curr->bcfile1, "%s\n", (fqrec1->qual.s)+strlen(curr->bc)); - fprintf (curr->bcfile2, "@%s", fqrec2->name.s); + fprintf (curr->bcfile2, "@%s:%s", fqrec2->name.s, curr->bc); if (fqrec2->comment.l) fprintf (curr->bcfile2, " %s\n", fqrec2->comment.s); else fprintf (curr->bcfile2, "\n"); diff --git a/src/demulti_single.c b/src/demulti_single.c index d304410..2a28b14 100644 --- a/src/demulti_single.c +++ b/src/demulti_single.c @@ -171,7 +171,7 @@ int single_main (int argc, char *argv[]) { /* If barcode data is found, output to demultiplexed file, else output to unknown file */ if (curr != NULL) { - fprintf (curr->bcfile, "@%s", fqrec->name.s); + fprintf (curr->bcfile, "@%s:%s", fqrec->name.s, curr->bc); if (fqrec->comment.l) fprintf (curr->bcfile, " %s\n", fqrec->comment.s); else fprintf (curr->bcfile, "\n"); From 10c1f474b61e10c30f62d0c2f3fae6fdadb6fe65 Mon Sep 17 00:00:00 2001 From: serine Date: Wed, 7 Mar 2018 13:27:58 +1100 Subject: [PATCH 03/55] added utils.c for general utils, right now it holds _mkdir function that recursivelly makes directories if output file is a path of nested directories. Also added a few new command line options: --min-umi-len allowing to filter reads that have too short UMIs --stats allowing user to specify a file instead of spitting into stdout, this could be slightly broken if user tries to filter reads out with --min-umi-len, needs more checking. --no-comment which I'm not sure is good idea/needed. I needed it for compatability with donwstream tools, basically this strips anything from the FASTQ header that is deliminated by space, i.e everything to the right of the white space regarded as comment --- Makefile | 8 ++- src/demulti_paired.c | 156 ++++++++++++++++++++++++++++++------------- src/sabre.c | 24 ++++--- src/sabre.h | 2 + src/utils.c | 51 ++++++++++++++ 5 files changed, 185 insertions(+), 56 deletions(-) create mode 100644 src/utils.c diff --git a/Makefile b/Makefile index 2ff3141..6f3f376 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ PROGRAM_NAME = sabre VERSION = 1.00 CC = gcc -CFLAGS = -Wall -pedantic -DVERSION=$(VERSION) +CFLAGS = -Wall -std=c99 -O2 -pedantic -DVERSION=$(VERSION) DEBUG = -g OPT = -O3 ARCHIVE = $(PROGRAM_NAME)_$(VERSION) @@ -21,9 +21,13 @@ demulti_single.o: $(SDIR)/demulti_single.c $(SDIR)/sabre.h $(SDIR)/kseq.h demulti_paired.o: $(SDIR)/demulti_paired.c $(SDIR)/sabre.h $(SDIR)/kseq.h $(CC) $(CFLAGS) $(OPT) -c $(SDIR)/$*.c +utils.o: $(SDIR)/utils.c $(SDIR)/sabre.h + $(CC) $(CFLAGS) $(OPT) -c $(SDIR)/$*.c + sabre.o: $(SDIR)/sabre.c $(SDIR)/sabre.h $(CC) $(CFLAGS) $(OPT) -c $(SDIR)/$*.c + clean: rm -rf *.o $(SDIR)/*.gch ./sabre @@ -33,7 +37,7 @@ distclean: clean dist: tar -zcf $(ARCHIVE).tar.gz src Makefile -build: barcode.o demulti_single.o demulti_paired.o sabre.o +build: barcode.o demulti_single.o demulti_paired.o sabre.o utils.o $(CC) $(CFLAGS) $(OPT) $? -o $(PROGRAM_NAME) $(LDFLAGS) debug: diff --git a/src/demulti_paired.c b/src/demulti_paired.c index 2a66928..a21adb6 100644 --- a/src/demulti_paired.c +++ b/src/demulti_paired.c @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #include "sabre.h" @@ -11,16 +12,19 @@ KSEQ_INIT(gzFile, gzread) - +//more about getopts http://www.informit.com/articles/article.aspx?p=175771&seqNum=3 static struct option paired_long_options[] = { - {"pe-file1", required_argument, 0, 'f'}, - {"pe-file2", required_argument, 0, 'r'}, - {"barcode-file", required_argument, 0, 'b'}, - {"unknown-output1", required_argument, 0, 'u'}, - {"unknown-output2", required_argument, 0, 'w'}, - {"both-barcodes", optional_argument, 0, 'c'}, - {"max-mismatch", optional_argument, 0, 'm'}, - {"quiet", optional_argument, 0, 'z'}, + {"pe-file1", required_argument, NULL, 'f'}, + {"pe-file2", required_argument, NULL, 'r'}, + {"barcode-file", required_argument, NULL, 'b'}, + {"unknown-output1", required_argument, NULL, 'u'}, + {"unknown-output2", required_argument, NULL, 'w'}, + {"both-barcodes", optional_argument, NULL, 'c'}, + {"max-mismatch", required_argument, 0, 'm'}, + {"min-umi-len", required_argument, 0, 'l'}, + {"stats", required_argument, NULL, 's'}, + {"no-comment", no_argument, 0, 'n'}, + //{"quiet", no_argument, 0, 'z'}, {GETOPT_HELP_OPTION_DECL}, {GETOPT_VERSION_OPTION_DECL}, {NULL, 0, NULL, 0} @@ -37,7 +41,10 @@ Options:\n\ fprintf (stderr, "-u, --unknown-output1, Output paired-end file 1 that contains records with no barcodes found. (required)\n\ -w, --unknown-output2, Output paired-end file 2 that contains records with no barcodes found. (required)\n\ -c, --both-barcodes, Optional flag that indicates that both fastq files have barcodes.\n\ --m , --max-mismatch , Optional argument that is the maximum number of mismatches allowed in a barcode. Default 0.\n"); +-m , --max-mismatch , Optional argument that is the maximum number of mismatches allowed in a barcode. Default 0.\n\ +-l , --min-umi-len , Optional argument that is the minimum UMI length to keep. Default [0].\n\ +-n, --no-comment, Optional argument to drop extra comments from FASTQ header. Default [NULL].\n\ +-s , --stats , Optional argument to write logs into a file instead of STDOUT. Default [STDOUT].\n"); fprintf (stderr, "--quiet, don't print barcode matching info\n\ --help, display this help and exit\n\ @@ -56,6 +63,7 @@ int paired_main (int argc, char *argv[]) { FILE* barfile = NULL; FILE* unknownfile1=NULL; FILE* unknownfile2=NULL; + FILE* log_file=NULL; int debug=0; int optc; extern char *optarg; @@ -72,12 +80,16 @@ int paired_main (int argc, char *argv[]) { int num_unknown=0; int total=0; int mismatch=0; - int quiet=0; + //int quiet=0; + + int min_umi_len=0; + char *log_fn=NULL; + int no_comment=-1; while (1) { int option_index = 0; - optc = getopt_long (argc, argv, "dcf:r:b:u:w:m:z", paired_long_options, &option_index); + optc = getopt_long (argc, argv, "dcf:r:b:u:w:m:s:l:n:z", paired_long_options, &option_index); if (optc == -1) break; @@ -117,8 +129,21 @@ int paired_main (int argc, char *argv[]) { mismatch = atoi (optarg); break; + case 's': + log_fn = (char*) malloc (strlen (optarg) + 1); + strcpy (log_fn, optarg); + break; + + case 'l': + min_umi_len = atoi (optarg); + break; + + case 'n': + no_comment = 1; + break; + case 'z': - quiet=1; + //quiet=1; break; case 'd': @@ -138,7 +163,6 @@ int paired_main (int argc, char *argv[]) { } } - if (!infn1 || !infn2 || !unknownfn1 || !unknownfn2 || !barfn) { paired_usage (EXIT_FAILURE); } @@ -182,6 +206,28 @@ int paired_main (int argc, char *argv[]) { return EXIT_FAILURE; } + fprintf(stderr, "\n\ + \n Running: %s\ + \n Command line args:\ + \n --pe-file1 %s\ + \n --pe-file2 %s\ + \n --barcode-file %s\ + \n --unknown-output1 %s\ + \n --unknown-output2 %s\ + \n --both-barcodes %d\ + \n --max-mismatch %d\ + \n --min-umi-len %d\ + \n --stats %s\ + \n --no-comment %d\ + \n\ + \n In Progess...\ + \n", PROGRAM_NAME,\ + infn1, infn2,\ + barfn,\ + unknownfn1, unknownfn2,\ + both_have_barcodes,\ + mismatch, min_umi_len, log_fn, no_comment); + /* Creating linked list of barcode data */ head = NULL; @@ -190,8 +236,8 @@ int paired_main (int argc, char *argv[]) { curr->bc = (char*) malloc (strlen(barcode) + 1); strcpy (curr->bc, barcode); - curr->bcfile1 = fopen (baroutfn1, "w"); - curr->bcfile2 = fopen (baroutfn2, "w"); + curr->bcfile1 = fopen (_mkdir(baroutfn1), "w"); + curr->bcfile2 = fopen (_mkdir(baroutfn2), "w"); curr->num_records = 0; curr->next = head; @@ -224,34 +270,42 @@ int paired_main (int argc, char *argv[]) { if (curr != NULL) { - fprintf (curr->bcfile1, "@%s:%s", fqrec1->name.s, curr->bc); - if (fqrec1->comment.l) fprintf (curr->bcfile1, " %s\n", fqrec1->comment.s); - else fprintf (curr->bcfile1, "\n"); + // if UMI is shorter then 10, discard the reads + if(strlen((fqrec1->seq.s)+strlen(curr->bc)) >= min_umi_len) { + //@READNAME:BACRCODE:UMI + fprintf (curr->bcfile1, "@%s:%s:%s", fqrec1->name.s, curr->bc, (fqrec1->seq.s)+strlen(curr->bc)); + if (fqrec1->comment.l && no_comment == -1) fprintf (curr->bcfile1, " %s\n", fqrec1->comment.s); + else fprintf (curr->bcfile1, "\n"); - fprintf (curr->bcfile1, "%s\n", (fqrec1->seq.s)+strlen(curr->bc)); + //fprintf (curr->bcfile1, "%s\n", (fqrec1->seq.s)+strlen(curr->bc)); + //This tmp hack knowning that data is single end, and R2 is simply a string of BARCODE+UMI + fprintf (curr->bcfile1, "N\n"); - fprintf (curr->bcfile1, "+%s", fqrec1->name.s); - if (fqrec1->comment.l) fprintf (curr->bcfile1, " %s\n", fqrec1->comment.s); - else fprintf (curr->bcfile1, "\n"); + fprintf (curr->bcfile1, "+%s", fqrec1->name.s); + if (fqrec1->comment.l) fprintf (curr->bcfile1, " %s\n", fqrec1->comment.s); + else fprintf (curr->bcfile1, "\n"); - fprintf (curr->bcfile1, "%s\n", (fqrec1->qual.s)+strlen(curr->bc)); + fprintf (curr->bcfile1, "%s\n", (fqrec1->qual.s)+strlen(curr->bc)); - fprintf (curr->bcfile2, "@%s:%s", fqrec2->name.s, curr->bc); - if (fqrec2->comment.l) fprintf (curr->bcfile2, " %s\n", fqrec2->comment.s); - else fprintf (curr->bcfile2, "\n"); + //fprintf (curr->bcfile2, "@%s:%s", fqrec2->name.s, curr->bc); + //@READNAME:BACRCODE:UMI + fprintf (curr->bcfile2, "@%s:%s:%s", fqrec2->name.s, curr->bc, (fqrec1->seq.s)+strlen(curr->bc)); + if (fqrec2->comment.l && no_comment == -1) fprintf (curr->bcfile2, " %s\n", fqrec2->comment.s); + else fprintf (curr->bcfile2, "\n"); - if (!both_have_barcodes) fprintf (curr->bcfile2, "%s\n", fqrec2->seq.s); - else fprintf (curr->bcfile2, "%s\n", (fqrec2->seq.s)+strlen(curr->bc)); + if (!both_have_barcodes) fprintf (curr->bcfile2, "%s\n", fqrec2->seq.s); + else fprintf (curr->bcfile2, "%s\n", (fqrec2->seq.s)+strlen(curr->bc)); - fprintf (curr->bcfile2, "+%s", fqrec2->name.s); - if (fqrec2->comment.l) fprintf (curr->bcfile2, " %s\n", fqrec2->comment.s); - else fprintf (curr->bcfile2, "\n"); + fprintf (curr->bcfile2, "+%s", fqrec2->name.s); + if (fqrec2->comment.l) fprintf (curr->bcfile2, " %s\n", fqrec2->comment.s); + else fprintf (curr->bcfile2, "\n"); - if (!both_have_barcodes) fprintf (curr->bcfile2, "%s\n", fqrec2->qual.s); - else fprintf (curr->bcfile2, "%s\n", (fqrec2->qual.s)+strlen(curr->bc)); + if (!both_have_barcodes) fprintf (curr->bcfile2, "%s\n", fqrec2->qual.s); + else fprintf (curr->bcfile2, "%s\n", (fqrec2->qual.s)+strlen(curr->bc)); - curr->num_records += 2; + curr->num_records += 2; + } } else { @@ -294,17 +348,25 @@ int paired_main (int argc, char *argv[]) { } - if (!quiet) { - fprintf (stdout, "\nTotal FastQ records: %d (%d pairs)\n\n", total, total/2); - curr = head; - while (curr) { - fprintf (stdout, "FastQ records for barcode %s: %d (%d pairs)\n", curr->bc, curr->num_records, curr->num_records/2); - curr = curr->next; - } - fprintf (stdout, "\nFastQ records with no barcode match: %d (%d pairs)\n", num_unknown, num_unknown/2); - fprintf (stdout, "\nNumber of mismatches allowed: %d\n\n", mismatch); - } - + //if (!quiet) { + //if (!log_fn) { is this better? + if (log_fn == NULL) { + log_file = stdout; + } + else { + log_file = fopen(log_fn, "w"); + } + + fprintf (log_file, "\nTotal FastQ records: %d (%d pairs)\n\n", total, total/2); + curr = head; + while (curr) { + fprintf (log_file, "FastQ records for barcode %s: %d (%d pairs)\n", curr->bc, curr->num_records, curr->num_records/2); + curr = curr->next; + } + fprintf (log_file, "\nFastQ records with no barcode match: %d (%d pairs)\n", num_unknown, num_unknown/2); + fprintf (log_file, "\nNumber of mismatches allowed: %d\n\n", mismatch); + + fprintf (stderr, "\n All done :)!"); kseq_destroy (fqrec1); kseq_destroy (fqrec2); @@ -313,12 +375,14 @@ int paired_main (int argc, char *argv[]) { fclose (unknownfile1); fclose (unknownfile2); fclose (barfile); + fclose (log_file); free (infn1); free (infn2); free (barfn); free (unknownfn1); free (unknownfn2); + free (log_fn); curr = head; while (curr) { diff --git a/src/sabre.c b/src/sabre.c index 4e066bb..24f3b10 100644 --- a/src/sabre.c +++ b/src/sabre.c @@ -10,14 +10,22 @@ void main_usage (int status) { - fprintf (stdout, "\nUsage: %s [options]\n\ -\n\ -Command:\n\ -pe\tpaired-end barcode de-multiplexing\n\ -se\tsingle-end barcode de-multiplexing\n\ -\n\ ---help, display this help and exit\n\ ---version, output version information and exit\n\n", PROGRAM_NAME); + fprintf (stdout, "\n\ + \n Usage: %s [options]\ + \n\ + \n Command:\ + \n\ + \n se\tsingle-end barcode de-multiplexing\ + \n pe\tpaired-end barcode de-multiplexing\ + \n\ + \n --help, display this help and exit\ + \n --version, output version information and exit\ + \n\ + \n Info: Sabre is a heavy cavalry sword with a curved blade and a single cutting edge\ + \n Not sure though if the meaning was intended by original author...\ + \n\ + \n", + PROGRAM_NAME); exit (status); } diff --git a/src/sabre.h b/src/sabre.h index 693af37..4eb20ef 100644 --- a/src/sabre.h +++ b/src/sabre.h @@ -64,4 +64,6 @@ int single_main (int argc, char *argv[]); int paired_main (int argc, char *argv[]); int strncmp_with_mismatch (const char *s1, const char *s2, register size_t n, register size_t mismatch); +const char * _mkdir (const char *dir); + #endif /*SABRE_H*/ diff --git a/src/utils.c b/src/utils.c new file mode 100644 index 0000000..524a10a --- /dev/null +++ b/src/utils.c @@ -0,0 +1,51 @@ +#include +#include +#include +#include +#include +#include +#include +#include "sabre.h" + +// https://stackoverflow.com/questions/2336242/recursive-mkdir-system-call-on-unix/11425692 +// https://stackoverflow.com/questions/7430248/creating-a-new-directory-in-c +const char * _mkdir(const char *file_path) { + // return straigth away if a file_path is not nested file path + if(strstr(file_path, "/") == NULL) { + return file_path; + } + //TODO check if directory already exists or not + //struct stat st = {0}; + // + //if (stat(dir, &st) == -1) { + // mkdir(tmp, S_IRWXU); + //} + + //char tmp[PATH_MAX]; // can't get this to work.. + char tmp[256]; + char *p = NULL; + char *dirc; + dirc = strdup(file_path); + size_t len; + + const char *dir = dirname(dirc); + snprintf(tmp, sizeof(tmp),"%s", dir); + len = strlen(tmp); + + if(tmp[len - 1] == '/') { + tmp[len - 1] = 0; + } + + for(p = tmp + 1; *p; p++) { + if(*p == '/') { + *p = 0; + mkdir(tmp, S_IRWXU); + *p = '/'; + } + } + + mkdir(tmp, S_IRWXU); + free(dirc); + + return file_path; +} From c0b280e89911fc79abda73a3bbd01fb69eaed3fd Mon Sep 17 00:00:00 2001 From: serine Date: Mon, 12 Mar 2018 14:35:35 +1100 Subject: [PATCH 04/55] refactored help menu --- src/demulti_paired.c | 45 +++++++++++++++++++++++++------------------- src/sabre.c | 4 ++-- 2 files changed, 28 insertions(+), 21 deletions(-) diff --git a/src/demulti_paired.c b/src/demulti_paired.c index a21adb6..4ca6bf9 100644 --- a/src/demulti_paired.c +++ b/src/demulti_paired.c @@ -32,25 +32,32 @@ static struct option paired_long_options[] = { void paired_usage (int status) { - fprintf (stderr, "\nUsage: %s pe -f -r -b -u -w \n\n\ -Options:\n\ --f, --pe-file1, Input paired-end fastq file 1 (required, must have same number of records as pe2)\n\ --r, --pe-file2, Input paired-end fastq file 2 (required, must have same number of records as pe1)\n\ --b, --barcode-file, File with barcode and two output file names per line (required)\n", PROGRAM_NAME); - -fprintf (stderr, "-u, --unknown-output1, Output paired-end file 1 that contains records with no barcodes found. (required)\n\ --w, --unknown-output2, Output paired-end file 2 that contains records with no barcodes found. (required)\n\ --c, --both-barcodes, Optional flag that indicates that both fastq files have barcodes.\n\ --m , --max-mismatch , Optional argument that is the maximum number of mismatches allowed in a barcode. Default 0.\n\ --l , --min-umi-len , Optional argument that is the minimum UMI length to keep. Default [0].\n\ --n, --no-comment, Optional argument to drop extra comments from FASTQ header. Default [NULL].\n\ --s , --stats , Optional argument to write logs into a file instead of STDOUT. Default [STDOUT].\n"); - -fprintf (stderr, "--quiet, don't print barcode matching info\n\ ---help, display this help and exit\n\ ---version, output version information and exit\n\n"); - - exit (status); + fprintf (stderr, "\n Usage: %s pe [OPTIONS] -f -r -b -u -w \ + \n\ + \n\ + \n Options:\ + \n\ + \n Required:\ + \n\ + \n -f, --pe-file1 FILE Input FASTQ R1 read\ + \n -r, --pe-file2 FILE Input FASTQ R2 reads\ + \n -b, --barcode-file FILE Barcodes files, one barcode per line, e.g B\\tR1\\tR2\ + \n -u, --unknown-output1 FILE Output unassigned R1 reads\ + \n -w, --unknown-output2 FILE Output unassigned R2 reads\ + \n\ + \n Other:\ + \n\ + \n -c, --both-barcodes INT Indicates that both FASTQ files have barcodes [0]\ + \n -m, --max-mismatch INT Maximum number of mismatches allowed in a barcode [0]\ + \n -l, --min-umi-len INT Minimum UMI length to keep [0]\ + \n -n, --no-comment Drop extra comments from FASTQ header [NULL]\ + \n -s, --stats FILE Write stats to file instead of STDOUT [STDOUT]\ + \n\ + \n", + PROGRAM_NAME); + + + exit (status); } int paired_main (int argc, char *argv[]) { diff --git a/src/sabre.c b/src/sabre.c index 24f3b10..5eafe11 100644 --- a/src/sabre.c +++ b/src/sabre.c @@ -18,8 +18,8 @@ void main_usage (int status) { \n se\tsingle-end barcode de-multiplexing\ \n pe\tpaired-end barcode de-multiplexing\ \n\ - \n --help, display this help and exit\ - \n --version, output version information and exit\ + \n --help\tto get more help\ + \n --version\tprint current version to stdout\ \n\ \n Info: Sabre is a heavy cavalry sword with a curved blade and a single cutting edge\ \n Not sure though if the meaning was intended by original author...\ From de51ec27f2bf8d6ceca3684a2a6741e752a56fc2 Mon Sep 17 00:00:00 2001 From: serine Date: Mon, 12 Mar 2018 14:38:45 +1100 Subject: [PATCH 05/55] updated README, included note about fork --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 5e84da3..82bb098 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,5 @@ +> This is a fork of the [original repo](https://github.com/najoshi/sabre). I might be taking this into slightly different direction + # sabre - A barcode demultiplexing and trimming tool for FastQ files ## About From 5ac6256451f31dc4da93d111bc9b7f969fccfe56 Mon Sep 17 00:00:00 2001 From: serine Date: Mon, 12 Mar 2018 14:59:24 +1100 Subject: [PATCH 06/55] very annoying commit, could be a deal breaker here... changed tabs to spaces, made tabs = 4 spaces instead of 8 reindented the code :| universe please forgive me... --- src/barcode.c | 29 +- src/demulti_paired.c | 720 +++++++++++++++++++++---------------------- 2 files changed, 376 insertions(+), 373 deletions(-) diff --git a/src/barcode.c b/src/barcode.c index a260281..972abec 100644 --- a/src/barcode.c +++ b/src/barcode.c @@ -5,20 +5,23 @@ int strncmp_with_mismatch (const char *s1, const char *s2, register size_t n, register size_t mismatch) { - register unsigned char u1, u2; - int cnt=0; + register unsigned char u1, u2; + int cnt=0; - while (n-- > 0) { - u1 = (unsigned char) *s1++; - u2 = (unsigned char) *s2++; + while (n-- > 0) { + u1 = (unsigned char) *s1++; + u2 = (unsigned char) *s2++; - if (u1 != u2) { - cnt++; - if (cnt > mismatch) return u1 - u2; - } + if (u1 != u2) { + cnt++; + if (cnt > mismatch) { + return u1 - u2; + } + } - if (u1 == '\0') return 0; - } - - return 0; + if (u1 == '\0') { + return 0; + } + } + return 0; } diff --git a/src/demulti_paired.c b/src/demulti_paired.c index 4ca6bf9..a1c6ca6 100644 --- a/src/demulti_paired.c +++ b/src/demulti_paired.c @@ -12,48 +12,48 @@ KSEQ_INIT(gzFile, gzread) -//more about getopts http://www.informit.com/articles/article.aspx?p=175771&seqNum=3 -static struct option paired_long_options[] = { - {"pe-file1", required_argument, NULL, 'f'}, - {"pe-file2", required_argument, NULL, 'r'}, - {"barcode-file", required_argument, NULL, 'b'}, - {"unknown-output1", required_argument, NULL, 'u'}, - {"unknown-output2", required_argument, NULL, 'w'}, - {"both-barcodes", optional_argument, NULL, 'c'}, - {"max-mismatch", required_argument, 0, 'm'}, - {"min-umi-len", required_argument, 0, 'l'}, - {"stats", required_argument, NULL, 's'}, - {"no-comment", no_argument, 0, 'n'}, - //{"quiet", no_argument, 0, 'z'}, - {GETOPT_HELP_OPTION_DECL}, - {GETOPT_VERSION_OPTION_DECL}, - {NULL, 0, NULL, 0} -}; + //more about getopts http://www.informit.com/articles/article.aspx?p=175771&seqNum=3 + static struct option paired_long_options[] = { + {"pe-file1", required_argument, NULL, 'f'}, + {"pe-file2", required_argument, NULL, 'r'}, + {"barcode-file", required_argument, NULL, 'b'}, + {"unknown-output1", required_argument, NULL, 'u'}, + {"unknown-output2", required_argument, NULL, 'w'}, + {"both-barcodes", optional_argument, NULL, 'c'}, + {"max-mismatch", required_argument, 0, 'm'}, + {"min-umi-len", required_argument, 0, 'l'}, + {"stats", required_argument, NULL, 's'}, + {"no-comment", no_argument, 0, 'n'}, + //{"quiet", no_argument, 0, 'z'}, + {GETOPT_HELP_OPTION_DECL}, + {GETOPT_VERSION_OPTION_DECL}, + {NULL, 0, NULL, 0} + }; void paired_usage (int status) { fprintf (stderr, "\n Usage: %s pe [OPTIONS] -f -r -b -u -w \ - \n\ - \n\ - \n Options:\ - \n\ - \n Required:\ - \n\ - \n -f, --pe-file1 FILE Input FASTQ R1 read\ - \n -r, --pe-file2 FILE Input FASTQ R2 reads\ - \n -b, --barcode-file FILE Barcodes files, one barcode per line, e.g B\\tR1\\tR2\ - \n -u, --unknown-output1 FILE Output unassigned R1 reads\ - \n -w, --unknown-output2 FILE Output unassigned R2 reads\ - \n\ - \n Other:\ - \n\ - \n -c, --both-barcodes INT Indicates that both FASTQ files have barcodes [0]\ - \n -m, --max-mismatch INT Maximum number of mismatches allowed in a barcode [0]\ - \n -l, --min-umi-len INT Minimum UMI length to keep [0]\ - \n -n, --no-comment Drop extra comments from FASTQ header [NULL]\ - \n -s, --stats FILE Write stats to file instead of STDOUT [STDOUT]\ - \n\ - \n", + \n\ + \n\ + \n Options:\ + \n\ + \n Required:\ + \n\ + \n -f, --pe-file1 FILE Input FASTQ R1 read\ + \n -r, --pe-file2 FILE Input FASTQ R2 reads\ + \n -b, --barcode-file FILE Barcodes files, one barcode per line, e.g B\\tR1\\tR2\ + \n -u, --unknown-output1 FILE Output unassigned R1 reads\ + \n -w, --unknown-output2 FILE Output unassigned R2 reads\ + \n\ + \n Other:\ + \n\ + \n -c, --both-barcodes INT Indicates that both FASTQ files have barcodes [0]\ + \n -m, --max-mismatch INT Maximum number of mismatches allowed in a barcode [0]\ + \n -l, --min-umi-len INT Minimum UMI length to keep [0]\ + \n -n, --no-comment Drop extra comments from FASTQ header [NULL]\ + \n -s, --stats FILE Write stats to file instead of STDOUT [STDOUT]\ + \n\ + \n", PROGRAM_NAME); @@ -62,344 +62,344 @@ void paired_usage (int status) { int paired_main (int argc, char *argv[]) { - gzFile pe1=NULL; - gzFile pe2=NULL; - kseq_t *fqrec1; - kseq_t *fqrec2; - int l1,l2; - FILE* barfile = NULL; - FILE* unknownfile1=NULL; - FILE* unknownfile2=NULL; - FILE* log_file=NULL; - int debug=0; - int optc; - extern char *optarg; - char *infn1=NULL; - char *infn2=NULL; - char *barfn=NULL; - char *unknownfn1=NULL; - char *unknownfn2=NULL; - int both_have_barcodes=0; - barcode_data_paired *curr, *head, *temp; - char barcode [MAX_BARCODE_LENGTH]; - char baroutfn1 [MAX_FILENAME_LENGTH]; - char baroutfn2 [MAX_FILENAME_LENGTH]; - int num_unknown=0; - int total=0; - int mismatch=0; - //int quiet=0; - - int min_umi_len=0; - char *log_fn=NULL; - int no_comment=-1; - - - while (1) { - int option_index = 0; - optc = getopt_long (argc, argv, "dcf:r:b:u:w:m:s:l:n:z", paired_long_options, &option_index); - - if (optc == -1) break; - - switch (optc) { - if (paired_long_options[option_index].flag != 0) break; - - case 'f': - infn1 = (char*) malloc (strlen (optarg) + 1); - strcpy (infn1, optarg); - break; - - case 'r': - infn2 = (char*) malloc (strlen (optarg) + 1); - strcpy (infn2, optarg); - break; - - case 'b': - barfn = (char*) malloc (strlen (optarg) + 1); - strcpy (barfn, optarg); - break; - - case 'u': - unknownfn1 = (char*) malloc (strlen (optarg) + 1); - strcpy (unknownfn1, optarg); - break; - - case 'w': - unknownfn2 = (char*) malloc (strlen (optarg) + 1); - strcpy (unknownfn2, optarg); - break; - - case 'c': - both_have_barcodes=1; - break; - - case 'm': - mismatch = atoi (optarg); - break; - - case 's': - log_fn = (char*) malloc (strlen (optarg) + 1); - strcpy (log_fn, optarg); - break; - - case 'l': - min_umi_len = atoi (optarg); - break; - - case 'n': - no_comment = 1; - break; - - case 'z': - //quiet=1; - break; - - case 'd': - debug = 1; - break; - - case_GETOPT_HELP_CHAR(paired_usage); - case_GETOPT_VERSION_CHAR(PROGRAM_NAME, VERSION, AUTHORS); - - case '?': - paired_usage (EXIT_FAILURE); - break; - - default: - paired_usage (EXIT_FAILURE); - break; - } - } - - if (!infn1 || !infn2 || !unknownfn1 || !unknownfn2 || !barfn) { - paired_usage (EXIT_FAILURE); - } - - if (!strcmp (infn1, infn2) || !strcmp (infn1, unknownfn1) || !strcmp (infn1, unknownfn2) || - !strcmp (infn1, barfn) || !strcmp (infn2, unknownfn1) || !strcmp (infn2, unknownfn2) || - !strcmp (infn2, barfn) || !strcmp (unknownfn1, unknownfn2) || !strcmp (unknownfn1, barfn) || - !strcmp (unknownfn2, barfn)) { - - fprintf (stderr, "Error: Duplicate input and/or output file names.\n"); - return EXIT_FAILURE; - } - - pe1 = gzopen (infn1, "r"); - if (!pe1) { - fprintf (stderr, "Could not open input file 1 '%s'.\n", infn1); - return EXIT_FAILURE; - } - - pe2 = gzopen (infn2, "r"); - if (!pe2) { - fprintf (stderr, "Could not open input file 2 '%s'.\n", infn2); - return EXIT_FAILURE; - } - - unknownfile1 = fopen (unknownfn1, "w"); - if (!unknownfile1) { - fprintf (stderr, "Could not open unknown output file 1 '%s'.\n", unknownfn1); - return EXIT_FAILURE; - } - - unknownfile2 = fopen (unknownfn2, "w"); - if (!unknownfile2) { - fprintf (stderr, "Could not open unknown output file 2 '%s'.\n", unknownfn2); - return EXIT_FAILURE; - } - - barfile = fopen (barfn, "r"); - if (!barfile) { - fprintf (stderr, "Could not open barcode file '%s'.\n", barfn); - return EXIT_FAILURE; - } - - fprintf(stderr, "\n\ - \n Running: %s\ - \n Command line args:\ - \n --pe-file1 %s\ - \n --pe-file2 %s\ - \n --barcode-file %s\ - \n --unknown-output1 %s\ - \n --unknown-output2 %s\ - \n --both-barcodes %d\ - \n --max-mismatch %d\ - \n --min-umi-len %d\ - \n --stats %s\ - \n --no-comment %d\ - \n\ - \n In Progess...\ - \n", PROGRAM_NAME,\ - infn1, infn2,\ - barfn,\ - unknownfn1, unknownfn2,\ - both_have_barcodes,\ - mismatch, min_umi_len, log_fn, no_comment); - - - /* Creating linked list of barcode data */ - head = NULL; - while (fscanf (barfile, "%s%s%s", barcode, baroutfn1, baroutfn2) != EOF) { - curr = (barcode_data_paired*) malloc (sizeof (barcode_data_paired)); - curr->bc = (char*) malloc (strlen(barcode) + 1); - strcpy (curr->bc, barcode); - - curr->bcfile1 = fopen (_mkdir(baroutfn1), "w"); - curr->bcfile2 = fopen (_mkdir(baroutfn2), "w"); - curr->num_records = 0; - - curr->next = head; - head = curr; - } - - - fqrec1 = kseq_init (pe1); - fqrec2 = kseq_init (pe2); - - while ((l1 = kseq_read (fqrec1)) >= 0) { - - l2 = kseq_read (fqrec2); - if (l2 < 0) { - fprintf (stderr, "Error: PE file 2 is shorter than PE file 1. Disregarding rest of PE file 1.\n"); - break; - } - - - /* Go through all barcode data and check if any match to beginning of read */ - /* If it does then put read in that barcode's file, otherwise put in unknown file */ - curr = head; - while (curr) { - if (strncmp_with_mismatch (curr->bc, fqrec1->seq.s, strlen (curr->bc), mismatch) == 0) { - break; - } - - curr = curr->next; - } - - - if (curr != NULL) { - // if UMI is shorter then 10, discard the reads - if(strlen((fqrec1->seq.s)+strlen(curr->bc)) >= min_umi_len) { - //@READNAME:BACRCODE:UMI - fprintf (curr->bcfile1, "@%s:%s:%s", fqrec1->name.s, curr->bc, (fqrec1->seq.s)+strlen(curr->bc)); - if (fqrec1->comment.l && no_comment == -1) fprintf (curr->bcfile1, " %s\n", fqrec1->comment.s); - else fprintf (curr->bcfile1, "\n"); - - //fprintf (curr->bcfile1, "%s\n", (fqrec1->seq.s)+strlen(curr->bc)); - //This tmp hack knowning that data is single end, and R2 is simply a string of BARCODE+UMI - fprintf (curr->bcfile1, "N\n"); - - fprintf (curr->bcfile1, "+%s", fqrec1->name.s); - if (fqrec1->comment.l) fprintf (curr->bcfile1, " %s\n", fqrec1->comment.s); - else fprintf (curr->bcfile1, "\n"); - - fprintf (curr->bcfile1, "%s\n", (fqrec1->qual.s)+strlen(curr->bc)); - - - //fprintf (curr->bcfile2, "@%s:%s", fqrec2->name.s, curr->bc); - //@READNAME:BACRCODE:UMI - fprintf (curr->bcfile2, "@%s:%s:%s", fqrec2->name.s, curr->bc, (fqrec1->seq.s)+strlen(curr->bc)); - if (fqrec2->comment.l && no_comment == -1) fprintf (curr->bcfile2, " %s\n", fqrec2->comment.s); - else fprintf (curr->bcfile2, "\n"); - - if (!both_have_barcodes) fprintf (curr->bcfile2, "%s\n", fqrec2->seq.s); - else fprintf (curr->bcfile2, "%s\n", (fqrec2->seq.s)+strlen(curr->bc)); - - fprintf (curr->bcfile2, "+%s", fqrec2->name.s); - if (fqrec2->comment.l) fprintf (curr->bcfile2, " %s\n", fqrec2->comment.s); - else fprintf (curr->bcfile2, "\n"); - - if (!both_have_barcodes) fprintf (curr->bcfile2, "%s\n", fqrec2->qual.s); - else fprintf (curr->bcfile2, "%s\n", (fqrec2->qual.s)+strlen(curr->bc)); - - curr->num_records += 2; - } - } + gzFile pe1=NULL; + gzFile pe2=NULL; + kseq_t *fqrec1; + kseq_t *fqrec2; + int l1,l2; + FILE* barfile = NULL; + FILE* unknownfile1=NULL; + FILE* unknownfile2=NULL; + FILE* log_file=NULL; + int debug=0; + int optc; + extern char *optarg; + char *infn1=NULL; + char *infn2=NULL; + char *barfn=NULL; + char *unknownfn1=NULL; + char *unknownfn2=NULL; + int both_have_barcodes=0; + barcode_data_paired *curr, *head, *temp; + char barcode [MAX_BARCODE_LENGTH]; + char baroutfn1 [MAX_FILENAME_LENGTH]; + char baroutfn2 [MAX_FILENAME_LENGTH]; + int num_unknown=0; + int total=0; + int mismatch=0; + //int quiet=0; + + int min_umi_len=0; + char *log_fn=NULL; + int no_comment=-1; + + + while (1) { + int option_index = 0; + optc = getopt_long (argc, argv, "dcf:r:b:u:w:m:s:l:n:z", paired_long_options, &option_index); + + if (optc == -1) break; + + switch (optc) { + if (paired_long_options[option_index].flag != 0) break; + + case 'f': + infn1 = (char*) malloc (strlen (optarg) + 1); + strcpy (infn1, optarg); + break; + + case 'r': + infn2 = (char*) malloc (strlen (optarg) + 1); + strcpy (infn2, optarg); + break; + + case 'b': + barfn = (char*) malloc (strlen (optarg) + 1); + strcpy (barfn, optarg); + break; + + case 'u': + unknownfn1 = (char*) malloc (strlen (optarg) + 1); + strcpy (unknownfn1, optarg); + break; + + case 'w': + unknownfn2 = (char*) malloc (strlen (optarg) + 1); + strcpy (unknownfn2, optarg); + break; + + case 'c': + both_have_barcodes=1; + break; + + case 'm': + mismatch = atoi (optarg); + break; + + case 's': + log_fn = (char*) malloc (strlen (optarg) + 1); + strcpy (log_fn, optarg); + break; + + case 'l': + min_umi_len = atoi (optarg); + break; + + case 'n': + no_comment = 1; + break; + + case 'z': + //quiet=1; + break; + + case 'd': + debug = 1; + break; + + case_GETOPT_HELP_CHAR(paired_usage); + case_GETOPT_VERSION_CHAR(PROGRAM_NAME, VERSION, AUTHORS); + + case '?': + paired_usage (EXIT_FAILURE); + break; + + default: + paired_usage (EXIT_FAILURE); + break; + } + } + + if (!infn1 || !infn2 || !unknownfn1 || !unknownfn2 || !barfn) { + paired_usage (EXIT_FAILURE); + } + + if (!strcmp (infn1, infn2) || !strcmp (infn1, unknownfn1) || !strcmp (infn1, unknownfn2) || + !strcmp (infn1, barfn) || !strcmp (infn2, unknownfn1) || !strcmp (infn2, unknownfn2) || + !strcmp (infn2, barfn) || !strcmp (unknownfn1, unknownfn2) || !strcmp (unknownfn1, barfn) || + !strcmp (unknownfn2, barfn)) { + + fprintf (stderr, "Error: Duplicate input and/or output file names.\n"); + return EXIT_FAILURE; + } + + pe1 = gzopen (infn1, "r"); + if (!pe1) { + fprintf (stderr, "Could not open input file 1 '%s'.\n", infn1); + return EXIT_FAILURE; + } + + pe2 = gzopen (infn2, "r"); + if (!pe2) { + fprintf (stderr, "Could not open input file 2 '%s'.\n", infn2); + return EXIT_FAILURE; + } + + unknownfile1 = fopen (unknownfn1, "w"); + if (!unknownfile1) { + fprintf (stderr, "Could not open unknown output file 1 '%s'.\n", unknownfn1); + return EXIT_FAILURE; + } + + unknownfile2 = fopen (unknownfn2, "w"); + if (!unknownfile2) { + fprintf (stderr, "Could not open unknown output file 2 '%s'.\n", unknownfn2); + return EXIT_FAILURE; + } + + barfile = fopen (barfn, "r"); + if (!barfile) { + fprintf (stderr, "Could not open barcode file '%s'.\n", barfn); + return EXIT_FAILURE; + } + + fprintf(stderr, "\n\ + \n Running: %s\ + \n Command line args:\ + \n --pe-file1 %s\ + \n --pe-file2 %s\ + \n --barcode-file %s\ + \n --unknown-output1 %s\ + \n --unknown-output2 %s\ + \n --both-barcodes %d\ + \n --max-mismatch %d\ + \n --min-umi-len %d\ + \n --stats %s\ + \n --no-comment %d\ + \n\ + \n In Progess...\ + \n", PROGRAM_NAME,\ + infn1, infn2,\ + barfn,\ + unknownfn1, unknownfn2,\ + both_have_barcodes,\ + mismatch, min_umi_len, log_fn, no_comment); + + + /* Creating linked list of barcode data */ + head = NULL; + while (fscanf (barfile, "%s%s%s", barcode, baroutfn1, baroutfn2) != EOF) { + curr = (barcode_data_paired*) malloc (sizeof (barcode_data_paired)); + curr->bc = (char*) malloc (strlen(barcode) + 1); + strcpy (curr->bc, barcode); + + curr->bcfile1 = fopen (_mkdir(baroutfn1), "w"); + curr->bcfile2 = fopen (_mkdir(baroutfn2), "w"); + curr->num_records = 0; + + curr->next = head; + head = curr; + } + + + fqrec1 = kseq_init (pe1); + fqrec2 = kseq_init (pe2); + + while ((l1 = kseq_read (fqrec1)) >= 0) { + + l2 = kseq_read (fqrec2); + if (l2 < 0) { + fprintf (stderr, "Error: PE file 2 is shorter than PE file 1. Disregarding rest of PE file 1.\n"); + break; + } - else { - fprintf (unknownfile1, "@%s", fqrec1->name.s); - if (fqrec1->comment.l) fprintf (unknownfile1, " %s\n", fqrec1->comment.s); - else fprintf (unknownfile1, "\n"); - fprintf (unknownfile1, "%s\n", fqrec1->seq.s); + /* Go through all barcode data and check if any match to beginning of read */ + /* If it does then put read in that barcode's file, otherwise put in unknown file */ + curr = head; + while (curr) { + if (strncmp_with_mismatch (curr->bc, fqrec1->seq.s, strlen (curr->bc), mismatch) == 0) { + break; + } - fprintf (unknownfile1, "+%s", fqrec1->name.s); - if (fqrec1->comment.l) fprintf (unknownfile1, " %s\n", fqrec1->comment.s); - else fprintf (unknownfile1, "\n"); + curr = curr->next; + } - fprintf (unknownfile1, "%s\n", fqrec1->qual.s); + if (curr != NULL) { + // if UMI is shorter then 10, discard the reads + if(strlen((fqrec1->seq.s)+strlen(curr->bc)) >= min_umi_len) { + //@READNAME:BACRCODE:UMI + fprintf (curr->bcfile1, "@%s:%s:%s", fqrec1->name.s, curr->bc, (fqrec1->seq.s)+strlen(curr->bc)); + if (fqrec1->comment.l && no_comment == -1) fprintf (curr->bcfile1, " %s\n", fqrec1->comment.s); + else fprintf (curr->bcfile1, "\n"); - fprintf (unknownfile2, "@%s", fqrec2->name.s); - if (fqrec2->comment.l) fprintf (unknownfile2, " %s\n", fqrec2->comment.s); - else fprintf (unknownfile2, "\n"); + //fprintf (curr->bcfile1, "%s\n", (fqrec1->seq.s)+strlen(curr->bc)); + //This tmp hack knowning that data is single end, and R2 is simply a string of BARCODE+UMI + fprintf (curr->bcfile1, "N\n"); - fprintf (unknownfile2, "%s\n", fqrec2->seq.s); + fprintf (curr->bcfile1, "+%s", fqrec1->name.s); + if (fqrec1->comment.l) fprintf (curr->bcfile1, " %s\n", fqrec1->comment.s); + else fprintf (curr->bcfile1, "\n"); - fprintf (unknownfile2, "+%s", fqrec2->name.s); - if (fqrec2->comment.l) fprintf (unknownfile2, " %s\n", fqrec2->comment.s); - else fprintf (unknownfile2, "\n"); + fprintf (curr->bcfile1, "%s\n", (fqrec1->qual.s)+strlen(curr->bc)); - fprintf (unknownfile2, "%s\n", fqrec2->qual.s); - num_unknown += 2; - } + //fprintf (curr->bcfile2, "@%s:%s", fqrec2->name.s, curr->bc); + //@READNAME:BACRCODE:UMI + fprintf (curr->bcfile2, "@%s:%s:%s", fqrec2->name.s, curr->bc, (fqrec1->seq.s)+strlen(curr->bc)); + if (fqrec2->comment.l && no_comment == -1) fprintf (curr->bcfile2, " %s\n", fqrec2->comment.s); + else fprintf (curr->bcfile2, "\n"); - total += 2; - } + if (!both_have_barcodes) fprintf (curr->bcfile2, "%s\n", fqrec2->seq.s); + else fprintf (curr->bcfile2, "%s\n", (fqrec2->seq.s)+strlen(curr->bc)); - if (l1 < 0) { - l2 = kseq_read (fqrec2); - if (l2 >= 0) { - fprintf (stderr, "Error: PE file 1 is shorter than PE file 2. Disregarding rest of PE file 2.\n"); - } - } + fprintf (curr->bcfile2, "+%s", fqrec2->name.s); + if (fqrec2->comment.l) fprintf (curr->bcfile2, " %s\n", fqrec2->comment.s); + else fprintf (curr->bcfile2, "\n"); + if (!both_have_barcodes) fprintf (curr->bcfile2, "%s\n", fqrec2->qual.s); + else fprintf (curr->bcfile2, "%s\n", (fqrec2->qual.s)+strlen(curr->bc)); - //if (!quiet) { - //if (!log_fn) { is this better? - if (log_fn == NULL) { - log_file = stdout; + curr->num_records += 2; + } } + else { - log_file = fopen(log_fn, "w"); + fprintf (unknownfile1, "@%s", fqrec1->name.s); + if (fqrec1->comment.l) fprintf (unknownfile1, " %s\n", fqrec1->comment.s); + else fprintf (unknownfile1, "\n"); + + fprintf (unknownfile1, "%s\n", fqrec1->seq.s); + + fprintf (unknownfile1, "+%s", fqrec1->name.s); + if (fqrec1->comment.l) fprintf (unknownfile1, " %s\n", fqrec1->comment.s); + else fprintf (unknownfile1, "\n"); + + fprintf (unknownfile1, "%s\n", fqrec1->qual.s); + + + fprintf (unknownfile2, "@%s", fqrec2->name.s); + if (fqrec2->comment.l) fprintf (unknownfile2, " %s\n", fqrec2->comment.s); + else fprintf (unknownfile2, "\n"); + + fprintf (unknownfile2, "%s\n", fqrec2->seq.s); + + fprintf (unknownfile2, "+%s", fqrec2->name.s); + if (fqrec2->comment.l) fprintf (unknownfile2, " %s\n", fqrec2->comment.s); + else fprintf (unknownfile2, "\n"); + + fprintf (unknownfile2, "%s\n", fqrec2->qual.s); + + num_unknown += 2; } - - fprintf (log_file, "\nTotal FastQ records: %d (%d pairs)\n\n", total, total/2); - curr = head; - while (curr) { - fprintf (log_file, "FastQ records for barcode %s: %d (%d pairs)\n", curr->bc, curr->num_records, curr->num_records/2); - curr = curr->next; + + total += 2; + } + + if (l1 < 0) { + l2 = kseq_read (fqrec2); + if (l2 >= 0) { + fprintf (stderr, "Error: PE file 1 is shorter than PE file 2. Disregarding rest of PE file 2.\n"); } - fprintf (log_file, "\nFastQ records with no barcode match: %d (%d pairs)\n", num_unknown, num_unknown/2); - fprintf (log_file, "\nNumber of mismatches allowed: %d\n\n", mismatch); - - fprintf (stderr, "\n All done :)!"); - - kseq_destroy (fqrec1); - kseq_destroy (fqrec2); - gzclose (pe1); - gzclose (pe2); - fclose (unknownfile1); - fclose (unknownfile2); - fclose (barfile); - fclose (log_file); - - free (infn1); - free (infn2); - free (barfn); - free (unknownfn1); - free (unknownfn2); - free (log_fn); - - curr = head; - while (curr) { - fclose (curr->bcfile1); - fclose (curr->bcfile2); - free (curr->bc); - temp = curr; - curr = curr->next; - free (temp); - } - - return EXIT_SUCCESS; + } + + + //if (!quiet) { + //if (!log_fn) { is this better? + if (log_fn == NULL) { + log_file = stdout; + } + else { + log_file = fopen(log_fn, "w"); + } + + fprintf (log_file, "\nTotal FastQ records: %d (%d pairs)\n\n", total, total/2); + curr = head; + while (curr) { + fprintf (log_file, "FastQ records for barcode %s: %d (%d pairs)\n", curr->bc, curr->num_records, curr->num_records/2); + curr = curr->next; + } + fprintf (log_file, "\nFastQ records with no barcode match: %d (%d pairs)\n", num_unknown, num_unknown/2); + fprintf (log_file, "\nNumber of mismatches allowed: %d\n\n", mismatch); + + fprintf (stderr, "\n All done :)!"); + + kseq_destroy (fqrec1); + kseq_destroy (fqrec2); + gzclose (pe1); + gzclose (pe2); + fclose (unknownfile1); + fclose (unknownfile2); + fclose (barfile); + fclose (log_file); + + free (infn1); + free (infn2); + free (barfn); + free (unknownfn1); + free (unknownfn2); + free (log_fn); + + curr = head; + while (curr) { + fclose (curr->bcfile1); + fclose (curr->bcfile2); + free (curr->bc); + temp = curr; + curr = curr->next; + free (temp); + } + + return EXIT_SUCCESS; } From 35bf35b2e4c3954605c11158653b5d3d6c6d8728 Mon Sep 17 00:00:00 2001 From: serine Date: Mon, 12 Mar 2018 17:11:23 +1100 Subject: [PATCH 07/55] Moved barcode.c code into utils and removed that file Changed strncmp_with_mismatch function to now also take an extra param --max_5prime_crop that will attempt to trim bases, one at a time from 5prime of the read this should improve assignment rates if there is a short overhang, like one or two base. I should also set a TODO to assert that max_5prime_crop <= strlen(read) and/or strlen(barcode) --- src/barcode.c | 27 ----------------------- src/demulti_paired.c | 26 +++++++++++++++++----- src/demulti_single.c | 5 +++-- src/sabre.h | 2 +- src/utils.c | 52 ++++++++++++++++++++++++++++++++++++++------ 5 files changed, 69 insertions(+), 43 deletions(-) delete mode 100644 src/barcode.c diff --git a/src/barcode.c b/src/barcode.c deleted file mode 100644 index 972abec..0000000 --- a/src/barcode.c +++ /dev/null @@ -1,27 +0,0 @@ -#include -#include -#include -#include - -int strncmp_with_mismatch (const char *s1, const char *s2, register size_t n, register size_t mismatch) { - - register unsigned char u1, u2; - int cnt=0; - - while (n-- > 0) { - u1 = (unsigned char) *s1++; - u2 = (unsigned char) *s2++; - - if (u1 != u2) { - cnt++; - if (cnt > mismatch) { - return u1 - u2; - } - } - - if (u1 == '\0') { - return 0; - } - } - return 0; -} diff --git a/src/demulti_paired.c b/src/demulti_paired.c index a1c6ca6..d4b7e94 100644 --- a/src/demulti_paired.c +++ b/src/demulti_paired.c @@ -22,6 +22,7 @@ KSEQ_INIT(gzFile, gzread) {"both-barcodes", optional_argument, NULL, 'c'}, {"max-mismatch", required_argument, 0, 'm'}, {"min-umi-len", required_argument, 0, 'l'}, + {"max-5prime-crop", required_argument, 0, 'a'}, {"stats", required_argument, NULL, 's'}, {"no-comment", no_argument, 0, 'n'}, //{"quiet", no_argument, 0, 'z'}, @@ -50,6 +51,7 @@ void paired_usage (int status) { \n -c, --both-barcodes INT Indicates that both FASTQ files have barcodes [0]\ \n -m, --max-mismatch INT Maximum number of mismatches allowed in a barcode [0]\ \n -l, --min-umi-len INT Minimum UMI length to keep [0]\ + \n -a, --max-5prime-crop INT Maximum number of possible bases cropped from 5prime [0]\ \n -n, --no-comment Drop extra comments from FASTQ header [NULL]\ \n -s, --stats FILE Write stats to file instead of STDOUT [STDOUT]\ \n\ @@ -77,8 +79,8 @@ int paired_main (int argc, char *argv[]) { char *infn1=NULL; char *infn2=NULL; char *barfn=NULL; - char *unknownfn1=NULL; - char *unknownfn2=NULL; + char *unknownfn1=strdup("unassigned_R1.fastq.gz"); + char *unknownfn2=strdup("unassigned_R2.fastq.gz"); int both_have_barcodes=0; barcode_data_paired *curr, *head, *temp; char barcode [MAX_BARCODE_LENGTH]; @@ -90,13 +92,15 @@ int paired_main (int argc, char *argv[]) { //int quiet=0; int min_umi_len=0; + int max_5prime_crop=0; char *log_fn=NULL; int no_comment=-1; while (1) { int option_index = 0; - optc = getopt_long (argc, argv, "dcf:r:b:u:w:m:s:l:n:z", paired_long_options, &option_index); + //colon after a flag means should have arguments and no colon means just a flag i.e bool, no args after it + optc = getopt_long (argc, argv, "dcnf:r:b:u:w:m:s:l:z:a:", paired_long_options, &option_index); if (optc == -1) break; @@ -119,11 +123,17 @@ int paired_main (int argc, char *argv[]) { break; case 'u': + if(unknownfn1) { + free(unknownfn1); + } unknownfn1 = (char*) malloc (strlen (optarg) + 1); strcpy (unknownfn1, optarg); break; case 'w': + if(unknownfn2) { + free(unknownfn2); + } unknownfn2 = (char*) malloc (strlen (optarg) + 1); strcpy (unknownfn2, optarg); break; @@ -145,6 +155,10 @@ int paired_main (int argc, char *argv[]) { min_umi_len = atoi (optarg); break; + case 'a': + max_5prime_crop = atoi (optarg); + break; + case 'n': no_comment = 1; break; @@ -224,6 +238,7 @@ int paired_main (int argc, char *argv[]) { \n --both-barcodes %d\ \n --max-mismatch %d\ \n --min-umi-len %d\ + \n --max-5prime-crop %d\ \n --stats %s\ \n --no-comment %d\ \n\ @@ -233,7 +248,7 @@ int paired_main (int argc, char *argv[]) { barfn,\ unknownfn1, unknownfn2,\ both_have_barcodes,\ - mismatch, min_umi_len, log_fn, no_comment); + mismatch, min_umi_len, max_5prime_crop, log_fn, no_comment); /* Creating linked list of barcode data */ @@ -268,10 +283,9 @@ int paired_main (int argc, char *argv[]) { /* If it does then put read in that barcode's file, otherwise put in unknown file */ curr = head; while (curr) { - if (strncmp_with_mismatch (curr->bc, fqrec1->seq.s, strlen (curr->bc), mismatch) == 0) { + if (strncmp_with_mismatch (curr->bc, fqrec1->seq.s, strlen (curr->bc), mismatch, max_5prime_crop) == 0) { break; } - curr = curr->next; } diff --git a/src/demulti_single.c b/src/demulti_single.c index 2a28b14..cd978fd 100644 --- a/src/demulti_single.c +++ b/src/demulti_single.c @@ -60,6 +60,8 @@ int single_main (int argc, char *argv[]) { int mismatch=0; int quiet=0; + int max_5prime_crop=0; + while (1) { int option_index = 0; optc = getopt_long (argc, argv, "df:b:u:m:z", single_long_options, &option_index); @@ -161,10 +163,9 @@ int single_main (int argc, char *argv[]) { /* with the sequence until a match is found or no match is found for any */ curr = head; while (curr) { - if (strncmp_with_mismatch (curr->bc, fqrec->seq.s, strlen (curr->bc), mismatch) == 0) { + if (strncmp_with_mismatch (curr->bc, fqrec->seq.s, strlen (curr->bc), mismatch, max_5prime_crop) == 0) { break; } - curr = curr->next; } diff --git a/src/sabre.h b/src/sabre.h index 4eb20ef..6ffb972 100644 --- a/src/sabre.h +++ b/src/sabre.h @@ -62,7 +62,7 @@ typedef struct listel_p { /* Function Prototypes */ int single_main (int argc, char *argv[]); int paired_main (int argc, char *argv[]); -int strncmp_with_mismatch (const char *s1, const char *s2, register size_t n, register size_t mismatch); +int strncmp_with_mismatch (const char *s1, const char *s2, register size_t n, register size_t mismatch, int max_5prime_crop); const char * _mkdir (const char *dir); diff --git a/src/utils.c b/src/utils.c index 524a10a..756f2d7 100644 --- a/src/utils.c +++ b/src/utils.c @@ -6,13 +6,15 @@ #include #include #include "sabre.h" +#include +#include // https://stackoverflow.com/questions/2336242/recursive-mkdir-system-call-on-unix/11425692 // https://stackoverflow.com/questions/7430248/creating-a-new-directory-in-c const char * _mkdir(const char *file_path) { // return straigth away if a file_path is not nested file path if(strstr(file_path, "/") == NULL) { - return file_path; + return file_path; } //TODO check if directory already exists or not //struct stat st = {0}; @@ -33,15 +35,15 @@ const char * _mkdir(const char *file_path) { len = strlen(tmp); if(tmp[len - 1] == '/') { - tmp[len - 1] = 0; + tmp[len - 1] = 0; } for(p = tmp + 1; *p; p++) { - if(*p == '/') { - *p = 0; - mkdir(tmp, S_IRWXU); - *p = '/'; - } + if(*p == '/') { + *p = 0; + mkdir(tmp, S_IRWXU); + *p = '/'; + } } mkdir(tmp, S_IRWXU); @@ -49,3 +51,39 @@ const char * _mkdir(const char *file_path) { return file_path; } + +//NOTE retuns zero on success +//strcmp can be used for sorting, returns pos, zero, neg +//BUT this new implementation can't be used as such just FYI +int strncmp_with_mismatch (const char *bc, const char *read, register size_t bc_len, register size_t mismatch, int max_5prime_crop) { + + register char u1, u2; + int cnt=0; + int n_crop=0; + + char *orig_bc = strdup(bc); + char *orig_read = strdup(read); + + while(n_crop <= max_5prime_crop) { + bc = orig_bc; + read = orig_read+n_crop; + + while (bc_len-- > 0) { + u1 = *bc++; + u2 = *read++; + + if (u1 != u2) { + cnt++; + if (cnt > mismatch) { + //return u1 - u2; + break; + } + } + if (u1 == '\0' || u2 == '\0') { + return 0; + } + } + n_crop++; + } + return 0; +} From d984204f4066afd8d7ed4269683691f67ad4faf8 Mon Sep 17 00:00:00 2001 From: serine Date: Mon, 12 Mar 2018 17:16:19 +1100 Subject: [PATCH 08/55] added exclusion of vim swap files into gitignore --- .gitignore | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.gitignore b/.gitignore index 68be19f..33305c7 100644 --- a/.gitignore +++ b/.gitignore @@ -2,4 +2,6 @@ *.o # ignore compiled executable sabre +# ignore vim swap files +*.swp From c49b1184e97357999662f6fe9a01b74a51734fa2 Mon Sep 17 00:00:00 2001 From: serine Date: Mon, 12 Mar 2018 17:17:32 +1100 Subject: [PATCH 09/55] updated Makefile to reflect removal of barcode.c file --- Makefile | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/Makefile b/Makefile index 6f3f376..db08141 100644 --- a/Makefile +++ b/Makefile @@ -12,9 +12,6 @@ SDIR = src default: build -barcode.o: $(SDIR)/barcode.c - $(CC) $(CFLAGS) $(OPT) -c $(SDIR)/$*.c - demulti_single.o: $(SDIR)/demulti_single.c $(SDIR)/sabre.h $(SDIR)/kseq.h $(CC) $(CFLAGS) $(OPT) -c $(SDIR)/$*.c @@ -27,7 +24,6 @@ utils.o: $(SDIR)/utils.c $(SDIR)/sabre.h sabre.o: $(SDIR)/sabre.c $(SDIR)/sabre.h $(CC) $(CFLAGS) $(OPT) -c $(SDIR)/$*.c - clean: rm -rf *.o $(SDIR)/*.gch ./sabre @@ -37,7 +33,7 @@ distclean: clean dist: tar -zcf $(ARCHIVE).tar.gz src Makefile -build: barcode.o demulti_single.o demulti_paired.o sabre.o utils.o +build: demulti_single.o demulti_paired.o sabre.o utils.o $(CC) $(CFLAGS) $(OPT) $? -o $(PROGRAM_NAME) $(LDFLAGS) debug: From fbf68a0ac2f33c256aad3497d67a4f9eb8b8a9ba Mon Sep 17 00:00:00 2001 From: serine Date: Wed, 14 Mar 2018 13:25:30 +1100 Subject: [PATCH 10/55] fixed the bug in new strncp_with_mismatch and changed input parameters a little It now takes barcode and fastq reads, allowed mismatches and max 5prime crop, which is at most number of bases to attempt to crop from 5 prime of the read in order to find assignment of the barcode. --- src/demulti_paired.c | 20 +++++++++----------- src/demulti_single.c | 3 ++- src/sabre.h | 2 +- src/utils.c | 40 ++++++++++++++++++++++++++++------------ 4 files changed, 40 insertions(+), 25 deletions(-) diff --git a/src/demulti_paired.c b/src/demulti_paired.c index d4b7e94..1a2788f 100644 --- a/src/demulti_paired.c +++ b/src/demulti_paired.c @@ -58,7 +58,6 @@ void paired_usage (int status) { \n", PROGRAM_NAME); - exit (status); } @@ -79,8 +78,8 @@ int paired_main (int argc, char *argv[]) { char *infn1=NULL; char *infn2=NULL; char *barfn=NULL; - char *unknownfn1=strdup("unassigned_R1.fastq.gz"); - char *unknownfn2=strdup("unassigned_R2.fastq.gz"); + char *unknownfn1=strdup("unassigned_R1.fastq"); + char *unknownfn2=strdup("unassigned_R2.fastq"); int both_have_barcodes=0; barcode_data_paired *curr, *head, *temp; char barcode [MAX_BARCODE_LENGTH]; @@ -96,7 +95,6 @@ int paired_main (int argc, char *argv[]) { char *log_fn=NULL; int no_comment=-1; - while (1) { int option_index = 0; //colon after a flag means should have arguments and no colon means just a flag i.e bool, no args after it @@ -250,8 +248,10 @@ int paired_main (int argc, char *argv[]) { both_have_barcodes,\ mismatch, min_umi_len, max_5prime_crop, log_fn, no_comment); - /* Creating linked list of barcode data */ + // https://www.hackerearth.com/practice/data-structures/linked-list/singly-linked-list/tutorial/ + // where each node is represents one barcode from the barcode file + // number of nodes should equal to number of barcodes (lines) in the file head = NULL; while (fscanf (barfile, "%s%s%s", barcode, baroutfn1, baroutfn2) != EOF) { curr = (barcode_data_paired*) malloc (sizeof (barcode_data_paired)); @@ -266,10 +266,9 @@ int paired_main (int argc, char *argv[]) { head = curr; } - fqrec1 = kseq_init (pe1); fqrec2 = kseq_init (pe2); - + // loop over all the reads and for every read loop over all barcodes and look for a match while ((l1 = kseq_read (fqrec1)) >= 0) { l2 = kseq_read (fqrec2); @@ -278,18 +277,17 @@ int paired_main (int argc, char *argv[]) { break; } - /* Go through all barcode data and check if any match to beginning of read */ /* If it does then put read in that barcode's file, otherwise put in unknown file */ curr = head; while (curr) { - if (strncmp_with_mismatch (curr->bc, fqrec1->seq.s, strlen (curr->bc), mismatch, max_5prime_crop) == 0) { + //zero means no mismatches found, that is barcode was found for that reads, therefore break and write it out + if (strncmp_with_mismatch (curr->bc, fqrec1->seq.s, mismatch, max_5prime_crop) == 0) { break; } curr = curr->next; } - if (curr != NULL) { // if UMI is shorter then 10, discard the reads if(strlen((fqrec1->seq.s)+strlen(curr->bc)) >= min_umi_len) { @@ -387,7 +385,7 @@ int paired_main (int argc, char *argv[]) { fprintf (log_file, "\nFastQ records with no barcode match: %d (%d pairs)\n", num_unknown, num_unknown/2); fprintf (log_file, "\nNumber of mismatches allowed: %d\n\n", mismatch); - fprintf (stderr, "\n All done :)!"); + fprintf (stderr, "\n All done :)! \n"); kseq_destroy (fqrec1); kseq_destroy (fqrec2); diff --git a/src/demulti_single.c b/src/demulti_single.c index cd978fd..f73cf69 100644 --- a/src/demulti_single.c +++ b/src/demulti_single.c @@ -6,6 +6,7 @@ #include #include #include +#include #include "sabre.h" #include "kseq.h" @@ -163,7 +164,7 @@ int single_main (int argc, char *argv[]) { /* with the sequence until a match is found or no match is found for any */ curr = head; while (curr) { - if (strncmp_with_mismatch (curr->bc, fqrec->seq.s, strlen (curr->bc), mismatch, max_5prime_crop) == 0) { + if (strncmp_with_mismatch (curr->bc, fqrec->seq.s, mismatch, max_5prime_crop) == 0) { break; } curr = curr->next; diff --git a/src/sabre.h b/src/sabre.h index 6ffb972..03ed885 100644 --- a/src/sabre.h +++ b/src/sabre.h @@ -62,7 +62,7 @@ typedef struct listel_p { /* Function Prototypes */ int single_main (int argc, char *argv[]); int paired_main (int argc, char *argv[]); -int strncmp_with_mismatch (const char *s1, const char *s2, register size_t n, register size_t mismatch, int max_5prime_crop); +int strncmp_with_mismatch (const char *orig_bc, const char *orig_read, size_t mismatch, int max_5prime_crop); const char * _mkdir (const char *dir); diff --git a/src/utils.c b/src/utils.c index 756f2d7..037165a 100644 --- a/src/utils.c +++ b/src/utils.c @@ -8,6 +8,7 @@ #include "sabre.h" #include #include +#include // https://stackoverflow.com/questions/2336242/recursive-mkdir-system-call-on-unix/11425692 // https://stackoverflow.com/questions/7430248/creating-a-new-directory-in-c @@ -26,8 +27,7 @@ const char * _mkdir(const char *file_path) { //char tmp[PATH_MAX]; // can't get this to work.. char tmp[256]; char *p = NULL; - char *dirc; - dirc = strdup(file_path); + char *dirc = strdup(file_path); size_t len; const char *dir = dirname(dirc); @@ -55,18 +55,28 @@ const char * _mkdir(const char *file_path) { //NOTE retuns zero on success //strcmp can be used for sorting, returns pos, zero, neg //BUT this new implementation can't be used as such just FYI -int strncmp_with_mismatch (const char *bc, const char *read, register size_t bc_len, register size_t mismatch, int max_5prime_crop) { +int strncmp_with_mismatch (const char *orig_bc, const char *orig_read, size_t mismatch, int max_5prime_crop) { - register char u1, u2; - int cnt=0; - int n_crop=0; + int orig_read_len = strlen(orig_read); + int orig_bc_len = strlen(orig_bc); + int n_crop = 0; - char *orig_bc = strdup(bc); - char *orig_read = strdup(read); + if(orig_bc_len > orig_read_len) { + fprintf (stderr, "Length of the barcode %d is greater than length of the reads %d.", orig_bc_len, orig_read_len); + return 1; + } while(n_crop <= max_5prime_crop) { - bc = orig_bc; - read = orig_read+n_crop; + + if(n_crop > orig_read_len) { + return 1; + } + + int cnt = 0; + char u1, u2; + const char *bc = orig_bc; + const char *read = orig_read+n_crop; + int bc_len = orig_bc_len; while (bc_len-- > 0) { u1 = *bc++; @@ -75,15 +85,21 @@ int strncmp_with_mismatch (const char *bc, const char *read, register size_t bc_ if (u1 != u2) { cnt++; if (cnt > mismatch) { - //return u1 - u2; break; } } + if (u1 == '\0' || u2 == '\0') { return 0; } } + + if(cnt <= mismatch) { + return 0; + } + n_crop++; } - return 0; + //this is in the case of error + return 1; } From 1790715aef752765e47a111b8aa1c609735cacb0 Mon Sep 17 00:00:00 2001 From: serine Date: Wed, 14 Mar 2018 13:29:55 +1100 Subject: [PATCH 11/55] removed -std=c99 from make file --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index db08141..262da11 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ PROGRAM_NAME = sabre VERSION = 1.00 CC = gcc -CFLAGS = -Wall -std=c99 -O2 -pedantic -DVERSION=$(VERSION) +CFLAGS = -Wall -O2 -pedantic -DVERSION=$(VERSION) DEBUG = -g OPT = -O3 ARCHIVE = $(PROGRAM_NAME)_$(VERSION) From 7ba39b98fecc7d62db4e2b646b50de49fccbf1f8 Mon Sep 17 00:00:00 2001 From: serine Date: Thu, 15 Mar 2018 00:01:06 +1100 Subject: [PATCH 12/55] Changed layout of output stats file into a tab separated table added new column of total percent of the library for each barcode --- src/demulti_paired.c | 29 ++++++++++++++++++++++++----- 1 file changed, 24 insertions(+), 5 deletions(-) diff --git a/src/demulti_paired.c b/src/demulti_paired.c index 1a2788f..e2e31d5 100644 --- a/src/demulti_paired.c +++ b/src/demulti_paired.c @@ -7,6 +7,7 @@ #include #include #include +#include #include "sabre.h" #include "kseq.h" @@ -62,6 +63,10 @@ void paired_usage (int status) { } int paired_main (int argc, char *argv[]) { + + //clock_t begin = clock(); + time_t start, end; + start = time(NULL); gzFile pe1=NULL; gzFile pe2=NULL; @@ -376,16 +381,30 @@ int paired_main (int argc, char *argv[]) { log_file = fopen(log_fn, "w"); } - fprintf (log_file, "\nTotal FastQ records: %d (%d pairs)\n\n", total, total/2); + + fprintf (log_file, "Barcode\tN_records\tN_pairs\tP_pairs\n"); curr = head; + int total_pairs = total/2; + while (curr) { - fprintf (log_file, "FastQ records for barcode %s: %d (%d pairs)\n", curr->bc, curr->num_records, curr->num_records/2); + + int n_pairs = curr->num_records/2; + float percent_pairs = (float) n_pairs/total_pairs; + + fprintf (log_file,"%s\t%d\t%d\t%.2f\n", curr->bc, curr->num_records, n_pairs, percent_pairs); curr = curr->next; } - fprintf (log_file, "\nFastQ records with no barcode match: %d (%d pairs)\n", num_unknown, num_unknown/2); - fprintf (log_file, "\nNumber of mismatches allowed: %d\n\n", mismatch); - fprintf (stderr, "\n All done :)! \n"); + int unknown_pairs = num_unknown/2; + float percent_unknown = (float) unknown_pairs/total_pairs; + float tot_chk = (float) total_pairs/total_pairs; + + fprintf (log_file, "unassigned\t%d\t%d\t%.2f\n", num_unknown, unknown_pairs, percent_unknown); + fprintf (log_file, "total\t%d\t%d\t%.2f\n", total, total_pairs, tot_chk); + + end = time(NULL); + fprintf(stderr, "\n All done :) \ + \n It took %.2f minutes\n", difftime(end, start)/60); kseq_destroy (fqrec1); kseq_destroy (fqrec2); From 844ab176484498dcf9682fa9355a6c72f9240570 Mon Sep 17 00:00:00 2001 From: serine Date: Mon, 9 Apr 2018 09:15:38 +1000 Subject: [PATCH 13/55] updated error message about barcode length being greater than read length --- src/utils.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/utils.c b/src/utils.c index 037165a..c214331 100644 --- a/src/utils.c +++ b/src/utils.c @@ -62,7 +62,12 @@ int strncmp_with_mismatch (const char *orig_bc, const char *orig_read, size_t mi int n_crop = 0; if(orig_bc_len > orig_read_len) { - fprintf (stderr, "Length of the barcode %d is greater than length of the reads %d.", orig_bc_len, orig_read_len); + fprintf (stderr, + "The length of the barcode %s is greater than the length of the reads %s, %d and %d\n", + orig_bc, + orig_read, + orig_bc_len, + orig_read_len); return 1; } From d4e96a9052c8827967d5665288f5b395583a8a62 Mon Sep 17 00:00:00 2001 From: serine Date: Thu, 12 Apr 2018 20:31:12 +1000 Subject: [PATCH 14/55] made new Makefile and added updated kseq.h file --- src/Makefile | 35 ++++++ src/kseq.h | 350 +++++++++++++++++++++++++++------------------------ 2 files changed, 220 insertions(+), 165 deletions(-) create mode 100644 src/Makefile diff --git a/src/Makefile b/src/Makefile new file mode 100644 index 0000000..95a679d --- /dev/null +++ b/src/Makefile @@ -0,0 +1,35 @@ +# Source, Executable, Includes, Library Defines +VERSION = 1.00 +CC = gcc +INCL = kseq.h sabre.h +SRC = demulti_paired.c demulti_single.c sabre.c utils.c +OBJ = $(SRC:.c=.o) +DSRC=src + +CFLAGS = -Wall -O2 -std=c99 -pedantic -DVERSION=$(VERSION) +LDFLAGS = -lz +GPROF = -pg +EXE = sabre + +.PHONY: default + +default: build +# a smarter way to have an if statement here instead of explicit grpof target +# have a look at gcc -M + +%.o: %.c + $(CC) -c $(CFLAGS) $(SRC) + +demulti_single.o: kseq.h sabre.h +demulti_paired.o: kseq.h sabre.h +sabre.o: sabre.h + +build: $(OBJ) + $(CC) $(CFLAGS) $(OBJ) -o $(EXE) $(LDFLAGS) + ln -sf $(DSRC)/$(EXE) .. + +gprof: + $(CC) $(CFLAGS) $(GPROF) $(SRC) -o $(EXE).gprof $(LDFLAGS) + +clean: + $(RM) $(OBJ) $(EXE) $(EXE).gprof core gmon.out diff --git a/src/kseq.h b/src/kseq.h index 73600c4..8f9e498 100644 --- a/src/kseq.h +++ b/src/kseq.h @@ -1,6 +1,6 @@ /* The MIT License - Copyright (c) 2008 Genome Research Ltd (GRL). + Copyright (c) 2008, 2009, 2011 Attractive Chaos Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the @@ -23,9 +23,7 @@ SOFTWARE. */ -/* Contact: Heng Li */ - -/* Last Modified: 12APR2009 */ +/* Last Modified: 2017-02-11 */ #ifndef AC_KSEQ_H #define AC_KSEQ_H @@ -34,47 +32,50 @@ #include #include -#define KS_SEP_SPACE 0 /* isspace(): \t, \n, \v, \f, \r */ -#define KS_SEP_TAB 1 /* isspace() && !' ' */ -#define KS_SEP_MAX 1 +#define KS_SEP_SPACE 0 // isspace(): \t, \n, \v, \f, \r +#define KS_SEP_TAB 1 // isspace() && !' ' +#define KS_SEP_LINE 2 // line separator: "\n" (Unix) or "\r\n" (Windows) +#define KS_SEP_MAX 2 -#define __KS_TYPE(type_t) \ - typedef struct __kstream_t { \ - char *buf; \ - int begin, end, is_eof; \ - type_t f; \ - } kstream_t; +#define __KS_TYPE(type_t) \ + typedef struct __kstream_t { \ + unsigned char *buf; \ + int begin, end, is_eof; \ + type_t f; \ + } kstream_t; +#define ks_err(ks) ((ks)->end < 0) #define ks_eof(ks) ((ks)->is_eof && (ks)->begin >= (ks)->end) #define ks_rewind(ks) ((ks)->is_eof = (ks)->begin = (ks)->end = 0) -#define __KS_BASIC(type_t, __bufsize) \ - static inline kstream_t *ks_init(type_t f) \ - { \ - kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t)); \ - ks->f = f; \ - ks->buf = (char*)malloc(__bufsize); \ - return ks; \ - } \ - static inline void ks_destroy(kstream_t *ks) \ - { \ - if (ks) { \ - free(ks->buf); \ - free(ks); \ - } \ - } - -#define __KS_GETC(__read, __bufsize) \ - static inline int ks_getc(kstream_t *ks) \ - { \ - if (ks->is_eof && ks->begin >= ks->end) return -1; \ - if (ks->begin >= ks->end) { \ - ks->begin = 0; \ - ks->end = __read(ks->f, ks->buf, __bufsize); \ - if (ks->end < __bufsize) ks->is_eof = 1; \ - if (ks->end == 0) return -1; \ - } \ - return (int)ks->buf[ks->begin++]; \ +#define __KS_BASIC(type_t, __bufsize) \ + static inline kstream_t *ks_init(type_t f) \ + { \ + kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t)); \ + ks->f = f; \ + ks->buf = (unsigned char*)malloc(__bufsize); \ + return ks; \ + } \ + static inline void ks_destroy(kstream_t *ks) \ + { \ + if (ks) { \ + free(ks->buf); \ + free(ks); \ + } \ + } + +#define __KS_GETC(__read, __bufsize) \ + static inline int ks_getc(kstream_t *ks) \ + { \ + if (ks_err(ks)) return -3; \ + if (ks_eof(ks)) return -1; \ + if (ks->begin >= ks->end) { \ + ks->begin = 0; \ + ks->end = __read(ks->f, ks->buf, __bufsize); \ + if (ks->end == 0) { ks->is_eof = 1; return -1; } \ + else if (ks->end < 0) { ks->is_eof = 1; return -3; } \ + } \ + return (int)ks->buf[ks->begin++]; \ } #ifndef KSTRING_T @@ -89,135 +90,154 @@ typedef struct __kstring_t { #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) #endif -#define __KS_GETUNTIL(__read, __bufsize) \ - static int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \ - { \ - if (dret) *dret = 0; \ - str->l = 0; \ - if (ks->begin >= ks->end && ks->is_eof) return -1; \ - for (;;) { \ - int i; \ - if (ks->begin >= ks->end) { \ - if (!ks->is_eof) { \ - ks->begin = 0; \ - ks->end = __read(ks->f, ks->buf, __bufsize); \ - if (ks->end < __bufsize) ks->is_eof = 1; \ - if (ks->end == 0) break; \ - } else break; \ - } \ - if (delimiter > KS_SEP_MAX) { \ - for (i = ks->begin; i < ks->end; ++i) \ - if (ks->buf[i] == delimiter) break; \ - } else if (delimiter == KS_SEP_SPACE) { \ - for (i = ks->begin; i < ks->end; ++i) \ - if (isspace(ks->buf[i])) break; \ - } else if (delimiter == KS_SEP_TAB) { \ - for (i = ks->begin; i < ks->end; ++i) \ - if (isspace(ks->buf[i]) && ks->buf[i] != ' ') break; \ - } else i = 0; /* never come to here! */ \ - if (str->m - str->l < i - ks->begin + 1) { \ - str->m = str->l + (i - ks->begin) + 1; \ - kroundup32(str->m); \ - str->s = (char*)realloc(str->s, str->m); \ - } \ - memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin); \ - str->l = str->l + (i - ks->begin); \ - ks->begin = i + 1; \ - if (i < ks->end) { \ - if (dret) *dret = ks->buf[i]; \ - break; \ - } \ - } \ - if (str->l == 0) { \ - str->m = 1; \ - str->s = (char*)calloc(1, 1); \ - } \ - str->s[str->l] = '\0'; \ - return str->l; \ - } - -#define KSTREAM_INIT(type_t, __read, __bufsize) \ - __KS_TYPE(type_t) \ - __KS_BASIC(type_t, __bufsize) \ - __KS_GETC(__read, __bufsize) \ - __KS_GETUNTIL(__read, __bufsize) - -#define __KSEQ_BASIC(type_t) \ - static inline kseq_t *kseq_init(type_t fd) \ - { \ - kseq_t *s = (kseq_t*)calloc(1, sizeof(kseq_t)); \ - s->f = ks_init(fd); \ - return s; \ - } \ - static inline void kseq_rewind(kseq_t *ks) \ - { \ - ks->last_char = 0; \ - ks->f->is_eof = ks->f->begin = ks->f->end = 0; \ - } \ - static inline void kseq_destroy(kseq_t *ks) \ - { \ - if (!ks) return; \ - free(ks->name.s); free(ks->comment.s); free(ks->seq.s); free(ks->qual.s); \ - ks_destroy(ks->f); \ - free(ks); \ - } +#define __KS_GETUNTIL(__read, __bufsize) \ + static int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append) \ + { \ + int gotany = 0; \ + if (dret) *dret = 0; \ + str->l = append? str->l : 0; \ + for (;;) { \ + int i; \ + if (ks_err(ks)) return -3; \ + if (ks->begin >= ks->end) { \ + if (!ks->is_eof) { \ + ks->begin = 0; \ + ks->end = __read(ks->f, ks->buf, __bufsize); \ + if (ks->end == 0) { ks->is_eof = 1; break; } \ + if (ks->end == -1) { ks->is_eof = 1; return -3; } \ + } else break; \ + } \ + if (delimiter == KS_SEP_LINE) { \ + for (i = ks->begin; i < ks->end; ++i) \ + if (ks->buf[i] == '\n') break; \ + } else if (delimiter > KS_SEP_MAX) { \ + for (i = ks->begin; i < ks->end; ++i) \ + if (ks->buf[i] == delimiter) break; \ + } else if (delimiter == KS_SEP_SPACE) { \ + for (i = ks->begin; i < ks->end; ++i) \ + if (isspace(ks->buf[i])) break; \ + } else if (delimiter == KS_SEP_TAB) { \ + for (i = ks->begin; i < ks->end; ++i) \ + if (isspace(ks->buf[i]) && ks->buf[i] != ' ') break; \ + } else i = 0; /* never come to here! */ \ + if (str->m - str->l < (size_t)(i - ks->begin + 1)) { \ + str->m = str->l + (i - ks->begin) + 1; \ + kroundup32(str->m); \ + str->s = (char*)realloc(str->s, str->m); \ + } \ + gotany = 1; \ + memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin); \ + str->l = str->l + (i - ks->begin); \ + ks->begin = i + 1; \ + if (i < ks->end) { \ + if (dret) *dret = ks->buf[i]; \ + break; \ + } \ + } \ + if (!gotany && ks_eof(ks)) return -1; \ + if (str->s == 0) { \ + str->m = 1; \ + str->s = (char*)calloc(1, 1); \ + } else if (delimiter == KS_SEP_LINE && str->l > 1 && str->s[str->l-1] == '\r') --str->l; \ + str->s[str->l] = '\0'; \ + return str->l; \ + } \ + static inline int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \ + { return ks_getuntil2(ks, delimiter, str, dret, 0); } + +#define KSTREAM_INIT(type_t, __read, __bufsize) \ + __KS_TYPE(type_t) \ + __KS_BASIC(type_t, __bufsize) \ + __KS_GETC(__read, __bufsize) \ + __KS_GETUNTIL(__read, __bufsize) + +#define kseq_rewind(ks) ((ks)->last_char = (ks)->f->is_eof = (ks)->f->begin = (ks)->f->end = 0) + +#define __KSEQ_BASIC(SCOPE, type_t) \ + SCOPE kseq_t *kseq_init(type_t fd) \ + { \ + kseq_t *s = (kseq_t*)calloc(1, sizeof(kseq_t)); \ + s->f = ks_init(fd); \ + return s; \ + } \ + SCOPE void kseq_destroy(kseq_t *ks) \ + { \ + if (!ks) return; \ + free(ks->name.s); free(ks->comment.s); free(ks->seq.s); free(ks->qual.s); \ + ks_destroy(ks->f); \ + free(ks); \ + } /* Return value: >=0 length of the sequence (normal) -1 end-of-file -2 truncated quality string -*/ -#define __KSEQ_READ \ - static int kseq_read(kseq_t *seq) \ - { \ - int c; \ - kstream_t *ks = seq->f; \ - if (seq->last_char == 0) { /* then jump to the next header line */ \ - while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@'); \ - if (c == -1) return -1; /* end of file */ \ - seq->last_char = c; \ - } /* the first header char has been read */ \ - seq->comment.l = seq->seq.l = seq->qual.l = 0; \ - if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1; \ - if (c != '\n') ks_getuntil(ks, '\n', &seq->comment, 0); \ - while ((c = ks_getc(ks)) != -1 && c != '>' && c != '+' && c != '@') { \ - if (isgraph(c)) { /* printable non-space character */ \ - if (seq->seq.l + 1 >= seq->seq.m) { /* double the memory */ \ - seq->seq.m = seq->seq.l + 2; \ - kroundup32(seq->seq.m); /* rounded to next closest 2^k */ \ - seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \ - } \ - seq->seq.s[seq->seq.l++] = (char)c; \ - } \ - } \ - if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */ \ - seq->seq.s[seq->seq.l] = 0; /* null terminated string */ \ - if (c != '+') return seq->seq.l; /* FASTA */ \ - if (seq->qual.m < seq->seq.m) { /* allocate enough memory */ \ - seq->qual.m = seq->seq.m; \ - seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m); \ - } \ - while ((c = ks_getc(ks)) != -1 && c != '\n'); /* skip the rest of '+' line */ \ - if (c == -1) return -2; /* we should not stop here */ \ - while ((c = ks_getc(ks)) != -1 && seq->qual.l < seq->seq.l) \ - if (c >= 33 && c <= 127) seq->qual.s[seq->qual.l++] = (unsigned char)c; \ - seq->qual.s[seq->qual.l] = 0; /* null terminated string */ \ - seq->last_char = 0; /* we have not come to the next header line */ \ - if (seq->seq.l != seq->qual.l) return -2; /* qual string is shorter than seq string */ \ - return seq->seq.l; \ - } - -#define __KSEQ_TYPE(type_t) \ - typedef struct { \ - kstring_t name, comment, seq, qual; \ - int last_char; \ - kstream_t *f; \ - } kseq_t; - -#define KSEQ_INIT(type_t, __read) \ - KSTREAM_INIT(type_t, __read, 4096) \ - __KSEQ_TYPE(type_t) \ - __KSEQ_BASIC(type_t) \ - __KSEQ_READ + -3 error reading stream + */ +#define __KSEQ_READ(SCOPE) \ + SCOPE int kseq_read(kseq_t *seq) \ + { \ + int c,r; \ + kstream_t *ks = seq->f; \ + if (seq->last_char == 0) { /* then jump to the next header line */ \ + while ((c = ks_getc(ks)) >= 0 && c != '>' && c != '@'); \ + if (c < 0) return c; /* end of file or error*/ \ + seq->last_char = c; \ + } /* else: the first header char has been read in the previous call */ \ + seq->comment.l = seq->seq.l = seq->qual.l = 0; /* reset all members */ \ + if ((r=ks_getuntil(ks, 0, &seq->name, &c)) < 0) return r; /* normal exit: EOF or error */ \ + if (c != '\n') ks_getuntil(ks, KS_SEP_LINE, &seq->comment, 0); /* read FASTA/Q comment */ \ + if (seq->seq.s == 0) { /* we can do this in the loop below, but that is slower */ \ + seq->seq.m = 256; \ + seq->seq.s = (char*)malloc(seq->seq.m); \ + } \ + while ((c = ks_getc(ks)) >= 0 && c != '>' && c != '+' && c != '@') { \ + if (c == '\n') continue; /* skip empty lines */ \ + seq->seq.s[seq->seq.l++] = c; /* this is safe: we always have enough space for 1 char */ \ + ks_getuntil2(ks, KS_SEP_LINE, &seq->seq, 0, 1); /* read the rest of the line */ \ + } \ + if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */ \ + if (seq->seq.l + 1 >= seq->seq.m) { /* seq->seq.s[seq->seq.l] below may be out of boundary */ \ + seq->seq.m = seq->seq.l + 2; \ + kroundup32(seq->seq.m); /* rounded to the next closest 2^k */ \ + seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \ + } \ + seq->seq.s[seq->seq.l] = 0; /* null terminated string */ \ + seq->is_fastq = (c == '+'); \ + if (!seq->is_fastq) return seq->seq.l; /* FASTA */ \ + if (seq->qual.m < seq->seq.m) { /* allocate memory for qual in case insufficient */ \ + seq->qual.m = seq->seq.m; \ + seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m); \ + } \ + while ((c = ks_getc(ks)) >= 0 && c != '\n'); /* skip the rest of '+' line */ \ + if (c == -1) return -2; /* error: no quality string */ \ + while ((c = ks_getuntil2(ks, KS_SEP_LINE, &seq->qual, 0, 1) >= 0 && seq->qual.l < seq->seq.l)); \ + if (c == -3) return -3; /* stream error */ \ + seq->last_char = 0; /* we have not come to the next header line */ \ + if (seq->seq.l != seq->qual.l) return -2; /* error: qual string is of a different length */ \ + return seq->seq.l; \ + } + +#define __KSEQ_TYPE(type_t) \ + typedef struct { \ + kstring_t name, comment, seq, qual; \ + int last_char, is_fastq; \ + kstream_t *f; \ + } kseq_t; + +#define KSEQ_INIT2(SCOPE, type_t, __read) \ + KSTREAM_INIT(type_t, __read, 16384) \ + __KSEQ_TYPE(type_t) \ + __KSEQ_BASIC(SCOPE, type_t) \ + __KSEQ_READ(SCOPE) + +#define KSEQ_INIT(type_t, __read) KSEQ_INIT2(static, type_t, __read) + +#define KSEQ_DECLARE(type_t) \ + __KS_TYPE(type_t) \ + __KSEQ_TYPE(type_t) \ + extern kseq_t *kseq_init(type_t fd); \ + void kseq_destroy(kseq_t *ks); \ + int kseq_read(kseq_t *seq); #endif From b1329545b6116bebc8c8376400436f77b248a716 Mon Sep 17 00:00:00 2001 From: serine Date: Mon, 30 Apr 2018 17:42:49 +1000 Subject: [PATCH 15/55] Setting myself up for mode feature. Planing to simplify sabre to be one command (no sub commands i.e se or pe) instead have -m, --mode flag with several different mode options, all described in docs/mode.md --- Makefile | 41 ------------------------- docs/modes.md | 70 ++++++++++++++++++++++++++++++++++++++++++ src/Makefile | 2 +- src/modes.c | 70 ++++++++++++++++++++++++++++++++++++++++++ src/utils.c | 85 +++++++++++++++++++++++++++++++++++++++++++++++---- src/utils.h | 17 +++++++++++ 6 files changed, 237 insertions(+), 48 deletions(-) delete mode 100644 Makefile create mode 100644 docs/modes.md create mode 100644 src/modes.c create mode 100644 src/utils.h diff --git a/Makefile b/Makefile deleted file mode 100644 index 262da11..0000000 --- a/Makefile +++ /dev/null @@ -1,41 +0,0 @@ -PROGRAM_NAME = sabre -VERSION = 1.00 -CC = gcc -CFLAGS = -Wall -O2 -pedantic -DVERSION=$(VERSION) -DEBUG = -g -OPT = -O3 -ARCHIVE = $(PROGRAM_NAME)_$(VERSION) -LDFLAGS = -lz -SDIR = src - -.PHONY: clean default build distclean dist debug - -default: build - -demulti_single.o: $(SDIR)/demulti_single.c $(SDIR)/sabre.h $(SDIR)/kseq.h - $(CC) $(CFLAGS) $(OPT) -c $(SDIR)/$*.c - -demulti_paired.o: $(SDIR)/demulti_paired.c $(SDIR)/sabre.h $(SDIR)/kseq.h - $(CC) $(CFLAGS) $(OPT) -c $(SDIR)/$*.c - -utils.o: $(SDIR)/utils.c $(SDIR)/sabre.h - $(CC) $(CFLAGS) $(OPT) -c $(SDIR)/$*.c - -sabre.o: $(SDIR)/sabre.c $(SDIR)/sabre.h - $(CC) $(CFLAGS) $(OPT) -c $(SDIR)/$*.c - -clean: - rm -rf *.o $(SDIR)/*.gch ./sabre - -distclean: clean - rm -rf *.tar.gz - -dist: - tar -zcf $(ARCHIVE).tar.gz src Makefile - -build: demulti_single.o demulti_paired.o sabre.o utils.o - $(CC) $(CFLAGS) $(OPT) $? -o $(PROGRAM_NAME) $(LDFLAGS) - -debug: - $(MAKE) build "CFLAGS=-Wall -pedantic -g -DDEBUG" - diff --git a/docs/modes.md b/docs/modes.md new file mode 100644 index 0000000..10982ad --- /dev/null +++ b/docs/modes.md @@ -0,0 +1,70 @@ +# Sabre + +## Different running modes + +DOCS: In each case BARCODE and/or UMI are trimed off and +put into FASTQ header: + +Not sure is I should have: + + BARCODE always has a precedent i.e BARCODE:UMI + OR + It follows the same structure as per experiment i.e + if BARCODE+UMI then BARCODE:UMI + else if UMI+BARCODE then UMI:BARCODE + +All modes that begin with 3 will return sinle, R1 file, mering +R1 read into R2 header and renaming R2 into R1 + +10 = sinle-end where R1 has the following structure: + + R1 --> + BARCODE+READ + +20 = paired-end where R1 and R2 have the following structure: + + R1 --> <--R2 + BARCODE+READ----READ+BARCODE + +this mode returns single file (R1) with barcode appended and into R1 header + +30 = paired-end where R1 and R2 have the following structure: + + R1 --> <-R2 + BARCODE----READ + +11 = sinle-end where R1 has the following structure: + + R1 --> + BARCODE+UMI+READ + +21 = paired-end where R1 and R2 have the following structure: + + R1 --> <--R2 + BARCODE+UMI+READ----READ+UMI+BARCODE + +this mode returns single file (R1) with barcode appended and into R1 header + +31 = paired-end where R1 and R2 have the following structure: + + R1 --> <-R2 + BARCODE+UMI----READ + +NOTE this gives me room for yet another mode e.g 12, 22, 32 + +12 = sinle-end where R1 has the following structure: + + R1 --> + UMI+READ + +22 = paired-end where R1 and R2 have the following structure: + + R1 --> <--R2 + UMI+READ----READ+UMI + +this mode returns single file (R1) with barcode appended and into R1 header + +32 = paired-end where R1 and R2 have the following structure: + + R1 --> <-R2 + UMI----READ diff --git a/src/Makefile b/src/Makefile index 95a679d..cb130fc 100644 --- a/src/Makefile +++ b/src/Makefile @@ -26,7 +26,7 @@ sabre.o: sabre.h build: $(OBJ) $(CC) $(CFLAGS) $(OBJ) -o $(EXE) $(LDFLAGS) - ln -sf $(DSRC)/$(EXE) .. + #ln -sf $(DSRC)/$(EXE) .. gprof: $(CC) $(CFLAGS) $(GPROF) $(SRC) -o $(EXE).gprof $(LDFLAGS) diff --git a/src/modes.c b/src/modes.c new file mode 100644 index 0000000..a616cb5 --- /dev/null +++ b/src/modes.c @@ -0,0 +1,70 @@ + +/* + * Not valid C code, just some snipets for later use + * + * TODO build on get_fqread function in utils. + * I think that function should also take char *umi + * if umi == NULL, then none of 3? modes are true, return normal + * if umi != NULL then this must be one of 3? modes, merge R1 and R2 and return R1 only + */ + +typedef struct listel_p { + char* bc; + int num_records; + //FILE* bcfile1; + //FILE* bcfile2; + gzFile bcfile1; + gzFile bcfile2; + struct listel_p *next; +} barcode_data_paired; + + /* Creating linked list of barcode data */ + // https://www.hackerearth.com/practice/data-structures/linked-list/singly-linked-list/tutorial/ + // where each node is represents one barcode from the barcode file + // number of nodes should equal to number of barcodes (lines) in the file + head = NULL; + char barcode [MAX_BARCODE_LENGTH]; + char s_name [MAX_SNAME_LENGTH]; + //while (fscanf (barfile, "%s%s%s", barcode, baroutfn1, baroutfn2) != EOF) { + while (fscanf (barfile, "%s\t%s", barcode, s_name) != EOF) { + char bcout_prefix [MAX_BARCODE_LENGTH+MAX_SNAME_LENGTH]; + char bcout_fn1 [MAX_FILENAME_LENGTH]; + char bcout_fn2 [MAX_FILENAME_LENGTH]; + + curr = (barcode_data_paired*) malloc (sizeof (barcode_data_paired)); + curr->bc = (char*) malloc (strlen(barcode) + 1); + strcpy (curr->bc, barcode); + + if(strlen(s_name) > MAX_FILENAME_LENGTH) { + fprintf (stderr, "ERROR: Too many characters in your sample name; %s:%d \n", s_name, strlen(s_name)); + } + //TODO make this into a function call later on. + //want a function in utils.c get_bc_fn(s_name, barcode, 1|2) to return + //a string = bcout_fn to... maybe this isn't worth a function call.. + strcat(bcout_prefix, s_name); + strcat(bcout_prefix, "_"); + strcat(bcout_prefix, barcode); + + strcpy(bcout_fn1, bcout_prefix); + strcat(bcout_fn1, "_R1.fastq.gz"); + + strcpy(bcout_fn2, bcout_prefix); + strcat(bcout_fn2, "_R2.fastq.gz"); + + curr->bcfile1 = gzopen(_mkdir(bcout_fn1), "wb"); + curr->bcfile2 = gzopen(_mkdir(bcout_fn2), "wb"); + curr->num_records = 0; + + curr->next = head; + head = curr; + } + + while (curr) { + gzclose(curr->bcfile1); + gzclose(curr->bcfile2); + free (curr->bc); + temp = curr; + curr = curr->next; + free (temp); + } + diff --git a/src/utils.c b/src/utils.c index c214331..18d7b68 100644 --- a/src/utils.c +++ b/src/utils.c @@ -9,6 +9,7 @@ #include #include #include +#include "utils.h" // https://stackoverflow.com/questions/2336242/recursive-mkdir-system-call-on-unix/11425692 // https://stackoverflow.com/questions/7430248/creating-a-new-directory-in-c @@ -62,12 +63,7 @@ int strncmp_with_mismatch (const char *orig_bc, const char *orig_read, size_t mi int n_crop = 0; if(orig_bc_len > orig_read_len) { - fprintf (stderr, - "The length of the barcode %s is greater than the length of the reads %s, %d and %d\n", - orig_bc, - orig_read, - orig_bc_len, - orig_read_len); + fprintf (stderr, "Length of the barcode %d is greater than length of the reads %d.", orig_bc_len, orig_read_len); return 1; } @@ -108,3 +104,80 @@ int strncmp_with_mismatch (const char *orig_bc, const char *orig_read, size_t mi //this is in the case of error return 1; } + +// https://stackoverflow.com/questions/21880730/c-what-is-the-best-and-fastest-way-to-concatenate-strings +//TODO this is a fastq mystrcat function, that returns a pointer to the end of the string +char * get_fqread(kseq_t *fqrec, char *barcode, int no_comment, int remove_seq) { + + size_t fqread_size = 0; + + fqread_size += strlen(fqrec->seq.s); + fqread_size += (strlen(fqrec->name.s)*2); + fqread_size += strlen(fqrec->qual.s); + fqread_size += (strlen(fqrec->comment.s)*2); + fqread_size += 2;// header signs @ and + + fqread_size += 2;//two colons (:) + fqread_size += 4;//cariage returns + fqread_size += 2;//two spaces + + char *umi = NULL; + + if(barcode[0] != '\0') { + umi = (char*) malloc( strlen(fqrec->seq.s)-strlen(barcode) + 1 ); + strcpy(umi, (fqrec->seq.s)+strlen(barcode)); + fqread_size += strlen(umi); + } + + char *fqread = (char*) malloc(fqread_size + 1); + //makes it a zero length string + fqread[0] = '\0'; + + //@READNAME:BACRCODE:UMI + //1st line + strcat(fqread, "@"); + strcat(fqread, fqrec->name.s); + //TODO later can have conditional here depending on the the structure and/or BARCODE/UMI + if(barcode[0] != '\0') { + strcat(fqread, ":"); + strcat(fqread, barcode); + + if(umi[0] == '\0') { + fprintf(stderr, "Error: This shouldn't happened.\n"); + exit(EXIT_FAILURE); + } + + strcat(fqread, ":"); + strcat(fqread, umi); + free(umi); + } + + if(fqrec->comment.l && no_comment == -1) { + strcat(fqread, " "); + strcat(fqread, fqrec->comment.s); + } + strcat(fqread, "\n"); + + //2nd line + if(remove_seq == 1) { + strcat(fqread, "N"); + } + else { + strcat(fqread, (fqrec->seq.s)+strlen(barcode)); + } + strcat(fqread, "\n"); + + //3rd line + strcat(fqread, "+"); + strcat(fqread, fqrec->name.s); + if(fqrec->comment.l && no_comment == -1) { + strcat(fqread, " "); + strcat(fqread, fqrec->comment.s); + } + strcat(fqread, "\n"); + + //4th line + strcat(fqread, fqrec->qual.s); + strcat(fqread, "\n"); + + return fqread; +} diff --git a/src/utils.h b/src/utils.h new file mode 100644 index 0000000..b6e0e56 --- /dev/null +++ b/src/utils.h @@ -0,0 +1,17 @@ +#ifndef UTILS_H +#define UTILS_H + +#include +#include "kseq.h" + +KSEQ_INIT(gzFile, gzread) + +//This is needed if compilling with -std=c99, read below for more +//https://stackoverflow.com/questions/26284110/strdup-confused-about-warnings-implicit-declaration-makes-pointer-with +char *strdup(const char*); + +int strncmp_with_mismatch (const char *orig_bc, const char *orig_read, size_t mismatch, int max_5prime_crop); +const char * _mkdir (const char *dir); +char * get_fqread(kseq_t *fqrec, char *barcode, int no_comment, int remove_seq); + +#endif /*UTILS_H*/ From 0b4d59ed66a8ed8207cfb19af9edb404211018fd Mon Sep 17 00:00:00 2001 From: serine Date: Tue, 15 May 2018 13:44:33 +1000 Subject: [PATCH 16/55] Added some docs, more like ideas at this stage --- docs/definitions.md | 37 +++++++++++++++++++++++++++++++++++++ docs/modes.md | 10 ++++++---- 2 files changed, 43 insertions(+), 4 deletions(-) create mode 100644 docs/definitions.md diff --git a/docs/definitions.md b/docs/definitions.md new file mode 100644 index 0000000..b0a8921 --- /dev/null +++ b/docs/definitions.md @@ -0,0 +1,37 @@ +define blocks along the read + +BARCODE +UMI +READ + +then set values to different block + +BARCODE = 8 +UMI = 10 + +## Andrew's suggestion + +``` +--input sample_A_R1.fastq.gz:i8{index1},r151{read1},i8{index2} +``` + +``` +--fq1 sample_A_R1.fastq.gz:i8{index1},r151{READ1},i8{index2} + +--fq2 sample_A_R2.fastq.gz:i8{index1},r151{READ1},i8{index2} +``` + +We need to check that BARCODE == index1 in both fq1 and fq2 but also check that index1_fq1 == index1_fq2 + +``` +--merge 12 merge R1 into R2 +--merge 21 merge R2 into R1 +``` + +either way resulting read is R1 + +``` +--fq1 sample_A_R1.fastq.gz:8index1,*index2 + +--fq2 sample_A_R2.fastq.gz:i8{index1},r151{read2},i8{index2} +``` diff --git a/docs/modes.md b/docs/modes.md index 10982ad..62bca89 100644 --- a/docs/modes.md +++ b/docs/modes.md @@ -5,7 +5,7 @@ DOCS: In each case BARCODE and/or UMI are trimed off and put into FASTQ header: -Not sure is I should have: +Not sure if I should have: BARCODE always has a precedent i.e BARCODE:UMI OR @@ -13,10 +13,10 @@ Not sure is I should have: if BARCODE+UMI then BARCODE:UMI else if UMI+BARCODE then UMI:BARCODE -All modes that begin with 3 will return sinle, R1 file, mering +All modes that begin with 3 will return single - R1 file, merging R1 read into R2 header and renaming R2 into R1 -10 = sinle-end where R1 has the following structure: +10 = single-end where R1 has the following structure: R1 --> BARCODE+READ @@ -33,7 +33,9 @@ this mode returns single file (R1) with barcode appended and into R1 header R1 --> <-R2 BARCODE----READ -11 = sinle-end where R1 has the following structure: +40 = paired-end where + +11 = single-end where R1 has the following structure: R1 --> BARCODE+UMI+READ From 5bce3cba82a085656d4ab084074fd77e5c26f879 Mon Sep 17 00:00:00 2001 From: serine Date: Wed, 30 May 2018 16:18:12 +1000 Subject: [PATCH 17/55] Kind of forgotten what I was doing here, left in staging for a couple of week... From the diff and distance memory removed single_end menu options as I'm moving into sligntly different direction. One option is to have "mode" where single-end is just on of the modes. --- src/sabre.c | 87 ++++++++++++++++++++++++++++------------------------- src/sabre.h | 10 +++--- 2 files changed, 50 insertions(+), 47 deletions(-) diff --git a/src/sabre.c b/src/sabre.c index 5eafe11..d339c9c 100644 --- a/src/sabre.c +++ b/src/sabre.c @@ -10,53 +10,58 @@ void main_usage (int status) { - fprintf (stdout, "\n\ - \n Usage: %s [options]\ - \n\ - \n Command:\ - \n\ - \n se\tsingle-end barcode de-multiplexing\ - \n pe\tpaired-end barcode de-multiplexing\ - \n\ - \n --help\tto get more help\ - \n --version\tprint current version to stdout\ - \n\ - \n Info: Sabre is a heavy cavalry sword with a curved blade and a single cutting edge\ - \n Not sure though if the meaning was intended by original author...\ - \n\ - \n", - PROGRAM_NAME); - - exit (status); + fprintf (stdout, + "\n\ + \n Usage: %s [options]\ + \n\ + \n Command:\ + \n\ + \n se\tsingle-end barcode de-multiplexing\ + \n pe\tpaired-end barcode de-multiplexing\ + \n\ + \n --help\tto get more help\ + \n --version\tprint current version to stdout\ + \n\ + \n Info: Sabre is a heavy cavalry sword with a curved blade and a single cutting edge\ + \n Not sure though if the meaning was intended by original author...\ + \n\ + \n", + PROGRAM_NAME); + + exit (status); } int main (int argc, char *argv[]) { - int retval=0; - - if (argc < 2 || (strcmp (argv[1],"pe") != 0 && strcmp (argv[1],"se") != 0 && strcmp (argv[1],"--version") != 0 && strcmp (argv[1],"--help") != 0)) { - main_usage (EXIT_FAILURE); - } - - if (strcmp (argv[1],"--version") == 0) { - fprintf(stdout, "%s version %0.3f\nCopyright (c) 2011 The Regents of University of California, Davis Campus.\n%s is free software and comes with ABSOLUTELY NO WARRANTY.\nDistributed under the MIT License.\n\nWritten by %s\n", PROGRAM_NAME, VERSION, PROGRAM_NAME, AUTHORS); - - exit (EXIT_SUCCESS); + int retval=0; - } + if (argc < 2 || (strcmp (argv[1],"pe") != 0 && strcmp (argv[1],"se") != 0 && strcmp (argv[1],"--version") != 0 && strcmp (argv[1],"--help") != 0)) { + main_usage (EXIT_FAILURE); + } - else if (strcmp (argv[1],"--help") == 0) { - main_usage (EXIT_SUCCESS); - } + if (strcmp (argv[1],"--version") == 0) { + fprintf(stdout, + "\n\ + \n %s version %0.3f\ + \n\ + \n Copyright (c) 2011 The Regents of University of California, Davis Campus.\ + \n %s is free software and comes with ABSOLUTELY NO WARRANTY.\ + \n Distributed under the MIT License.\ + \n\ + \n Written by %s\ + \n\ + \n", + PROGRAM_NAME, VERSION, PROGRAM_NAME, AUTHORS); + exit (EXIT_SUCCESS); + } - else if (strcmp (argv[1],"pe") == 0) { - retval = paired_main (argc, argv); - return (retval); - } + else if (strcmp (argv[1],"--help") == 0) { + main_usage (EXIT_SUCCESS); + } - else if (strcmp (argv[1],"se") == 0) { - retval = single_main (argc, argv); - return (retval); - } + else if (strcmp (argv[1],"pe") == 0) { + retval = paired_main (argc, argv); + return (retval); + } - return 0; + return 0; } diff --git a/src/sabre.h b/src/sabre.h index 03ed885..0701cff 100644 --- a/src/sabre.h +++ b/src/sabre.h @@ -1,6 +1,8 @@ #ifndef SABRE_H #define SABRE_H +#include + #ifndef PROGRAM_NAME #define PROGRAM_NAME "sabre" #endif @@ -53,17 +55,13 @@ typedef struct listel { typedef struct listel_p { char* bc; int num_records; - FILE* bcfile1; - FILE* bcfile2; + gzFile bcfile1; + gzFile bcfile2; struct listel_p *next; } barcode_data_paired; /* Function Prototypes */ -int single_main (int argc, char *argv[]); int paired_main (int argc, char *argv[]); -int strncmp_with_mismatch (const char *orig_bc, const char *orig_read, size_t mismatch, int max_5prime_crop); - -const char * _mkdir (const char *dir); #endif /*SABRE_H*/ From cb1563f7d99013c0fd2e44cba286963df73dc970 Mon Sep 17 00:00:00 2001 From: serine Date: Wed, 30 May 2018 16:20:24 +1000 Subject: [PATCH 18/55] changed strncmp_with_mismatch to chk_bc_mtch function. This function no longer returns greater, equals or less then zero. Instead it simply looks for a barcode match, still allows 5 prime cropping, on match returns number of bases cropped, as this is important for downstream analysis in getting "actual barcode" sequence. Also remember that we are allowing mismatches in the barcode and perhaps we want that info later in the analysis somewhere. The other two important functions are get_fqread and get_merged_fqread both return a string that you can then write out. Originally I thought to just have get_fqread that then would take a mode and write "correct" fq string out. The idea behind get_merged_fqread function; in the case where R1 is just the barcode and R2 is just the read, we don't want to write out R1 since it holds no information, so simply merged two into one, appending barcode info into R2 header --- src/demulti_paired.c | 308 ++++++++++++++++++++++++------------------- src/demultiplex.c | 123 +++++++++++++++++ src/utils.c | 174 +++++++++++++++--------- src/utils.h | 7 +- 4 files changed, 416 insertions(+), 196 deletions(-) create mode 100644 src/demultiplex.c diff --git a/src/demulti_paired.c b/src/demulti_paired.c index e2e31d5..cd33031 100644 --- a/src/demulti_paired.c +++ b/src/demulti_paired.c @@ -6,21 +6,19 @@ #include #include #include -#include #include #include "sabre.h" -#include "kseq.h" - -KSEQ_INIT(gzFile, gzread) +#include "utils.h" //more about getopts http://www.informit.com/articles/article.aspx?p=175771&seqNum=3 static struct option paired_long_options[] = { - {"pe-file1", required_argument, NULL, 'f'}, - {"pe-file2", required_argument, NULL, 'r'}, - {"barcode-file", required_argument, NULL, 'b'}, - {"unknown-output1", required_argument, NULL, 'u'}, - {"unknown-output2", required_argument, NULL, 'w'}, - {"both-barcodes", optional_argument, NULL, 'c'}, + {"fq1", required_argument, NULL, 'f'}, + {"fq2", required_argument, NULL, 'r'}, + {"barcodes", required_argument, NULL, 'b'}, + {"unassinged1", required_argument, NULL, 'z'}, + {"unassinged2", required_argument, NULL, 'w'}, + {"combine", optional_argument, NULL, 'c'}, + {"umi", optional_argument, NULL, 'u'}, {"max-mismatch", required_argument, 0, 'm'}, {"min-umi-len", required_argument, 0, 'l'}, {"max-5prime-crop", required_argument, 0, 'a'}, @@ -41,15 +39,15 @@ void paired_usage (int status) { \n\ \n Required:\ \n\ - \n -f, --pe-file1 FILE Input FASTQ R1 read\ - \n -r, --pe-file2 FILE Input FASTQ R2 reads\ - \n -b, --barcode-file FILE Barcodes files, one barcode per line, e.g B\\tR1\\tR2\ - \n -u, --unknown-output1 FILE Output unassigned R1 reads\ - \n -w, --unknown-output2 FILE Output unassigned R2 reads\ + \n -f, --fq1 FILE Input FASTQ R1 read\ + \n -r, --fq2 FILE Input FASTQ R2 reads\ + \n -b, --barcodes FILE Barcodes files, one barcode per line, e.g BC\\tPREFIX\ + \n -w, --unassigned CHAR Unassigned prefix\ \n\ \n Other:\ \n\ - \n -c, --both-barcodes INT Indicates that both FASTQ files have barcodes [0]\ + \n -c, --combine Combine R1 and R2 [NULL]\ + \n -u, --umi Indicates that umi present in the R1 read [NULL]\ \n -m, --max-mismatch INT Maximum number of mismatches allowed in a barcode [0]\ \n -l, --min-umi-len INT Minimum UMI length to keep [0]\ \n -a, --max-5prime-crop INT Maximum number of possible bases cropped from 5prime [0]\ @@ -74,27 +72,28 @@ int paired_main (int argc, char *argv[]) { kseq_t *fqrec2; int l1,l2; FILE* barfile = NULL; - FILE* unknownfile1=NULL; - FILE* unknownfile2=NULL; + + gzFile unknownfile1=NULL; + gzFile unknownfile2=NULL; + char *unknownfn1=strdup("unassigned_R1.fastq.gz"); + char *unknownfn2=strdup("unassigned_R2.fastq.gz"); + FILE* log_file=NULL; - int debug=0; int optc; extern char *optarg; - char *infn1=NULL; - char *infn2=NULL; + char *fq1=NULL; + char *fq2=NULL; char *barfn=NULL; - char *unknownfn1=strdup("unassigned_R1.fastq"); - char *unknownfn2=strdup("unassigned_R2.fastq"); - int both_have_barcodes=0; + char s_name[MAX_FILENAME_LENGTH]; barcode_data_paired *curr, *head, *temp; char barcode [MAX_BARCODE_LENGTH]; - char baroutfn1 [MAX_FILENAME_LENGTH]; - char baroutfn2 [MAX_FILENAME_LENGTH]; int num_unknown=0; int total=0; int mismatch=0; - //int quiet=0; + int combine = -1; + int umi = -1; + int paired = -1; int min_umi_len=0; int max_5prime_crop=0; char *log_fn=NULL; @@ -103,7 +102,7 @@ int paired_main (int argc, char *argv[]) { while (1) { int option_index = 0; //colon after a flag means should have arguments and no colon means just a flag i.e bool, no args after it - optc = getopt_long (argc, argv, "dcnf:r:b:u:w:m:s:l:z:a:", paired_long_options, &option_index); + optc = getopt_long (argc, argv, "dnucf:r:b:z:w:m:s:l:z:a:", paired_long_options, &option_index); if (optc == -1) break; @@ -111,13 +110,13 @@ int paired_main (int argc, char *argv[]) { if (paired_long_options[option_index].flag != 0) break; case 'f': - infn1 = (char*) malloc (strlen (optarg) + 1); - strcpy (infn1, optarg); + fq1 = (char*) malloc (strlen (optarg) + 1); + strcpy (fq1, optarg); break; case 'r': - infn2 = (char*) malloc (strlen (optarg) + 1); - strcpy (infn2, optarg); + fq2 = (char*) malloc (strlen (optarg) + 1); + strcpy (fq2, optarg); break; case 'b': @@ -125,7 +124,7 @@ int paired_main (int argc, char *argv[]) { strcpy (barfn, optarg); break; - case 'u': + case 'z': if(unknownfn1) { free(unknownfn1); } @@ -142,7 +141,11 @@ int paired_main (int argc, char *argv[]) { break; case 'c': - both_have_barcodes=1; + combine=1; + break; + + case 'u': + umi=1; break; case 'm': @@ -166,14 +169,6 @@ int paired_main (int argc, char *argv[]) { no_comment = 1; break; - case 'z': - //quiet=1; - break; - - case 'd': - debug = 1; - break; - case_GETOPT_HELP_CHAR(paired_usage); case_GETOPT_VERSION_CHAR(PROGRAM_NAME, VERSION, AUTHORS); @@ -187,38 +182,38 @@ int paired_main (int argc, char *argv[]) { } } - if (!infn1 || !infn2 || !unknownfn1 || !unknownfn2 || !barfn) { + if (!fq1 || !fq2 || !unknownfn1 || !unknownfn2 || !barfn) { paired_usage (EXIT_FAILURE); } - if (!strcmp (infn1, infn2) || !strcmp (infn1, unknownfn1) || !strcmp (infn1, unknownfn2) || - !strcmp (infn1, barfn) || !strcmp (infn2, unknownfn1) || !strcmp (infn2, unknownfn2) || - !strcmp (infn2, barfn) || !strcmp (unknownfn1, unknownfn2) || !strcmp (unknownfn1, barfn) || + if (!strcmp (fq1, fq2) || !strcmp (fq1, unknownfn1) || !strcmp (fq1, unknownfn2) || + !strcmp (fq1, barfn) || !strcmp (fq2, unknownfn1) || !strcmp (fq2, unknownfn2) || + !strcmp (fq2, barfn) || !strcmp (unknownfn1, unknownfn2) || !strcmp (unknownfn1, barfn) || !strcmp (unknownfn2, barfn)) { fprintf (stderr, "Error: Duplicate input and/or output file names.\n"); return EXIT_FAILURE; } - pe1 = gzopen (infn1, "r"); + pe1 = gzopen (fq1, "r"); if (!pe1) { - fprintf (stderr, "Could not open input file 1 '%s'.\n", infn1); + fprintf (stderr, "Could not open input file 1 '%s'.\n", fq1); return EXIT_FAILURE; } - pe2 = gzopen (infn2, "r"); + pe2 = gzopen (fq2, "r"); if (!pe2) { - fprintf (stderr, "Could not open input file 2 '%s'.\n", infn2); + fprintf (stderr, "Could not open input file 2 '%s'.\n", fq2); return EXIT_FAILURE; } - unknownfile1 = fopen (unknownfn1, "w"); + unknownfile1 = gzopen(unknownfn1, "wb"); if (!unknownfile1) { fprintf (stderr, "Could not open unknown output file 1 '%s'.\n", unknownfn1); return EXIT_FAILURE; } - unknownfile2 = fopen (unknownfn2, "w"); + unknownfile2 = gzopen(unknownfn2, "wb"); if (!unknownfile2) { fprintf (stderr, "Could not open unknown output file 2 '%s'.\n", unknownfn2); return EXIT_FAILURE; @@ -230,6 +225,10 @@ int paired_main (int argc, char *argv[]) { return EXIT_FAILURE; } + if(fq2) { + paired = 1; + } + fprintf(stderr, "\n\ \n Running: %s\ \n Command line args:\ @@ -238,7 +237,7 @@ int paired_main (int argc, char *argv[]) { \n --barcode-file %s\ \n --unknown-output1 %s\ \n --unknown-output2 %s\ - \n --both-barcodes %d\ + \n --combine %d\ \n --max-mismatch %d\ \n --min-umi-len %d\ \n --max-5prime-crop %d\ @@ -247,121 +246,161 @@ int paired_main (int argc, char *argv[]) { \n\ \n In Progess...\ \n", PROGRAM_NAME,\ - infn1, infn2,\ + fq1, fq2,\ barfn,\ unknownfn1, unknownfn2,\ - both_have_barcodes,\ + combine,\ mismatch, min_umi_len, max_5prime_crop, log_fn, no_comment); + char *bcout_fn1 = NULL; + char *bcout_fn2 = NULL; /* Creating linked list of barcode data */ // https://www.hackerearth.com/practice/data-structures/linked-list/singly-linked-list/tutorial/ // where each node is represents one barcode from the barcode file - // number of nodes should equal to number of barcodes (lines) in the file head = NULL; - while (fscanf (barfile, "%s%s%s", barcode, baroutfn1, baroutfn2) != EOF) { - curr = (barcode_data_paired*) malloc (sizeof (barcode_data_paired)); - curr->bc = (char*) malloc (strlen(barcode) + 1); - strcpy (curr->bc, barcode); + while (fscanf (barfile, "%s%s", barcode, s_name) != EOF) { + curr = (barcode_data_paired*) malloc(sizeof(barcode_data_paired)); + curr->bc = (char*) malloc(strlen(barcode) + 1); + strcpy(curr->bc, barcode); + + bcout_fn1 = (char *) malloc(MAX_FILENAME_LENGTH*2); + bcout_fn1[0] = '\0'; + get_bc_fn(&bcout_fn1, s_name, curr->bc, 1); + //curr->bcfile1 = fopen (_mkdir(bcout_fn1), "w"); + curr->bcfile1 = gzopen(_mkdir(bcout_fn1), "wb"); + + if(paired > 0 && combine < 0) { + bcout_fn2 = (char *) malloc(MAX_FILENAME_LENGTH*2); + bcout_fn2[0] = '\0'; + get_bc_fn(&bcout_fn2, s_name, curr->bc, 2); + //curr->bcfile2 = fopen (_mkdir(bcout_fn2), "w"); + curr->bcfile2 = gzopen(_mkdir(bcout_fn2), "wb"); + } - curr->bcfile1 = fopen (_mkdir(baroutfn1), "w"); - curr->bcfile2 = fopen (_mkdir(baroutfn2), "w"); curr->num_records = 0; - curr->next = head; head = curr; + + //free(bcout_fn1); + //free(bcout_fn2); } fqrec1 = kseq_init (pe1); - fqrec2 = kseq_init (pe2); - // loop over all the reads and for every read loop over all barcodes and look for a match + + if(paired > 0) { + fqrec2 = kseq_init (pe2); + } + + /* Get reads, one at a time */ while ((l1 = kseq_read (fqrec1)) >= 0) { - l2 = kseq_read (fqrec2); - if (l2 < 0) { - fprintf (stderr, "Error: PE file 2 is shorter than PE file 1. Disregarding rest of PE file 1.\n"); - break; + int n_crop = 0; + + char *actl_bc = NULL; + char *umi_idx = NULL; + + char *fqread1 = NULL; + char *fqread2 = NULL; + + size_t fq_size = 0; + + fq_size += strlen(fqrec1->seq.s); + fq_size += (strlen(fqrec1->name.s)*2); + fq_size += strlen(fqrec1->qual.s); + fq_size += (strlen(fqrec1->comment.s)*2); + fq_size += 2;// header signs @ and + + fq_size += 2;//two colons (:) + fq_size += 4;//cariage returns + fq_size += 2;//two spaces + fq_size += 1000;//test + + if(paired > 0 || combine > 0) { + l2 = kseq_read (fqrec2); + if (l2 < 0) { + fprintf (stderr, "ERROR: R2 file is shorter than R1 file. Disregarding rest of R1 file \n"); + break; + } + fq_size += strlen(fqrec2->seq.s); } - /* Go through all barcode data and check if any match to beginning of read */ - /* If it does then put read in that barcode's file, otherwise put in unknown file */ + /* Find matching barcode */ curr = head; while (curr) { - //zero means no mismatches found, that is barcode was found for that reads, therefore break and write it out - if (strncmp_with_mismatch (curr->bc, fqrec1->seq.s, mismatch, max_5prime_crop) == 0) { + n_crop = chk_bc_mtch(curr->bc, fqrec1->seq.s, mismatch, max_5prime_crop); + if(n_crop >= 0) { + //found matching barcode + actl_bc = strndup( (fqrec1->seq.s)+n_crop, strlen(curr->bc) ); break; } curr = curr->next; } + /* Write read out into barcode specific file */ if (curr != NULL) { - // if UMI is shorter then 10, discard the reads - if(strlen((fqrec1->seq.s)+strlen(curr->bc)) >= min_umi_len) { - //@READNAME:BACRCODE:UMI - fprintf (curr->bcfile1, "@%s:%s:%s", fqrec1->name.s, curr->bc, (fqrec1->seq.s)+strlen(curr->bc)); - if (fqrec1->comment.l && no_comment == -1) fprintf (curr->bcfile1, " %s\n", fqrec1->comment.s); - else fprintf (curr->bcfile1, "\n"); - - //fprintf (curr->bcfile1, "%s\n", (fqrec1->seq.s)+strlen(curr->bc)); - //This tmp hack knowning that data is single end, and R2 is simply a string of BARCODE+UMI - fprintf (curr->bcfile1, "N\n"); - - fprintf (curr->bcfile1, "+%s", fqrec1->name.s); - if (fqrec1->comment.l) fprintf (curr->bcfile1, " %s\n", fqrec1->comment.s); - else fprintf (curr->bcfile1, "\n"); - - fprintf (curr->bcfile1, "%s\n", (fqrec1->qual.s)+strlen(curr->bc)); + //for now assume barcode and umi are in R1 raed + if(umi > 0) { + umi_idx = (char*) malloc( strlen(fqrec1->seq.s)-strlen(curr->bc) + 1 ); + strcpy(umi_idx, (fqrec1->seq.s)+strlen(curr->bc)+n_crop); + fq_size += strlen(umi_idx); + + if(strlen(umi_idx) < min_umi_len) { + break; + } + } + if(combine > 0) { + fqread1 = (char*) malloc(fq_size); + fqread1[0] = '\0'; - //fprintf (curr->bcfile2, "@%s:%s", fqrec2->name.s, curr->bc); - //@READNAME:BACRCODE:UMI - fprintf (curr->bcfile2, "@%s:%s:%s", fqrec2->name.s, curr->bc, (fqrec1->seq.s)+strlen(curr->bc)); - if (fqrec2->comment.l && no_comment == -1) fprintf (curr->bcfile2, " %s\n", fqrec2->comment.s); - else fprintf (curr->bcfile2, "\n"); + get_merged_fqread(&fqread1, fqrec1, fqrec2, actl_bc, umi_idx, no_comment, n_crop); + gzwrite(curr->bcfile1, fqread1, strlen(fqread1)); + free(fqread1); + } + else { + fqread1 = (char*) malloc(fq_size + 1); + fqread2 = (char*) malloc(fq_size + 1); - if (!both_have_barcodes) fprintf (curr->bcfile2, "%s\n", fqrec2->seq.s); - else fprintf (curr->bcfile2, "%s\n", (fqrec2->seq.s)+strlen(curr->bc)); + fqread1[0] = '\0'; + fqread2[0] = '\0'; - fprintf (curr->bcfile2, "+%s", fqrec2->name.s); - if (fqrec2->comment.l) fprintf (curr->bcfile2, " %s\n", fqrec2->comment.s); - else fprintf (curr->bcfile2, "\n"); + get_fqread(&fqread1, fqrec1, actl_bc, umi_idx, no_comment, n_crop); + gzwrite(curr->bcfile1, fqread1, strlen(fqread1)); - if (!both_have_barcodes) fprintf (curr->bcfile2, "%s\n", fqrec2->qual.s); - else fprintf (curr->bcfile2, "%s\n", (fqrec2->qual.s)+strlen(curr->bc)); + if(paired > 0) { + get_fqread(&fqread2, fqrec1, actl_bc, umi_idx, no_comment, n_crop); + //fprintf(curr->bcfile2, "%s", fqread2); + gzwrite(curr->bcfile2, fqread2, strlen(fqread2)); + curr->num_records += 1; + } - curr->num_records += 2; + free(fqread1); + free(fqread2); } + curr->num_records += 1; } - else { - fprintf (unknownfile1, "@%s", fqrec1->name.s); - if (fqrec1->comment.l) fprintf (unknownfile1, " %s\n", fqrec1->comment.s); - else fprintf (unknownfile1, "\n"); - - fprintf (unknownfile1, "%s\n", fqrec1->seq.s); - - fprintf (unknownfile1, "+%s", fqrec1->name.s); - if (fqrec1->comment.l) fprintf (unknownfile1, " %s\n", fqrec1->comment.s); - else fprintf (unknownfile1, "\n"); - - fprintf (unknownfile1, "%s\n", fqrec1->qual.s); - - - fprintf (unknownfile2, "@%s", fqrec2->name.s); - if (fqrec2->comment.l) fprintf (unknownfile2, " %s\n", fqrec2->comment.s); - else fprintf (unknownfile2, "\n"); + fqread1 = (char*) malloc(fq_size + 1); + fqread2 = (char*) malloc(fq_size + 1); - fprintf (unknownfile2, "%s\n", fqrec2->seq.s); + fqread1[0] = '\0'; + fqread2[0] = '\0'; - fprintf (unknownfile2, "+%s", fqrec2->name.s); - if (fqrec2->comment.l) fprintf (unknownfile2, " %s\n", fqrec2->comment.s); - else fprintf (unknownfile2, "\n"); + get_fqread(&fqread1, fqrec1, NULL, NULL, no_comment, 0); + gzwrite(unknownfile1, fqread1, strlen(fqread1)); + num_unknown += 1; - fprintf (unknownfile2, "%s\n", fqrec2->qual.s); + if(paired > 0) { + get_fqread(&fqread2, fqrec2, NULL, NULL, no_comment, 0); + gzwrite(unknownfile2, fqread2, strlen(fqread2)); + num_unknown += 1; + } - num_unknown += 2; + free(fqread1); + free(fqread2); } total += 2; + free(umi_idx); } if (l1 < 0) { @@ -371,7 +410,6 @@ int paired_main (int argc, char *argv[]) { } } - //if (!quiet) { //if (!log_fn) { is this better? if (log_fn == NULL) { @@ -381,7 +419,6 @@ int paired_main (int argc, char *argv[]) { log_file = fopen(log_fn, "w"); } - fprintf (log_file, "Barcode\tN_records\tN_pairs\tP_pairs\n"); curr = head; int total_pairs = total/2; @@ -392,6 +429,7 @@ int paired_main (int argc, char *argv[]) { float percent_pairs = (float) n_pairs/total_pairs; fprintf (log_file,"%s\t%d\t%d\t%.2f\n", curr->bc, curr->num_records, n_pairs, percent_pairs); + curr = curr->next; } @@ -404,19 +442,20 @@ int paired_main (int argc, char *argv[]) { end = time(NULL); fprintf(stderr, "\n All done :) \ - \n It took %.2f minutes\n", difftime(end, start)/60); + \n It took %.2f minutes\n", + difftime(end, start)/60); kseq_destroy (fqrec1); kseq_destroy (fqrec2); gzclose (pe1); gzclose (pe2); - fclose (unknownfile1); - fclose (unknownfile2); + gzclose (unknownfile1); + gzclose (unknownfile2); fclose (barfile); fclose (log_file); - free (infn1); - free (infn2); + free (fq1); + free (fq2); free (barfn); free (unknownfn1); free (unknownfn2); @@ -424,8 +463,9 @@ int paired_main (int argc, char *argv[]) { curr = head; while (curr) { - fclose (curr->bcfile1); - fclose (curr->bcfile2); + gzclose(curr->bcfile1); + gzclose(curr->bcfile2); + free (curr->bc); temp = curr; curr = curr->next; diff --git a/src/demultiplex.c b/src/demultiplex.c new file mode 100644 index 0000000..63466f1 --- /dev/null +++ b/src/demultiplex.c @@ -0,0 +1,123 @@ + +/* + * set softtab=4 + * set shiftwidth=4 + * set expandtab + * + */ + +/* + * sabre FASTQ files demultiplexing + * demultiplex.c: FASTQ demultiplexing + * + */ + +int demulti() { + + barcode_data_paired *curr, *head, *temp; + char barcode [MAX_BARCODE_LENGTH]; + char s_name [MAX_FILENAME_LENGTH]; + + /* Creating linked list of barcode data */ + // https://www.hackerearth.com/practice/data-structures/linked-list/singly-linked-list/tutorial/ + // where each node is represents one barcode from the barcode file + // number of nodes should equal to number of barcodes (lines) in the file + head = NULL; + while (fscanf (barfile, "%s%s", barcode, s_name) != EOF) { + curr = (barcode_data_paired*) malloc (sizeof (barcode_data_paired)); + curr->bc = (char*) malloc (strlen(barcode) + 1); + strcpy(curr->bc, barcode); + + char *bcout_fn1 = get_bc_fn(barcode, s_name, 1); + char *bcout_fn2 = get_bc_fn(barcode, s_name, 1); + + curr->bcfile1 = fopen (_mkdir(bcout_fn1), "w"); + curr->bcfile2 = fopen (_mkdir(bcout_fn2), "w"); + curr->num_records = 0; + + curr->next = head; + head = curr; + } + + fqrec1 = kseq_init (pe1); + + if(paired > 0) { + fqrec2 = kseq_init (pe2); + } + + /* Get reads, one at a time */ + while((l1 = kseq_read (fqrec1)) >= 0) { + + int n_crop_fq1; + int n_crop_fq2; + char *actl_bc_fq1 = [MAX_BARCODE_LENGTH]; + char *actl_bc_fq2 = [MAX_BARCODE_LENGTH]; + + if(paired > 0) { + l2 = kseq_read (fqrec2); + if (l2 < 0) { + fprintf (stderr, "ERROR: R2 file is shorter than R1 file. Disregarding rest of R1 file \n"); + break; + } + } + + /* Find matching barcode */ + curr = head; + while (curr) { + + n_crop_fq1 = chk_bc_mtch(curr->bc, fqrec1->seq.s, mismatch, max_5prime_crop); + if (n_crop_fq1 >= 0) { + //found matching barcode + break; + } + + if(paired > 0) { + + n_crop_fq2 = chk_bc_mtch(curr->bc, fqrec2->seq.s, mismatch, max_5prime_crop); + + if (n_crop_fq2 < 0) { + // it is ok not to have matching barcode.. + fprintf (stderr, "ERROR: R2 didn't have a matching barcode. \n"); + } + + if (n_crop_fq1 != n_crop_fq2) { + // this will go heand in heand with previous check + // but can be stand along thing as well, when one read has an overhand + // and the other doesn't, shouldn't be the case though (I think) + fprintf (stderr, "ERROR: Number of cropped bases doesn't match between R1 and R2\n"); + } + + actl_bc_fq1 = strlen(fqrec1->seq.s)+strlen(curr->bc)+n_crop_fq1; + actl_bc_fq2 = strlen(fqrec2->seq.s)+strlen(curr->bc)+n_crop_fq2; + + // didn't match if != zero + if(strcmp(actl_bc_fq1, actl_bc_fq2) != 0) { + fprintf (stderr, "ERROR: Actual R1 and R2 barcodes didn't match, %s and %s. This is strange.. \n", + actl_bc_fq1, + actl_bc_fq2); + } + else { + //write read out to a matching barcode + break; + } + } + + curr = curr->next; + } + + /* Write read out into barcode specific file */ + if(curr != NULL) { + // if UMI is shorter then 10, discard the reads + //if(strlen((fqrec1->seq.s)+strlen(curr->bc)) >= min_umi_len) { + + fqrec1->name.s + fqrec1->seq.s + fqrec1->comment.l + fqrec1->qual.s + curr->bc + + char *trimed_fq1 = (fqrec1->seq.s)+strlen(curr->bc); + + //} + } +} diff --git a/src/utils.c b/src/utils.c index 18d7b68..2d9884c 100644 --- a/src/utils.c +++ b/src/utils.c @@ -14,6 +14,14 @@ // https://stackoverflow.com/questions/2336242/recursive-mkdir-system-call-on-unix/11425692 // https://stackoverflow.com/questions/7430248/creating-a-new-directory-in-c const char * _mkdir(const char *file_path) { + + if(!file_path) { + fprintf (stderr, + "ERROR: This shouldn't happend, file path == %s\n", + file_path); + exit(EXIT_FAILURE); + } + // return straigth away if a file_path is not nested file path if(strstr(file_path, "/") == NULL) { return file_path; @@ -56,21 +64,23 @@ const char * _mkdir(const char *file_path) { //NOTE retuns zero on success //strcmp can be used for sorting, returns pos, zero, neg //BUT this new implementation can't be used as such just FYI -int strncmp_with_mismatch (const char *orig_bc, const char *orig_read, size_t mismatch, int max_5prime_crop) { +int chk_bc_mtch(const char *orig_bc, const char *orig_read, size_t mismatch, int max_5prime_crop) { int orig_read_len = strlen(orig_read); int orig_bc_len = strlen(orig_bc); int n_crop = 0; if(orig_bc_len > orig_read_len) { - fprintf (stderr, "Length of the barcode %d is greater than length of the reads %d.", orig_bc_len, orig_read_len); - return 1; + fprintf (stderr, + "WARNING: Length of the barcode %d is greater than length of the reads %d.", + orig_bc_len, orig_read_len); + return -1; } while(n_crop <= max_5prime_crop) { if(n_crop > orig_read_len) { - return 1; + return -1; } int cnt = 0; @@ -78,7 +88,7 @@ int strncmp_with_mismatch (const char *orig_bc, const char *orig_read, size_t mi const char *bc = orig_bc; const char *read = orig_read+n_crop; int bc_len = orig_bc_len; - + while (bc_len-- > 0) { u1 = *bc++; u2 = *read++; @@ -91,93 +101,137 @@ int strncmp_with_mismatch (const char *orig_bc, const char *orig_read, size_t mi } if (u1 == '\0' || u2 == '\0') { - return 0; + return n_crop; } } if(cnt <= mismatch) { - return 0; + return n_crop; } n_crop++; } //this is in the case of error - return 1; + return -1; } // https://stackoverflow.com/questions/21880730/c-what-is-the-best-and-fastest-way-to-concatenate-strings //TODO this is a fastq mystrcat function, that returns a pointer to the end of the string -char * get_fqread(kseq_t *fqrec, char *barcode, int no_comment, int remove_seq) { - - size_t fqread_size = 0; +void get_fqread(char **fqread, kseq_t *fqrec, char *barcode, char *umi_idx, int no_comment, int n_crop) { - fqread_size += strlen(fqrec->seq.s); - fqread_size += (strlen(fqrec->name.s)*2); - fqread_size += strlen(fqrec->qual.s); - fqread_size += (strlen(fqrec->comment.s)*2); - fqread_size += 2;// header signs @ and + - fqread_size += 2;//two colons (:) - fqread_size += 4;//cariage returns - fqread_size += 2;//two spaces - - char *umi = NULL; - - if(barcode[0] != '\0') { - umi = (char*) malloc( strlen(fqrec->seq.s)-strlen(barcode) + 1 ); - strcpy(umi, (fqrec->seq.s)+strlen(barcode)); - fqread_size += strlen(umi); + if(n_crop < 0) { + fprintf(stderr, + "ERROR: n_crop set to %d. This can't happend\n", + n_crop); + exit(EXIT_FAILURE); } - - char *fqread = (char*) malloc(fqread_size + 1); - //makes it a zero length string - fqread[0] = '\0'; //@READNAME:BACRCODE:UMI //1st line - strcat(fqread, "@"); - strcat(fqread, fqrec->name.s); + strcat(*fqread, "@"); + strcat(*fqread, fqrec->name.s); //TODO later can have conditional here depending on the the structure and/or BARCODE/UMI - if(barcode[0] != '\0') { - strcat(fqread, ":"); - strcat(fqread, barcode); - - if(umi[0] == '\0') { - fprintf(stderr, "Error: This shouldn't happened.\n"); - exit(EXIT_FAILURE); - } + if(barcode) { + strcat(*fqread, ":"); + strcat(*fqread, barcode); + } + else if(!barcode) { + barcode = ""; + } - strcat(fqread, ":"); - strcat(fqread, umi); - free(umi); + if(umi_idx) { + strcat(*fqread, ":"); + strcat(*fqread, umi_idx); } if(fqrec->comment.l && no_comment == -1) { - strcat(fqread, " "); - strcat(fqread, fqrec->comment.s); + strcat(*fqread, " "); + strcat(*fqread, fqrec->comment.s); } - strcat(fqread, "\n"); + strcat(*fqread, "\n"); //2nd line - if(remove_seq == 1) { - strcat(fqread, "N"); + strcat(*fqread, (fqrec->seq.s)+strlen(barcode)+n_crop); + strcat(*fqread, "\n"); + + //3rd line + strcat(*fqread, "+"); + strcat(*fqread, fqrec->name.s); + if(fqrec->comment.l && no_comment == -1) { + strcat(*fqread, " "); + strcat(*fqread, fqrec->comment.s); } - else { - strcat(fqread, (fqrec->seq.s)+strlen(barcode)); + strcat(*fqread, "\n"); + + //4th line + strcat(*fqread, (fqrec->qual.s)+strlen(barcode)+n_crop); + strcat(*fqread, "\n"); +} + +void get_merged_fqread(char **fqread, kseq_t *fqrec1, kseq_t *fqrec2, char *barcode, char *umi_idx, int no_comment, int n_crop) { + + //@READNAME:BACRCODE:UMI + //1st line + strcat(*fqread, "@"); + strcat(*fqread, fqrec1->name.s); + //TODO later can have conditional here depending on the the structure and/or BARCODE/UMI + if(barcode) { + strcat(*fqread, ":"); + strcat(*fqread, barcode); + } + + if(umi_idx) { + strcat(*fqread, ":"); + strcat(*fqread, umi_idx); + } + + if(fqrec1->comment.l && no_comment == -1) { + strcat(*fqread, " "); + strcat(*fqread, fqrec1->comment.s); } - strcat(fqread, "\n"); + strcat(*fqread, "\n"); + + //2nd line + strcat(*fqread, fqrec2->seq.s); + strcat(*fqread, "\n"); //3rd line - strcat(fqread, "+"); - strcat(fqread, fqrec->name.s); - if(fqrec->comment.l && no_comment == -1) { - strcat(fqread, " "); - strcat(fqread, fqrec->comment.s); + strcat(*fqread, "+"); + strcat(*fqread, fqrec2->name.s); + if(fqrec2->comment.l && no_comment == -1) { + strcat(*fqread, " "); + strcat(*fqread, fqrec2->comment.s); } - strcat(fqread, "\n"); + strcat(*fqread, "\n"); //4th line - strcat(fqread, fqrec->qual.s); - strcat(fqread, "\n"); + strcat(*fqread, (fqrec2->qual.s)+strlen(barcode)+n_crop); + strcat(*fqread, "\n"); +} + +void get_bc_fn(char **bcout_fn, char *s_name, char *barcode, int read_type) { + + if(strlen(s_name) > MAX_FILENAME_LENGTH) { + fprintf (stderr, + "ERROR: Too many characters in your sample name; %s:%zd \n", + s_name, strlen(s_name)); + exit(EXIT_FAILURE); + } - return fqread; + strcat(*bcout_fn, s_name); + strcat(*bcout_fn, "_"); + strcat(*bcout_fn, barcode); + + if(read_type == 1) { + strcat(*bcout_fn, "_R1.fastq.gz"); + } + else if(read_type == 2) { + strcat(*bcout_fn, "_R2.fastq.gz"); + } + else { + fprintf (stderr, + "ERROR: This shouldn't happened, wrong read type was passed through -> %d\n", + read_type); + exit(EXIT_FAILURE); + } } diff --git a/src/utils.h b/src/utils.h index b6e0e56..7fe0c83 100644 --- a/src/utils.h +++ b/src/utils.h @@ -9,9 +9,12 @@ KSEQ_INIT(gzFile, gzread) //This is needed if compilling with -std=c99, read below for more //https://stackoverflow.com/questions/26284110/strdup-confused-about-warnings-implicit-declaration-makes-pointer-with char *strdup(const char*); +char *strndup(const char *s, size_t n); -int strncmp_with_mismatch (const char *orig_bc, const char *orig_read, size_t mismatch, int max_5prime_crop); const char * _mkdir (const char *dir); -char * get_fqread(kseq_t *fqrec, char *barcode, int no_comment, int remove_seq); +int chk_bc_mtch(const char *orig_bc, const char *orig_read, size_t mismatch, int max_5prime_crop); +void get_fqread(char **fqread, kseq_t *fqrec, char *barcode, char *umi_idx, int no_comment, int n_crop); +void get_merged_fqread(char **fqread, kseq_t *fqrec1, kseq_t *fqrec2, char *barcode, char *umi_idx, int no_comment, int n_crop); +void get_bc_fn(char **bcout_fn, char *s_name, char *barcode, int read_type); #endif /*UTILS_H*/ From cd0435ee20130466d33f566192c8445dd8585b4a Mon Sep 17 00:00:00 2001 From: serine Date: Wed, 30 May 2018 16:38:02 +1000 Subject: [PATCH 19/55] updated Makefile that doesn't look at demulti_single.c file --- src/Makefile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/Makefile b/src/Makefile index cb130fc..401a082 100644 --- a/src/Makefile +++ b/src/Makefile @@ -2,7 +2,8 @@ VERSION = 1.00 CC = gcc INCL = kseq.h sabre.h -SRC = demulti_paired.c demulti_single.c sabre.c utils.c +#SRC = demulti_paired.c demulti_single.c sabre.c utils.c +SRC = demulti_paired.c sabre.c utils.c OBJ = $(SRC:.c=.o) DSRC=src From e94e9edf6bd28458e1cad834294713ed30334df8 Mon Sep 17 00:00:00 2001 From: serine Date: Fri, 22 Jun 2018 10:32:06 +1000 Subject: [PATCH 20/55] making umis of uniform length based on max-5prime-crop. i.e removing n bases from the back of the umi, where n = max-5prime-crop value --- src/demulti_paired.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/demulti_paired.c b/src/demulti_paired.c index cd33031..eca2bb4 100644 --- a/src/demulti_paired.c +++ b/src/demulti_paired.c @@ -340,7 +340,11 @@ int paired_main (int argc, char *argv[]) { //for now assume barcode and umi are in R1 raed if(umi > 0) { umi_idx = (char*) malloc( strlen(fqrec1->seq.s)-strlen(curr->bc) + 1 ); - strcpy(umi_idx, (fqrec1->seq.s)+strlen(curr->bc)+n_crop); + //strcpy(umi_idx, (fqrec1->seq.s)+strlen(curr->bc)+n_crop); + //TODO should probably also adjust umi_idx malloc to account for less space + strncpy(umi_idx, (fqrec1->seq.s)+strlen(curr->bc)+n_crop, strlen(umi_idx)-max_5prime_crop); + umi_idx[strlen(umi_idx)-max_5prime_crop+1] = '\0'; + fq_size += strlen(umi_idx); if(strlen(umi_idx) < min_umi_len) { From 321c2ffb32331864cb91b4ca792a778655eebe7c Mon Sep 17 00:00:00 2001 From: serine Date: Fri, 22 Jun 2018 13:49:55 +1000 Subject: [PATCH 21/55] attempting to fix memory leak --- src/demulti_paired.c | 36 ++++++++++++++++-------------------- 1 file changed, 16 insertions(+), 20 deletions(-) diff --git a/src/demulti_paired.c b/src/demulti_paired.c index eca2bb4..dd48a49 100644 --- a/src/demulti_paired.c +++ b/src/demulti_paired.c @@ -61,7 +61,7 @@ void paired_usage (int status) { } int paired_main (int argc, char *argv[]) { - + //clock_t begin = clock(); time_t start, end; start = time(NULL); @@ -270,21 +270,22 @@ int paired_main (int argc, char *argv[]) { curr->bcfile1 = gzopen(_mkdir(bcout_fn1), "wb"); if(paired > 0 && combine < 0) { - bcout_fn2 = (char *) malloc(MAX_FILENAME_LENGTH*2); + bcout_fn2 = (char *) malloc(MAX_FILENAME_LENGTH*2); bcout_fn2[0] = '\0'; - get_bc_fn(&bcout_fn2, s_name, curr->bc, 2); - //curr->bcfile2 = fopen (_mkdir(bcout_fn2), "w"); - curr->bcfile2 = gzopen(_mkdir(bcout_fn2), "wb"); + get_bc_fn(&bcout_fn2, s_name, curr->bc, 2); + //curr->bcfile2 = fopen (_mkdir(bcout_fn2), "w"); + curr->bcfile2 = gzopen(_mkdir(bcout_fn2), "wb"); } curr->num_records = 0; curr->next = head; head = curr; - //free(bcout_fn1); - //free(bcout_fn2); } + free(bcout_fn1); + free(bcout_fn2); + fqrec1 = kseq_init (pe1); if(paired > 0) { @@ -339,11 +340,9 @@ int paired_main (int argc, char *argv[]) { if (curr != NULL) { //for now assume barcode and umi are in R1 raed if(umi > 0) { - umi_idx = (char*) malloc( strlen(fqrec1->seq.s)-strlen(curr->bc) + 1 ); - //strcpy(umi_idx, (fqrec1->seq.s)+strlen(curr->bc)+n_crop); - //TODO should probably also adjust umi_idx malloc to account for less space - strncpy(umi_idx, (fqrec1->seq.s)+strlen(curr->bc)+n_crop, strlen(umi_idx)-max_5prime_crop); - umi_idx[strlen(umi_idx)-max_5prime_crop+1] = '\0'; + const char *actl_umi_idx = (fqrec1->seq.s)+strlen(curr->bc)+n_crop; + umi_idx = strdup(actl_umi_idx); + umi_idx[strlen(umi_idx)-(max_5prime_crop-n_crop)] = '\0'; fq_size += strlen(umi_idx); @@ -358,7 +357,6 @@ int paired_main (int argc, char *argv[]) { get_merged_fqread(&fqread1, fqrec1, fqrec2, actl_bc, umi_idx, no_comment, n_crop); gzwrite(curr->bcfile1, fqread1, strlen(fqread1)); - free(fqread1); } else { fqread1 = (char*) malloc(fq_size + 1); @@ -376,9 +374,6 @@ int paired_main (int argc, char *argv[]) { gzwrite(curr->bcfile2, fqread2, strlen(fqread2)); curr->num_records += 1; } - - free(fqread1); - free(fqread2); } curr->num_records += 1; } @@ -398,12 +393,13 @@ int paired_main (int argc, char *argv[]) { gzwrite(unknownfile2, fqread2, strlen(fqread2)); num_unknown += 1; } - - free(fqread1); - free(fqread2); } total += 2; + + free(fqread1); + free(fqread2); + free(actl_bc); free(umi_idx); } @@ -425,7 +421,7 @@ int paired_main (int argc, char *argv[]) { fprintf (log_file, "Barcode\tN_records\tN_pairs\tP_pairs\n"); curr = head; - int total_pairs = total/2; + int total_pairs = total/2; while (curr) { From e7e18657718f5c5cedcb8d28343e96d9e2b9080e Mon Sep 17 00:00:00 2001 From: serine Date: Thu, 28 Jun 2018 16:21:48 +1000 Subject: [PATCH 22/55] fixed bug in getting quality string length when using combine mode --- src/utils.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/utils.c b/src/utils.c index 2d9884c..bc268a8 100644 --- a/src/utils.c +++ b/src/utils.c @@ -205,7 +205,7 @@ void get_merged_fqread(char **fqread, kseq_t *fqrec1, kseq_t *fqrec2, char *barc strcat(*fqread, "\n"); //4th line - strcat(*fqread, (fqrec2->qual.s)+strlen(barcode)+n_crop); + strcat(*fqread, (fqrec2->qual.s)); strcat(*fqread, "\n"); } From 73eb266aa3cfb55d1e41b2eb65ae654c7e647689 Mon Sep 17 00:00:00 2001 From: serine Date: Thu, 9 Aug 2018 14:55:12 +1000 Subject: [PATCH 23/55] fixed bug in skipping umis that are too short. Also added redirect to a file umis_too_short.txt that now holds read names that were discarded due too short --- src/demulti_paired.c | 64 +++++++++++++++++++++++++------------------- 1 file changed, 37 insertions(+), 27 deletions(-) diff --git a/src/demulti_paired.c b/src/demulti_paired.c index dd48a49..a9a2e36 100644 --- a/src/demulti_paired.c +++ b/src/demulti_paired.c @@ -32,7 +32,7 @@ void paired_usage (int status) { - fprintf (stderr, "\n Usage: %s pe [OPTIONS] -f -r -b -u -w \ + fprintf (stderr, "\n Usage: %s pe [OPTIONS] -f -r -b \ \n\ \n\ \n Options:\ @@ -78,6 +78,9 @@ int paired_main (int argc, char *argv[]) { char *unknownfn1=strdup("unassigned_R1.fastq.gz"); char *unknownfn2=strdup("unassigned_R2.fastq.gz"); + FILE* umis_2_short_file=NULL; + char *umis_2_short_fn=strdup("umis_too_short.txt"); + FILE* log_file=NULL; int optc; extern char *optarg; @@ -191,25 +194,25 @@ int paired_main (int argc, char *argv[]) { !strcmp (fq2, barfn) || !strcmp (unknownfn1, unknownfn2) || !strcmp (unknownfn1, barfn) || !strcmp (unknownfn2, barfn)) { - fprintf (stderr, "Error: Duplicate input and/or output file names.\n"); + fprintf (stderr, "ERROR: Duplicate input and/or output file names.\n"); return EXIT_FAILURE; } pe1 = gzopen (fq1, "r"); if (!pe1) { - fprintf (stderr, "Could not open input file 1 '%s'.\n", fq1); + fprintf (stderr, "ERROR: Could not open input file 1 '%s'.\n", fq1); return EXIT_FAILURE; } pe2 = gzopen (fq2, "r"); if (!pe2) { - fprintf (stderr, "Could not open input file 2 '%s'.\n", fq2); + fprintf (stderr, "ERROR: Could not open input file 2 '%s'.\n", fq2); return EXIT_FAILURE; } unknownfile1 = gzopen(unknownfn1, "wb"); if (!unknownfile1) { - fprintf (stderr, "Could not open unknown output file 1 '%s'.\n", unknownfn1); + fprintf (stderr, "ERROR: Could not open unknown output file 1 '%s'.\n", unknownfn1); return EXIT_FAILURE; } @@ -229,28 +232,33 @@ int paired_main (int argc, char *argv[]) { paired = 1; } + umis_2_short_file = fopen(umis_2_short_fn, "a"); + fprintf(umis_2_short_file, "name\tumi\tlen\tmin_len\n"); + fprintf(stderr, "\n\ \n Running: %s\ \n Command line args:\ - \n --pe-file1 %s\ - \n --pe-file2 %s\ - \n --barcode-file %s\ - \n --unknown-output1 %s\ - \n --unknown-output2 %s\ + \n --fq1 %s\ + \n --fq2 %s\ + \n --barcodes %s\ + \n --unassigned %s\ \n --combine %d\ + \n --umi %d\ \n --max-mismatch %d\ \n --min-umi-len %d\ \n --max-5prime-crop %d\ - \n --stats %s\ \n --no-comment %d\ + \n --stats %s\ \n\ \n In Progess...\ \n", PROGRAM_NAME,\ fq1, fq2,\ barfn,\ - unknownfn1, unknownfn2,\ - combine,\ - mismatch, min_umi_len, max_5prime_crop, log_fn, no_comment); + unknownfn1,\ + combine, umi,\ + mismatch, min_umi_len, + max_5prime_crop, no_comment,\ + log_fn); char *bcout_fn1 = NULL; char *bcout_fn2 = NULL; @@ -298,7 +306,6 @@ int paired_main (int argc, char *argv[]) { int n_crop = 0; char *actl_bc = NULL; - char *umi_idx = NULL; char *fqread1 = NULL; char *fqread2 = NULL; @@ -318,7 +325,12 @@ int paired_main (int argc, char *argv[]) { if(paired > 0 || combine > 0) { l2 = kseq_read (fqrec2); if (l2 < 0) { - fprintf (stderr, "ERROR: R2 file is shorter than R1 file. Disregarding rest of R1 file \n"); + fprintf (stderr, "\n\ + \n ERROR: R2 file is shorter than R1 file.\ + \n Stopping here:\ + \n %s\ + \n", + fqrec1 -> name.s); break; } fq_size += strlen(fqrec2->seq.s); @@ -337,6 +349,9 @@ int paired_main (int argc, char *argv[]) { } /* Write read out into barcode specific file */ + + char *umi_idx = NULL; + if (curr != NULL) { //for now assume barcode and umi are in R1 raed if(umi > 0) { @@ -347,7 +362,8 @@ int paired_main (int argc, char *argv[]) { fq_size += strlen(umi_idx); if(strlen(umi_idx) < min_umi_len) { - break; + fprintf(umis_2_short_file, "%s\t%s\t%zu\t%d\n", fqrec1->name.s, umi_idx, strlen(umi_idx), min_umi_len); + continue; } } @@ -403,14 +419,6 @@ int paired_main (int argc, char *argv[]) { free(umi_idx); } - if (l1 < 0) { - l2 = kseq_read (fqrec2); - if (l2 >= 0) { - fprintf (stderr, "Error: PE file 1 is shorter than PE file 2. Disregarding rest of PE file 2.\n"); - } - } - - //if (!quiet) { //if (!log_fn) { is this better? if (log_fn == NULL) { log_file = stdout; @@ -437,8 +445,8 @@ int paired_main (int argc, char *argv[]) { float percent_unknown = (float) unknown_pairs/total_pairs; float tot_chk = (float) total_pairs/total_pairs; - fprintf (log_file, "unassigned\t%d\t%d\t%.2f\n", num_unknown, unknown_pairs, percent_unknown); - fprintf (log_file, "total\t%d\t%d\t%.2f\n", total, total_pairs, tot_chk); + fprintf(log_file, "unassigned\t%d\t%d\t%.2f\n", num_unknown, unknown_pairs, percent_unknown); + fprintf(log_file, "total\t%d\t%d\t%.2f\n", total, total_pairs, tot_chk); end = time(NULL); fprintf(stderr, "\n All done :) \ @@ -453,12 +461,14 @@ int paired_main (int argc, char *argv[]) { gzclose (unknownfile2); fclose (barfile); fclose (log_file); + fclose(umis_2_short_file); free (fq1); free (fq2); free (barfn); free (unknownfn1); free (unknownfn2); + free(umis_2_short_fn); free (log_fn); curr = head; From 204dfe388bc514d99aa7ec74fe5100dcea57c2cf Mon Sep 17 00:00:00 2001 From: serine Date: Thu, 9 Aug 2018 14:57:08 +1000 Subject: [PATCH 24/55] started working on metrics collection script. e.g number of different barcode in a demultiplexed fastq given one mismatch was allowed also want similar metrics for umis --- src/metrics.c | 91 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 91 insertions(+) create mode 100644 src/metrics.c diff --git a/src/metrics.c b/src/metrics.c new file mode 100644 index 0000000..bf0d2e2 --- /dev/null +++ b/src/metrics.c @@ -0,0 +1,91 @@ +#include +#include +#include "utils.h" + +/* +gcc -Wall -O2 -std=c99 -o metrics metrics.c -lz +*/ + +#define BARCODE_ARRAY 1000000 + +//int chk_bc_arr(char *arr, char *bc); +// +//int chk_bc_arr(char *arr, char *bc) { +// +// int i = 0; +// while(arr[i] != '\0') { +// +// if(strcmp(bc, arr[i]) == 0) { +// return i; +// } +// +// i += 1; +// } +// +// //for(int i; i < strlen(arr); i++) { +// // if(strcmp(bc, arr[i]) == 0) { +// // return i; +// // } +// //} +// +// return -1; +//} + +int main (int argc, char *argv[]) { + + gzFile pe1=NULL; + pe1 = gzopen(argv[1], "r"); + kseq_t *fqrec1; + int l1; + + fqrec1 = kseq_init(pe1); + + char barcodes[BARCODE_ARRAY]; + //barcodes[0] = '\0'; + int bc_cnts[BARCODE_ARRAY]; + //bc_cnts[0] = '\0'; + int loc = 0; + /* Get reads, one at a time */ + while ((l1 = kseq_read(fqrec1)) >= 0) { + //fprintf(stdout, "%s\n", fqrec1->name.s); + + char *last; + char *last2; + + char *p = strtok(fqrec1->name.s, ":"); + //fprintf(stdout, "check %s\n", p); + while (p != NULL) { + //fprintf(stdout, "%s\n", p); + last2 = last; + last = p; + p = strtok(NULL, ":"); + } + + //fprintf(stdout, "%s %zu\n", last2, strlen(last2)); + //int bc_idx = chk_bc_arr(barcodes, last2); + int idx = 0; + while(barcodes[idx] != '\0') { + fprintf(stdout, "%s %zu\n", barcodes[idx], strlen(barcodes[idx])); + + } + + //if(bc_idx >= 0) { + // bc_cnts[bc_idx] += 1; + //} + //else { + // barcodes[loc] = last2; + // loc += 1; + //} + + //fprintf(stdout, "%s\n", last2); + } + + //for(int i; i < strlen(barcodes); i++) { + // fprintf(stdout, "%s %d\n", barcodes[i], bc_cnts[i]); + //} + + gzclose(pe1); + kseq_destroy(fqrec1); + + return EXIT_SUCCESS; +} From db72db12c204d3351e6a514e83c4ab11ff26c57c Mon Sep 17 00:00:00 2001 From: serine Date: Fri, 10 Aug 2018 10:34:41 +1000 Subject: [PATCH 25/55] wrote functional metrics.c script that returns unsorted table of barcodes and counts --- src/metrics.c | 118 +++++++++++++++++++++++++++----------------------- 1 file changed, 65 insertions(+), 53 deletions(-) diff --git a/src/metrics.c b/src/metrics.c index bf0d2e2..75a1309 100644 --- a/src/metrics.c +++ b/src/metrics.c @@ -7,29 +7,34 @@ gcc -Wall -O2 -std=c99 -o metrics metrics.c -lz */ #define BARCODE_ARRAY 1000000 +//#define BARCODE 20 -//int chk_bc_arr(char *arr, char *bc); -// -//int chk_bc_arr(char *arr, char *bc) { -// -// int i = 0; -// while(arr[i] != '\0') { -// -// if(strcmp(bc, arr[i]) == 0) { -// return i; -// } -// -// i += 1; -// } -// -// //for(int i; i < strlen(arr); i++) { -// // if(strcmp(bc, arr[i]) == 0) { -// // return i; -// // } -// //} -// -// return -1; -//} +//int chk_bc_arr(const char *arr, char *bc); + +// char **arr and char *arr[] mean the same things +// but later one more informative +int chk_bc_arr(char *arr[], char *bc) { + + int i = 0; + while(arr[i] != 0) { + if(strcmp(bc, arr[i]) == 0) { + return i; + } + + i += 1; + } + + arr[i] = strdup(bc); + + // error handling + if(i == BARCODE_ARRAY-1) { + fprintf(stderr, "ERROR: gone too far\n"); + exit(1); + } + + //arr[i+1]=0] could also do that + return i; +} int main (int argc, char *argv[]) { @@ -40,52 +45,59 @@ int main (int argc, char *argv[]) { fqrec1 = kseq_init(pe1); - char barcodes[BARCODE_ARRAY]; - //barcodes[0] = '\0'; - int bc_cnts[BARCODE_ARRAY]; - //bc_cnts[0] = '\0'; - int loc = 0; + //char barcodes[BARCODE_ARRAY]; + // actual two different things + //char barcodes[BARCODE_ARRAY][BARCODE]; + + char **barcodes = calloc(BARCODE_ARRAY, sizeof(char*)); + int *bc_cnts = calloc(BARCODE_ARRAY, sizeof(int)); + /* Get reads, one at a time */ while ((l1 = kseq_read(fqrec1)) >= 0) { //fprintf(stdout, "%s\n", fqrec1->name.s); - char *last; - char *last2; + char *last; + char *last2; + + char *p = strtok(fqrec1->name.s, ":"); - char *p = strtok(fqrec1->name.s, ":"); - //fprintf(stdout, "check %s\n", p); while (p != NULL) { //fprintf(stdout, "%s\n", p); - last2 = last; - last = p; + last2 = last; + last = p; p = strtok(NULL, ":"); - } - - //fprintf(stdout, "%s %zu\n", last2, strlen(last2)); - //int bc_idx = chk_bc_arr(barcodes, last2); - int idx = 0; - while(barcodes[idx] != '\0') { - fprintf(stdout, "%s %zu\n", barcodes[idx], strlen(barcodes[idx])); - } - //if(bc_idx >= 0) { - // bc_cnts[bc_idx] += 1; - //} - //else { - // barcodes[loc] = last2; - // loc += 1; - //} - - //fprintf(stdout, "%s\n", last2); + //fprintf(stdout, "%s %zu\n", last2, strlen(last2)); + int bc_idx = chk_bc_arr(barcodes, last2); + bc_cnts[bc_idx] += 1; } - //for(int i; i < strlen(barcodes); i++) { - // fprintf(stdout, "%s %d\n", barcodes[i], bc_cnts[i]); - //} + { + // this is to limit the right scope for i + int i = 0; + while(barcodes[i] != 0) { + fprintf(stdout, "%s %d\n", barcodes[i], bc_cnts[i]); + i++; + } + } gzclose(pe1); kseq_destroy(fqrec1); return EXIT_SUCCESS; } +//TODO use struct instead +//used qsort +//write a function to sort e.g bubbleSort +//actually sorting function should be simpler then bubbleSort +//it should return -1, 0 or 1. OR -1 or 1. +//typedef struct barcodes_t { +// char *barcodes; +// int cnts; +//} barcodes_t; +// +// +//barcodes_t barcodes[size]; +// +//barcodes[i].cnts +=1; From fe1c1c6168e8adf8f27c2328c35c4a84b11965cf Mon Sep 17 00:00:00 2001 From: serine Date: Fri, 10 Aug 2018 11:10:46 +1000 Subject: [PATCH 26/55] cleaned metrics.c code a little --- src/metrics.c | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/src/metrics.c b/src/metrics.c index 75a1309..5768baa 100644 --- a/src/metrics.c +++ b/src/metrics.c @@ -7,12 +7,14 @@ gcc -Wall -O2 -std=c99 -o metrics metrics.c -lz */ #define BARCODE_ARRAY 1000000 -//#define BARCODE 20 //int chk_bc_arr(const char *arr, char *bc); -// char **arr and char *arr[] mean the same things -// but later one more informative +/* + WISDOM char **arr and char *arr[] mean the same things + but later one more informative + */ + int chk_bc_arr(char *arr[], char *bc) { int i = 0; @@ -28,11 +30,12 @@ int chk_bc_arr(char *arr[], char *bc) { // error handling if(i == BARCODE_ARRAY-1) { - fprintf(stderr, "ERROR: gone too far\n"); + fprintf(stderr, "ERROR: gone too far in the array\n"); exit(1); } - //arr[i+1]=0] could also do that + // could also do initialisation of the next element here + // arr[i+1]=0; return i; } @@ -45,16 +48,17 @@ int main (int argc, char *argv[]) { fqrec1 = kseq_init(pe1); - //char barcodes[BARCODE_ARRAY]; - // actual two different things - //char barcodes[BARCODE_ARRAY][BARCODE]; + /* + WISDOM these tow are actually different things + char barcodes[BARCODE_ARRAY][BARCODE]; + char *barcodes[]; + */ char **barcodes = calloc(BARCODE_ARRAY, sizeof(char*)); int *bc_cnts = calloc(BARCODE_ARRAY, sizeof(int)); /* Get reads, one at a time */ while ((l1 = kseq_read(fqrec1)) >= 0) { - //fprintf(stdout, "%s\n", fqrec1->name.s); char *last; char *last2; @@ -62,19 +66,20 @@ int main (int argc, char *argv[]) { char *p = strtok(fqrec1->name.s, ":"); while (p != NULL) { - //fprintf(stdout, "%s\n", p); last2 = last; last = p; p = strtok(NULL, ":"); } - //fprintf(stdout, "%s %zu\n", last2, strlen(last2)); int bc_idx = chk_bc_arr(barcodes, last2); bc_cnts[bc_idx] += 1; } { - // this is to limit the right scope for i + /* + WISDOM this is to limit the right scope for i + */ + int i = 0; while(barcodes[i] != 0) { fprintf(stdout, "%s %d\n", barcodes[i], bc_cnts[i]); From 4f4f1540d51d3d5956736fcf0beb34cfb89c8ffa Mon Sep 17 00:00:00 2001 From: serine Date: Mon, 20 Aug 2018 10:00:41 +1000 Subject: [PATCH 27/55] worked on metrics util, not it produce sorted list --- src/metrics.c | 44 ++++++++++++++++++++++++++++---------------- src/utils.h | 7 +++++++ 2 files changed, 35 insertions(+), 16 deletions(-) diff --git a/src/metrics.c b/src/metrics.c index 5768baa..71f3ae1 100644 --- a/src/metrics.c +++ b/src/metrics.c @@ -1,5 +1,3 @@ -#include -#include #include "utils.h" /* @@ -15,18 +13,18 @@ gcc -Wall -O2 -std=c99 -o metrics metrics.c -lz but later one more informative */ -int chk_bc_arr(char *arr[], char *bc) { +int chk_bc_arr(barcodes_t *arr, char *bc) { int i = 0; - while(arr[i] != 0) { - if(strcmp(bc, arr[i]) == 0) { + while(arr[i].bc != 0) { + if(strcmp(bc, arr[i].bc) == 0) { return i; } i += 1; } - arr[i] = strdup(bc); + arr[i].bc = strdup(bc); // error handling if(i == BARCODE_ARRAY-1) { @@ -39,6 +37,14 @@ int chk_bc_arr(char *arr[], char *bc) { return i; } +int bc_n_cmp(const void *a1, const void *b1) { + + const barcodes_t* a=a1; + const barcodes_t* b=b1; + + return a->cnts - b->cnts; +}; + int main (int argc, char *argv[]) { gzFile pe1=NULL; @@ -54,8 +60,11 @@ int main (int argc, char *argv[]) { char *barcodes[]; */ - char **barcodes = calloc(BARCODE_ARRAY, sizeof(char*)); - int *bc_cnts = calloc(BARCODE_ARRAY, sizeof(int)); + //char **barcodes = calloc(BARCODE_ARRAY, sizeof(char*)); + //int *bc_cnts = calloc(BARCODE_ARRAY, sizeof(int)); + + barcodes_t *barcodes; + barcodes = calloc(BARCODE_ARRAY, sizeof(barcodes_t)); /* Get reads, one at a time */ while ((l1 = kseq_read(fqrec1)) >= 0) { @@ -72,7 +81,15 @@ int main (int argc, char *argv[]) { } int bc_idx = chk_bc_arr(barcodes, last2); - bc_cnts[bc_idx] += 1; + barcodes[bc_idx].cnts += 1; + } + + { + int n = 0; + while(barcodes[n].bc != 0) { + n++; + } + qsort(barcodes, n, sizeof(barcodes_t), bc_n_cmp); } { @@ -81,8 +98,8 @@ int main (int argc, char *argv[]) { */ int i = 0; - while(barcodes[i] != 0) { - fprintf(stdout, "%s %d\n", barcodes[i], bc_cnts[i]); + while(barcodes[i].bc != 0) { + fprintf(stdout, "%s %d\n", barcodes[i].bc, barcodes[i].cnts); i++; } } @@ -97,11 +114,6 @@ int main (int argc, char *argv[]) { //write a function to sort e.g bubbleSort //actually sorting function should be simpler then bubbleSort //it should return -1, 0 or 1. OR -1 or 1. -//typedef struct barcodes_t { -// char *barcodes; -// int cnts; -//} barcodes_t; -// // //barcodes_t barcodes[size]; // diff --git a/src/utils.h b/src/utils.h index 7fe0c83..84522ad 100644 --- a/src/utils.h +++ b/src/utils.h @@ -1,9 +1,16 @@ #ifndef UTILS_H #define UTILS_H +#include +#include #include #include "kseq.h" +typedef struct barcodes_t { + char *bc; + int cnts; +} barcodes_t; + KSEQ_INIT(gzFile, gzread) //This is needed if compilling with -std=c99, read below for more From ac8e1d28bfd124d0b24f073bd356cbbec5a5f2c9 Mon Sep 17 00:00:00 2001 From: serine Date: Mon, 20 Aug 2018 10:05:46 +1000 Subject: [PATCH 28/55] changed umi trimming. If min-umi-len is set then all umis are ought to be that length, reads with shorted umis will be thrown away, reads which have longer umis will be kept, but umi barcode will be trimmed back to min-umi-len to make all umis of uniform length, otherwise downstream analysis breaks - can't deduplicate if umis are of variable length --- src/demulti_paired.c | 31 +++++++++++++++++++++++++------ 1 file changed, 25 insertions(+), 6 deletions(-) diff --git a/src/demulti_paired.c b/src/demulti_paired.c index a9a2e36..be915fd 100644 --- a/src/demulti_paired.c +++ b/src/demulti_paired.c @@ -326,11 +326,11 @@ int paired_main (int argc, char *argv[]) { l2 = kseq_read (fqrec2); if (l2 < 0) { fprintf (stderr, "\n\ - \n ERROR: R2 file is shorter than R1 file.\ - \n Stopping here:\ - \n %s\ - \n", - fqrec1 -> name.s); + \n ERROR: R2 file is shorter than R1 file.\ + \n Stopping here:\ + \n %s\ + \n", + fqrec1 -> name.s); break; } fq_size += strlen(fqrec2->seq.s); @@ -355,13 +355,32 @@ int paired_main (int argc, char *argv[]) { if (curr != NULL) { //for now assume barcode and umi are in R1 raed if(umi > 0) { + //TODO will have to check at some point that none of + // number in the square brackets are zero. It sort of implies + // but good be a potential bug const char *actl_umi_idx = (fqrec1->seq.s)+strlen(curr->bc)+n_crop; umi_idx = strdup(actl_umi_idx); umi_idx[strlen(umi_idx)-(max_5prime_crop-n_crop)] = '\0'; + /*TODO + * There was a note below from myself in the past. + * I don't think I need to worry about that comment, but leaving it in for now + * + * Here need to add abit of code that pads out umi to a set length + * if min umi length is 9, make the umi 9 bases, padded up with N's to make the length + * + */ + + int fin_umi_len = strlen(umi_idx)-min_umi_len; + + if(fin_umi_len > 0) { + umi_idx[strlen(umi_idx)-fin_umi_len] = '\0'; + } + fq_size += strlen(umi_idx); - if(strlen(umi_idx) < min_umi_len) { + //if(strlen(umi_idx) < min_umi_len) { + if(fin_umi_len < 0) { fprintf(umis_2_short_file, "%s\t%s\t%zu\t%d\n", fqrec1->name.s, umi_idx, strlen(umi_idx), min_umi_len); continue; } From 22c994f819d02281cd05928b60c0f9631f5afa6d Mon Sep 17 00:00:00 2001 From: serine Date: Tue, 21 Aug 2018 08:22:30 +1000 Subject: [PATCH 29/55] updated gitignore and makefile now can build dev version with make dev --- .gitignore | 4 +++- src/Makefile | 11 ++++++++--- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/.gitignore b/.gitignore index 33305c7..a8275f3 100644 --- a/.gitignore +++ b/.gitignore @@ -1,7 +1,9 @@ # ignore C object files *.o -# ignore compiled executable +# ignore executables sabre +sabre-dev +metrics # ignore vim swap files *.swp diff --git a/src/Makefile b/src/Makefile index 401a082..c9f16c3 100644 --- a/src/Makefile +++ b/src/Makefile @@ -8,6 +8,8 @@ OBJ = $(SRC:.c=.o) DSRC=src CFLAGS = -Wall -O2 -std=c99 -pedantic -DVERSION=$(VERSION) +CFLAGSDEV = -Wall -O0 -ggdb -std=c99 -pedantic -DVERSION=$(VERSION) + LDFLAGS = -lz GPROF = -pg EXE = sabre @@ -26,11 +28,14 @@ demulti_paired.o: kseq.h sabre.h sabre.o: sabre.h build: $(OBJ) - $(CC) $(CFLAGS) $(OBJ) -o $(EXE) $(LDFLAGS) + $(CC) $(CFLAGS) $(OBJ) -o $(EXE) $(LDFLAGS) #ln -sf $(DSRC)/$(EXE) .. -gprof: - $(CC) $(CFLAGS) $(GPROF) $(SRC) -o $(EXE).gprof $(LDFLAGS) +dev: $(OBJ) + $(CC) $(CFLAGSDEV) $(OBJ) -o $(EXE)-dev $(LDFLAGS) + +gprof: + $(CC) $(CFLAGS) $(GPROF) $(SRC) -o $(EXE).gprof $(LDFLAGS) clean: $(RM) $(OBJ) $(EXE) $(EXE).gprof core gmon.out From ded2d9007d275bdabbca8dbd8b1a1f2647e5ac8c Mon Sep 17 00:00:00 2001 From: serine Date: Tue, 21 Aug 2018 08:24:23 +1000 Subject: [PATCH 30/55] fixed bug in making umi reads of a particular length I was removing one extra base from the end of the barcode Also refactored code for better redability and removed unwanted comments --- src/demulti_paired.c | 37 ++++++++++--------------------------- 1 file changed, 10 insertions(+), 27 deletions(-) diff --git a/src/demulti_paired.c b/src/demulti_paired.c index be915fd..4488c61 100644 --- a/src/demulti_paired.c +++ b/src/demulti_paired.c @@ -75,8 +75,8 @@ int paired_main (int argc, char *argv[]) { gzFile unknownfile1=NULL; gzFile unknownfile2=NULL; - char *unknownfn1=strdup("unassigned_R1.fastq.gz"); - char *unknownfn2=strdup("unassigned_R2.fastq.gz"); + char *unknownfn1=strdup("unassigned_R1.fq.gz"); + char *unknownfn2=strdup("unassigned_R2.fq.gz"); FILE* umis_2_short_file=NULL; char *umis_2_short_fn=strdup("umis_too_short.txt"); @@ -355,35 +355,18 @@ int paired_main (int argc, char *argv[]) { if (curr != NULL) { //for now assume barcode and umi are in R1 raed if(umi > 0) { - //TODO will have to check at some point that none of - // number in the square brackets are zero. It sort of implies - // but good be a potential bug - const char *actl_umi_idx = (fqrec1->seq.s)+strlen(curr->bc)+n_crop; - umi_idx = strdup(actl_umi_idx); - umi_idx[strlen(umi_idx)-(max_5prime_crop-n_crop)] = '\0'; - - /*TODO - * There was a note below from myself in the past. - * I don't think I need to worry about that comment, but leaving it in for now - * - * Here need to add abit of code that pads out umi to a set length - * if min umi length is 9, make the umi 9 bases, padded up with N's to make the length - * - */ - - int fin_umi_len = strlen(umi_idx)-min_umi_len; - - if(fin_umi_len > 0) { - umi_idx[strlen(umi_idx)-fin_umi_len] = '\0'; - } - fq_size += strlen(umi_idx); + const char *actl_umi_idx = (fqrec1->seq.s)+strlen(curr->bc)+n_crop; - //if(strlen(umi_idx) < min_umi_len) { - if(fin_umi_len < 0) { - fprintf(umis_2_short_file, "%s\t%s\t%zu\t%d\n", fqrec1->name.s, umi_idx, strlen(umi_idx), min_umi_len); + if(strlen(actl_umi_idx) < min_umi_len) { + fprintf(umis_2_short_file, "%s\t%s\t%zu\t%d\n", fqrec1->name.s, actl_umi_idx, strlen(actl_umi_idx), min_umi_len); continue; } + else { + umi_idx = strdup(actl_umi_idx); + umi_idx[min_umi_len] = '\0'; + fq_size += strlen(umi_idx); + } } if(combine > 0) { From 0a77e8a2b66817c3f449d2d25fc506b3272c3b63 Mon Sep 17 00:00:00 2001 From: serine Date: Tue, 21 Aug 2018 08:24:33 +1000 Subject: [PATCH 31/55] Added another mode to metrics, now one can either get metrics on sample barcodes OR metrics on umis. The implementation isn't robust, and there is some code duplication which will need to be refactored later on. Also because there are many more umis. I've seen upto 280k unique tags per sample, better memory management and samter search strategies will be needed --- src/metrics.c | 87 +++++++++++++++++++++++++++++++++++++++++++++------ src/utils.h | 6 ++++ 2 files changed, 83 insertions(+), 10 deletions(-) diff --git a/src/metrics.c b/src/metrics.c index 71f3ae1..0c58ae8 100644 --- a/src/metrics.c +++ b/src/metrics.c @@ -17,9 +17,10 @@ int chk_bc_arr(barcodes_t *arr, char *bc) { int i = 0; while(arr[i].bc != 0) { + if(strcmp(bc, arr[i].bc) == 0) { return i; - } + } i += 1; } @@ -37,20 +38,63 @@ int chk_bc_arr(barcodes_t *arr, char *bc) { return i; } +int chk_umi_arr(umis_t *arr, char *bc) { + + int i = 0; + while(arr[i].bc != 0) { + + if(strcmp(bc, arr[i].bc) == 0) { + return i; + } + + i += 1; + } + + arr[i].bc = strdup(bc); + arr[i].len = strlen(bc); + + if(i == BARCODE_ARRAY-1) { + fprintf(stderr, "ERROR: gone too far in the array\n"); + exit(1); + } + return i; +} + +int umi_n_cmp(const void *a1, const void *b1) { + + const umis_t* a=a1; + const umis_t* b=b1; + + return a->cnts - b->cnts; +} + int bc_n_cmp(const void *a1, const void *b1) { const barcodes_t* a=a1; const barcodes_t* b=b1; return a->cnts - b->cnts; -}; +} int main (int argc, char *argv[]) { + if (argc <= 2) { + fprintf(stderr, "\n\ + \n Usage: metrics \ + \n\ + \n Options:\ + \n\ + \n mode INT [0|1]; 0 = sample barcode, 1 = umis barcodes\ + \n\ + \n"); + exit (EXIT_SUCCESS); + } + gzFile pe1=NULL; pe1 = gzopen(argv[1], "r"); kseq_t *fqrec1; int l1; + int mode = atoi(argv[2]); fqrec1 = kseq_init(pe1); @@ -66,6 +110,9 @@ int main (int argc, char *argv[]) { barcodes_t *barcodes; barcodes = calloc(BARCODE_ARRAY, sizeof(barcodes_t)); + umis_t *umis; + umis = calloc(BARCODE_ARRAY, sizeof(umis_t)); + /* Get reads, one at a time */ while ((l1 = kseq_read(fqrec1)) >= 0) { @@ -80,26 +127,46 @@ int main (int argc, char *argv[]) { p = strtok(NULL, ":"); } - int bc_idx = chk_bc_arr(barcodes, last2); - barcodes[bc_idx].cnts += 1; + if(mode == 0) { + int bc_idx = chk_bc_arr(barcodes, last2); + barcodes[bc_idx].cnts += 1; + } + else if(mode == 1) { + + int umi_idx = chk_umi_arr(umis, last); + umis[umi_idx].cnts += 1; + } } - { + if(mode == 0) { + int n = 0; while(barcodes[n].bc != 0) { n++; } qsort(barcodes, n, sizeof(barcodes_t), bc_n_cmp); - } - { /* - WISDOM this is to limit the right scope for i - */ + WISDOM this is to limit the right scope for i + */ int i = 0; while(barcodes[i].bc != 0) { - fprintf(stdout, "%s %d\n", barcodes[i].bc, barcodes[i].cnts); + fprintf(stdout, "%s\t%d\n", barcodes[i].bc, barcodes[i].cnts); + i++; + } + } + else if(mode == 1) { + + int n = 0; + while(umis[n].bc != 0) { + n++; + } + qsort(umis, n, sizeof(umis_t), umi_n_cmp); + + int i = 0; + while(umis[i].bc != 0) { + fprintf(stdout, "%s\t%d\t%d\n", umis[i].bc, umis[i].len, umis[i].cnts); i++; } } diff --git a/src/utils.h b/src/utils.h index 84522ad..cdc0e20 100644 --- a/src/utils.h +++ b/src/utils.h @@ -11,6 +11,12 @@ typedef struct barcodes_t { int cnts; } barcodes_t; +typedef struct umis_t { + char *bc; + int len; + int cnts; +} umis_t; + KSEQ_INIT(gzFile, gzread) //This is needed if compilling with -std=c99, read below for more From 1a07501b87a7df330131316cb947be25b72bdc55 Mon Sep 17 00:00:00 2001 From: serine Date: Tue, 21 Aug 2018 08:40:47 +1000 Subject: [PATCH 32/55] Updated makefile --- src/Makefile | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/Makefile b/src/Makefile index c9f16c3..6d935c9 100644 --- a/src/Makefile +++ b/src/Makefile @@ -34,8 +34,14 @@ build: $(OBJ) dev: $(OBJ) $(CC) $(CFLAGSDEV) $(OBJ) -o $(EXE)-dev $(LDFLAGS) +metrics: + $(CC) $(CFLAGSDEV) -o metrics metrics.c $(LDFLAGS) + gprof: $(CC) $(CFLAGS) $(GPROF) $(SRC) -o $(EXE).gprof $(LDFLAGS) clean: - $(RM) $(OBJ) $(EXE) $(EXE).gprof core gmon.out + $(RM) $(OBJ) $(EXE) core + +clean-all: + $(RM) $(OBJ) $(EXE) $(EXE)-dev $(EXE).gprof core gmon.out metrics From 9e56e4a27d7716986c5046d0b6d3840c1c68c4a0 Mon Sep 17 00:00:00 2001 From: serine Date: Tue, 11 Dec 2018 18:55:07 +1100 Subject: [PATCH 33/55] far out.. major revamp of sabre, just sabre code left and right this wont compile but need to checkpoint the code. I'm scared to run gcc -Wall (0_0) ... --- src/demulti_paired.c | 488 ------------------------------------------- src/demulti_single.c | 241 --------------------- src/demultiplex.c | 236 +++++++++++++-------- src/sabre.c | 392 +++++++++++++++++++++++++++++----- src/sabre.h | 121 ++++++++++- src/sanity_check.c | 50 +++++ src/usage.c | 78 +++++++ src/utils.c | 11 - 8 files changed, 726 insertions(+), 891 deletions(-) delete mode 100644 src/demulti_paired.c delete mode 100644 src/demulti_single.c create mode 100644 src/sanity_check.c create mode 100644 src/usage.c diff --git a/src/demulti_paired.c b/src/demulti_paired.c deleted file mode 100644 index 4488c61..0000000 --- a/src/demulti_paired.c +++ /dev/null @@ -1,488 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "sabre.h" -#include "utils.h" - - //more about getopts http://www.informit.com/articles/article.aspx?p=175771&seqNum=3 - static struct option paired_long_options[] = { - {"fq1", required_argument, NULL, 'f'}, - {"fq2", required_argument, NULL, 'r'}, - {"barcodes", required_argument, NULL, 'b'}, - {"unassinged1", required_argument, NULL, 'z'}, - {"unassinged2", required_argument, NULL, 'w'}, - {"combine", optional_argument, NULL, 'c'}, - {"umi", optional_argument, NULL, 'u'}, - {"max-mismatch", required_argument, 0, 'm'}, - {"min-umi-len", required_argument, 0, 'l'}, - {"max-5prime-crop", required_argument, 0, 'a'}, - {"stats", required_argument, NULL, 's'}, - {"no-comment", no_argument, 0, 'n'}, - //{"quiet", no_argument, 0, 'z'}, - {GETOPT_HELP_OPTION_DECL}, - {GETOPT_VERSION_OPTION_DECL}, - {NULL, 0, NULL, 0} - }; - -void paired_usage (int status) { - - fprintf (stderr, "\n Usage: %s pe [OPTIONS] -f -r -b \ - \n\ - \n\ - \n Options:\ - \n\ - \n Required:\ - \n\ - \n -f, --fq1 FILE Input FASTQ R1 read\ - \n -r, --fq2 FILE Input FASTQ R2 reads\ - \n -b, --barcodes FILE Barcodes files, one barcode per line, e.g BC\\tPREFIX\ - \n -w, --unassigned CHAR Unassigned prefix\ - \n\ - \n Other:\ - \n\ - \n -c, --combine Combine R1 and R2 [NULL]\ - \n -u, --umi Indicates that umi present in the R1 read [NULL]\ - \n -m, --max-mismatch INT Maximum number of mismatches allowed in a barcode [0]\ - \n -l, --min-umi-len INT Minimum UMI length to keep [0]\ - \n -a, --max-5prime-crop INT Maximum number of possible bases cropped from 5prime [0]\ - \n -n, --no-comment Drop extra comments from FASTQ header [NULL]\ - \n -s, --stats FILE Write stats to file instead of STDOUT [STDOUT]\ - \n\ - \n", - PROGRAM_NAME); - - exit (status); -} - -int paired_main (int argc, char *argv[]) { - - //clock_t begin = clock(); - time_t start, end; - start = time(NULL); - - gzFile pe1=NULL; - gzFile pe2=NULL; - kseq_t *fqrec1; - kseq_t *fqrec2; - int l1,l2; - FILE* barfile = NULL; - - gzFile unknownfile1=NULL; - gzFile unknownfile2=NULL; - char *unknownfn1=strdup("unassigned_R1.fq.gz"); - char *unknownfn2=strdup("unassigned_R2.fq.gz"); - - FILE* umis_2_short_file=NULL; - char *umis_2_short_fn=strdup("umis_too_short.txt"); - - FILE* log_file=NULL; - int optc; - extern char *optarg; - char *fq1=NULL; - char *fq2=NULL; - char *barfn=NULL; - char s_name[MAX_FILENAME_LENGTH]; - barcode_data_paired *curr, *head, *temp; - char barcode [MAX_BARCODE_LENGTH]; - int num_unknown=0; - int total=0; - int mismatch=0; - - int combine = -1; - int umi = -1; - int paired = -1; - int min_umi_len=0; - int max_5prime_crop=0; - char *log_fn=NULL; - int no_comment=-1; - - while (1) { - int option_index = 0; - //colon after a flag means should have arguments and no colon means just a flag i.e bool, no args after it - optc = getopt_long (argc, argv, "dnucf:r:b:z:w:m:s:l:z:a:", paired_long_options, &option_index); - - if (optc == -1) break; - - switch (optc) { - if (paired_long_options[option_index].flag != 0) break; - - case 'f': - fq1 = (char*) malloc (strlen (optarg) + 1); - strcpy (fq1, optarg); - break; - - case 'r': - fq2 = (char*) malloc (strlen (optarg) + 1); - strcpy (fq2, optarg); - break; - - case 'b': - barfn = (char*) malloc (strlen (optarg) + 1); - strcpy (barfn, optarg); - break; - - case 'z': - if(unknownfn1) { - free(unknownfn1); - } - unknownfn1 = (char*) malloc (strlen (optarg) + 1); - strcpy (unknownfn1, optarg); - break; - - case 'w': - if(unknownfn2) { - free(unknownfn2); - } - unknownfn2 = (char*) malloc (strlen (optarg) + 1); - strcpy (unknownfn2, optarg); - break; - - case 'c': - combine=1; - break; - - case 'u': - umi=1; - break; - - case 'm': - mismatch = atoi (optarg); - break; - - case 's': - log_fn = (char*) malloc (strlen (optarg) + 1); - strcpy (log_fn, optarg); - break; - - case 'l': - min_umi_len = atoi (optarg); - break; - - case 'a': - max_5prime_crop = atoi (optarg); - break; - - case 'n': - no_comment = 1; - break; - - case_GETOPT_HELP_CHAR(paired_usage); - case_GETOPT_VERSION_CHAR(PROGRAM_NAME, VERSION, AUTHORS); - - case '?': - paired_usage (EXIT_FAILURE); - break; - - default: - paired_usage (EXIT_FAILURE); - break; - } - } - - if (!fq1 || !fq2 || !unknownfn1 || !unknownfn2 || !barfn) { - paired_usage (EXIT_FAILURE); - } - - if (!strcmp (fq1, fq2) || !strcmp (fq1, unknownfn1) || !strcmp (fq1, unknownfn2) || - !strcmp (fq1, barfn) || !strcmp (fq2, unknownfn1) || !strcmp (fq2, unknownfn2) || - !strcmp (fq2, barfn) || !strcmp (unknownfn1, unknownfn2) || !strcmp (unknownfn1, barfn) || - !strcmp (unknownfn2, barfn)) { - - fprintf (stderr, "ERROR: Duplicate input and/or output file names.\n"); - return EXIT_FAILURE; - } - - pe1 = gzopen (fq1, "r"); - if (!pe1) { - fprintf (stderr, "ERROR: Could not open input file 1 '%s'.\n", fq1); - return EXIT_FAILURE; - } - - pe2 = gzopen (fq2, "r"); - if (!pe2) { - fprintf (stderr, "ERROR: Could not open input file 2 '%s'.\n", fq2); - return EXIT_FAILURE; - } - - unknownfile1 = gzopen(unknownfn1, "wb"); - if (!unknownfile1) { - fprintf (stderr, "ERROR: Could not open unknown output file 1 '%s'.\n", unknownfn1); - return EXIT_FAILURE; - } - - unknownfile2 = gzopen(unknownfn2, "wb"); - if (!unknownfile2) { - fprintf (stderr, "Could not open unknown output file 2 '%s'.\n", unknownfn2); - return EXIT_FAILURE; - } - - barfile = fopen (barfn, "r"); - if (!barfile) { - fprintf (stderr, "Could not open barcode file '%s'.\n", barfn); - return EXIT_FAILURE; - } - - if(fq2) { - paired = 1; - } - - umis_2_short_file = fopen(umis_2_short_fn, "a"); - fprintf(umis_2_short_file, "name\tumi\tlen\tmin_len\n"); - - fprintf(stderr, "\n\ - \n Running: %s\ - \n Command line args:\ - \n --fq1 %s\ - \n --fq2 %s\ - \n --barcodes %s\ - \n --unassigned %s\ - \n --combine %d\ - \n --umi %d\ - \n --max-mismatch %d\ - \n --min-umi-len %d\ - \n --max-5prime-crop %d\ - \n --no-comment %d\ - \n --stats %s\ - \n\ - \n In Progess...\ - \n", PROGRAM_NAME,\ - fq1, fq2,\ - barfn,\ - unknownfn1,\ - combine, umi,\ - mismatch, min_umi_len, - max_5prime_crop, no_comment,\ - log_fn); - - char *bcout_fn1 = NULL; - char *bcout_fn2 = NULL; - /* Creating linked list of barcode data */ - // https://www.hackerearth.com/practice/data-structures/linked-list/singly-linked-list/tutorial/ - // where each node is represents one barcode from the barcode file - head = NULL; - while (fscanf (barfile, "%s%s", barcode, s_name) != EOF) { - curr = (barcode_data_paired*) malloc(sizeof(barcode_data_paired)); - curr->bc = (char*) malloc(strlen(barcode) + 1); - strcpy(curr->bc, barcode); - - bcout_fn1 = (char *) malloc(MAX_FILENAME_LENGTH*2); - bcout_fn1[0] = '\0'; - get_bc_fn(&bcout_fn1, s_name, curr->bc, 1); - //curr->bcfile1 = fopen (_mkdir(bcout_fn1), "w"); - curr->bcfile1 = gzopen(_mkdir(bcout_fn1), "wb"); - - if(paired > 0 && combine < 0) { - bcout_fn2 = (char *) malloc(MAX_FILENAME_LENGTH*2); - bcout_fn2[0] = '\0'; - get_bc_fn(&bcout_fn2, s_name, curr->bc, 2); - //curr->bcfile2 = fopen (_mkdir(bcout_fn2), "w"); - curr->bcfile2 = gzopen(_mkdir(bcout_fn2), "wb"); - } - - curr->num_records = 0; - curr->next = head; - head = curr; - - } - - free(bcout_fn1); - free(bcout_fn2); - - fqrec1 = kseq_init (pe1); - - if(paired > 0) { - fqrec2 = kseq_init (pe2); - } - - /* Get reads, one at a time */ - while ((l1 = kseq_read (fqrec1)) >= 0) { - - int n_crop = 0; - - char *actl_bc = NULL; - - char *fqread1 = NULL; - char *fqread2 = NULL; - - size_t fq_size = 0; - - fq_size += strlen(fqrec1->seq.s); - fq_size += (strlen(fqrec1->name.s)*2); - fq_size += strlen(fqrec1->qual.s); - fq_size += (strlen(fqrec1->comment.s)*2); - fq_size += 2;// header signs @ and + - fq_size += 2;//two colons (:) - fq_size += 4;//cariage returns - fq_size += 2;//two spaces - fq_size += 1000;//test - - if(paired > 0 || combine > 0) { - l2 = kseq_read (fqrec2); - if (l2 < 0) { - fprintf (stderr, "\n\ - \n ERROR: R2 file is shorter than R1 file.\ - \n Stopping here:\ - \n %s\ - \n", - fqrec1 -> name.s); - break; - } - fq_size += strlen(fqrec2->seq.s); - } - - /* Find matching barcode */ - curr = head; - while (curr) { - n_crop = chk_bc_mtch(curr->bc, fqrec1->seq.s, mismatch, max_5prime_crop); - if(n_crop >= 0) { - //found matching barcode - actl_bc = strndup( (fqrec1->seq.s)+n_crop, strlen(curr->bc) ); - break; - } - curr = curr->next; - } - - /* Write read out into barcode specific file */ - - char *umi_idx = NULL; - - if (curr != NULL) { - //for now assume barcode and umi are in R1 raed - if(umi > 0) { - - const char *actl_umi_idx = (fqrec1->seq.s)+strlen(curr->bc)+n_crop; - - if(strlen(actl_umi_idx) < min_umi_len) { - fprintf(umis_2_short_file, "%s\t%s\t%zu\t%d\n", fqrec1->name.s, actl_umi_idx, strlen(actl_umi_idx), min_umi_len); - continue; - } - else { - umi_idx = strdup(actl_umi_idx); - umi_idx[min_umi_len] = '\0'; - fq_size += strlen(umi_idx); - } - } - - if(combine > 0) { - fqread1 = (char*) malloc(fq_size); - fqread1[0] = '\0'; - - get_merged_fqread(&fqread1, fqrec1, fqrec2, actl_bc, umi_idx, no_comment, n_crop); - gzwrite(curr->bcfile1, fqread1, strlen(fqread1)); - } - else { - fqread1 = (char*) malloc(fq_size + 1); - fqread2 = (char*) malloc(fq_size + 1); - - fqread1[0] = '\0'; - fqread2[0] = '\0'; - - get_fqread(&fqread1, fqrec1, actl_bc, umi_idx, no_comment, n_crop); - gzwrite(curr->bcfile1, fqread1, strlen(fqread1)); - - if(paired > 0) { - get_fqread(&fqread2, fqrec1, actl_bc, umi_idx, no_comment, n_crop); - //fprintf(curr->bcfile2, "%s", fqread2); - gzwrite(curr->bcfile2, fqread2, strlen(fqread2)); - curr->num_records += 1; - } - } - curr->num_records += 1; - } - else { - fqread1 = (char*) malloc(fq_size + 1); - fqread2 = (char*) malloc(fq_size + 1); - - fqread1[0] = '\0'; - fqread2[0] = '\0'; - - get_fqread(&fqread1, fqrec1, NULL, NULL, no_comment, 0); - gzwrite(unknownfile1, fqread1, strlen(fqread1)); - num_unknown += 1; - - if(paired > 0) { - get_fqread(&fqread2, fqrec2, NULL, NULL, no_comment, 0); - gzwrite(unknownfile2, fqread2, strlen(fqread2)); - num_unknown += 1; - } - } - - total += 2; - - free(fqread1); - free(fqread2); - free(actl_bc); - free(umi_idx); - } - - //if (!log_fn) { is this better? - if (log_fn == NULL) { - log_file = stdout; - } - else { - log_file = fopen(log_fn, "w"); - } - - fprintf (log_file, "Barcode\tN_records\tN_pairs\tP_pairs\n"); - curr = head; - int total_pairs = total/2; - - while (curr) { - - int n_pairs = curr->num_records/2; - float percent_pairs = (float) n_pairs/total_pairs; - - fprintf (log_file,"%s\t%d\t%d\t%.2f\n", curr->bc, curr->num_records, n_pairs, percent_pairs); - - curr = curr->next; - } - - int unknown_pairs = num_unknown/2; - float percent_unknown = (float) unknown_pairs/total_pairs; - float tot_chk = (float) total_pairs/total_pairs; - - fprintf(log_file, "unassigned\t%d\t%d\t%.2f\n", num_unknown, unknown_pairs, percent_unknown); - fprintf(log_file, "total\t%d\t%d\t%.2f\n", total, total_pairs, tot_chk); - - end = time(NULL); - fprintf(stderr, "\n All done :) \ - \n It took %.2f minutes\n", - difftime(end, start)/60); - - kseq_destroy (fqrec1); - kseq_destroy (fqrec2); - gzclose (pe1); - gzclose (pe2); - gzclose (unknownfile1); - gzclose (unknownfile2); - fclose (barfile); - fclose (log_file); - fclose(umis_2_short_file); - - free (fq1); - free (fq2); - free (barfn); - free (unknownfn1); - free (unknownfn2); - free(umis_2_short_fn); - free (log_fn); - - curr = head; - while (curr) { - gzclose(curr->bcfile1); - gzclose(curr->bcfile2); - - free (curr->bc); - temp = curr; - curr = curr->next; - free (temp); - } - - return EXIT_SUCCESS; -} diff --git a/src/demulti_single.c b/src/demulti_single.c deleted file mode 100644 index f73cf69..0000000 --- a/src/demulti_single.c +++ /dev/null @@ -1,241 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "sabre.h" -#include "kseq.h" - -KSEQ_INIT(gzFile, gzread) - - -static struct option single_long_options[] = { - {"fastq-file", required_argument, 0, 'f'}, - {"barcode-file", required_argument, 0, 'b'}, - {"unknown-output", required_argument, 0, 'u'}, - {"max-mismatch", optional_argument, 0, 'm'}, - {"quiet", optional_argument, 0, 'z'}, - {GETOPT_HELP_OPTION_DECL}, - {GETOPT_VERSION_OPTION_DECL}, - {NULL, 0, NULL, 0} -}; - -void single_usage (int status) { - - fprintf (stderr, "\nUsage: %s se -f -b -u \n\ -\n\ -Options:\n\ --f, --fastq-file, Input fastq file (required)\n", PROGRAM_NAME); - - fprintf (stderr, "-b, --barcode-file, File with barcode and output file name per line (required)\n\ --u, --unknown-output, Output file that contains records with no barcodes found. (required)\n\ --m , --max-mismatch , Optional argument that is the maximum number of mismatches allowed in a barcode. Default 0.\n\ ---quiet, don't output matching info\n\ ---help, display this help and exit\n\ ---version, output version information and exit\n\n"); - - exit (status); -} - -int single_main (int argc, char *argv[]) { - - gzFile se=NULL; - kseq_t *fqrec; - int debug=0; - int optc; - extern char *optarg; - FILE* barfile = NULL; - FILE* unknownfile=NULL; - char *barfn=NULL; - char *infn=NULL; - char *unknownfn=NULL; - barcode_data *curr, *head, *temp; - char barcode [MAX_BARCODE_LENGTH]; - char baroutfn [MAX_FILENAME_LENGTH]; - int num_unknown=0; - int total=0; - int mismatch=0; - int quiet=0; - - int max_5prime_crop=0; - - while (1) { - int option_index = 0; - optc = getopt_long (argc, argv, "df:b:u:m:z", single_long_options, &option_index); - - if (optc == -1) break; - - switch (optc) { - if (single_long_options[option_index].flag != 0) break; - - case 'f': - infn = (char*) malloc (strlen (optarg) + 1); - strcpy (infn, optarg); - break; - - case 'b': - barfn = (char*) malloc (strlen (optarg) + 1); - strcpy (barfn, optarg); - break; - - case 'u': - unknownfn = (char*) malloc (strlen (optarg) + 1); - strcpy (unknownfn, optarg); - break; - - case 'm': - mismatch = atoi (optarg); - break; - - case 'z': - quiet=1; - break; - - case 'd': - debug = 1; - break; - - case_GETOPT_HELP_CHAR(single_usage) - case_GETOPT_VERSION_CHAR(PROGRAM_NAME, VERSION, AUTHORS); - - case '?': - single_usage (EXIT_FAILURE); - break; - - default: - single_usage (EXIT_FAILURE); - break; - } - } - - - if (!infn || !barfn) { - single_usage (EXIT_FAILURE); - } - - if (!strcmp (infn, barfn)) { - fprintf (stderr, "Error: Input file is same as barcode file.\n"); - return EXIT_FAILURE; - } - - se = gzopen (infn, "r"); - if (!se) { - fprintf (stderr, "Could not open input file '%s'.\n", infn); - return EXIT_FAILURE; - } - - barfile = fopen (barfn, "r"); - if (!barfile) { - fprintf (stderr, "Could not open barcode file '%s'.\n", barfn); - return EXIT_FAILURE; - } - - unknownfile = fopen (unknownfn, "w"); - if (!unknownfile) { - fprintf (stderr, "Could not open unknown output file '%s'.\n", unknownfn); - return EXIT_FAILURE; - } - - - /* Set up a linked list of the barcode data */ - head = NULL; - while (fscanf (barfile, "%s%s", barcode, baroutfn) != EOF) { - curr = (barcode_data*) malloc (sizeof (barcode_data)); - curr->bc = (char*) malloc (strlen(barcode) + 1); - strcpy (curr->bc, barcode); - - curr->bcfile = fopen (baroutfn, "w"); - curr->num_records = 0; - - curr->next = head; - head = curr; - } - - - fqrec = kseq_init (se); - - while (kseq_read (fqrec) >= 0) { - - /* Go through linked list of barcode data and compare each barcode */ - /* with the sequence until a match is found or no match is found for any */ - curr = head; - while (curr) { - if (strncmp_with_mismatch (curr->bc, fqrec->seq.s, mismatch, max_5prime_crop) == 0) { - break; - } - curr = curr->next; - } - - - /* If barcode data is found, output to demultiplexed file, else output to unknown file */ - if (curr != NULL) { - fprintf (curr->bcfile, "@%s:%s", fqrec->name.s, curr->bc); - if (fqrec->comment.l) fprintf (curr->bcfile, " %s\n", fqrec->comment.s); - else fprintf (curr->bcfile, "\n"); - - fprintf (curr->bcfile, "%s\n", (fqrec->seq.s)+strlen(curr->bc)); - - fprintf (curr->bcfile, "+%s", fqrec->name.s); - if (fqrec->comment.l) fprintf (curr->bcfile, " %s\n", fqrec->comment.s); - else fprintf (curr->bcfile, "\n"); - - fprintf (curr->bcfile, "%s\n", (fqrec->qual.s)+strlen(curr->bc)); - - curr->num_records++; - } - - else { - fprintf (unknownfile, "@%s", fqrec->name.s); - if (fqrec->comment.l) fprintf (unknownfile, " %s\n", fqrec->comment.s); - else fprintf (unknownfile, "\n"); - - fprintf (unknownfile, "%s\n", fqrec->seq.s); - - fprintf (unknownfile, "+%s", fqrec->name.s); - if (fqrec->comment.l) fprintf (unknownfile, " %s\n", fqrec->comment.s); - else fprintf (unknownfile, "\n"); - - fprintf (unknownfile, "%s\n", fqrec->qual.s); - - num_unknown++; - } - - total++; - } - - - if (!quiet) { - fprintf (stdout, "\nTotal FastQ records: %d\n\n", total); - curr = head; - while (curr) { - fprintf (stdout, "FastQ records for barcode %s: %d\n", curr->bc, curr->num_records); - curr = curr->next; - } - fprintf (stdout, "\nFastQ records with no barcode match: %d\n", num_unknown); - fprintf (stdout, "\nNumber of mismatches allowed: %d\n\n", mismatch); - } - - kseq_destroy (fqrec); - gzclose (se); - fclose (barfile); - fclose (unknownfile); - - free (infn); - free (unknownfn); - free (barfn); - - curr = head; - while (curr) { - fclose (curr->bcfile); - free (curr->bc); - temp = curr; - curr = curr->next; - free (temp); - } - - return 0; -} diff --git a/src/demultiplex.c b/src/demultiplex.c index 63466f1..3a8871a 100644 --- a/src/demultiplex.c +++ b/src/demultiplex.c @@ -12,112 +12,176 @@ * */ -int demulti() { - - barcode_data_paired *curr, *head, *temp; - char barcode [MAX_BARCODE_LENGTH]; - char s_name [MAX_FILENAME_LENGTH]; - - /* Creating linked list of barcode data */ - // https://www.hackerearth.com/practice/data-structures/linked-list/singly-linked-list/tutorial/ - // where each node is represents one barcode from the barcode file - // number of nodes should equal to number of barcodes (lines) in the file - head = NULL; - while (fscanf (barfile, "%s%s", barcode, s_name) != EOF) { - curr = (barcode_data_paired*) malloc (sizeof (barcode_data_paired)); - curr->bc = (char*) malloc (strlen(barcode) + 1); - strcpy(curr->bc, barcode); - - char *bcout_fn1 = get_bc_fn(barcode, s_name, 1); - char *bcout_fn2 = get_bc_fn(barcode, s_name, 1); - - curr->bcfile1 = fopen (_mkdir(bcout_fn1), "w"); - curr->bcfile2 = fopen (_mkdir(bcout_fn2), "w"); - curr->num_records = 0; - - curr->next = head; - head = curr; - } +#include "sabre.h" + +void* demult_runner(void *arg) +{ + + kseq_t fqrec1; + kseq_t fqrec2; + + int l1, l2; + thread_data_t* thread_data = (thread_data*)arg; + int my_line_num; - fqrec1 = kseq_init (pe1); + fqrec1 = kseq_init(thread_data->fq1_fd); - if(paired > 0) { - fqrec2 = kseq_init (pe2); + if(thread_data->paired > 0) { + fqrec2 = kseq_init(thread_data->fq2_fd); } /* Get reads, one at a time */ - while((l1 = kseq_read (fqrec1)) >= 0) { - int n_crop_fq1; - int n_crop_fq2; - char *actl_bc_fq1 = [MAX_BARCODE_LENGTH]; - char *actl_bc_fq2 = [MAX_BARCODE_LENGTH]; + while(1) { + + // lock reading + pthread_mutex_lock(thread_data->in_lock); + + l1 = kseq_read(fqrec1); + + // sanity check no more reads + if(l1 < 0 ) { + pthread_mutex_unlock(thread_data->in_lock); + break; + } + + int n_crop = 0; + + char *actl_bc = NULL; - if(paired > 0) { - l2 = kseq_read (fqrec2); + char *fqread1 = NULL; + char *fqread2 = NULL; + + size_t fq_size = 0; + + fq_size += strlen(fqrec1->seq.s); + fq_size += (strlen(fqrec1->name.s)*2); + fq_size += strlen(fqrec1->qual.s); + fq_size += (strlen(fqrec1->comment.s)*2); + fq_size += 2;// header signs @ and + + fq_size += 2;//two colons (:) + fq_size += 4;//cariage returns + fq_size += 2;//two spaces + fq_size += 1000;//test + + if(paired > 0 || combine > 0) { + l2 = kseq_read(fqrec2); if (l2 < 0) { - fprintf (stderr, "ERROR: R2 file is shorter than R1 file. Disregarding rest of R1 file \n"); + fprintf (stderr, "\n\ + \n ERROR: R2 file is shorter than R1 file.\ + \n Stopping here:\ + \n %s\ + \n", + fqrec1->name.s); break; } + fq_size += strlen(fqrec2->seq.s); } - /* Find matching barcode */ - curr = head; - while (curr) { - - n_crop_fq1 = chk_bc_mtch(curr->bc, fqrec1->seq.s, mismatch, max_5prime_crop); - if (n_crop_fq1 >= 0) { + /* Step 1: Find matching barcode */ + thread_data->curr = head; + while(thread_data->curr) { + n_crop = chk_bc_mtch(thread_data->curr->bc, fqrec1->seq.s, thread_data->params->mismatch, thread_data->params->max_5prime_crop); + if(n_crop >= 0) { //found matching barcode + actl_bc = strndup( (fqrec1->seq.s)+n_crop, strlen(thread_data->curr->bc) ); break; } + thread_data->curr = thread_data->curr->next; + } + + // unlock reading + my_line_num = *(thread_data->line_num); + *thread_data->line_num += 1; + pthread_mutex_unlock(thread_data->in_lock); + + /* Step 2: Write read out into barcode specific file */ + + // lock writing + while(*(thread_data->out_line_num) != my_line_num) { + pthread_cond_wait(thread_data->cv, thread_data->out_lock); + } + *thread_data->out_line_num += 1; + + pthread_cond_broadcast(thread_data->cv); // Tell everyone it might be their turn! + + char *umi_idx = NULL; + + if(thread_data->curr != NULL) { + //for now assume barcode and umi are in R1 read + if(thread_data->params->umi > 0) { + + const char *actl_umi_idx = (fqrec1->seq.s)+strlen(thread_data->curr->bc)+n_crop; + + if(strlen(actl_umi_idx) < thread_data->params->min_umi_len) { + //protect by mutex umis_2_short_file + fprintf(thread_data->params->umis_2_short_fd, "%s\t%s\t%zu\t%d\n", fqrec1->name.s, actl_umi_idx, strlen(actl_umi_idx), thread_data->params->min_umi_len); + continue; + } + else { + umi_idx = strdup(actl_umi_idx); + umi_idx[min_umi_len] = '\0'; + fq_size += strlen(umi_idx); + } + } + + if(thread_data->params->combine > 0) { + fqread1 = (char*) malloc(fq_size); + fqread1[0] = '\0'; - if(paired > 0) { - - n_crop_fq2 = chk_bc_mtch(curr->bc, fqrec2->seq.s, mismatch, max_5prime_crop); - - if (n_crop_fq2 < 0) { - // it is ok not to have matching barcode.. - fprintf (stderr, "ERROR: R2 didn't have a matching barcode. \n"); - } - - if (n_crop_fq1 != n_crop_fq2) { - // this will go heand in heand with previous check - // but can be stand along thing as well, when one read has an overhand - // and the other doesn't, shouldn't be the case though (I think) - fprintf (stderr, "ERROR: Number of cropped bases doesn't match between R1 and R2\n"); - } - - actl_bc_fq1 = strlen(fqrec1->seq.s)+strlen(curr->bc)+n_crop_fq1; - actl_bc_fq2 = strlen(fqrec2->seq.s)+strlen(curr->bc)+n_crop_fq2; - - // didn't match if != zero - if(strcmp(actl_bc_fq1, actl_bc_fq2) != 0) { - fprintf (stderr, "ERROR: Actual R1 and R2 barcodes didn't match, %s and %s. This is strange.. \n", - actl_bc_fq1, - actl_bc_fq2); - } - else { - //write read out to a matching barcode - break; - } - } - - curr = curr->next; + get_merged_fqread(&fqread1, fqrec1, fqrec2, actl_bc, umi_idx, thread_data->params->no_comment, n_crop); + //protect by mutex umis_2_short_file + gzwrite(thread_data->curr->bcfile1, fqread1, strlen(fqread1)); + } + else { + fqread1 = (char*) malloc(fq_size + 1); + fqread2 = (char*) malloc(fq_size + 1); + + fqread1[0] = '\0'; + fqread2[0] = '\0'; + + get_fqread(&fqread1, fqrec1, actl_bc, umi_idx, thread_data->params->no_comment, n_crop); + gzwrite(thread_data->curr->bcfile1, fqread1, strlen(fqread1)); + + if(thread_data->params->paired > 0) { + get_fqread(&fqread2, fqrec1, actl_bc, umi_idx, thread_data->params->no_comment, n_crop); + //fprintf(curr->bcfile2, "%s", fqread2); + gzwrite(thread_data_.curr->bcfile2, fqread2, strlen(fqread2)); + *thread_data->curr->num_records += 1; + } + } + *thread_data->curr->num_records += 1; } + else { + fqread1 = (char*) malloc(fq_size + 1); + fqread2 = (char*) malloc(fq_size + 1); + + fqread1[0] = '\0'; + fqread2[0] = '\0'; + + get_fqread(&fqread1, fqrec1, NULL, NULL, thread_data->params->no_comment, 0); + gzwrite(thread_data->unassigned1_fd, fqread1, strlen(fqread1)); + *metrics->num_unknown += 1; - /* Write read out into barcode specific file */ - if(curr != NULL) { - // if UMI is shorter then 10, discard the reads - //if(strlen((fqrec1->seq.s)+strlen(curr->bc)) >= min_umi_len) { + if(paired > 0) { + get_fqread(&fqread2, thread_data->fqrec2, NULL, NULL, no_comment, 0); + gzwrite(thread_data->unassigned1_fd, fqread2, strlen(fqread2)); + *metrics->num_unknown += 1; + } + } - fqrec1->name.s - fqrec1->seq.s - fqrec1->comment.l - fqrec1->qual.s - curr->bc + *metrics->total += 2; - char *trimed_fq1 = (fqrec1->seq.s)+strlen(curr->bc); + // unlock writing + pthread_mutex_unlock(thread_data->out_lock); - //} + free(fqread1); + free(fqread2); + free(actl_bc); + free(umi_idx); } + + free(data); + + return NULL; } diff --git a/src/sabre.c b/src/sabre.c index d339c9c..e00b66b 100644 --- a/src/sabre.c +++ b/src/sabre.c @@ -1,67 +1,351 @@ -#include -#include -#include -#include -#include -#include -#include -#include #include "sabre.h" -void main_usage (int status) { - - fprintf (stdout, - "\n\ - \n Usage: %s [options]\ - \n\ - \n Command:\ - \n\ - \n se\tsingle-end barcode de-multiplexing\ - \n pe\tpaired-end barcode de-multiplexing\ - \n\ - \n --help\tto get more help\ - \n --version\tprint current version to stdout\ - \n\ - \n Info: Sabre is a heavy cavalry sword with a curved blade and a single cutting edge\ - \n Not sure though if the meaning was intended by original author...\ - \n\ - \n", - PROGRAM_NAME); - - exit (status); -} +int main(int argc, char *argv[]) { + + //more about getopts http://www.informit.com/articles/article.aspx?p=175771&seqNum=3 + static struct option paired_long_options[] = { + {"fq1", required_argument, NULL, 'f'}, + {"fq2", required_argument, NULL, 'r'}, + {"barcodes", required_argument, NULL, 'b'}, + {"unassinged1", required_argument, NULL, 'z'}, + {"unassinged2", required_argument, NULL, 'w'}, + {"combine", optional_argument, NULL, 'c'}, + {"umi", optional_argument, NULL, 'u'}, + {"max-mismatch", required_argument, 0, 'm'}, + {"min-umi-len", required_argument, 0, 'l'}, + {"max-5prime-crop", required_argument, 0, 'a'}, + {"stats", required_argument, NULL, 's'}, + {"no-comment", no_argument, 0, 'n'}, + {"threads", optional_argument, 0, 't'}, + {"version", optional_argument, NULL, 'v'}, + {"help", optional_argument, NULL, 'h'}, + {"story", optional_argument, NULL, 'o'}, + //{"quiet", no_argument, 0, 'z'}, + {GETOPT_HELP_OPTION_DECL}, + {GETOPT_VERSION_OPTION_DECL}, + {NULL, 0, NULL, 0} + }; + + // this is on the stack so no need to malloc + // stack gets cleaned when function exits, + // because of that no need to free either + param_t params; + + metrics_t metrics; + + //clock_t begin = clock(); + time_t start, end; + start = time(NULL); + + FILE* barfile = NULL; + + gzFile unassigned1_fd=NULL; + gzFile unassigned2_fd=NULL; + + char *unassigned1_fn=strdup("unassigned_R1.fq.gz"); + char *unassigned2_fn=strdup("unassigned_R2.fq.gz"); + char *umis_2_short_fn=strdup("umis_too_short.txt"); + + FILE* log_file=NULL; + int optc; + extern char *optarg; + + char *fq1_fn=NULL; + char *fq2_fn=NULL; + char *log_fn=NULL; + + char *barfn=NULL; + char s_name[MAX_FILENAME_LENGTH]; + barcode_data_t *curr, *head, *temp; + char barcode [MAX_BARCODE_LENGTH]; + + int threads=4; + + while (1) { + int option_index = 0; + //colon after a flag means should have arguments and no colon means just a flag i.e bool, no args after it + optc = getopt_long (argc, argv, "dnucvof:r:b:z:w:m:s:l:z:a:t:", paired_long_options, &option_index); + + if (optc == -1) break; + + switch (optc) { + if (paired_long_options[option_index].flag != 0) break; + + case 'f': + fq1 = (char*) malloc (strlen (optarg) + 1); + strcpy (fq1, optarg); + break; + + case 'r': + fq2 = (char*) malloc (strlen (optarg) + 1); + strcpy (fq2, optarg); + break; + + case 'b': + barfn = (char*) malloc (strlen (optarg) + 1); + strcpy (barfn, optarg); + break; + + case 'z': + if(unassigned1_fn) { + free(unassigned1_fn); + } + unassigned1_fn = (char*) malloc (strlen (optarg) + 1); + strcpy (unassigned1_fn, optarg); + break; + + case 'w': + if(unassigned2_fn) { + free(unassigned2_fn); + } + unassigned2_fn = (char*) malloc (strlen (optarg) + 1); + strcpy (unassigned2_fn, optarg); + break; + + case 'c': + params.combine=1; + break; + + case 'u': + params.umi=1; + break; + + case 'm': + params.mismatch = atoi (optarg); + break; + + case 'l': + params.min_umi_len = atoi (optarg); + break; + + case 'a': + params.max_5prime_crop = atoi (optarg); + break; + + case 'n': + params.no_comment = 1; + break; + + case 't': + threads = atoi (optarg); + break; + + case 's': + log_fn = (char*) malloc (strlen (optarg) + 1); + strcpy (log_fn, optarg); + break; + + case 'v': + version(EXIT_SUCCESS); + break; + + //NOTE if user requrested the help menu i.e --help then + //return success for all other cases below while help menu + //is printed it wasn't intended by user (or at least we don't know that) + //and therefore exit code - fail + case 'h': + version(EXIT_SUCCESS); + break; + + case 'o': + little_story(EXIT_SUCCESS); + break; + + case_GETOPT_VERSION_CHAR(PROGRAM_NAME, VERSION_MAJOR, AUTHORS); + + case '?': + usage(EXIT_FAILURE); + break; + + default: + usage(EXIT_FAILURE); + break; + } + } + + params.fq1_fd = gzopen(fq1_fn, "r"); + params.fq2_fd = gzopen(fq2_fn, "r"); + params.unassigned1_fd = gzopen(unassigned1_fn, "wb"); + params.unassigned2_fd = gzopen(unassigned1_fn, "wb"); + params.umis_2_short_fd = fopen(umis_2_short_fn, "a"); + + // ? where does this goes? + fprintf(umis_2_short_file, "name\tumi\tlen\tmin_len\n"); + + //TODO plugin sanity_chk here + + if(params.fq2_fd) { + params.paired = 1; + } + + fprintf(stderr, "\n\ + \n Running: %s\ + \n Command line args:\ + \n --fq1 %s\ + \n --fq2 %s\ + \n --barcodes %s\ + \n --unassigned_R1 %s\ + \n --unassigned_R2 %s\ + \n --combine %d\ + \n --umi %d\ + \n --max-mismatch %d\ + \n --min-umi-len %d\ + \n --max-5prime-crop %d\ + \n --no-comment %d\ + \n --stats %s\ + \n --threads %d\ + \n\ + \n In Progess...\ + \n", PROGRAM_NAME,\ + fq1_fn, fq2_fn,\ + barfn,\ + unassigned1_fn, unassigned2_fn,\ + params.combine, params.umi,\ + params.mismatch, params.min_umi_len, + params.max_5prime_crop, params.no_comment,\ + log_fn, threads); + + char *bcout_fn1 = NULL; + char *bcout_fn2 = NULL; + /* Creating linked list of barcode data */ + // https://www.hackerearth.com/practice/data-structures/linked-list/singly-linked-list/tutorial/ + // where each node is represents one barcode from the barcode file + head = NULL; + while (fscanf (barfile, "%s%s", barcode, s_name) != EOF) { + curr = (barcode_data_t*) malloc(sizeof(barcode_data_t)); + curr->bc = (char*) malloc(strlen(barcode) + 1); + strcpy(curr->bc, barcode); + + bcout_fn1 = (char *) malloc(MAX_FILENAME_LENGTH*2); + bcout_fn1[0] = '\0'; + get_bc_fn(&bcout_fn1, s_name, curr->bc, 1); + //curr->bcfile1 = fopen (_mkdir(bcout_fn1), "w"); + curr->bcfile1 = gzopen(_mkdir(bcout_fn1), "wb"); + //curr->bcfile1 = popen(_mkdir(bcout_fn1), "wb"); + // popen returns file handler + + if(paired > 0 && combine < 0) { + bcout_fn2 = (char *) malloc(MAX_FILENAME_LENGTH*2); + bcout_fn2[0] = '\0'; + get_bc_fn(&bcout_fn2, s_name, curr->bc, 2); + //curr->bcfile2 = fopen (_mkdir(bcout_fn2), "w"); + curr->bcfile2 = gzopen(_mkdir(bcout_fn2), "wb"); + } + + curr->num_records = 0; + curr->next = head; + head = curr; + + } + + free(bcout_fn1); + free(bcout_fn2); + free(fq1_fn); + free(fq2_fn); + free(barfn); + free(unassigned1_fn); + free(unassigned2_fn); + free(umis_2_short_fn); + + // Threading + pthread_t tid[threads]; + pthread_mutex_t in_lock; + pthread_mutex_t out_lock; + pthread_cond_t cv; + int line_num = 0; + int out_line_num = 0; + + pthread_mutex_init(&in_lock, NULL); + pthread_mutex_init(&out_lock, NULL); + pthread_cond_init(&cv, NULL); + + thread_data_t threads_data[threads]; -int main (int argc, char *argv[]) { - int retval=0; + for(int i=0; i < threads; i++) { - if (argc < 2 || (strcmp (argv[1],"pe") != 0 && strcmp (argv[1],"se") != 0 && strcmp (argv[1],"--version") != 0 && strcmp (argv[1],"--help") != 0)) { - main_usage (EXIT_FAILURE); + thread_data->params = ¶ms; + thread_data->barcode_data = &curr; + thread_data->metrics = &metrics; + thread_data->id = i; + thread_data->in_lock = &in_lock; + thread_data->out_lock = &out_lock; + thread_data->line_num = &line_num; + thread_data->out_line_num = &out_line_num; + thread_data->cv = &cv; + thread_data->fqrec1 = kseq_init(fq1_fd); + + if(paired > 0) { + thread_data->fqrec2 = kseq_init(fq2_fd); + } + + pthread_create(&(tid[i]), NULL, &demult_runner, arg); } - if (strcmp (argv[1],"--version") == 0) { - fprintf(stdout, - "\n\ - \n %s version %0.3f\ - \n\ - \n Copyright (c) 2011 The Regents of University of California, Davis Campus.\ - \n %s is free software and comes with ABSOLUTELY NO WARRANTY.\ - \n Distributed under the MIT License.\ - \n\ - \n Written by %s\ - \n\ - \n", - PROGRAM_NAME, VERSION, PROGRAM_NAME, AUTHORS); - exit (EXIT_SUCCESS); + for(int i=0; i < threads; i++) { + pthread_join(tid[i], NULL); } + printf("Threads all done\n"); + + pthread_mutex_destroy(&in_lock); + pthread_mutex_destroy(&out_lock); - else if (strcmp (argv[1],"--help") == 0) { - main_usage (EXIT_SUCCESS); + //if (!log_fn) { is this better? + if (log_fn == NULL) { + log_file = stdout; } + else { + log_file = fopen(log_fn, "w"); + } + + fprintf (log_file, "Barcode\tN_records\tN_pairs\tP_pairs\n"); + curr = head; + int total_pairs = metrics.total/2; + + while (curr) { + + int n_pairs = curr->num_records/2; + float percent_pairs = (float) n_pairs/total_pairs; + + fprintf (log_file,"%s\t%d\t%d\t%.2f\n", curr->bc, curr->num_records, n_pairs, percent_pairs); + + curr = curr->next; + } + + int unknown_pairs = metrics.num_unknown/2; + float percent_unknown = (float) metrics.unknown_pairs/total_pairs; + float tot_chk = (float) total_pairs/total_pairs; + + fprintf(log_file, "unassigned\t%d\t%d\t%.2f\n", num_unknown, unknown_pairs, percent_unknown); + fprintf(log_file, "total\t%d\t%d\t%.2f\n", metrics.total, total_pairs, tot_chk); + + end = time(NULL); + fprintf(stderr, "\n All done :) \ + \n It took %.2f minutes\n", + difftime(end, start)/60); + + // good read :) + little_sotry(EXIT_SUCCESS); + + gzclose(fq1_fd); + gzclose(fq2_fd); + gzclose(unassigned1_fd); + gzclose(unassigned2_fd); + fclose(barfile); + fclose(log_file); + fclose(umis_2_short_file); + + free(log_fn); + + curr = head; + while (curr) { + gzclose(curr->bcfile1); + gzclose(curr->bcfile2); - else if (strcmp (argv[1],"pe") == 0) { - retval = paired_main (argc, argv); - return (retval); + free (curr->bc); + temp = curr; + curr = curr->next; + free (temp); } - return 0; + return EXIT_SUCCESS; } diff --git a/src/sabre.h b/src/sabre.h index 0701cff..01946b4 100644 --- a/src/sabre.h +++ b/src/sabre.h @@ -1,18 +1,37 @@ #ifndef SABRE_H #define SABRE_H +#include +#include +#include +#include #include +#include +#include +#include +#include +#include +#include +#include +#include +#include "kseq.h" #ifndef PROGRAM_NAME #define PROGRAM_NAME "sabre" #endif #ifndef AUTHORS -#define AUTHORS "Nikhil Joshi, UC Davis Bioinformatics Core\n" +#define AUTHORS "\n\ + \n Nikhil Joshi, UC Davis Bioinformatics Core\ + \n Kirill Tsyganov, Monash Bioinformatics Platform\ + \n" #endif #ifndef VERSION -#define VERSION 0.0 +//https://semver.org/ +#define VERSION_MAJOR 0 +#define VERSION_MINOR 3 +#define VERSION_PATCH 1 #endif /* Options drawn from GNU's coreutils/src/system.h */ @@ -39,29 +58,109 @@ fprintf(stdout, "%s version %0.3f\nCopyright (c) 2011 The Regents " \ Program_name, Version, Program_name, Authors); \ exit(EXIT_SUCCESS); \ break; -/* end code drawn from system.h */ +// end code drawn from system.h #define MAX_BARCODE_LENGTH 100 #define MAX_FILENAME_LENGTH 200 - -typedef struct listel { - char* bc; - int num_records; - FILE* bcfile; - struct listel *next; -} barcode_data; +//NOTE more info on struc and typedef +//https://stackoverflow.com/questions/1675351/typedef-struct-vs-struct-definitions +typedef struct actl_bc_cnt { + char *bc; + int cnt; + struct actl_bc_cnt *next; +} actl_bc_cnt; typedef struct listel_p { char* bc; int num_records; gzFile bcfile1; gzFile bcfile2; + struct actl_bc_cnt *actl_bc_cnt; struct listel_p *next; -} barcode_data_paired; +} barcode_data_t; +#define barcode_destroy(barcode_data_t *bc_data)\ + gzclose(bc_data->bcfile1)\ + gzclose(bc_data->bcfile2)\ + free(bc_data->bc) /* Function Prototypes */ int paired_main (int argc, char *argv[]); +typedef struct barcodes_t { + char *bc; + int cnts; +} barcodes_t; + +typedef struct umis_t { + char *bc; + int len; + int cnts; +} umis_t; + +KSEQ_INIT(gzFile, gzread) + +typedef struct { + gzFile fq1_fd=NULL; //gzFile is file descriptor - fd //https://en.wikipedia.org/wiki/File_descriptor + gzFile fq2_fd=NULL; + FILE* umis_2_short_fd=NULL; + int mismatch=0; + int threads=4; + int combine = -1; + int umi = -1; + int paired = -1; + int min_umi_len=0; + int max_5prime_crop=0; + int no_comment = -1; +} param_t; + +typedef struct { + const *param_t params; + const *barcode_data_t curr; + int id; + pthread_mutex_t *in_lock, *out_lock; + pthread_cond_t *cv; + volatile int *line_num; // Pointer to line number. Access to this protected by in_lock + kseq_t fqrec1; + kseq_t fqrec2; +} thread_data_t; + +#define param_destroy(param_t *param)\ + gzclose(param->fq1_fd)\ + gzclose(param->fq2_fd)\ + fclose(param->umis_2_short_fd) + +typedef struct { + int num_unknown=0; + int total=0; +} metrics_t; +//This is needed if compilling with -std=c99, read below for more +//https://stackoverflow.com/questions/26284110/strdup-confused-about-warnings-implicit-declaration-makes-pointer-with +char *strdup(const char*); +char *strndup(const char *s, size_t n); + +const char * _mkdir (const char *dir); +int chk_bc_mtch(const char *orig_bc, const char *orig_read, size_t mismatch, int max_5prime_crop); +void get_fqread(char **fqread, kseq_t *fqrec, char *barcode, char *umi_idx, int no_comment, int n_crop); +void get_merged_fqread(char **fqread, kseq_t *fqrec1, kseq_t *fqrec2, char *barcode, char *umi_idx, int no_comment, int n_crop); +void get_bc_fn(char **bcout_fn, char *s_name, char *barcode, int read_type); + +typedef struct { + int id; + pthread_mutex_t *in_lock, *out_lock; + pthread_cond_t *cv; + volatile int *line_num; // Pointer to line number. Access to this protected by in_lock + volatile int *out_line_num; // Pointer to line number output. Protected by out_lock + kseq_t *fqrec1; + kseq_t *fqrec2; +} thread_data; + +void* demult_runner(void *arg); + +// usage.c +void usage(int status); +void little_story(int status); +void version(int status); + #endif /*SABRE_H*/ diff --git a/src/sanity_check.c b/src/sanity_check.c new file mode 100644 index 0000000..0876d23 --- /dev/null +++ b/src/sanity_check.c @@ -0,0 +1,50 @@ + +void* sanity_chk(void *arg) + if (!fq1 || !fq2 || !unknownfn1 || !unknownfn2 || !barfn) { + paired_usage (EXIT_FAILURE); + } + + if (!strcmp (fq1, fq2) || !strcmp (fq1, unknownfn1) || !strcmp (fq1, unknownfn2) || + !strcmp (fq1, barfn) || !strcmp (fq2, unknownfn1) || !strcmp (fq2, unknownfn2) || + !strcmp (fq2, barfn) || !strcmp (unknownfn1, unknownfn2) || !strcmp (unknownfn1, barfn) || + !strcmp (unknownfn2, barfn)) { + + fprintf (stderr, "ERROR: Duplicate input and/or output file names.\n"); + return EXIT_FAILURE; + } + + pe1 = gzopen (fq1, "r"); + if (!pe1) { + fprintf (stderr, "ERROR: Could not open input file 1 '%s'.\n", fq1); + return EXIT_FAILURE; + } + + pe2 = gzopen (fq2, "r"); + if (!pe2) { + fprintf (stderr, "ERROR: Could not open input file 2 '%s'.\n", fq2); + return EXIT_FAILURE; + } + + unknownfile1 = gzopen(unknownfn1, "wb"); + if (!unknownfile1) { + fprintf (stderr, "ERROR: Could not open unknown output file 1 '%s'.\n", unknownfn1); + return EXIT_FAILURE; + } + + unknownfile2 = gzopen(unknownfn2, "wb"); + if (!unknownfile2) { + fprintf (stderr, "Could not open unknown output file 2 '%s'.\n", unknownfn2); + return EXIT_FAILURE; + } + + barfile = fopen (barfn, "r"); + if (!barfile) { + fprintf (stderr, "Could not open barcode file '%s'.\n", barfn); + return EXIT_FAILURE; + } + + if(threads < 0) { + fprintf(stderr, "WARNING: Negative number of threads detected %d, setting threads to 1\n", threads); + threads = 1; + } +} diff --git a/src/usage.c b/src/usage.c new file mode 100644 index 0000000..96dc48b --- /dev/null +++ b/src/usage.c @@ -0,0 +1,78 @@ + +#include "sabre.h" + +void usage(int status) { + + fprintf(stderr, "\n Usage: %s [OPTIONS] -f -r -b \ + \n\ + \n\ + \n Options:\ + \n\ + \n Required:\ + \n\ + \n -f, --fq1 FILE Input FASTQ R1 read\ + \n -r, --fq2 FILE Input FASTQ R2 reads\ + \n -b, --barcodes FILE Barcodes files, one barcode per line, e.g BC\\tPREFIX\ + \n -w, --unassigned CHAR Unassigned prefix\ + \n\ + \n Other:\ + \n\ + \n -c, --combine Combine R1 and R2 [NULL]\ + \n -u, --umi Indicates that umi present in the R1 read [NULL]\ + \n -m, --max-mismatch INT Maximum number of mismatches allowed in a barcode [0]\ + \n -l, --min-umi-len INT Minimum UMI length to keep [0]\ + \n -a, --max-5prime-crop INT Maximum number of possible bases cropped from 5prime [0]\ + \n -n, --no-comment Drop extra comments from FASTQ header [NULL]\ + \n -s, --stats FILE Write stats to file instead of STDOUT [STDOUT]\ + \n\ + \n Extras:\ + \n\ + \n -t, --threads INT specify number of threads to use [4]\ + \n -v, --version get current version\ + \n -h, --hel get help menu, exit status is zero\ + \n -o, --story little story about sabre tool\ + \n\ + \n", + PROGRAM_NAME); + + exit(status); +} + +void little_story(int status) { + + fprintf(stdout, "\n\ + \n Little story:\ + \n\ + \n Sabre is a heavy cavalry sword with a curved blade and a single cutting edge\ + \n Not sure though if the meaning was intended by original author...\ + \n\ + \n Later on I was pointed out to me that yes of course it was intended\ + \n since we are cutting off adaptors..\ + \n\ + \n to be continued...\ + \n\ + \n"); + + exit(status); +} + +void version(int status) { + + fprintf(stdout, "\n\ + \n %s\ + \n\ + \n version: %d.%d.%d\ + \n\ + \n Copyright (c) 2011 The Regents of University of California, Davis Campus.\ + \n %s is free software and comes with ABSOLUTELY NO WARRANTY.\ + \n Distributed under the MIT License.\ + \n\ + \n Written by: %s\ + \n\ + \n", + PROGRAM_NAME, + VERSION_MAJOR, VERSION_MINOR, VERSION_PATCH, + PROGRAM_NAME, AUTHORS); + + exit(status); +} diff --git a/src/utils.c b/src/utils.c index bc268a8..60e50b0 100644 --- a/src/utils.c +++ b/src/utils.c @@ -1,15 +1,4 @@ -#include -#include -#include -#include -#include -#include -#include #include "sabre.h" -#include -#include -#include -#include "utils.h" // https://stackoverflow.com/questions/2336242/recursive-mkdir-system-call-on-unix/11425692 // https://stackoverflow.com/questions/7430248/creating-a-new-directory-in-c From 0233441228fcaf5d159d3e63fe8f3fb1ceb4f6de Mon Sep 17 00:00:00 2001 From: serine Date: Wed, 12 Dec 2018 13:15:26 +1100 Subject: [PATCH 34/55] milestone, got all headers in order? --- src/demultiplex.h | 9 +++++ src/sabre.h | 95 ++++++++++++----------------------------------- src/usage.h | 10 +++++ src/utils.h | 9 ++--- 4 files changed, 45 insertions(+), 78 deletions(-) create mode 100644 src/demultiplex.h create mode 100644 src/usage.h diff --git a/src/demultiplex.h b/src/demultiplex.h new file mode 100644 index 0000000..355ca23 --- /dev/null +++ b/src/demultiplex.h @@ -0,0 +1,9 @@ +#ifndef _DEMULTIPLEX_H +#define _DEMULTIPLEX_H + +#include "sabre.h" +#include "utils.h" + +void* demult_runner(void *arg); + +#endif /*_DEMULTIPLEX_H*/ diff --git a/src/sabre.h b/src/sabre.h index 01946b4..bcc53e5 100644 --- a/src/sabre.h +++ b/src/sabre.h @@ -1,5 +1,5 @@ -#ifndef SABRE_H -#define SABRE_H +#ifndef _SABRE_H +#define _SABRE_H #include #include @@ -16,6 +16,8 @@ #include #include "kseq.h" +KSEQ_INIT(gzFile, gzread) + #ifndef PROGRAM_NAME #define PROGRAM_NAME "sabre" #endif @@ -80,87 +82,36 @@ typedef struct listel_p { struct listel_p *next; } barcode_data_t; -#define barcode_destroy(barcode_data_t *bc_data)\ - gzclose(bc_data->bcfile1)\ - gzclose(bc_data->bcfile2)\ - free(bc_data->bc) - -/* Function Prototypes */ -int paired_main (int argc, char *argv[]); - -typedef struct barcodes_t { - char *bc; - int cnts; -} barcodes_t; - -typedef struct umis_t { - char *bc; - int len; - int cnts; -} umis_t; - -KSEQ_INIT(gzFile, gzread) - typedef struct { - gzFile fq1_fd=NULL; //gzFile is file descriptor - fd //https://en.wikipedia.org/wiki/File_descriptor - gzFile fq2_fd=NULL; - FILE* umis_2_short_fd=NULL; - int mismatch=0; - int threads=4; - int combine = -1; - int umi = -1; - int paired = -1; - int min_umi_len=0; - int max_5prime_crop=0; - int no_comment = -1; + gzFile fq1_fd; //gzFile is file descriptor - fd //https://en.wikipedia.org/wiki/File_descriptor + gzFile fq2_fd; + gzFile unassigned1_fd; + gzFile unassigned2_fd; + FILE* umis_2_short_fd; + int mismatch; + int combine; + int umi; + int paired; + int min_umi_len; + int max_5prime_crop; + int no_comment; } param_t; typedef struct { - const *param_t params; - const *barcode_data_t curr; - int id; - pthread_mutex_t *in_lock, *out_lock; - pthread_cond_t *cv; - volatile int *line_num; // Pointer to line number. Access to this protected by in_lock - kseq_t fqrec1; - kseq_t fqrec2; -} thread_data_t; - -#define param_destroy(param_t *param)\ - gzclose(param->fq1_fd)\ - gzclose(param->fq2_fd)\ - fclose(param->umis_2_short_fd) - -typedef struct { - int num_unknown=0; - int total=0; + int num_unknown; + int total; } metrics_t; -//This is needed if compilling with -std=c99, read below for more -//https://stackoverflow.com/questions/26284110/strdup-confused-about-warnings-implicit-declaration-makes-pointer-with -char *strdup(const char*); -char *strndup(const char *s, size_t n); - -const char * _mkdir (const char *dir); -int chk_bc_mtch(const char *orig_bc, const char *orig_read, size_t mismatch, int max_5prime_crop); -void get_fqread(char **fqread, kseq_t *fqrec, char *barcode, char *umi_idx, int no_comment, int n_crop); -void get_merged_fqread(char **fqread, kseq_t *fqrec1, kseq_t *fqrec2, char *barcode, char *umi_idx, int no_comment, int n_crop); -void get_bc_fn(char **bcout_fn, char *s_name, char *barcode, int read_type); typedef struct { int id; + const param_t* params; + barcode_data_t* curr; + metrics_t* metrics; pthread_mutex_t *in_lock, *out_lock; pthread_cond_t *cv; volatile int *line_num; // Pointer to line number. Access to this protected by in_lock volatile int *out_line_num; // Pointer to line number output. Protected by out_lock - kseq_t *fqrec1; - kseq_t *fqrec2; -} thread_data; - -void* demult_runner(void *arg); +} thread_data_t; -// usage.c -void usage(int status); -void little_story(int status); -void version(int status); -#endif /*SABRE_H*/ +#endif /*_SABRE_H*/ diff --git a/src/usage.h b/src/usage.h new file mode 100644 index 0000000..4ab8f3f --- /dev/null +++ b/src/usage.h @@ -0,0 +1,10 @@ +#ifndef _USAGE_H +#define _USAGE_H + +#include "sabre.h" + +void usage(int status); +void little_story(int status); +void version(int status); + +#endif /*_USAGE_H*/ diff --git a/src/utils.h b/src/utils.h index cdc0e20..20bbf7c 100644 --- a/src/utils.h +++ b/src/utils.h @@ -1,10 +1,7 @@ #ifndef UTILS_H #define UTILS_H -#include -#include -#include -#include "kseq.h" +#include "sabre.h" typedef struct barcodes_t { char *bc; @@ -17,8 +14,6 @@ typedef struct umis_t { int cnts; } umis_t; -KSEQ_INIT(gzFile, gzread) - //This is needed if compilling with -std=c99, read below for more //https://stackoverflow.com/questions/26284110/strdup-confused-about-warnings-implicit-declaration-makes-pointer-with char *strdup(const char*); @@ -30,4 +25,6 @@ void get_fqread(char **fqread, kseq_t *fqrec, char *barcode, char *umi_idx, int void get_merged_fqread(char **fqread, kseq_t *fqrec1, kseq_t *fqrec2, char *barcode, char *umi_idx, int no_comment, int n_crop); void get_bc_fn(char **bcout_fn, char *s_name, char *barcode, int read_type); +void set_default_params(param_t *params); + #endif /*UTILS_H*/ From a9af68a1c462f139a63445e346d2e71af14b2afb Mon Sep 17 00:00:00 2001 From: serine Date: Wed, 12 Dec 2018 13:16:43 +1100 Subject: [PATCH 35/55] work in progress, just another commit --- src/demultiplex.c | 59 ++++++++++++++++++++++++----------------------- src/sabre.c | 30 ++++++++++++------------ src/usage.c | 2 +- src/utils.c | 12 +++++++++- 4 files changed, 57 insertions(+), 46 deletions(-) diff --git a/src/demultiplex.c b/src/demultiplex.c index 3a8871a..934bad7 100644 --- a/src/demultiplex.c +++ b/src/demultiplex.c @@ -12,22 +12,23 @@ * */ -#include "sabre.h" +#include "demultiplex.h" void* demult_runner(void *arg) { - kseq_t fqrec1; - kseq_t fqrec2; + kseq_t *fqrec1; + kseq_t *fqrec2; + barcode_data_t *curr; int l1, l2; - thread_data_t* thread_data = (thread_data*)arg; + thread_data_t* thread_data = (thread_data_t*)arg; int my_line_num; - fqrec1 = kseq_init(thread_data->fq1_fd); + fqrec1 = kseq_init(thread_data->params->fq1_fd); - if(thread_data->paired > 0) { - fqrec2 = kseq_init(thread_data->fq2_fd); + if(thread_data->params->paired > 0) { + fqrec2 = kseq_init(thread_data->params->fq2_fd); } /* Get reads, one at a time */ @@ -64,7 +65,7 @@ void* demult_runner(void *arg) fq_size += 2;//two spaces fq_size += 1000;//test - if(paired > 0 || combine > 0) { + if(thread_data->params->paired > 0 || thread_data->params->combine > 0) { l2 = kseq_read(fqrec2); if (l2 < 0) { fprintf (stderr, "\n\ @@ -79,15 +80,15 @@ void* demult_runner(void *arg) } /* Step 1: Find matching barcode */ - thread_data->curr = head; - while(thread_data->curr) { - n_crop = chk_bc_mtch(thread_data->curr->bc, fqrec1->seq.s, thread_data->params->mismatch, thread_data->params->max_5prime_crop); + curr = thread_data->curr; + while(curr) { + n_crop = chk_bc_mtch(curr->bc, fqrec1->seq.s, thread_data->params->mismatch, thread_data->params->max_5prime_crop); if(n_crop >= 0) { //found matching barcode - actl_bc = strndup( (fqrec1->seq.s)+n_crop, strlen(thread_data->curr->bc) ); + actl_bc = strndup( (fqrec1->seq.s)+n_crop, strlen(curr->bc) ); break; } - thread_data->curr = thread_data->curr->next; + curr = thread_data->curr->next; } // unlock reading @@ -107,11 +108,11 @@ void* demult_runner(void *arg) char *umi_idx = NULL; - if(thread_data->curr != NULL) { + if(curr != NULL) { //for now assume barcode and umi are in R1 read if(thread_data->params->umi > 0) { - const char *actl_umi_idx = (fqrec1->seq.s)+strlen(thread_data->curr->bc)+n_crop; + const char *actl_umi_idx = (fqrec1->seq.s)+strlen(curr->bc)+n_crop; if(strlen(actl_umi_idx) < thread_data->params->min_umi_len) { //protect by mutex umis_2_short_file @@ -120,7 +121,7 @@ void* demult_runner(void *arg) } else { umi_idx = strdup(actl_umi_idx); - umi_idx[min_umi_len] = '\0'; + umi_idx[thread_data->params->min_umi_len] = '\0'; fq_size += strlen(umi_idx); } } @@ -131,7 +132,7 @@ void* demult_runner(void *arg) get_merged_fqread(&fqread1, fqrec1, fqrec2, actl_bc, umi_idx, thread_data->params->no_comment, n_crop); //protect by mutex umis_2_short_file - gzwrite(thread_data->curr->bcfile1, fqread1, strlen(fqread1)); + gzwrite(curr->bcfile1, fqread1, strlen(fqread1)); } else { fqread1 = (char*) malloc(fq_size + 1); @@ -141,16 +142,16 @@ void* demult_runner(void *arg) fqread2[0] = '\0'; get_fqread(&fqread1, fqrec1, actl_bc, umi_idx, thread_data->params->no_comment, n_crop); - gzwrite(thread_data->curr->bcfile1, fqread1, strlen(fqread1)); + gzwrite(curr->bcfile1, fqread1, strlen(fqread1)); if(thread_data->params->paired > 0) { get_fqread(&fqread2, fqrec1, actl_bc, umi_idx, thread_data->params->no_comment, n_crop); //fprintf(curr->bcfile2, "%s", fqread2); - gzwrite(thread_data_.curr->bcfile2, fqread2, strlen(fqread2)); - *thread_data->curr->num_records += 1; + gzwrite(curr->bcfile2, fqread2, strlen(fqread2)); + curr->num_records += 1; } } - *thread_data->curr->num_records += 1; + curr->num_records += 1; } else { fqread1 = (char*) malloc(fq_size + 1); @@ -160,17 +161,17 @@ void* demult_runner(void *arg) fqread2[0] = '\0'; get_fqread(&fqread1, fqrec1, NULL, NULL, thread_data->params->no_comment, 0); - gzwrite(thread_data->unassigned1_fd, fqread1, strlen(fqread1)); - *metrics->num_unknown += 1; + gzwrite(thread_data->params->unassigned1_fd, fqread1, strlen(fqread1)); + thread_data->metrics->num_unknown += 1; - if(paired > 0) { - get_fqread(&fqread2, thread_data->fqrec2, NULL, NULL, no_comment, 0); - gzwrite(thread_data->unassigned1_fd, fqread2, strlen(fqread2)); - *metrics->num_unknown += 1; + if(thread_data->params->paired > 0) { + get_fqread(&fqread2, fqrec2, NULL, NULL, thread_data->params->no_comment, 0); + gzwrite(thread_data->params->unassigned2_fd, fqread2, strlen(fqread2)); + thread_data->metrics->num_unknown += 1; } } - *metrics->total += 2; + thread_data->metrics->total += 2; // unlock writing pthread_mutex_unlock(thread_data->out_lock); @@ -181,7 +182,7 @@ void* demult_runner(void *arg) free(umi_idx); } - free(data); + free(thread_data); return NULL; } diff --git a/src/sabre.c b/src/sabre.c index e00b66b..7cc126f 100644 --- a/src/sabre.c +++ b/src/sabre.c @@ -30,6 +30,7 @@ int main(int argc, char *argv[]) { // stack gets cleaned when function exits, // because of that no need to free either param_t params; + set_default_params(¶ms); metrics_t metrics; @@ -222,30 +223,29 @@ int main(int argc, char *argv[]) { //curr->bcfile1 = fopen (_mkdir(bcout_fn1), "w"); curr->bcfile1 = gzopen(_mkdir(bcout_fn1), "wb"); //curr->bcfile1 = popen(_mkdir(bcout_fn1), "wb"); - // popen returns file handler - - if(paired > 0 && combine < 0) { - bcout_fn2 = (char *) malloc(MAX_FILENAME_LENGTH*2); - bcout_fn2[0] = '\0'; - get_bc_fn(&bcout_fn2, s_name, curr->bc, 2); - //curr->bcfile2 = fopen (_mkdir(bcout_fn2), "w"); - curr->bcfile2 = gzopen(_mkdir(bcout_fn2), "wb"); - } + // popen returns file handler + + if(paired > 0 && combine < 0) { + bcout_fn2 = (char *) malloc(MAX_FILENAME_LENGTH*2); + bcout_fn2[0] = '\0'; + get_bc_fn(&bcout_fn2, s_name, curr->bc, 2); + //curr->bcfile2 = fopen (_mkdir(bcout_fn2), "w"); + curr->bcfile2 = gzopen(_mkdir(bcout_fn2), "wb"); + } curr->num_records = 0; curr->next = head; head = curr; - } free(bcout_fn1); free(bcout_fn2); - free(fq1_fn); - free(fq2_fn); free(barfn); - free(unassigned1_fn); - free(unassigned2_fn); - free(umis_2_short_fn); + //free(fq1_fn); + //free(fq2_fn); + //free(unassigned1_fn); + //free(unassigned2_fn); + //free(umis_2_short_fn); // Threading pthread_t tid[threads]; diff --git a/src/usage.c b/src/usage.c index 96dc48b..b7280bc 100644 --- a/src/usage.c +++ b/src/usage.c @@ -1,5 +1,5 @@ -#include "sabre.h" +#include "usage.h" void usage(int status) { diff --git a/src/utils.c b/src/utils.c index 60e50b0..c797d3e 100644 --- a/src/utils.c +++ b/src/utils.c @@ -219,8 +219,18 @@ void get_bc_fn(char **bcout_fn, char *s_name, char *barcode, int read_type) { } else { fprintf (stderr, - "ERROR: This shouldn't happened, wrong read type was passed through -> %d\n", + "ERROR: This shouldn't happen, wrong read type was passed through -> %d\n", read_type); exit(EXIT_FAILURE); } } + +void set_default_params(param_t *params) { + params->mismatch = 0; + params->combine = -1; + params->umi = -1; + params->paired = -1; + params->min_umi_len = 0; + params->max_5prime_crop = 0; + params->no_comment = -1; +} From a1af55541700971801e2569ed178cce7bbed22ea Mon Sep 17 00:00:00 2001 From: serine Date: Wed, 12 Dec 2018 13:44:30 +1100 Subject: [PATCH 36/55] individual c file compiles error free, check --- src/sabre.c | 55 ++++++++++++++++++++--------------------------------- src/sabre.h | 26 ------------------------- src/utils.c | 8 ++++++++ src/utils.h | 1 + 4 files changed, 30 insertions(+), 60 deletions(-) diff --git a/src/sabre.c b/src/sabre.c index 7cc126f..59a2701 100644 --- a/src/sabre.c +++ b/src/sabre.c @@ -1,4 +1,7 @@ #include "sabre.h" +#include "utils.h" +#include "usage.h" +#include "demultiplex.h" int main(int argc, char *argv[]) { @@ -21,8 +24,6 @@ int main(int argc, char *argv[]) { {"help", optional_argument, NULL, 'h'}, {"story", optional_argument, NULL, 'o'}, //{"quiet", no_argument, 0, 'z'}, - {GETOPT_HELP_OPTION_DECL}, - {GETOPT_VERSION_OPTION_DECL}, {NULL, 0, NULL, 0} }; @@ -40,9 +41,6 @@ int main(int argc, char *argv[]) { FILE* barfile = NULL; - gzFile unassigned1_fd=NULL; - gzFile unassigned2_fd=NULL; - char *unassigned1_fn=strdup("unassigned_R1.fq.gz"); char *unassigned2_fn=strdup("unassigned_R2.fq.gz"); char *umis_2_short_fn=strdup("umis_too_short.txt"); @@ -73,13 +71,13 @@ int main(int argc, char *argv[]) { if (paired_long_options[option_index].flag != 0) break; case 'f': - fq1 = (char*) malloc (strlen (optarg) + 1); - strcpy (fq1, optarg); + fq1_fn = (char*) malloc (strlen (optarg) + 1); + strcpy (fq1_fn, optarg); break; case 'r': - fq2 = (char*) malloc (strlen (optarg) + 1); - strcpy (fq2, optarg); + fq2_fn = (char*) malloc (strlen (optarg) + 1); + strcpy (fq2_fn, optarg); break; case 'b': @@ -152,8 +150,6 @@ int main(int argc, char *argv[]) { little_story(EXIT_SUCCESS); break; - case_GETOPT_VERSION_CHAR(PROGRAM_NAME, VERSION_MAJOR, AUTHORS); - case '?': usage(EXIT_FAILURE); break; @@ -171,7 +167,7 @@ int main(int argc, char *argv[]) { params.umis_2_short_fd = fopen(umis_2_short_fn, "a"); // ? where does this goes? - fprintf(umis_2_short_file, "name\tumi\tlen\tmin_len\n"); + fprintf(params.umis_2_short_fd, "name\tumi\tlen\tmin_len\n"); //TODO plugin sanity_chk here @@ -225,7 +221,7 @@ int main(int argc, char *argv[]) { //curr->bcfile1 = popen(_mkdir(bcout_fn1), "wb"); // popen returns file handler - if(paired > 0 && combine < 0) { + if(params.paired > 0 && params.combine < 0) { bcout_fn2 = (char *) malloc(MAX_FILENAME_LENGTH*2); bcout_fn2[0] = '\0'; get_bc_fn(&bcout_fn2, s_name, curr->bc, 2); @@ -241,11 +237,11 @@ int main(int argc, char *argv[]) { free(bcout_fn1); free(bcout_fn2); free(barfn); - //free(fq1_fn); - //free(fq2_fn); - //free(unassigned1_fn); - //free(unassigned2_fn); - //free(umis_2_short_fn); + free(fq1_fn); + free(fq2_fn); + free(unassigned1_fn); + free(unassigned2_fn); + free(umis_2_short_fn); // Threading pthread_t tid[threads]; @@ -259,12 +255,12 @@ int main(int argc, char *argv[]) { pthread_mutex_init(&out_lock, NULL); pthread_cond_init(&cv, NULL); - thread_data_t threads_data[threads]; + thread_data_t thread_data[threads]; for(int i=0; i < threads; i++) { thread_data->params = ¶ms; - thread_data->barcode_data = &curr; + thread_data->curr = &curr; thread_data->metrics = &metrics; thread_data->id = i; thread_data->in_lock = &in_lock; @@ -272,13 +268,8 @@ int main(int argc, char *argv[]) { thread_data->line_num = &line_num; thread_data->out_line_num = &out_line_num; thread_data->cv = &cv; - thread_data->fqrec1 = kseq_init(fq1_fd); - - if(paired > 0) { - thread_data->fqrec2 = kseq_init(fq2_fd); - } - pthread_create(&(tid[i]), NULL, &demult_runner, arg); + pthread_create(&(tid[i]), NULL, &demult_runner, thread_data); } for(int i=0; i < threads; i++) { @@ -312,10 +303,10 @@ int main(int argc, char *argv[]) { } int unknown_pairs = metrics.num_unknown/2; - float percent_unknown = (float) metrics.unknown_pairs/total_pairs; + float percent_unknown = (float) unknown_pairs/total_pairs; float tot_chk = (float) total_pairs/total_pairs; - fprintf(log_file, "unassigned\t%d\t%d\t%.2f\n", num_unknown, unknown_pairs, percent_unknown); + fprintf(log_file, "unassigned\t%d\t%d\t%.2f\n", metrics.num_unknown, unknown_pairs, percent_unknown); fprintf(log_file, "total\t%d\t%d\t%.2f\n", metrics.total, total_pairs, tot_chk); end = time(NULL); @@ -324,15 +315,11 @@ int main(int argc, char *argv[]) { difftime(end, start)/60); // good read :) - little_sotry(EXIT_SUCCESS); + little_story(EXIT_SUCCESS); - gzclose(fq1_fd); - gzclose(fq2_fd); - gzclose(unassigned1_fd); - gzclose(unassigned2_fd); fclose(barfile); fclose(log_file); - fclose(umis_2_short_file); + params_destroy(¶ms); free(log_fn); diff --git a/src/sabre.h b/src/sabre.h index bcc53e5..eecd4f3 100644 --- a/src/sabre.h +++ b/src/sabre.h @@ -36,32 +36,6 @@ KSEQ_INIT(gzFile, gzread) #define VERSION_PATCH 1 #endif -/* Options drawn from GNU's coreutils/src/system.h */ -/* These options are defined so as to avoid conflicting with option -values used by commands */ -enum { - GETOPT_HELP_CHAR = (CHAR_MIN - 2), - GETOPT_VERSION_CHAR = (CHAR_MIN - 3) -}; -#define GETOPT_HELP_OPTION_DECL \ -"help", no_argument, NULL, GETOPT_HELP_CHAR -#define GETOPT_VERSION_OPTION_DECL \ -"version", no_argument, NULL, GETOPT_VERSION_CHAR -#define case_GETOPT_HELP_CHAR(Usage_call) \ -case GETOPT_HELP_CHAR: \ -Usage_call(EXIT_SUCCESS); \ -break; -#define case_GETOPT_VERSION_CHAR(Program_name, Version, Authors) \ -case GETOPT_VERSION_CHAR: \ -fprintf(stdout, "%s version %0.3f\nCopyright (c) 2011 The Regents " \ -"of University of California, Davis Campus.\n" \ -"%s is free software and comes with ABSOLUTELY NO WARRANTY.\n"\ -"Distributed under the MIT License.\n\nWritten by %s\n", \ -Program_name, Version, Program_name, Authors); \ -exit(EXIT_SUCCESS); \ -break; -// end code drawn from system.h - #define MAX_BARCODE_LENGTH 100 #define MAX_FILENAME_LENGTH 200 diff --git a/src/utils.c b/src/utils.c index c797d3e..deb3779 100644 --- a/src/utils.c +++ b/src/utils.c @@ -234,3 +234,11 @@ void set_default_params(param_t *params) { params->max_5prime_crop = 0; params->no_comment = -1; } + +void params_destroy(param_t *params) { + gzclose(params->fq1_fd); + gzclose(params->fq2_fd); + gzclose(params->unassigned1_fd); + gzclose(params->unassigned2_fd); + fclose(params->umis_2_short_fd); +} diff --git a/src/utils.h b/src/utils.h index 20bbf7c..ace2f6f 100644 --- a/src/utils.h +++ b/src/utils.h @@ -26,5 +26,6 @@ void get_merged_fqread(char **fqread, kseq_t *fqrec1, kseq_t *fqrec2, char *barc void get_bc_fn(char **bcout_fn, char *s_name, char *barcode, int read_type); void set_default_params(param_t *params); +void params_destroy(param_t *params); #endif /*UTILS_H*/ From af518d74e3fc5a3cb27d20a17d876da4bf7962f2 Mon Sep 17 00:00:00 2001 From: serine Date: Mon, 17 Dec 2018 09:28:45 +1100 Subject: [PATCH 37/55] milestone majore. - droping kseq.h header can't multithread with that - wrote fastq parse - struct - pluged that in - updated Makefile --- src/Makefile | 9 +++-- src/demultiplex.c | 96 ++++++++++++++++++++++++----------------------- src/fastq.c | 40 ++++++++++++++++++++ src/fastq.h | 20 ++++++++++ src/sabre.c | 16 +++++--- src/sabre.h | 9 +---- src/utils.c | 36 +++++++++--------- 7 files changed, 146 insertions(+), 80 deletions(-) create mode 100644 src/fastq.c create mode 100644 src/fastq.h diff --git a/src/Makefile b/src/Makefile index 6d935c9..cdb2462 100644 --- a/src/Makefile +++ b/src/Makefile @@ -3,14 +3,14 @@ VERSION = 1.00 CC = gcc INCL = kseq.h sabre.h #SRC = demulti_paired.c demulti_single.c sabre.c utils.c -SRC = demulti_paired.c sabre.c utils.c +SRC = sabre.c usage.c demultiplex.c utils.c OBJ = $(SRC:.c=.o) DSRC=src CFLAGS = -Wall -O2 -std=c99 -pedantic -DVERSION=$(VERSION) CFLAGSDEV = -Wall -O0 -ggdb -std=c99 -pedantic -DVERSION=$(VERSION) -LDFLAGS = -lz +LDFLAGS = -lz -lpthread GPROF = -pg EXE = sabre @@ -23,8 +23,9 @@ default: build %.o: %.c $(CC) -c $(CFLAGS) $(SRC) -demulti_single.o: kseq.h sabre.h -demulti_paired.o: kseq.h sabre.h +usage.o: usage.h sabre.h kseq.h +utils.o: utils.h sabre.h kseq.h +demultiplex.o: demultiplex.h utils.h sabre.h kseq.h sabre.o: sabre.h build: $(OBJ) diff --git a/src/demultiplex.c b/src/demultiplex.c index 934bad7..43aeb5e 100644 --- a/src/demultiplex.c +++ b/src/demultiplex.c @@ -17,35 +17,54 @@ void* demult_runner(void *arg) { - kseq_t *fqrec1; - kseq_t *fqrec2; + fq_rec_t *fq_rec1; + fq_rec_t *fq_rec2; + + fq_rec1 = (fq_rec_t*) malloc(sizeof(fq_rec_t)); + fq_rec1 = (fq_rec_t*) malloc(sizeof(fq_rec_t)); + barcode_data_t *curr; - int l1, l2; thread_data_t* thread_data = (thread_data_t*)arg; int my_line_num; - fqrec1 = kseq_init(thread_data->params->fq1_fd); - - if(thread_data->params->paired > 0) { - fqrec2 = kseq_init(thread_data->params->fq2_fd); - } /* Get reads, one at a time */ while(1) { - // lock reading pthread_mutex_lock(thread_data->in_lock); - l1 = kseq_read(fqrec1); - - // sanity check no more reads - if(l1 < 0 ) { + //this is equivalent to if(false), which means this block + //is always skipped, unless when there is an error/end of the file + if(get_read(fq_rec1, thread_data->params->fq1_fd)) { + // sanity check no more reads pthread_mutex_unlock(thread_data->in_lock); - break; + break; } - + + if(thread_data->params->paired > 0) { + if(get_read(fq_rec2, thread_data->params->fq2_fd)) { + //error out there becuase if reached the end of the file + //then we should hit first break, above, since the assumptions + //that the files of equal length. If issues with R2 only this is an error + fprintf (stderr, "\n\ + \n ERROR: R2 file shorter than R1 file.\ + \n Stopping here:\ + \n %s\ + \n", + fq_rec1->name); + //should this be an error? + pthread_mutex_unlock(thread_data->in_lock); + exit(1); + } + } + + // unlock reading + my_line_num = *(thread_data->line_num); + *thread_data->line_num += 1; + pthread_mutex_unlock(thread_data->in_lock); + int n_crop = 0; char *actl_bc = NULL; @@ -55,47 +74,32 @@ void* demult_runner(void *arg) size_t fq_size = 0; - fq_size += strlen(fqrec1->seq.s); - fq_size += (strlen(fqrec1->name.s)*2); - fq_size += strlen(fqrec1->qual.s); - fq_size += (strlen(fqrec1->comment.s)*2); + fq_size += strlen(fq_rec1->seq); + fq_size += (strlen(fq_rec1->name)*2); + fq_size += strlen(fq_rec1->qual); + fq_size += (strlen(fq_rec1->comment)*2); fq_size += 2;// header signs @ and + fq_size += 2;//two colons (:) fq_size += 4;//cariage returns fq_size += 2;//two spaces fq_size += 1000;//test - if(thread_data->params->paired > 0 || thread_data->params->combine > 0) { - l2 = kseq_read(fqrec2); - if (l2 < 0) { - fprintf (stderr, "\n\ - \n ERROR: R2 file is shorter than R1 file.\ - \n Stopping here:\ - \n %s\ - \n", - fqrec1->name.s); - break; - } - fq_size += strlen(fqrec2->seq.s); + if(thread_data->params->combine > 0) { + fq_size += strlen(fq_rec2->seq); } /* Step 1: Find matching barcode */ curr = thread_data->curr; while(curr) { - n_crop = chk_bc_mtch(curr->bc, fqrec1->seq.s, thread_data->params->mismatch, thread_data->params->max_5prime_crop); + n_crop = chk_bc_mtch(curr->bc, fq_rec1->seq, thread_data->params->mismatch, thread_data->params->max_5prime_crop); if(n_crop >= 0) { //found matching barcode - actl_bc = strndup( (fqrec1->seq.s)+n_crop, strlen(curr->bc) ); + actl_bc = strndup( (fq_rec1->seq)+n_crop, strlen(curr->bc) ); break; } - curr = thread_data->curr->next; + curr = curr->next; } - // unlock reading - my_line_num = *(thread_data->line_num); - *thread_data->line_num += 1; - pthread_mutex_unlock(thread_data->in_lock); - /* Step 2: Write read out into barcode specific file */ // lock writing @@ -112,11 +116,11 @@ void* demult_runner(void *arg) //for now assume barcode and umi are in R1 read if(thread_data->params->umi > 0) { - const char *actl_umi_idx = (fqrec1->seq.s)+strlen(curr->bc)+n_crop; + const char *actl_umi_idx = (fq_rec1->seq)+strlen(curr->bc)+n_crop; if(strlen(actl_umi_idx) < thread_data->params->min_umi_len) { //protect by mutex umis_2_short_file - fprintf(thread_data->params->umis_2_short_fd, "%s\t%s\t%zu\t%d\n", fqrec1->name.s, actl_umi_idx, strlen(actl_umi_idx), thread_data->params->min_umi_len); + fprintf(thread_data->params->umis_2_short_fd, "%s\t%s\t%zu\t%d\n", fq_rec1->name, actl_umi_idx, strlen(actl_umi_idx), thread_data->params->min_umi_len); continue; } else { @@ -130,7 +134,7 @@ void* demult_runner(void *arg) fqread1 = (char*) malloc(fq_size); fqread1[0] = '\0'; - get_merged_fqread(&fqread1, fqrec1, fqrec2, actl_bc, umi_idx, thread_data->params->no_comment, n_crop); + get_merged_fqread(&fqread1, fq_rec1, fq_rec2, actl_bc, umi_idx, thread_data->params->no_comment, n_crop); //protect by mutex umis_2_short_file gzwrite(curr->bcfile1, fqread1, strlen(fqread1)); } @@ -141,11 +145,11 @@ void* demult_runner(void *arg) fqread1[0] = '\0'; fqread2[0] = '\0'; - get_fqread(&fqread1, fqrec1, actl_bc, umi_idx, thread_data->params->no_comment, n_crop); + get_fqread(&fqread1, fq_rec1, actl_bc, umi_idx, thread_data->params->no_comment, n_crop); gzwrite(curr->bcfile1, fqread1, strlen(fqread1)); if(thread_data->params->paired > 0) { - get_fqread(&fqread2, fqrec1, actl_bc, umi_idx, thread_data->params->no_comment, n_crop); + get_fqread(&fqread2, fq_rec1, actl_bc, umi_idx, thread_data->params->no_comment, n_crop); //fprintf(curr->bcfile2, "%s", fqread2); gzwrite(curr->bcfile2, fqread2, strlen(fqread2)); curr->num_records += 1; @@ -160,12 +164,12 @@ void* demult_runner(void *arg) fqread1[0] = '\0'; fqread2[0] = '\0'; - get_fqread(&fqread1, fqrec1, NULL, NULL, thread_data->params->no_comment, 0); + get_fqread(&fqread1, fq_rec1, NULL, NULL, thread_data->params->no_comment, 0); gzwrite(thread_data->params->unassigned1_fd, fqread1, strlen(fqread1)); thread_data->metrics->num_unknown += 1; if(thread_data->params->paired > 0) { - get_fqread(&fqread2, fqrec2, NULL, NULL, thread_data->params->no_comment, 0); + get_fqread(&fqread2, fq_rec2, NULL, NULL, thread_data->params->no_comment, 0); gzwrite(thread_data->params->unassigned2_fd, fqread2, strlen(fqread2)); thread_data->metrics->num_unknown += 1; } diff --git a/src/fastq.c b/src/fastq.c new file mode 100644 index 0000000..146eaa3 --- /dev/null +++ b/src/fastq.c @@ -0,0 +1,40 @@ + +#include "fastq.h" + +int get_line(gzFile fq_fd, char *line, int buff) { + + char *new_line = gzgets(fq_fd, line, buff); + + if(new_line == NULL) { + return -1; + } + + int str_len = strlen(new_line); + + if(new_line[str_len-1] != '\n') { + fprintf(stderr, "Line too long %d\n", buff); + exit(1); + } + + new_line[str_len-1] = '\0'; + + return 0; +} +//user strchr +int get_fq_rec(fq_rec_t *fq_rec, gzFile fq_fd) { + + int done = get_line(fq_fd, fq_rec->name, LINE_SIZE); + done = done || get_line(fq_fd, fq_rec->seq, LINE_SIZE); + done = done || get_line(fq_fd, fq_rec->other, LINE_SIZE); + done = done || get_line(fq_fd, fq_rec->qual, LINE_SIZE); + char *ptr = strchr(fq_rec->name,' '); + if (ptr) { + *ptr='\0'; + read->comment = ptr+1; + } else { + read->comment = NULL; + } + // before writing it out check that comment isn't null + + return done; +} diff --git a/src/fastq.h b/src/fastq.h new file mode 100644 index 0000000..9c14f9e --- /dev/null +++ b/src/fastq.h @@ -0,0 +1,20 @@ +#ifndef _FASTQ_H +#define _FASTQ_H + +#include "sabre.h" + +#define LINE_SIZE 600 + +typedef struct { + char name[LINE_SIZE]; + char *comment; + char seq[LINE_SIZE]; + char other[LINE_SIZE]; + char qual[LINE_SIZE]; + char *r2; +} fq_rec_t; + +int get_line(gzFile fq_fd, char *line, int buff); +int get_fq_read(fq_rec_t *fq_rec, gzFile fq_fd); + +#endif /*_FASTQ_H*/ diff --git a/src/sabre.c b/src/sabre.c index 59a2701..a33f2a7 100644 --- a/src/sabre.c +++ b/src/sabre.c @@ -39,7 +39,14 @@ int main(int argc, char *argv[]) { time_t start, end; start = time(NULL); + gzFile fq1_fd; + gzFile fq2_fd; + + char *fq1_fn=NULL; + char *fq2_fn=NULL; + FILE* barfile = NULL; + char *barfn=NULL; char *unassigned1_fn=strdup("unassigned_R1.fq.gz"); char *unassigned2_fn=strdup("unassigned_R2.fq.gz"); @@ -49,11 +56,8 @@ int main(int argc, char *argv[]) { int optc; extern char *optarg; - char *fq1_fn=NULL; - char *fq2_fn=NULL; char *log_fn=NULL; - char *barfn=NULL; char s_name[MAX_FILENAME_LENGTH]; barcode_data_t *curr, *head, *temp; char barcode [MAX_BARCODE_LENGTH]; @@ -150,7 +154,7 @@ int main(int argc, char *argv[]) { little_story(EXIT_SUCCESS); break; - case '?': + case '*': usage(EXIT_FAILURE); break; @@ -159,9 +163,11 @@ int main(int argc, char *argv[]) { break; } } + // TODO check that what's requre if not run usage params.fq1_fd = gzopen(fq1_fn, "r"); params.fq2_fd = gzopen(fq2_fn, "r"); + params.unassigned1_fd = gzopen(unassigned1_fn, "wb"); params.unassigned2_fd = gzopen(unassigned1_fn, "wb"); params.umis_2_short_fd = fopen(umis_2_short_fn, "a"); @@ -260,7 +266,7 @@ int main(int argc, char *argv[]) { for(int i=0; i < threads; i++) { thread_data->params = ¶ms; - thread_data->curr = &curr; + thread_data->curr = curr; thread_data->metrics = &metrics; thread_data->id = i; thread_data->in_lock = &in_lock; diff --git a/src/sabre.h b/src/sabre.h index eecd4f3..647ccb9 100644 --- a/src/sabre.h +++ b/src/sabre.h @@ -14,9 +14,6 @@ #include #include #include -#include "kseq.h" - -KSEQ_INIT(gzFile, gzread) #ifndef PROGRAM_NAME #define PROGRAM_NAME "sabre" @@ -29,12 +26,10 @@ KSEQ_INIT(gzFile, gzread) \n" #endif -#ifndef VERSION //https://semver.org/ #define VERSION_MAJOR 0 #define VERSION_MINOR 3 #define VERSION_PATCH 1 -#endif #define MAX_BARCODE_LENGTH 100 #define MAX_FILENAME_LENGTH 200 @@ -57,8 +52,8 @@ typedef struct listel_p { } barcode_data_t; typedef struct { - gzFile fq1_fd; //gzFile is file descriptor - fd //https://en.wikipedia.org/wiki/File_descriptor - gzFile fq2_fd; + fq_read_t *fq1_read; + fq_read_t *fq2_read; gzFile unassigned1_fd; gzFile unassigned2_fd; FILE* umis_2_short_fd; diff --git a/src/utils.c b/src/utils.c index deb3779..eb737df 100644 --- a/src/utils.c +++ b/src/utils.c @@ -106,7 +106,7 @@ int chk_bc_mtch(const char *orig_bc, const char *orig_read, size_t mismatch, int // https://stackoverflow.com/questions/21880730/c-what-is-the-best-and-fastest-way-to-concatenate-strings //TODO this is a fastq mystrcat function, that returns a pointer to the end of the string -void get_fqread(char **fqread, kseq_t *fqrec, char *barcode, char *umi_idx, int no_comment, int n_crop) { +void get_fqread(char **fqread, fq_rec_t *fq_rec, char *barcode, char *umi_idx, int no_comment, int n_crop) { if(n_crop < 0) { fprintf(stderr, @@ -118,7 +118,7 @@ void get_fqread(char **fqread, kseq_t *fqrec, char *barcode, char *umi_idx, int //@READNAME:BACRCODE:UMI //1st line strcat(*fqread, "@"); - strcat(*fqread, fqrec->name.s); + strcat(*fqread, fq_rec->name); //TODO later can have conditional here depending on the the structure and/or BARCODE/UMI if(barcode) { strcat(*fqread, ":"); @@ -133,36 +133,36 @@ void get_fqread(char **fqread, kseq_t *fqrec, char *barcode, char *umi_idx, int strcat(*fqread, umi_idx); } - if(fqrec->comment.l && no_comment == -1) { + if(fq_rec->comment && no_comment == -1) { strcat(*fqread, " "); - strcat(*fqread, fqrec->comment.s); + strcat(*fqread, fq_rec->comment); } strcat(*fqread, "\n"); //2nd line - strcat(*fqread, (fqrec->seq.s)+strlen(barcode)+n_crop); + strcat(*fqread, (fq_rec->seq)+strlen(barcode)+n_crop); strcat(*fqread, "\n"); //3rd line strcat(*fqread, "+"); - strcat(*fqread, fqrec->name.s); - if(fqrec->comment.l && no_comment == -1) { + strcat(*fqread, fq_rec->name); + if(fq_rec->comment && no_comment == -1) { strcat(*fqread, " "); - strcat(*fqread, fqrec->comment.s); + strcat(*fqread, fq_rec->comment); } strcat(*fqread, "\n"); //4th line - strcat(*fqread, (fqrec->qual.s)+strlen(barcode)+n_crop); + strcat(*fqread, (fq_rec->qual)+strlen(barcode)+n_crop); strcat(*fqread, "\n"); } -void get_merged_fqread(char **fqread, kseq_t *fqrec1, kseq_t *fqrec2, char *barcode, char *umi_idx, int no_comment, int n_crop) { +void get_merged_fqread(char **fqread, fq_rec_t *fq_rec1, fq_rec_t *fq_rec2, char *barcode, char *umi_idx, int no_comment, int n_crop) { //@READNAME:BACRCODE:UMI //1st line strcat(*fqread, "@"); - strcat(*fqread, fqrec1->name.s); + strcat(*fqread, fq_rec1->name); //TODO later can have conditional here depending on the the structure and/or BARCODE/UMI if(barcode) { strcat(*fqread, ":"); @@ -174,27 +174,27 @@ void get_merged_fqread(char **fqread, kseq_t *fqrec1, kseq_t *fqrec2, char *barc strcat(*fqread, umi_idx); } - if(fqrec1->comment.l && no_comment == -1) { + if(fq_rec1->comment && no_comment == -1) { strcat(*fqread, " "); - strcat(*fqread, fqrec1->comment.s); + strcat(*fqread, fq_rec1->comment); } strcat(*fqread, "\n"); //2nd line - strcat(*fqread, fqrec2->seq.s); + strcat(*fqread, fq_rec2->seq); strcat(*fqread, "\n"); //3rd line strcat(*fqread, "+"); - strcat(*fqread, fqrec2->name.s); - if(fqrec2->comment.l && no_comment == -1) { + strcat(*fqread, fq_rec2->name); + if(fq_rec2->comment && no_comment == -1) { strcat(*fqread, " "); - strcat(*fqread, fqrec2->comment.s); + strcat(*fqread, fq_rec2->comment); } strcat(*fqread, "\n"); //4th line - strcat(*fqread, (fqrec2->qual.s)); + strcat(*fqread, (fq_rec2->qual)); strcat(*fqread, "\n"); } From 98db795d18459166cbadc28d5a96e87157f51198 Mon Sep 17 00:00:00 2001 From: serine Date: Mon, 17 Dec 2018 09:49:51 +1100 Subject: [PATCH 38/55] yet another milestone --- src/Makefile | 10 +++++----- src/demultiplex.c | 4 ++-- src/demultiplex.h | 7 ++++--- src/fastq.c | 4 ++-- src/fastq.h | 9 ++++----- src/sabre.c | 3 --- src/sabre.h | 4 ++-- src/utils.c | 2 +- src/utils.h | 5 +++-- 9 files changed, 23 insertions(+), 25 deletions(-) diff --git a/src/Makefile b/src/Makefile index cdb2462..1e7fde2 100644 --- a/src/Makefile +++ b/src/Makefile @@ -1,9 +1,9 @@ # Source, Executable, Includes, Library Defines VERSION = 1.00 CC = gcc -INCL = kseq.h sabre.h +INCL = fastq.h sabre.h #SRC = demulti_paired.c demulti_single.c sabre.c utils.c -SRC = sabre.c usage.c demultiplex.c utils.c +SRC = sabre.c usage.c demultiplex.c utils.c fastq.c OBJ = $(SRC:.c=.o) DSRC=src @@ -23,9 +23,9 @@ default: build %.o: %.c $(CC) -c $(CFLAGS) $(SRC) -usage.o: usage.h sabre.h kseq.h -utils.o: utils.h sabre.h kseq.h -demultiplex.o: demultiplex.h utils.h sabre.h kseq.h +usage.o: usage.h sabre.h fastq.h +utils.o: utils.h sabre.h fastq.h +demultiplex.o: demultiplex.h utils.h sabre.h fastq.h sabre.o: sabre.h build: $(OBJ) diff --git a/src/demultiplex.c b/src/demultiplex.c index 43aeb5e..f08df5c 100644 --- a/src/demultiplex.c +++ b/src/demultiplex.c @@ -37,14 +37,14 @@ void* demult_runner(void *arg) //this is equivalent to if(false), which means this block //is always skipped, unless when there is an error/end of the file - if(get_read(fq_rec1, thread_data->params->fq1_fd)) { + if(get_fq_rec(fq_rec1, thread_data->params->fq1_fd)) { // sanity check no more reads pthread_mutex_unlock(thread_data->in_lock); break; } if(thread_data->params->paired > 0) { - if(get_read(fq_rec2, thread_data->params->fq2_fd)) { + if(get_fq_rec(fq_rec2, thread_data->params->fq2_fd)) { //error out there becuase if reached the end of the file //then we should hit first break, above, since the assumptions //that the files of equal length. If issues with R2 only this is an error diff --git a/src/demultiplex.h b/src/demultiplex.h index 355ca23..ddca50b 100644 --- a/src/demultiplex.h +++ b/src/demultiplex.h @@ -1,9 +1,10 @@ -#ifndef _DEMULTIPLEX_H -#define _DEMULTIPLEX_H +#ifndef DEMULTIPLEX_H +#define DEMULTIPLEX_H #include "sabre.h" #include "utils.h" +#include "fastq.h" void* demult_runner(void *arg); -#endif /*_DEMULTIPLEX_H*/ +#endif /*DEMULTIPLEX_H*/ diff --git a/src/fastq.c b/src/fastq.c index 146eaa3..05097d7 100644 --- a/src/fastq.c +++ b/src/fastq.c @@ -30,9 +30,9 @@ int get_fq_rec(fq_rec_t *fq_rec, gzFile fq_fd) { char *ptr = strchr(fq_rec->name,' '); if (ptr) { *ptr='\0'; - read->comment = ptr+1; + fq_rec->comment = ptr+1; } else { - read->comment = NULL; + fq_rec->comment = NULL; } // before writing it out check that comment isn't null diff --git a/src/fastq.h b/src/fastq.h index 9c14f9e..2c9a239 100644 --- a/src/fastq.h +++ b/src/fastq.h @@ -1,5 +1,5 @@ -#ifndef _FASTQ_H -#define _FASTQ_H +#ifndef FASTQ_H +#define FASTQ_H #include "sabre.h" @@ -11,10 +11,9 @@ typedef struct { char seq[LINE_SIZE]; char other[LINE_SIZE]; char qual[LINE_SIZE]; - char *r2; } fq_rec_t; int get_line(gzFile fq_fd, char *line, int buff); -int get_fq_read(fq_rec_t *fq_rec, gzFile fq_fd); +int get_fq_rec(fq_rec_t *fq_rec, gzFile fq_fd); -#endif /*_FASTQ_H*/ +#endif /*FASTQ_H*/ diff --git a/src/sabre.c b/src/sabre.c index a33f2a7..2536e33 100644 --- a/src/sabre.c +++ b/src/sabre.c @@ -39,9 +39,6 @@ int main(int argc, char *argv[]) { time_t start, end; start = time(NULL); - gzFile fq1_fd; - gzFile fq2_fd; - char *fq1_fn=NULL; char *fq2_fn=NULL; diff --git a/src/sabre.h b/src/sabre.h index 647ccb9..36e0d5c 100644 --- a/src/sabre.h +++ b/src/sabre.h @@ -52,8 +52,8 @@ typedef struct listel_p { } barcode_data_t; typedef struct { - fq_read_t *fq1_read; - fq_read_t *fq2_read; + gzFile fq1_fd; + gzFile fq2_fd; gzFile unassigned1_fd; gzFile unassigned2_fd; FILE* umis_2_short_fd; diff --git a/src/utils.c b/src/utils.c index eb737df..4572895 100644 --- a/src/utils.c +++ b/src/utils.c @@ -1,4 +1,4 @@ -#include "sabre.h" +#include "utils.h" // https://stackoverflow.com/questions/2336242/recursive-mkdir-system-call-on-unix/11425692 // https://stackoverflow.com/questions/7430248/creating-a-new-directory-in-c diff --git a/src/utils.h b/src/utils.h index ace2f6f..1c9085c 100644 --- a/src/utils.h +++ b/src/utils.h @@ -2,6 +2,7 @@ #define UTILS_H #include "sabre.h" +#include "fastq.h" typedef struct barcodes_t { char *bc; @@ -21,8 +22,8 @@ char *strndup(const char *s, size_t n); const char * _mkdir (const char *dir); int chk_bc_mtch(const char *orig_bc, const char *orig_read, size_t mismatch, int max_5prime_crop); -void get_fqread(char **fqread, kseq_t *fqrec, char *barcode, char *umi_idx, int no_comment, int n_crop); -void get_merged_fqread(char **fqread, kseq_t *fqrec1, kseq_t *fqrec2, char *barcode, char *umi_idx, int no_comment, int n_crop); +void get_fqread(char **fqread, fq_rec_t *fq_rec, char *barcode, char *umi_idx, int no_comment, int n_crop); +void get_merged_fqread(char **fqread, fq_rec_t *fq_rec1, fq_rec_t *fq_rec2, char *barcode, char *umi_idx, int no_comment, int n_crop); void get_bc_fn(char **bcout_fn, char *s_name, char *barcode, int read_type); void set_default_params(param_t *params); From a17892b1ef251fc230761aced78552b386ed33b2 Mon Sep 17 00:00:00 2001 From: serine Date: Mon, 17 Dec 2018 14:42:23 +1100 Subject: [PATCH 39/55] milestone, compiles and runs --- src/demultiplex.c | 169 ++++++++++++++++++++++++++-------------------- src/demultiplex.h | 4 ++ src/sabre.c | 59 ++++++++++------ src/utils.c | 44 ++++++------ src/utils.h | 2 +- 5 files changed, 159 insertions(+), 119 deletions(-) diff --git a/src/demultiplex.c b/src/demultiplex.c index f08df5c..a28f8f3 100644 --- a/src/demultiplex.c +++ b/src/demultiplex.c @@ -17,21 +17,32 @@ void* demult_runner(void *arg) { + char fqread1[MAX_READ_SIZE]; + char fqread2[MAX_READ_SIZE]; + + fqread1[0] = '\0'; + fqread2[0] = '\0'; + + char *fq_read_buff; + fq_read_buff = (char*) malloc(MAX_READ_BUFFER); + fq_read_buff[0] = '\0'; + int buff_cnt = 0; + fq_rec_t *fq_rec1; fq_rec_t *fq_rec2; fq_rec1 = (fq_rec_t*) malloc(sizeof(fq_rec_t)); - fq_rec1 = (fq_rec_t*) malloc(sizeof(fq_rec_t)); + fq_rec2 = (fq_rec_t*) malloc(sizeof(fq_rec_t)); barcode_data_t *curr; thread_data_t* thread_data = (thread_data_t*)arg; int my_line_num; - /* Get reads, one at a time */ while(1) { + // lock reading pthread_mutex_lock(thread_data->in_lock); @@ -54,7 +65,6 @@ void* demult_runner(void *arg) \n %s\ \n", fq_rec1->name); - //should this be an error? pthread_mutex_unlock(thread_data->in_lock); exit(1); } @@ -68,25 +78,8 @@ void* demult_runner(void *arg) int n_crop = 0; char *actl_bc = NULL; - - char *fqread1 = NULL; - char *fqread2 = NULL; - - size_t fq_size = 0; - - fq_size += strlen(fq_rec1->seq); - fq_size += (strlen(fq_rec1->name)*2); - fq_size += strlen(fq_rec1->qual); - fq_size += (strlen(fq_rec1->comment)*2); - fq_size += 2;// header signs @ and + - fq_size += 2;//two colons (:) - fq_size += 4;//cariage returns - fq_size += 2;//two spaces - fq_size += 1000;//test - - if(thread_data->params->combine > 0) { - fq_size += strlen(fq_rec2->seq); - } + //char actl_bc[MAX_BARCODE_LENGTH]; + //actl_bc[0] = '\0'; /* Step 1: Find matching barcode */ curr = thread_data->curr; @@ -97,18 +90,23 @@ void* demult_runner(void *arg) actl_bc = strndup( (fq_rec1->seq)+n_crop, strlen(curr->bc) ); break; } + //else { + // fprintf(stdout, "GOTCHA %s \n", actl_bc); + // exit(1); + //} curr = curr->next; } + //fprintf(stdout, "HERE %s %s %s \n", fq_rec1->name, fq_rec1->seq, fq_rec1->qual); /* Step 2: Write read out into barcode specific file */ - // lock writing - while(*(thread_data->out_line_num) != my_line_num) { - pthread_cond_wait(thread_data->cv, thread_data->out_lock); - } - *thread_data->out_line_num += 1; + //// lock writing + //while(*(thread_data->out_line_num) != my_line_num) { + // pthread_cond_wait(thread_data->cv, thread_data->out_lock); + //} + //*thread_data->out_line_num += 1; - pthread_cond_broadcast(thread_data->cv); // Tell everyone it might be their turn! + //pthread_cond_broadcast(thread_data->cv); // Tell everyone it might be their turn! char *umi_idx = NULL; @@ -119,73 +117,94 @@ void* demult_runner(void *arg) const char *actl_umi_idx = (fq_rec1->seq)+strlen(curr->bc)+n_crop; if(strlen(actl_umi_idx) < thread_data->params->min_umi_len) { - //protect by mutex umis_2_short_file - fprintf(thread_data->params->umis_2_short_fd, "%s\t%s\t%zu\t%d\n", fq_rec1->name, actl_umi_idx, strlen(actl_umi_idx), thread_data->params->min_umi_len); + //protect by mutex umis_2_short_file + pthread_mutex_lock(thread_data->out_lock); + fprintf(stdout, "%s\t%s\t%zu\t%d\n", fq_rec1->name, actl_umi_idx, strlen(actl_umi_idx), thread_data->params->min_umi_len); + pthread_mutex_unlock(thread_data->out_lock); + //fprintf(thread_data->params->umis_2_short_fd, "%s\t%s\t%zu\t%d\n", fq_rec1->name, actl_umi_idx, strlen(actl_umi_idx), thread_data->params->min_umi_len); continue; } else { umi_idx = strdup(actl_umi_idx); umi_idx[thread_data->params->min_umi_len] = '\0'; - fq_size += strlen(umi_idx); } } - if(thread_data->params->combine > 0) { - fqread1 = (char*) malloc(fq_size); - fqread1[0] = '\0'; + if(thread_data->params->combine > 0 && actl_bc != NULL) { + //fqread1 = (char*) malloc(fq_size); + //fqread1[0] = '\0'; - get_merged_fqread(&fqread1, fq_rec1, fq_rec2, actl_bc, umi_idx, thread_data->params->no_comment, n_crop); + get_merged_fqread(fqread1, fq_rec1, fq_rec2, actl_bc, umi_idx, thread_data->params->no_comment, n_crop); //protect by mutex umis_2_short_file - gzwrite(curr->bcfile1, fqread1, strlen(fqread1)); + //pthread_mutex_lock(thread_data->out_lock); + ////fprintf(stdout, "%s", fqread1); + //gzwrite(curr->bcfile1, fqread1, strlen(fqread1)); + //pthread_mutex_unlock(thread_data->out_lock); + strcat(fq_read_buff, fqread1); + buff_cnt++; } - else { - fqread1 = (char*) malloc(fq_size + 1); - fqread2 = (char*) malloc(fq_size + 1); - - fqread1[0] = '\0'; - fqread2[0] = '\0'; - - get_fqread(&fqread1, fq_rec1, actl_bc, umi_idx, thread_data->params->no_comment, n_crop); + if(buff_cnt > MAX_READ_NUMBER-1) { + pthread_mutex_lock(thread_data->out_lock); gzwrite(curr->bcfile1, fqread1, strlen(fqread1)); - - if(thread_data->params->paired > 0) { - get_fqread(&fqread2, fq_rec1, actl_bc, umi_idx, thread_data->params->no_comment, n_crop); - //fprintf(curr->bcfile2, "%s", fqread2); - gzwrite(curr->bcfile2, fqread2, strlen(fqread2)); - curr->num_records += 1; - } - } - curr->num_records += 1; - } - else { - fqread1 = (char*) malloc(fq_size + 1); - fqread2 = (char*) malloc(fq_size + 1); - - fqread1[0] = '\0'; - fqread2[0] = '\0'; - - get_fqread(&fqread1, fq_rec1, NULL, NULL, thread_data->params->no_comment, 0); - gzwrite(thread_data->params->unassigned1_fd, fqread1, strlen(fqread1)); - thread_data->metrics->num_unknown += 1; - - if(thread_data->params->paired > 0) { - get_fqread(&fqread2, fq_rec2, NULL, NULL, thread_data->params->no_comment, 0); - gzwrite(thread_data->params->unassigned2_fd, fqread2, strlen(fqread2)); - thread_data->metrics->num_unknown += 1; - } + fq_read_buff[0] = '\0'; + buff_cnt = 0; + pthread_mutex_unlock(thread_data->out_lock); + + } +// else { +// fqread1 = (char*) malloc(fq_size + 1); +// fqread2 = (char*) malloc(fq_size + 1); +// +// fqread1[0] = '\0'; +// fqread2[0] = '\0'; +// +// get_fqread(&fqread1, fq_rec1, actl_bc, umi_idx, thread_data->params->no_comment, n_crop); +// pthread_mutex_lock(thread_data->out_lock); +// gzwrite(curr->bcfile1, fqread1, strlen(fqread1)); +// pthread_mutex_unlock(thread_data->out_lock); +// +// if(thread_data->params->paired > 0) { +// get_fqread(&fqread2, fq_rec1, actl_bc, umi_idx, thread_data->params->no_comment, n_crop); +// //fprintf(curr->bcfile2, "%s", fqread2); +// pthread_mutex_lock(thread_data->out_lock); +// gzwrite(curr->bcfile2, fqread2, strlen(fqread2)); +// curr->num_records += 1; +// pthread_mutex_unlock(thread_data->out_lock); +// } +// } +// curr->num_records += 1; +// } +// else { +// fqread1 = (char*) malloc(fq_size + 1); +// fqread2 = (char*) malloc(fq_size + 1); +// +// fqread1[0] = '\0'; +// fqread2[0] = '\0'; +// +// get_fqread(&fqread1, fq_rec1, NULL, NULL, thread_data->params->no_comment, 0); +// pthread_mutex_lock(thread_data->out_lock); +// gzwrite(thread_data->params->unassigned1_fd, fqread1, strlen(fqread1)); +// thread_data->metrics->num_unknown += 1; +// pthread_mutex_unlock(thread_data->out_lock); +// +// if(thread_data->params->paired > 0) { +// get_fqread(&fqread2, fq_rec2, NULL, NULL, thread_data->params->no_comment, 0); +// pthread_mutex_lock(thread_data->out_lock); +// gzwrite(thread_data->params->unassigned2_fd, fqread2, strlen(fqread2)); +// thread_data->metrics->num_unknown += 1; +// pthread_mutex_unlock(thread_data->out_lock); +// } } thread_data->metrics->total += 2; // unlock writing - pthread_mutex_unlock(thread_data->out_lock); + //pthread_mutex_unlock(thread_data->out_lock); - free(fqread1); - free(fqread2); - free(actl_bc); - free(umi_idx); } + //free(actl_bc); + free(fq_read_buff); free(thread_data); return NULL; diff --git a/src/demultiplex.h b/src/demultiplex.h index ddca50b..3ffbaae 100644 --- a/src/demultiplex.h +++ b/src/demultiplex.h @@ -5,6 +5,10 @@ #include "utils.h" #include "fastq.h" +#define MAX_READ_SIZE 2048 +#define MAX_READ_NUMBER 1000 +#define MAX_READ_BUFFER MAX_READ_SIZE*MAX_READ_NUMBER + void* demult_runner(void *arg); #endif /*DEMULTIPLEX_H*/ diff --git a/src/sabre.c b/src/sabre.c index 2536e33..255bd0d 100644 --- a/src/sabre.c +++ b/src/sabre.c @@ -42,18 +42,19 @@ int main(int argc, char *argv[]) { char *fq1_fn=NULL; char *fq2_fn=NULL; - FILE* barfile = NULL; - char *barfn=NULL; + FILE* bc_fd; + char *bc_fn=NULL; char *unassigned1_fn=strdup("unassigned_R1.fq.gz"); char *unassigned2_fn=strdup("unassigned_R2.fq.gz"); char *umis_2_short_fn=strdup("umis_too_short.txt"); FILE* log_file=NULL; + char *log_fn=strdup("stats.txt"); + int optc; extern char *optarg; - char *log_fn=NULL; char s_name[MAX_FILENAME_LENGTH]; barcode_data_t *curr, *head, *temp; @@ -82,8 +83,8 @@ int main(int argc, char *argv[]) { break; case 'b': - barfn = (char*) malloc (strlen (optarg) + 1); - strcpy (barfn, optarg); + bc_fn = (char*) malloc (strlen (optarg) + 1); + strcpy (bc_fn, optarg); break; case 'z': @@ -144,7 +145,7 @@ int main(int argc, char *argv[]) { //is printed it wasn't intended by user (or at least we don't know that) //and therefore exit code - fail case 'h': - version(EXIT_SUCCESS); + usage(EXIT_SUCCESS); break; case 'o': @@ -160,10 +161,23 @@ int main(int argc, char *argv[]) { break; } } + // TODO check that what's requre if not run usage + + params.fq1_fd = gzopen(fq1_fn, "r"); + params.fq2_fd = gzopen(fq2_fn, "r"); params.fq1_fd = gzopen(fq1_fn, "r"); + if(!params.fq1_fd) { + fprintf(stderr, "ERROR: Could not open input file R1 '%s'.\n", fq1_fn); + exit(EXIT_FAILURE); + } + params.fq2_fd = gzopen(fq2_fn, "r"); + if(!params.fq2_fd) { + fprintf(stderr, "ERROR: Could not open input file R2 '%s'.\n", fq2_fn); + exit(EXIT_FAILURE); + } params.unassigned1_fd = gzopen(unassigned1_fn, "wb"); params.unassigned2_fd = gzopen(unassigned1_fn, "wb"); @@ -198,7 +212,7 @@ int main(int argc, char *argv[]) { \n In Progess...\ \n", PROGRAM_NAME,\ fq1_fn, fq2_fn,\ - barfn,\ + bc_fn,\ unassigned1_fn, unassigned2_fn,\ params.combine, params.umi,\ params.mismatch, params.min_umi_len, @@ -210,8 +224,10 @@ int main(int argc, char *argv[]) { /* Creating linked list of barcode data */ // https://www.hackerearth.com/practice/data-structures/linked-list/singly-linked-list/tutorial/ // where each node is represents one barcode from the barcode file + bc_fd = fopen(bc_fn, "r"); head = NULL; - while (fscanf (barfile, "%s%s", barcode, s_name) != EOF) { + curr = NULL; + while (fscanf (bc_fd, "%s\t%s", barcode, s_name) != EOF) { curr = (barcode_data_t*) malloc(sizeof(barcode_data_t)); curr->bc = (char*) malloc(strlen(barcode) + 1); strcpy(curr->bc, barcode); @@ -239,13 +255,14 @@ int main(int argc, char *argv[]) { free(bcout_fn1); free(bcout_fn2); - free(barfn); + free(bc_fn); free(fq1_fn); free(fq2_fn); free(unassigned1_fn); free(unassigned2_fn); free(umis_2_short_fn); + // Threading pthread_t tid[threads]; pthread_mutex_t in_lock; @@ -262,17 +279,17 @@ int main(int argc, char *argv[]) { for(int i=0; i < threads; i++) { - thread_data->params = ¶ms; - thread_data->curr = curr; - thread_data->metrics = &metrics; - thread_data->id = i; - thread_data->in_lock = &in_lock; - thread_data->out_lock = &out_lock; - thread_data->line_num = &line_num; - thread_data->out_line_num = &out_line_num; - thread_data->cv = &cv; - - pthread_create(&(tid[i]), NULL, &demult_runner, thread_data); + thread_data[i].params = ¶ms; + thread_data[i].curr = curr; + thread_data[i].metrics = &metrics; + thread_data[i].id = i; + thread_data[i].in_lock = &in_lock; + thread_data[i].out_lock = &out_lock; + thread_data[i].line_num = &line_num; + thread_data[i].out_line_num = &out_line_num; + thread_data[i].cv = &cv; + + pthread_create(&(tid[i]), NULL, &demult_runner, thread_data+i); } for(int i=0; i < threads; i++) { @@ -320,7 +337,7 @@ int main(int argc, char *argv[]) { // good read :) little_story(EXIT_SUCCESS); - fclose(barfile); + fclose(bc_fd); fclose(log_file); params_destroy(¶ms); diff --git a/src/utils.c b/src/utils.c index 4572895..78a61f1 100644 --- a/src/utils.c +++ b/src/utils.c @@ -157,45 +157,45 @@ void get_fqread(char **fqread, fq_rec_t *fq_rec, char *barcode, char *umi_idx, i strcat(*fqread, "\n"); } -void get_merged_fqread(char **fqread, fq_rec_t *fq_rec1, fq_rec_t *fq_rec2, char *barcode, char *umi_idx, int no_comment, int n_crop) { - +void get_merged_fqread(char *fqread, fq_rec_t *fq_rec1, fq_rec_t *fq_rec2, char *barcode, char *umi_idx, int no_comment, int n_crop) { + fqread[0] = '\0'; //@READNAME:BACRCODE:UMI //1st line - strcat(*fqread, "@"); - strcat(*fqread, fq_rec1->name); + strcat(fqread, fq_rec1->name); //TODO later can have conditional here depending on the the structure and/or BARCODE/UMI if(barcode) { - strcat(*fqread, ":"); - strcat(*fqread, barcode); + strcat(fqread, ":"); + strcat(fqread, barcode); } if(umi_idx) { - strcat(*fqread, ":"); - strcat(*fqread, umi_idx); + strcat(fqread, ":"); + strcat(fqread, umi_idx); } if(fq_rec1->comment && no_comment == -1) { - strcat(*fqread, " "); - strcat(*fqread, fq_rec1->comment); + strcat(fqread, " "); + strcat(fqread, fq_rec1->comment); } - strcat(*fqread, "\n"); + strcat(fqread, "\n"); //2nd line - strcat(*fqread, fq_rec2->seq); - strcat(*fqread, "\n"); + strcat(fqread, fq_rec2->seq); + strcat(fqread, "\n"); //3rd line - strcat(*fqread, "+"); - strcat(*fqread, fq_rec2->name); - if(fq_rec2->comment && no_comment == -1) { - strcat(*fqread, " "); - strcat(*fqread, fq_rec2->comment); - } - strcat(*fqread, "\n"); + strcat(fqread, "+"); + strcat(fqread, ""); + //strcat(fqread, fq_rec2->name); + //if(fq_rec2->comment && no_comment == -1) { + // strcat(fqread, " "); + // strcat(fqread, fq_rec2->comment); + //} + strcat(fqread, "\n"); //4th line - strcat(*fqread, (fq_rec2->qual)); - strcat(*fqread, "\n"); + strcat(fqread, (fq_rec2->qual)); + strcat(fqread, "\n"); } void get_bc_fn(char **bcout_fn, char *s_name, char *barcode, int read_type) { diff --git a/src/utils.h b/src/utils.h index 1c9085c..da82afa 100644 --- a/src/utils.h +++ b/src/utils.h @@ -23,7 +23,7 @@ char *strndup(const char *s, size_t n); const char * _mkdir (const char *dir); int chk_bc_mtch(const char *orig_bc, const char *orig_read, size_t mismatch, int max_5prime_crop); void get_fqread(char **fqread, fq_rec_t *fq_rec, char *barcode, char *umi_idx, int no_comment, int n_crop); -void get_merged_fqread(char **fqread, fq_rec_t *fq_rec1, fq_rec_t *fq_rec2, char *barcode, char *umi_idx, int no_comment, int n_crop); +void get_merged_fqread(char *fqread, fq_rec_t *fq_rec1, fq_rec_t *fq_rec2, char *barcode, char *umi_idx, int no_comment, int n_crop); void get_bc_fn(char **bcout_fn, char *s_name, char *barcode, int read_type); void set_default_params(param_t *params); From cfc3bfa0575433ab1c147a9018d4aa77b0bb076c Mon Sep 17 00:00:00 2001 From: serine Date: Mon, 17 Dec 2018 16:34:19 +1100 Subject: [PATCH 40/55] check --- src/demultiplex.c | 132 ++++++++++++++++++++++++---------------------- src/demultiplex.h | 3 +- src/sabre.h | 12 ++++- src/utils.c | 41 +++++++------- src/utils.h | 2 +- 5 files changed, 103 insertions(+), 87 deletions(-) diff --git a/src/demultiplex.c b/src/demultiplex.c index a28f8f3..aa7dee7 100644 --- a/src/demultiplex.c +++ b/src/demultiplex.c @@ -23,10 +23,26 @@ void* demult_runner(void *arg) fqread1[0] = '\0'; fqread2[0] = '\0'; - char *fq_read_buff; - fq_read_buff = (char*) malloc(MAX_READ_BUFFER); - fq_read_buff[0] = '\0'; + char *fq_read1_buff; + fq_read1_buff = (char*) malloc(MAX_READ_BUFFER); + + char *fq_read2_buff; + fq_read2_buff = (char*) malloc(MAX_READ_BUFFER); + + fq_read1_buff[0] = '\0'; + fq_read2_buff[0] = '\0'; + + char *fq_read1_unass_buff; + fq_read1_unass_buff = (char*) malloc(MAX_READ_BUFFER); + + char *fq_read2_unass_buff; + fq_read2_unass_buff = (char*) malloc(MAX_READ_BUFFER); + + fq_read1_unass_buff[0] = '\0'; + fq_read2_unass_buff[0] = '\0'; + int buff_cnt = 0; + int buff_unass_cnt = 0; fq_rec_t *fq_rec1; fq_rec_t *fq_rec2; @@ -90,10 +106,6 @@ void* demult_runner(void *arg) actl_bc = strndup( (fq_rec1->seq)+n_crop, strlen(curr->bc) ); break; } - //else { - // fprintf(stdout, "GOTCHA %s \n", actl_bc); - // exit(1); - //} curr = curr->next; } @@ -120,8 +132,8 @@ void* demult_runner(void *arg) //protect by mutex umis_2_short_file pthread_mutex_lock(thread_data->out_lock); fprintf(stdout, "%s\t%s\t%zu\t%d\n", fq_rec1->name, actl_umi_idx, strlen(actl_umi_idx), thread_data->params->min_umi_len); + fprintf(thread_data->params->umis_2_short_fd, "%s\t%s\t%zu\t%d\n", fq_rec1->name, actl_umi_idx, strlen(actl_umi_idx), thread_data->params->min_umi_len); pthread_mutex_unlock(thread_data->out_lock); - //fprintf(thread_data->params->umis_2_short_fd, "%s\t%s\t%zu\t%d\n", fq_rec1->name, actl_umi_idx, strlen(actl_umi_idx), thread_data->params->min_umi_len); continue; } else { @@ -131,69 +143,60 @@ void* demult_runner(void *arg) } if(thread_data->params->combine > 0 && actl_bc != NULL) { - //fqread1 = (char*) malloc(fq_size); - //fqread1[0] = '\0'; - get_merged_fqread(fqread1, fq_rec1, fq_rec2, actl_bc, umi_idx, thread_data->params->no_comment, n_crop); - //protect by mutex umis_2_short_file - //pthread_mutex_lock(thread_data->out_lock); - ////fprintf(stdout, "%s", fqread1); - //gzwrite(curr->bcfile1, fqread1, strlen(fqread1)); - //pthread_mutex_unlock(thread_data->out_lock); - strcat(fq_read_buff, fqread1); + strcat(fq_read1_buff, fqread1); buff_cnt++; } + else { + get_fqread(fqread1, fq_rec1, actl_bc, umi_idx, thread_data->params->no_comment, n_crop); + strcat(fq_read1_buff, fqread1); + buff_cnt++; + + if(thread_data->params->paired > 0) { + get_fqread(fqread2, fq_rec1, actl_bc, umi_idx, thread_data->params->no_comment, n_crop); + strcat(fq_read2_buff, fqread2); + //dont need to increment buff_cnt, assuming fq_read1 keeps the right count + curr->num_records += 1; + } + } + curr->num_records += 1; + if(buff_cnt > MAX_READ_NUMBER-1) { pthread_mutex_lock(thread_data->out_lock); - gzwrite(curr->bcfile1, fqread1, strlen(fqread1)); - fq_read_buff[0] = '\0'; + gzwrite(curr->bcfile1, fq_read1_buff, strlen(fq_read1_buff)); + fq_read1_buff[0] = '\0'; buff_cnt = 0; + if(thread_data->params->paired > 0) { + gzwrite(curr->bcfile2, fq_read2_buff, strlen(fq_read2_buff)); + fq_read1_buff[0] = '\0'; + } + pthread_mutex_unlock(thread_data->out_lock); + } + } + else { + + get_fqread(fqread1, fq_rec1, NULL, NULL, thread_data->params->no_comment, 0); + strcat(fq_read1_unass_buff, fqread1); + thread_data->metrics->num_unknown += 1; + buff_unass_cnt++; + + if(thread_data->params->paired > 0) { + get_fqread(fqread2, fq_rec2, NULL, NULL, thread_data->params->no_comment, 0); + strcat(fq_read2_unass_buff, fqread2); + thread_data->metrics->num_unknown += 1; + } + + if(buff_unass_cnt > MAX_READ_NUMBER-1) { + pthread_mutex_lock(thread_data->out_lock); + gzwrite(thread_data->params->unassigned1_fd, fq_read1_unass_buff, strlen(fq_read1_unass_buff)); + fq_read1_unass_buff[0] = '\0'; + buff_unass_cnt = 0; + if(thread_data->params->paired > 0) { + gzwrite(thread_data->params->unassigned2_fd, fq_read2_unass_buff, strlen(fq_read2_unass_buff)); + fq_read2_unass_buff[0] = '\0'; + } pthread_mutex_unlock(thread_data->out_lock); - } -// else { -// fqread1 = (char*) malloc(fq_size + 1); -// fqread2 = (char*) malloc(fq_size + 1); -// -// fqread1[0] = '\0'; -// fqread2[0] = '\0'; -// -// get_fqread(&fqread1, fq_rec1, actl_bc, umi_idx, thread_data->params->no_comment, n_crop); -// pthread_mutex_lock(thread_data->out_lock); -// gzwrite(curr->bcfile1, fqread1, strlen(fqread1)); -// pthread_mutex_unlock(thread_data->out_lock); -// -// if(thread_data->params->paired > 0) { -// get_fqread(&fqread2, fq_rec1, actl_bc, umi_idx, thread_data->params->no_comment, n_crop); -// //fprintf(curr->bcfile2, "%s", fqread2); -// pthread_mutex_lock(thread_data->out_lock); -// gzwrite(curr->bcfile2, fqread2, strlen(fqread2)); -// curr->num_records += 1; -// pthread_mutex_unlock(thread_data->out_lock); -// } -// } -// curr->num_records += 1; -// } -// else { -// fqread1 = (char*) malloc(fq_size + 1); -// fqread2 = (char*) malloc(fq_size + 1); -// -// fqread1[0] = '\0'; -// fqread2[0] = '\0'; -// -// get_fqread(&fqread1, fq_rec1, NULL, NULL, thread_data->params->no_comment, 0); -// pthread_mutex_lock(thread_data->out_lock); -// gzwrite(thread_data->params->unassigned1_fd, fqread1, strlen(fqread1)); -// thread_data->metrics->num_unknown += 1; -// pthread_mutex_unlock(thread_data->out_lock); -// -// if(thread_data->params->paired > 0) { -// get_fqread(&fqread2, fq_rec2, NULL, NULL, thread_data->params->no_comment, 0); -// pthread_mutex_lock(thread_data->out_lock); -// gzwrite(thread_data->params->unassigned2_fd, fqread2, strlen(fqread2)); -// thread_data->metrics->num_unknown += 1; -// pthread_mutex_unlock(thread_data->out_lock); -// } } thread_data->metrics->total += 2; @@ -204,7 +207,8 @@ void* demult_runner(void *arg) } //free(actl_bc); - free(fq_read_buff); + free(fq_read1_buff); + free(fq_read2_buff); free(thread_data); return NULL; diff --git a/src/demultiplex.h b/src/demultiplex.h index 3ffbaae..22e3417 100644 --- a/src/demultiplex.h +++ b/src/demultiplex.h @@ -6,7 +6,8 @@ #include "fastq.h" #define MAX_READ_SIZE 2048 -#define MAX_READ_NUMBER 1000 +//#define MAX_READ_NUMBER 100000 +#define MAX_READ_NUMBER 15000 #define MAX_READ_BUFFER MAX_READ_SIZE*MAX_READ_NUMBER void* demult_runner(void *arg); diff --git a/src/sabre.h b/src/sabre.h index 36e0d5c..f7dbbcc 100644 --- a/src/sabre.h +++ b/src/sabre.h @@ -43,7 +43,11 @@ typedef struct actl_bc_cnt { } actl_bc_cnt; typedef struct listel_p { - char* bc; + //TODO this is future implementation + // this is a pointer to an array of pointers + //char **bc; + //typedef struct barcode_t *b + char *bc; int num_records; gzFile bcfile1; gzFile bcfile2; @@ -51,6 +55,12 @@ typedef struct listel_p { struct listel_p *next; } barcode_data_t; +//TODO this is for future implementation +//typdef strcut { +// char *bc; +// barcode_data_t *bacrcode_data; +//} barcode_t + typedef struct { gzFile fq1_fd; gzFile fq2_fd; diff --git a/src/utils.c b/src/utils.c index 78a61f1..f45271c 100644 --- a/src/utils.c +++ b/src/utils.c @@ -106,7 +106,9 @@ int chk_bc_mtch(const char *orig_bc, const char *orig_read, size_t mismatch, int // https://stackoverflow.com/questions/21880730/c-what-is-the-best-and-fastest-way-to-concatenate-strings //TODO this is a fastq mystrcat function, that returns a pointer to the end of the string -void get_fqread(char **fqread, fq_rec_t *fq_rec, char *barcode, char *umi_idx, int no_comment, int n_crop) { +void get_fqread(char *fqread, fq_rec_t *fq_rec, char *barcode, char *umi_idx, int no_comment, int n_crop) { + + fqread[0] = '\0'; if(n_crop < 0) { fprintf(stderr, @@ -117,48 +119,47 @@ void get_fqread(char **fqread, fq_rec_t *fq_rec, char *barcode, char *umi_idx, i //@READNAME:BACRCODE:UMI //1st line - strcat(*fqread, "@"); - strcat(*fqread, fq_rec->name); + strcat(fqread, fq_rec->name); //TODO later can have conditional here depending on the the structure and/or BARCODE/UMI if(barcode) { - strcat(*fqread, ":"); - strcat(*fqread, barcode); + strcat(fqread, ":"); + strcat(fqread, barcode); } else if(!barcode) { barcode = ""; } if(umi_idx) { - strcat(*fqread, ":"); - strcat(*fqread, umi_idx); + strcat(fqread, ":"); + strcat(fqread, umi_idx); } if(fq_rec->comment && no_comment == -1) { - strcat(*fqread, " "); - strcat(*fqread, fq_rec->comment); + strcat(fqread, " "); + strcat(fqread, fq_rec->comment); } - strcat(*fqread, "\n"); + strcat(fqread, "\n"); //2nd line - strcat(*fqread, (fq_rec->seq)+strlen(barcode)+n_crop); - strcat(*fqread, "\n"); + strcat(fqread, (fq_rec->seq)+strlen(barcode)+n_crop); + strcat(fqread, "\n"); //3rd line - strcat(*fqread, "+"); - strcat(*fqread, fq_rec->name); + strcat(fqread, "+"); + strcat(fqread, fq_rec->name); if(fq_rec->comment && no_comment == -1) { - strcat(*fqread, " "); - strcat(*fqread, fq_rec->comment); + strcat(fqread, " "); + strcat(fqread, fq_rec->comment); } - strcat(*fqread, "\n"); + strcat(fqread, "\n"); //4th line - strcat(*fqread, (fq_rec->qual)+strlen(barcode)+n_crop); - strcat(*fqread, "\n"); + strcat(fqread, (fq_rec->qual)+strlen(barcode)+n_crop); + strcat(fqread, "\n"); } void get_merged_fqread(char *fqread, fq_rec_t *fq_rec1, fq_rec_t *fq_rec2, char *barcode, char *umi_idx, int no_comment, int n_crop) { - fqread[0] = '\0'; + fqread[0] = '\0'; //@READNAME:BACRCODE:UMI //1st line strcat(fqread, fq_rec1->name); diff --git a/src/utils.h b/src/utils.h index da82afa..3a9e32b 100644 --- a/src/utils.h +++ b/src/utils.h @@ -22,7 +22,7 @@ char *strndup(const char *s, size_t n); const char * _mkdir (const char *dir); int chk_bc_mtch(const char *orig_bc, const char *orig_read, size_t mismatch, int max_5prime_crop); -void get_fqread(char **fqread, fq_rec_t *fq_rec, char *barcode, char *umi_idx, int no_comment, int n_crop); +void get_fqread(char *fqread, fq_rec_t *fq_rec, char *barcode, char *umi_idx, int no_comment, int n_crop); void get_merged_fqread(char *fqread, fq_rec_t *fq_rec1, fq_rec_t *fq_rec2, char *barcode, char *umi_idx, int no_comment, int n_crop); void get_bc_fn(char **bcout_fn, char *s_name, char *barcode, int read_type); From 82bcba998c94ff663242162696898e68135f725b Mon Sep 17 00:00:00 2001 From: serine Date: Mon, 17 Dec 2018 22:09:20 +1100 Subject: [PATCH 41/55] huzzah! --- src/demultiplex.c | 59 +++++++++++++++++++++++++++++++++-------------- src/sabre.c | 40 +++++++++++++++++++++----------- src/sabre.h | 5 ++-- src/utils.c | 4 ++-- 4 files changed, 73 insertions(+), 35 deletions(-) diff --git a/src/demultiplex.c b/src/demultiplex.c index aa7dee7..44aa551 100644 --- a/src/demultiplex.c +++ b/src/demultiplex.c @@ -51,9 +51,10 @@ void* demult_runner(void *arg) fq_rec2 = (fq_rec_t*) malloc(sizeof(fq_rec_t)); barcode_data_t *curr; + curr = NULL; thread_data_t* thread_data = (thread_data_t*)arg; - int my_line_num; + //int my_line_num; /* Get reads, one at a time */ @@ -87,32 +88,40 @@ void* demult_runner(void *arg) } // unlock reading - my_line_num = *(thread_data->line_num); - *thread_data->line_num += 1; + // TODO this bit of code for ordered fastq files, implement later? + //my_line_num = *(thread_data->line_num); + //*thread_data->line_num += 1; pthread_mutex_unlock(thread_data->in_lock); int n_crop = 0; char *actl_bc = NULL; - //char actl_bc[MAX_BARCODE_LENGTH]; - //actl_bc[0] = '\0'; /* Step 1: Find matching barcode */ + int got_match = 0; curr = thread_data->curr; while(curr) { - n_crop = chk_bc_mtch(curr->bc, fq_rec1->seq, thread_data->params->mismatch, thread_data->params->max_5prime_crop); - if(n_crop >= 0) { - //found matching barcode - actl_bc = strndup( (fq_rec1->seq)+n_crop, strlen(curr->bc) ); - break; - } + + if(got_match) { + break; + } + + for (int i=0; curr->bc[i]; i++) { + n_crop = chk_bc_mtch(curr->bc[i], fq_rec1->seq, thread_data->params->mismatch, thread_data->params->max_5prime_crop); + if(n_crop >= 0) { + //found matching barcode + actl_bc = strndup( (fq_rec1->seq)+n_crop, strlen(curr->bc[i]) ); + got_match = 1; + } + + } curr = curr->next; } - //fprintf(stdout, "HERE %s %s %s \n", fq_rec1->name, fq_rec1->seq, fq_rec1->qual); /* Step 2: Write read out into barcode specific file */ - - //// lock writing + //TODO this bit of code to keep fastq files ordered as per original fastq files + //which I don't think that needed? at least not at this stage + // lock writing //while(*(thread_data->out_line_num) != my_line_num) { // pthread_cond_wait(thread_data->cv, thread_data->out_lock); //} @@ -126,12 +135,10 @@ void* demult_runner(void *arg) //for now assume barcode and umi are in R1 read if(thread_data->params->umi > 0) { - const char *actl_umi_idx = (fq_rec1->seq)+strlen(curr->bc)+n_crop; + const char *actl_umi_idx = (fq_rec1->seq)+strlen(actl_bc)+n_crop; if(strlen(actl_umi_idx) < thread_data->params->min_umi_len) { - //protect by mutex umis_2_short_file pthread_mutex_lock(thread_data->out_lock); - fprintf(stdout, "%s\t%s\t%zu\t%d\n", fq_rec1->name, actl_umi_idx, strlen(actl_umi_idx), thread_data->params->min_umi_len); fprintf(thread_data->params->umis_2_short_fd, "%s\t%s\t%zu\t%d\n", fq_rec1->name, actl_umi_idx, strlen(actl_umi_idx), thread_data->params->min_umi_len); pthread_mutex_unlock(thread_data->out_lock); continue; @@ -206,6 +213,24 @@ void* demult_runner(void *arg) } + if(strlen(fq_read1_buff) > 0) { + pthread_mutex_lock(thread_data->out_lock); + gzwrite(curr->bcfile1, fq_read1_buff, strlen(fq_read1_buff)); + if(thread_data->params->paired > 0) { + gzwrite(curr->bcfile2, fq_read2_buff, strlen(fq_read2_buff)); + } + pthread_mutex_unlock(thread_data->out_lock); + } + if(strlen(fq_read1_unass_buff) > 0) { + pthread_mutex_lock(thread_data->out_lock); + gzwrite(thread_data->params->unassigned1_fd, fq_read1_unass_buff, strlen(fq_read1_unass_buff)); + if(thread_data->params->paired > 0) { + gzwrite(thread_data->params->unassigned2_fd, fq_read2_unass_buff, strlen(fq_read2_unass_buff)); + } + pthread_mutex_unlock(thread_data->out_lock); + } + //TODO need to print out very last buffer here + // //free(actl_bc); free(fq_read1_buff); free(fq_read2_buff); diff --git a/src/sabre.c b/src/sabre.c index 255bd0d..f0a6441 100644 --- a/src/sabre.c +++ b/src/sabre.c @@ -55,10 +55,7 @@ int main(int argc, char *argv[]) { int optc; extern char *optarg; - - char s_name[MAX_FILENAME_LENGTH]; barcode_data_t *curr, *head, *temp; - char barcode [MAX_BARCODE_LENGTH]; int threads=4; @@ -210,6 +207,7 @@ int main(int argc, char *argv[]) { \n --threads %d\ \n\ \n In Progess...\ + \n\ \n", PROGRAM_NAME,\ fq1_fn, fq2_fn,\ bc_fn,\ @@ -227,27 +225,41 @@ int main(int argc, char *argv[]) { bc_fd = fopen(bc_fn, "r"); head = NULL; curr = NULL; - while (fscanf (bc_fd, "%s\t%s", barcode, s_name) != EOF) { + + char line_buff[1024]; + while(fgets(line_buff, 1024, bc_fd)) { curr = (barcode_data_t*) malloc(sizeof(barcode_data_t)); - curr->bc = (char*) malloc(strlen(barcode) + 1); - strcpy(curr->bc, barcode); + + char *p = strtok(line_buff, "\t"); + char *s_name = strdup(p); + + p = strtok(NULL, "\t"); + curr->bc_grp = strdup(p); bcout_fn1 = (char *) malloc(MAX_FILENAME_LENGTH*2); bcout_fn1[0] = '\0'; - get_bc_fn(&bcout_fn1, s_name, curr->bc, 1); - //curr->bcfile1 = fopen (_mkdir(bcout_fn1), "w"); + get_bc_fn(&bcout_fn1, s_name, curr->bc_grp, 1); curr->bcfile1 = gzopen(_mkdir(bcout_fn1), "wb"); - //curr->bcfile1 = popen(_mkdir(bcout_fn1), "wb"); - // popen returns file handler if(params.paired > 0 && params.combine < 0) { bcout_fn2 = (char *) malloc(MAX_FILENAME_LENGTH*2); bcout_fn2[0] = '\0'; - get_bc_fn(&bcout_fn2, s_name, curr->bc, 2); - //curr->bcfile2 = fopen (_mkdir(bcout_fn2), "w"); + get_bc_fn(&bcout_fn2, s_name, curr->bc_grp, 2); curr->bcfile2 = gzopen(_mkdir(bcout_fn2), "wb"); } + //TODO for hardcode max limit of items in the barcodes file to 6 + curr->bc = calloc(6, sizeof(void*)); + + int i=0; + while(i<5 && (p = strtok(NULL, "\t\n"))) { + // remove the token, new line char + curr->bc[i] = strdup(p); + fprintf(stdout, " BC %s ", curr->bc[i]); + i++; + } + fprintf(stdout, "\n"); + curr->num_records = 0; curr->next = head; head = curr; @@ -262,7 +274,6 @@ int main(int argc, char *argv[]) { free(unassigned2_fn); free(umis_2_short_fn); - // Threading pthread_t tid[threads]; pthread_mutex_t in_lock; @@ -317,7 +328,7 @@ int main(int argc, char *argv[]) { int n_pairs = curr->num_records/2; float percent_pairs = (float) n_pairs/total_pairs; - fprintf (log_file,"%s\t%d\t%d\t%.2f\n", curr->bc, curr->num_records, n_pairs, percent_pairs); + fprintf (log_file,"%s\t%d\t%d\t%.2f\n", curr->bc_grp, curr->num_records, n_pairs, percent_pairs); curr = curr->next; } @@ -348,6 +359,7 @@ int main(int argc, char *argv[]) { gzclose(curr->bcfile1); gzclose(curr->bcfile2); + free (curr->bc_grp); free (curr->bc); temp = curr; curr = curr->next; diff --git a/src/sabre.h b/src/sabre.h index f7dbbcc..5b1dd83 100644 --- a/src/sabre.h +++ b/src/sabre.h @@ -44,10 +44,11 @@ typedef struct actl_bc_cnt { typedef struct listel_p { //TODO this is future implementation - // this is a pointer to an array of pointers //char **bc; //typedef struct barcode_t *b - char *bc; + // this is a pointer to an array of pointers + char **bc; + char *bc_grp; int num_records; gzFile bcfile1; gzFile bcfile2; diff --git a/src/utils.c b/src/utils.c index f45271c..2f56d87 100644 --- a/src/utils.c +++ b/src/utils.c @@ -54,11 +54,9 @@ const char * _mkdir(const char *file_path) { //strcmp can be used for sorting, returns pos, zero, neg //BUT this new implementation can't be used as such just FYI int chk_bc_mtch(const char *orig_bc, const char *orig_read, size_t mismatch, int max_5prime_crop) { - int orig_read_len = strlen(orig_read); int orig_bc_len = strlen(orig_bc); int n_crop = 0; - if(orig_bc_len > orig_read_len) { fprintf (stderr, "WARNING: Length of the barcode %d is greater than length of the reads %d.", @@ -82,6 +80,8 @@ int chk_bc_mtch(const char *orig_bc, const char *orig_read, size_t mismatch, int u1 = *bc++; u2 = *read++; + //fprintf(stdout, "BC %s READ %s\n", u1, u2); + if (u1 != u2) { cnt++; if (cnt > mismatch) { From 3782b8b34773e71683dcb3febfa94d3496653ebd Mon Sep 17 00:00:00 2001 From: serine Date: Tue, 18 Dec 2018 07:36:05 +1100 Subject: [PATCH 42/55] tweak --- src/demultiplex.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/demultiplex.c b/src/demultiplex.c index 44aa551..623527d 100644 --- a/src/demultiplex.c +++ b/src/demultiplex.c @@ -102,19 +102,21 @@ void* demult_runner(void *arg) curr = thread_data->curr; while(curr) { - if(got_match) { - break; - } - for (int i=0; curr->bc[i]; i++) { n_crop = chk_bc_mtch(curr->bc[i], fq_rec1->seq, thread_data->params->mismatch, thread_data->params->max_5prime_crop); if(n_crop >= 0) { //found matching barcode actl_bc = strndup( (fq_rec1->seq)+n_crop, strlen(curr->bc[i]) ); got_match = 1; + break; } } + + if(got_match) { + break; + } + curr = curr->next; } From 02a9c253a1c64eda9c7cd4edae6f80026d7b3530 Mon Sep 17 00:00:00 2001 From: serine Date: Thu, 21 Feb 2019 15:47:01 +1100 Subject: [PATCH 43/55] check2 --- .gitignore | 3 ++- src/sabre.c | 10 ++++++---- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/.gitignore b/.gitignore index a8275f3..135df85 100644 --- a/.gitignore +++ b/.gitignore @@ -6,4 +6,5 @@ sabre-dev metrics # ignore vim swap files *.swp - +*.gz +/tmp diff --git a/src/sabre.c b/src/sabre.c index f0a6441..85d1b5c 100644 --- a/src/sabre.c +++ b/src/sabre.c @@ -227,6 +227,7 @@ int main(int argc, char *argv[]) { curr = NULL; char line_buff[1024]; + int max_items = 6; while(fgets(line_buff, 1024, bc_fd)) { curr = (barcode_data_t*) malloc(sizeof(barcode_data_t)); @@ -249,11 +250,11 @@ int main(int argc, char *argv[]) { } //TODO for hardcode max limit of items in the barcodes file to 6 - curr->bc = calloc(6, sizeof(void*)); + curr->bc = calloc(max_items, sizeof(void*)); int i=0; - while(i<5 && (p = strtok(NULL, "\t\n"))) { - // remove the token, new line char + while(i <= max_items && (p = strtok(NULL, "\t\n"))) { + // remove the token, new line char curr->bc[i] = strdup(p); fprintf(stdout, " BC %s ", curr->bc[i]); i++; @@ -328,7 +329,8 @@ int main(int argc, char *argv[]) { int n_pairs = curr->num_records/2; float percent_pairs = (float) n_pairs/total_pairs; - fprintf (log_file,"%s\t%d\t%d\t%.2f\n", curr->bc_grp, curr->num_records, n_pairs, percent_pairs); + //fprintf (log_file,"%s\t%d\t%d\t%.2f\n", curr->bc_grp, curr->num_records, n_pairs, percent_pairs); + fprintf (stdout,"%s\t%d\t%d\t%.2f\n", curr->bc_grp, curr->num_records, n_pairs, percent_pairs); curr = curr->next; } From 4429c005f8952f0c55bee14fe7fdf4b4f189d162 Mon Sep 17 00:00:00 2001 From: serine Date: Thu, 21 Feb 2019 21:46:54 +1100 Subject: [PATCH 44/55] removed gzWrite because it appears that it was very slow, although gziping by itself is slow as well and it is hard to tell what's better gzipping as I go or using stand alone pigz tools? I think the concern was that gzip ing worked better once the file stoped changing? - also in this commit some minor bug fixes --- src/demultiplex.c | 112 +++++++++++++--------------------------------- src/sabre.c | 34 +++++++------- src/sabre.h | 10 ++--- src/utils.c | 12 ++--- 4 files changed, 58 insertions(+), 110 deletions(-) diff --git a/src/demultiplex.c b/src/demultiplex.c index 623527d..901afe0 100644 --- a/src/demultiplex.c +++ b/src/demultiplex.c @@ -6,10 +6,10 @@ * */ -/* +/* * sabre FASTQ files demultiplexing * demultiplex.c: FASTQ demultiplexing - * + * */ #include "demultiplex.h" @@ -23,27 +23,6 @@ void* demult_runner(void *arg) fqread1[0] = '\0'; fqread2[0] = '\0'; - char *fq_read1_buff; - fq_read1_buff = (char*) malloc(MAX_READ_BUFFER); - - char *fq_read2_buff; - fq_read2_buff = (char*) malloc(MAX_READ_BUFFER); - - fq_read1_buff[0] = '\0'; - fq_read2_buff[0] = '\0'; - - char *fq_read1_unass_buff; - fq_read1_unass_buff = (char*) malloc(MAX_READ_BUFFER); - - char *fq_read2_unass_buff; - fq_read2_unass_buff = (char*) malloc(MAX_READ_BUFFER); - - fq_read1_unass_buff[0] = '\0'; - fq_read2_unass_buff[0] = '\0'; - - int buff_cnt = 0; - int buff_unass_cnt = 0; - fq_rec_t *fq_rec1; fq_rec_t *fq_rec2; @@ -68,7 +47,7 @@ void* demult_runner(void *arg) if(get_fq_rec(fq_rec1, thread_data->params->fq1_fd)) { // sanity check no more reads pthread_mutex_unlock(thread_data->in_lock); - break; + break; } if(thread_data->params->paired > 0) { @@ -83,7 +62,7 @@ void* demult_runner(void *arg) \n", fq_rec1->name); pthread_mutex_unlock(thread_data->in_lock); - exit(1); + exit(1); } } @@ -114,7 +93,7 @@ void* demult_runner(void *arg) } if(got_match) { - break; + break; } curr = curr->next; @@ -153,90 +132,61 @@ void* demult_runner(void *arg) if(thread_data->params->combine > 0 && actl_bc != NULL) { get_merged_fqread(fqread1, fq_rec1, fq_rec2, actl_bc, umi_idx, thread_data->params->no_comment, n_crop); - strcat(fq_read1_buff, fqread1); - buff_cnt++; + + pthread_mutex_lock(thread_data->out_lock); + fprintf(curr->bcfile1, fqread1); + pthread_mutex_unlock(thread_data->out_lock); + } else { get_fqread(fqread1, fq_rec1, actl_bc, umi_idx, thread_data->params->no_comment, n_crop); - strcat(fq_read1_buff, fqread1); - buff_cnt++; + + pthread_mutex_lock(thread_data->out_lock); + fprintf(curr->bcfile1, fqread1); + pthread_mutex_unlock(thread_data->out_lock); if(thread_data->params->paired > 0) { get_fqread(fqread2, fq_rec1, actl_bc, umi_idx, thread_data->params->no_comment, n_crop); - strcat(fq_read2_buff, fqread2); + + pthread_mutex_lock(thread_data->out_lock); + fprintf(curr->bcfile2, fqread2); + pthread_mutex_unlock(thread_data->out_lock); + //dont need to increment buff_cnt, assuming fq_read1 keeps the right count curr->num_records += 1; } } curr->num_records += 1; - if(buff_cnt > MAX_READ_NUMBER-1) { - pthread_mutex_lock(thread_data->out_lock); - gzwrite(curr->bcfile1, fq_read1_buff, strlen(fq_read1_buff)); - fq_read1_buff[0] = '\0'; - buff_cnt = 0; - if(thread_data->params->paired > 0) { - gzwrite(curr->bcfile2, fq_read2_buff, strlen(fq_read2_buff)); - fq_read1_buff[0] = '\0'; - } - pthread_mutex_unlock(thread_data->out_lock); - } } else { get_fqread(fqread1, fq_rec1, NULL, NULL, thread_data->params->no_comment, 0); - strcat(fq_read1_unass_buff, fqread1); + + pthread_mutex_lock(thread_data->out_lock); + fprintf(thread_data->params->unassigned1_fd, fqread1); + pthread_mutex_unlock(thread_data->out_lock); + thread_data->metrics->num_unknown += 1; - buff_unass_cnt++; if(thread_data->params->paired > 0) { get_fqread(fqread2, fq_rec2, NULL, NULL, thread_data->params->no_comment, 0); - strcat(fq_read2_unass_buff, fqread2); - thread_data->metrics->num_unknown += 1; - } - if(buff_unass_cnt > MAX_READ_NUMBER-1) { pthread_mutex_lock(thread_data->out_lock); - gzwrite(thread_data->params->unassigned1_fd, fq_read1_unass_buff, strlen(fq_read1_unass_buff)); - fq_read1_unass_buff[0] = '\0'; - buff_unass_cnt = 0; - if(thread_data->params->paired > 0) { - gzwrite(thread_data->params->unassigned2_fd, fq_read2_unass_buff, strlen(fq_read2_unass_buff)); - fq_read2_unass_buff[0] = '\0'; - } + fprintf(thread_data->params->unassigned2_fd, fqread2); pthread_mutex_unlock(thread_data->out_lock); - } + + thread_data->metrics->num_unknown += 1; + } + } thread_data->metrics->total += 2; - // unlock writing - //pthread_mutex_unlock(thread_data->out_lock); - } - if(strlen(fq_read1_buff) > 0) { - pthread_mutex_lock(thread_data->out_lock); - gzwrite(curr->bcfile1, fq_read1_buff, strlen(fq_read1_buff)); - if(thread_data->params->paired > 0) { - gzwrite(curr->bcfile2, fq_read2_buff, strlen(fq_read2_buff)); - } - pthread_mutex_unlock(thread_data->out_lock); - } - if(strlen(fq_read1_unass_buff) > 0) { - pthread_mutex_lock(thread_data->out_lock); - gzwrite(thread_data->params->unassigned1_fd, fq_read1_unass_buff, strlen(fq_read1_unass_buff)); - if(thread_data->params->paired > 0) { - gzwrite(thread_data->params->unassigned2_fd, fq_read2_unass_buff, strlen(fq_read2_unass_buff)); - } - pthread_mutex_unlock(thread_data->out_lock); - } - //TODO need to print out very last buffer here - // - //free(actl_bc); - free(fq_read1_buff); - free(fq_read2_buff); + free(fq_rec1); + free(fq_rec2); free(thread_data); - return NULL; } diff --git a/src/sabre.c b/src/sabre.c index 85d1b5c..3be837d 100644 --- a/src/sabre.c +++ b/src/sabre.c @@ -27,7 +27,7 @@ int main(int argc, char *argv[]) { {NULL, 0, NULL, 0} }; - // this is on the stack so no need to malloc + // this is on the stack so no need to malloc // stack gets cleaned when function exits, // because of that no need to free either param_t params; @@ -45,8 +45,8 @@ int main(int argc, char *argv[]) { FILE* bc_fd; char *bc_fn=NULL; - char *unassigned1_fn=strdup("unassigned_R1.fq.gz"); - char *unassigned2_fn=strdup("unassigned_R2.fq.gz"); + char *unassigned1_fn=strdup("unassigned_R1.fq"); + char *unassigned2_fn=strdup("unassigned_R2.fq"); char *umis_2_short_fn=strdup("umis_too_short.txt"); FILE* log_file=NULL; @@ -158,11 +158,8 @@ int main(int argc, char *argv[]) { break; } } - + // TODO check that what's requre if not run usage - - params.fq1_fd = gzopen(fq1_fn, "r"); - params.fq2_fd = gzopen(fq2_fn, "r"); params.fq1_fd = gzopen(fq1_fn, "r"); if(!params.fq1_fd) { @@ -176,15 +173,15 @@ int main(int argc, char *argv[]) { exit(EXIT_FAILURE); } - params.unassigned1_fd = gzopen(unassigned1_fn, "wb"); - params.unassigned2_fd = gzopen(unassigned1_fn, "wb"); + params.unassigned1_fd = fopen(unassigned1_fn, "wb"); + params.unassigned2_fd = fopen(unassigned2_fn, "wb"); params.umis_2_short_fd = fopen(umis_2_short_fn, "a"); // ? where does this goes? fprintf(params.umis_2_short_fd, "name\tumi\tlen\tmin_len\n"); //TODO plugin sanity_chk here - + if(params.fq2_fd) { params.paired = 1; } @@ -224,7 +221,7 @@ int main(int argc, char *argv[]) { // where each node is represents one barcode from the barcode file bc_fd = fopen(bc_fn, "r"); head = NULL; - curr = NULL; + curr = NULL; char line_buff[1024]; int max_items = 6; @@ -240,13 +237,13 @@ int main(int argc, char *argv[]) { bcout_fn1 = (char *) malloc(MAX_FILENAME_LENGTH*2); bcout_fn1[0] = '\0'; get_bc_fn(&bcout_fn1, s_name, curr->bc_grp, 1); - curr->bcfile1 = gzopen(_mkdir(bcout_fn1), "wb"); + curr->bcfile1 = fopen(_mkdir(bcout_fn1), "wb"); if(params.paired > 0 && params.combine < 0) { bcout_fn2 = (char *) malloc(MAX_FILENAME_LENGTH*2); bcout_fn2[0] = '\0'; get_bc_fn(&bcout_fn2, s_name, curr->bc_grp, 2); - curr->bcfile2 = gzopen(_mkdir(bcout_fn2), "wb"); + curr->bcfile2 = fopen(_mkdir(bcout_fn2), "wb"); } //TODO for hardcode max limit of items in the barcodes file to 6 @@ -254,7 +251,7 @@ int main(int argc, char *argv[]) { int i=0; while(i <= max_items && (p = strtok(NULL, "\t\n"))) { - // remove the token, new line char + // remove the token, new line char curr->bc[i] = strdup(p); fprintf(stdout, " BC %s ", curr->bc[i]); i++; @@ -329,8 +326,7 @@ int main(int argc, char *argv[]) { int n_pairs = curr->num_records/2; float percent_pairs = (float) n_pairs/total_pairs; - //fprintf (log_file,"%s\t%d\t%d\t%.2f\n", curr->bc_grp, curr->num_records, n_pairs, percent_pairs); - fprintf (stdout,"%s\t%d\t%d\t%.2f\n", curr->bc_grp, curr->num_records, n_pairs, percent_pairs); + fprintf(log_file,"%s\t%d\t%d\t%.2f\n", curr->bc_grp, curr->num_records, n_pairs, percent_pairs); curr = curr->next; } @@ -358,8 +354,8 @@ int main(int argc, char *argv[]) { curr = head; while (curr) { - gzclose(curr->bcfile1); - gzclose(curr->bcfile2); + fclose(curr->bcfile1); + fclose(curr->bcfile2); free (curr->bc_grp); free (curr->bc); @@ -368,5 +364,7 @@ int main(int argc, char *argv[]) { free (temp); } + free(curr); + return EXIT_SUCCESS; } diff --git a/src/sabre.h b/src/sabre.h index 5b1dd83..757ee56 100644 --- a/src/sabre.h +++ b/src/sabre.h @@ -50,8 +50,8 @@ typedef struct listel_p { char **bc; char *bc_grp; int num_records; - gzFile bcfile1; - gzFile bcfile2; + FILE* bcfile1; + FILE* bcfile2; struct actl_bc_cnt *actl_bc_cnt; struct listel_p *next; } barcode_data_t; @@ -65,8 +65,8 @@ typedef struct listel_p { typedef struct { gzFile fq1_fd; gzFile fq2_fd; - gzFile unassigned1_fd; - gzFile unassigned2_fd; + FILE* unassigned1_fd; + FILE* unassigned2_fd; FILE* umis_2_short_fd; int mismatch; int combine; @@ -84,7 +84,7 @@ typedef struct { typedef struct { int id; - const param_t* params; + const param_t* params; barcode_data_t* curr; metrics_t* metrics; pthread_mutex_t *in_lock, *out_lock; diff --git a/src/utils.c b/src/utils.c index 2f56d87..d99144d 100644 --- a/src/utils.c +++ b/src/utils.c @@ -31,7 +31,7 @@ const char * _mkdir(const char *file_path) { const char *dir = dirname(dirc); snprintf(tmp, sizeof(tmp),"%s", dir); len = strlen(tmp); - + if(tmp[len - 1] == '/') { tmp[len - 1] = 0; } @@ -52,7 +52,7 @@ const char * _mkdir(const char *file_path) { //NOTE retuns zero on success //strcmp can be used for sorting, returns pos, zero, neg -//BUT this new implementation can't be used as such just FYI +//BUT this new implementation can't be used as such just FYI int chk_bc_mtch(const char *orig_bc, const char *orig_read, size_t mismatch, int max_5prime_crop) { int orig_read_len = strlen(orig_read); int orig_bc_len = strlen(orig_bc); @@ -213,10 +213,10 @@ void get_bc_fn(char **bcout_fn, char *s_name, char *barcode, int read_type) { strcat(*bcout_fn, barcode); if(read_type == 1) { - strcat(*bcout_fn, "_R1.fastq.gz"); + strcat(*bcout_fn, "_R1.fastq"); } else if(read_type == 2) { - strcat(*bcout_fn, "_R2.fastq.gz"); + strcat(*bcout_fn, "_R2.fastq"); } else { fprintf (stderr, @@ -239,7 +239,7 @@ void set_default_params(param_t *params) { void params_destroy(param_t *params) { gzclose(params->fq1_fd); gzclose(params->fq2_fd); - gzclose(params->unassigned1_fd); - gzclose(params->unassigned2_fd); + fclose(params->unassigned1_fd); + fclose(params->unassigned2_fd); fclose(params->umis_2_short_fd); } From 850b92440eda474e3a4914f9d14350fa2742c076 Mon Sep 17 00:00:00 2001 From: serine Date: Fri, 22 Feb 2019 11:46:01 +1100 Subject: [PATCH 45/55] Polished off threads features, fixed all issues reaised by valgrind --leak-check=yes --track-origins=yes -v to get a dev version use make clean-all && make dev --- src/Makefile | 7 ++++--- src/demultiplex.c | 15 +++++++-------- src/fastq.c | 11 +++++++++-- src/fastq.h | 1 + src/sabre.c | 12 ++++-------- 5 files changed, 25 insertions(+), 21 deletions(-) diff --git a/src/Makefile b/src/Makefile index 1e7fde2..f20d60d 100644 --- a/src/Makefile +++ b/src/Makefile @@ -1,5 +1,6 @@ # Source, Executable, Includes, Library Defines VERSION = 1.00 +VERSIONDEV = 1.00-dev CC = gcc INCL = fastq.h sabre.h #SRC = demulti_paired.c demulti_single.c sabre.c utils.c @@ -8,7 +9,7 @@ OBJ = $(SRC:.c=.o) DSRC=src CFLAGS = -Wall -O2 -std=c99 -pedantic -DVERSION=$(VERSION) -CFLAGSDEV = -Wall -O0 -ggdb -std=c99 -pedantic -DVERSION=$(VERSION) +CFLAGSDEV = -Wall -O0 -g -std=c99 -DVERSION=$(VERSIONDEV) LDFLAGS = -lz -lpthread GPROF = -pg @@ -32,8 +33,8 @@ build: $(OBJ) $(CC) $(CFLAGS) $(OBJ) -o $(EXE) $(LDFLAGS) #ln -sf $(DSRC)/$(EXE) .. -dev: $(OBJ) - $(CC) $(CFLAGSDEV) $(OBJ) -o $(EXE)-dev $(LDFLAGS) +dev: $(OBJDEV) + $(CC) $(CFLAGSDEV) $(SRC) -o $(EXE)-dev $(LDFLAGS) metrics: $(CC) $(CFLAGSDEV) -o metrics metrics.c $(LDFLAGS) diff --git a/src/demultiplex.c b/src/demultiplex.c index 901afe0..ad482a4 100644 --- a/src/demultiplex.c +++ b/src/demultiplex.c @@ -14,8 +14,7 @@ #include "demultiplex.h" -void* demult_runner(void *arg) -{ +void* demult_runner(void *arg) { char fqread1[MAX_READ_SIZE]; char fqread2[MAX_READ_SIZE]; @@ -23,11 +22,11 @@ void* demult_runner(void *arg) fqread1[0] = '\0'; fqread2[0] = '\0'; - fq_rec_t *fq_rec1; - fq_rec_t *fq_rec2; + fq_rec_t *fq_rec1 = (fq_rec_t*) malloc(sizeof(fq_rec_t)); + fq_rec_t *fq_rec2 = (fq_rec_t*) malloc(sizeof(fq_rec_t)); - fq_rec1 = (fq_rec_t*) malloc(sizeof(fq_rec_t)); - fq_rec2 = (fq_rec_t*) malloc(sizeof(fq_rec_t)); + init_fq_rec(fq_rec1); + init_fq_rec(fq_rec2); barcode_data_t *curr; curr = NULL; @@ -182,11 +181,11 @@ void* demult_runner(void *arg) } thread_data->metrics->total += 2; - + free(actl_bc); } free(fq_rec1); free(fq_rec2); - free(thread_data); + //free(thread_data); according to valgrind report this line isn't needed since no errors given out.. return NULL; } diff --git a/src/fastq.c b/src/fastq.c index 05097d7..1a33bbb 100644 --- a/src/fastq.c +++ b/src/fastq.c @@ -28,9 +28,9 @@ int get_fq_rec(fq_rec_t *fq_rec, gzFile fq_fd) { done = done || get_line(fq_fd, fq_rec->other, LINE_SIZE); done = done || get_line(fq_fd, fq_rec->qual, LINE_SIZE); char *ptr = strchr(fq_rec->name,' '); - if (ptr) { + if(ptr) { *ptr='\0'; - fq_rec->comment = ptr+1; + fq_rec->comment = ptr+1; } else { fq_rec->comment = NULL; } @@ -38,3 +38,10 @@ int get_fq_rec(fq_rec_t *fq_rec, gzFile fq_fd) { return done; } + +void init_fq_rec(fq_rec_t *fq_rec) { + fq_rec->name[0] = '\0'; + fq_rec->seq[0] = '\0'; + fq_rec->other[0] = '\0'; + fq_rec->qual[0] = '\0'; +} diff --git a/src/fastq.h b/src/fastq.h index 2c9a239..0083621 100644 --- a/src/fastq.h +++ b/src/fastq.h @@ -13,6 +13,7 @@ typedef struct { char qual[LINE_SIZE]; } fq_rec_t; +void init_fq_rec(fq_rec_t *fq_rec); int get_line(gzFile fq_fd, char *line, int buff); int get_fq_rec(fq_rec_t *fq_rec, gzFile fq_fd); diff --git a/src/sabre.c b/src/sabre.c index 3be837d..2a4f49b 100644 --- a/src/sabre.c +++ b/src/sabre.c @@ -34,6 +34,8 @@ int main(int argc, char *argv[]) { set_default_params(¶ms); metrics_t metrics; + metrics. num_unknown = 0; + metrics.total = 0; //clock_t begin = clock(); time_t start, end; @@ -49,7 +51,7 @@ int main(int argc, char *argv[]) { char *unassigned2_fn=strdup("unassigned_R2.fq"); char *umis_2_short_fn=strdup("umis_too_short.txt"); - FILE* log_file=NULL; + FILE* log_file; char *log_fn=strdup("stats.txt"); int optc; @@ -85,17 +87,11 @@ int main(int argc, char *argv[]) { break; case 'z': - if(unassigned1_fn) { - free(unassigned1_fn); - } unassigned1_fn = (char*) malloc (strlen (optarg) + 1); strcpy (unassigned1_fn, optarg); break; case 'w': - if(unassigned2_fn) { - free(unassigned2_fn); - } unassigned2_fn = (char*) malloc (strlen (optarg) + 1); strcpy (unassigned2_fn, optarg); break; @@ -321,7 +317,7 @@ int main(int argc, char *argv[]) { curr = head; int total_pairs = metrics.total/2; - while (curr) { + while(curr) { int n_pairs = curr->num_records/2; float percent_pairs = (float) n_pairs/total_pairs; From 8cf750142de4d3c2d1d6af5dfdba79e5d86180b4 Mon Sep 17 00:00:00 2001 From: serine Date: Fri, 22 Feb 2019 13:54:07 +1100 Subject: [PATCH 46/55] Stupid but necessary, reindented all C files to 4 spaces and no tabs! --- src/demultiplex.c | 53 +++++++++++++++++------------------ src/fastq.c | 4 +-- src/metrics.c | 32 ++++++++++----------- src/modes.c | 70 ---------------------------------------------- src/sabre.c | 39 +++++++++++++------------- src/sanity_check.c | 2 +- src/usage.c | 13 +++++---- src/utils.c | 8 ++---- 8 files changed, 75 insertions(+), 146 deletions(-) delete mode 100644 src/modes.c diff --git a/src/demultiplex.c b/src/demultiplex.c index ad482a4..206d617 100644 --- a/src/demultiplex.c +++ b/src/demultiplex.c @@ -41,32 +41,32 @@ void* demult_runner(void *arg) { // lock reading pthread_mutex_lock(thread_data->in_lock); - //this is equivalent to if(false), which means this block - //is always skipped, unless when there is an error/end of the file + //this is equivalent to if(false), which means this block + //is always skipped, unless when there is an error/end of the file if(get_fq_rec(fq_rec1, thread_data->params->fq1_fd)) { - // sanity check no more reads + // sanity check no more reads pthread_mutex_unlock(thread_data->in_lock); - break; - } + break; + } if(thread_data->params->paired > 0) { if(get_fq_rec(fq_rec2, thread_data->params->fq2_fd)) { - //error out there becuase if reached the end of the file - //then we should hit first break, above, since the assumptions - //that the files of equal length. If issues with R2 only this is an error + //error out there becuase if reached the end of the file + //then we should hit first break, above, since the assumptions + //that the files of equal length. If issues with R2 only this is an error fprintf (stderr, "\n\ - \n ERROR: R2 file shorter than R1 file.\ - \n Stopping here:\ - \n %s\ - \n", - fq_rec1->name); - pthread_mutex_unlock(thread_data->in_lock); - exit(1); - } + \n ERROR: R2 file shorter than R1 file.\ + \n Stopping here:\ + \n %s\ + \n", + fq_rec1->name); + pthread_mutex_unlock(thread_data->in_lock); + exit(1); + } } // unlock reading - // TODO this bit of code for ordered fastq files, implement later? + // TODO this bit of code for ordered fastq files, implement later? //my_line_num = *(thread_data->line_num); //*thread_data->line_num += 1; pthread_mutex_unlock(thread_data->in_lock); @@ -86,21 +86,21 @@ void* demult_runner(void *arg) { //found matching barcode actl_bc = strndup( (fq_rec1->seq)+n_crop, strlen(curr->bc[i]) ); got_match = 1; - break; + break; } - } + } if(got_match) { - break; - } + break; + } curr = curr->next; } /* Step 2: Write read out into barcode specific file */ //TODO this bit of code to keep fastq files ordered as per original fastq files - //which I don't think that needed? at least not at this stage + //which I don't think that needed? at least not at this stage // lock writing //while(*(thread_data->out_line_num) != my_line_num) { // pthread_cond_wait(thread_data->cv, thread_data->out_lock); @@ -124,8 +124,8 @@ void* demult_runner(void *arg) { continue; } else { - umi_idx = strdup(actl_umi_idx); - umi_idx[thread_data->params->min_umi_len] = '\0'; + umi_idx = strdup(actl_umi_idx); + umi_idx[thread_data->params->min_umi_len] = '\0'; } } @@ -151,12 +151,12 @@ void* demult_runner(void *arg) { fprintf(curr->bcfile2, fqread2); pthread_mutex_unlock(thread_data->out_lock); - //dont need to increment buff_cnt, assuming fq_read1 keeps the right count + //dont need to increment buff_cnt, assuming fq_read1 keeps the right count curr->num_records += 1; } } curr->num_records += 1; - + free(actl_bc); } else { @@ -181,7 +181,6 @@ void* demult_runner(void *arg) { } thread_data->metrics->total += 2; - free(actl_bc); } free(fq_rec1); diff --git a/src/fastq.c b/src/fastq.c index 1a33bbb..333ebf5 100644 --- a/src/fastq.c +++ b/src/fastq.c @@ -13,7 +13,7 @@ int get_line(gzFile fq_fd, char *line, int buff) { if(new_line[str_len-1] != '\n') { fprintf(stderr, "Line too long %d\n", buff); - exit(1); + exit(1); } new_line[str_len-1] = '\0'; @@ -32,7 +32,7 @@ int get_fq_rec(fq_rec_t *fq_rec, gzFile fq_fd) { *ptr='\0'; fq_rec->comment = ptr+1; } else { - fq_rec->comment = NULL; + fq_rec->comment = NULL; } // before writing it out check that comment isn't null diff --git a/src/metrics.c b/src/metrics.c index 0c58ae8..646b3ea 100644 --- a/src/metrics.c +++ b/src/metrics.c @@ -1,16 +1,16 @@ #include "utils.h" /* -gcc -Wall -O2 -std=c99 -o metrics metrics.c -lz -*/ + gcc -Wall -O2 -std=c99 -o metrics metrics.c -lz + */ #define BARCODE_ARRAY 1000000 //int chk_bc_arr(const char *arr, char *bc); /* - WISDOM char **arr and char *arr[] mean the same things - but later one more informative + WISDOM char **arr and char *arr[] mean the same things + but later one more informative */ int chk_bc_arr(barcodes_t *arr, char *bc) { @@ -80,13 +80,13 @@ int main (int argc, char *argv[]) { if (argc <= 2) { fprintf(stderr, "\n\ - \n Usage: metrics \ - \n\ - \n Options:\ - \n\ - \n mode INT [0|1]; 0 = sample barcode, 1 = umis barcodes\ - \n\ - \n"); + \n Usage: metrics \ + \n\ + \n Options:\ + \n\ + \n mode INT [0|1]; 0 = sample barcode, 1 = umis barcodes\ + \n\ + \n"); exit (EXIT_SUCCESS); } @@ -99,9 +99,9 @@ int main (int argc, char *argv[]) { fqrec1 = kseq_init(pe1); /* - WISDOM these tow are actually different things - char barcodes[BARCODE_ARRAY][BARCODE]; - char *barcodes[]; + WISDOM these tow are actually different things + char barcodes[BARCODE_ARRAY][BARCODE]; + char *barcodes[]; */ //char **barcodes = calloc(BARCODE_ARRAY, sizeof(char*)); @@ -147,8 +147,8 @@ int main (int argc, char *argv[]) { qsort(barcodes, n, sizeof(barcodes_t), bc_n_cmp); /* - WISDOM this is to limit the right scope for i - */ + WISDOM this is to limit the right scope for i + */ int i = 0; while(barcodes[i].bc != 0) { diff --git a/src/modes.c b/src/modes.c deleted file mode 100644 index a616cb5..0000000 --- a/src/modes.c +++ /dev/null @@ -1,70 +0,0 @@ - -/* - * Not valid C code, just some snipets for later use - * - * TODO build on get_fqread function in utils. - * I think that function should also take char *umi - * if umi == NULL, then none of 3? modes are true, return normal - * if umi != NULL then this must be one of 3? modes, merge R1 and R2 and return R1 only - */ - -typedef struct listel_p { - char* bc; - int num_records; - //FILE* bcfile1; - //FILE* bcfile2; - gzFile bcfile1; - gzFile bcfile2; - struct listel_p *next; -} barcode_data_paired; - - /* Creating linked list of barcode data */ - // https://www.hackerearth.com/practice/data-structures/linked-list/singly-linked-list/tutorial/ - // where each node is represents one barcode from the barcode file - // number of nodes should equal to number of barcodes (lines) in the file - head = NULL; - char barcode [MAX_BARCODE_LENGTH]; - char s_name [MAX_SNAME_LENGTH]; - //while (fscanf (barfile, "%s%s%s", barcode, baroutfn1, baroutfn2) != EOF) { - while (fscanf (barfile, "%s\t%s", barcode, s_name) != EOF) { - char bcout_prefix [MAX_BARCODE_LENGTH+MAX_SNAME_LENGTH]; - char bcout_fn1 [MAX_FILENAME_LENGTH]; - char bcout_fn2 [MAX_FILENAME_LENGTH]; - - curr = (barcode_data_paired*) malloc (sizeof (barcode_data_paired)); - curr->bc = (char*) malloc (strlen(barcode) + 1); - strcpy (curr->bc, barcode); - - if(strlen(s_name) > MAX_FILENAME_LENGTH) { - fprintf (stderr, "ERROR: Too many characters in your sample name; %s:%d \n", s_name, strlen(s_name)); - } - //TODO make this into a function call later on. - //want a function in utils.c get_bc_fn(s_name, barcode, 1|2) to return - //a string = bcout_fn to... maybe this isn't worth a function call.. - strcat(bcout_prefix, s_name); - strcat(bcout_prefix, "_"); - strcat(bcout_prefix, barcode); - - strcpy(bcout_fn1, bcout_prefix); - strcat(bcout_fn1, "_R1.fastq.gz"); - - strcpy(bcout_fn2, bcout_prefix); - strcat(bcout_fn2, "_R2.fastq.gz"); - - curr->bcfile1 = gzopen(_mkdir(bcout_fn1), "wb"); - curr->bcfile2 = gzopen(_mkdir(bcout_fn2), "wb"); - curr->num_records = 0; - - curr->next = head; - head = curr; - } - - while (curr) { - gzclose(curr->bcfile1); - gzclose(curr->bcfile2); - free (curr->bc); - temp = curr; - curr = curr->next; - free (temp); - } - diff --git a/src/sabre.c b/src/sabre.c index 2a4f49b..0a0ea6e 100644 --- a/src/sabre.c +++ b/src/sabre.c @@ -133,10 +133,10 @@ int main(int argc, char *argv[]) { version(EXIT_SUCCESS); break; - //NOTE if user requrested the help menu i.e --help then - //return success for all other cases below while help menu - //is printed it wasn't intended by user (or at least we don't know that) - //and therefore exit code - fail + //NOTE if user requrested the help menu i.e --help then + //return success for all other cases below while help menu + //is printed it wasn't intended by user (or at least we don't know that) + //and therefore exit code - fail case 'h': usage(EXIT_SUCCESS); break; @@ -200,7 +200,7 @@ int main(int argc, char *argv[]) { \n --threads %d\ \n\ \n In Progess...\ - \n\ + \n\ \n", PROGRAM_NAME,\ fq1_fn, fq2_fn,\ bc_fn,\ @@ -225,10 +225,10 @@ int main(int argc, char *argv[]) { curr = (barcode_data_t*) malloc(sizeof(barcode_data_t)); char *p = strtok(line_buff, "\t"); - char *s_name = strdup(p); + char *s_name = strdup(p); p = strtok(NULL, "\t"); - curr->bc_grp = strdup(p); + curr->bc_grp = strdup(p); bcout_fn1 = (char *) malloc(MAX_FILENAME_LENGTH*2); bcout_fn1[0] = '\0'; @@ -242,17 +242,17 @@ int main(int argc, char *argv[]) { curr->bcfile2 = fopen(_mkdir(bcout_fn2), "wb"); } - //TODO for hardcode max limit of items in the barcodes file to 6 - curr->bc = calloc(max_items, sizeof(void*)); + //TODO for hardcode max limit of items in the barcodes file to 6 + curr->bc = calloc(max_items, sizeof(void*)); - int i=0; + int i=0; while(i <= max_items && (p = strtok(NULL, "\t\n"))) { - // remove the token, new line char - curr->bc[i] = strdup(p); - fprintf(stdout, " BC %s ", curr->bc[i]); - i++; - } - fprintf(stdout, "\n"); + // remove the token, new line char + curr->bc[i] = strdup(p); + fprintf(stdout, " BC %s ", curr->bc[i]); + i++; + } + fprintf(stdout, "\n"); curr->num_records = 0; curr->next = head; @@ -284,9 +284,9 @@ int main(int argc, char *argv[]) { for(int i=0; i < threads; i++) { - thread_data[i].params = ¶ms; - thread_data[i].curr = curr; - thread_data[i].metrics = &metrics; + thread_data[i].params = ¶ms; + thread_data[i].curr = curr; + thread_data[i].metrics = &metrics; thread_data[i].id = i; thread_data[i].in_lock = &in_lock; thread_data[i].out_lock = &out_lock; @@ -361,6 +361,5 @@ int main(int argc, char *argv[]) { } free(curr); - return EXIT_SUCCESS; } diff --git a/src/sanity_check.c b/src/sanity_check.c index 0876d23..c6ffb1c 100644 --- a/src/sanity_check.c +++ b/src/sanity_check.c @@ -1,5 +1,5 @@ -void* sanity_chk(void *arg) +void* sanity_chk(void *arg) { if (!fq1 || !fq2 || !unknownfn1 || !unknownfn2 || !barfn) { paired_usage (EXIT_FAILURE); } diff --git a/src/usage.c b/src/usage.c index b7280bc..0090788 100644 --- a/src/usage.c +++ b/src/usage.c @@ -29,7 +29,7 @@ void usage(int status) { \n\ \n -t, --threads INT specify number of threads to use [4]\ \n -v, --version get current version\ - \n -h, --hel get help menu, exit status is zero\ + \n -h, --help get help menu, exit status is zero\ \n -o, --story little story about sabre tool\ \n\ \n", @@ -61,7 +61,7 @@ void version(int status) { fprintf(stdout, "\n\ \n %s\ \n\ - \n version: %d.%d.%d\ + \n version: %d.%d.%d\ \n\ \n Copyright (c) 2011 The Regents of University of California, Davis Campus.\ \n %s is free software and comes with ABSOLUTELY NO WARRANTY.\ @@ -70,9 +70,12 @@ void version(int status) { \n Written by: %s\ \n\ \n", - PROGRAM_NAME, - VERSION_MAJOR, VERSION_MINOR, VERSION_PATCH, - PROGRAM_NAME, AUTHORS); + PROGRAM_NAME, + VERSION_MAJOR, + VERSION_MINOR, + VERSION_PATCH, + PROGRAM_NAME, + AUTHORS); exit(status); } diff --git a/src/utils.c b/src/utils.c index d99144d..4ed7f05 100644 --- a/src/utils.c +++ b/src/utils.c @@ -38,9 +38,9 @@ const char * _mkdir(const char *file_path) { for(p = tmp + 1; *p; p++) { if(*p == '/') { - *p = 0; - mkdir(tmp, S_IRWXU); - *p = '/'; + *p = 0; + mkdir(tmp, S_IRWXU); + *p = '/'; } } @@ -80,8 +80,6 @@ int chk_bc_mtch(const char *orig_bc, const char *orig_read, size_t mismatch, int u1 = *bc++; u2 = *read++; - //fprintf(stdout, "BC %s READ %s\n", u1, u2); - if (u1 != u2) { cnt++; if (cnt > mismatch) { From f96b2348303c8a4c09162c4a219f0728070eb378 Mon Sep 17 00:00:00 2001 From: serine Date: Mon, 14 Oct 2019 10:38:59 +1100 Subject: [PATCH 47/55] update makefile to include commit hash into binary build --- src/Makefile | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/Makefile b/src/Makefile index f20d60d..c2b5387 100644 --- a/src/Makefile +++ b/src/Makefile @@ -1,6 +1,5 @@ # Source, Executable, Includes, Library Defines -VERSION = 1.00 -VERSIONDEV = 1.00-dev +GIT_VERSION := "$(shell git describe --abbrev=7 --always --tags)" CC = gcc INCL = fastq.h sabre.h #SRC = demulti_paired.c demulti_single.c sabre.c utils.c @@ -8,12 +7,14 @@ SRC = sabre.c usage.c demultiplex.c utils.c fastq.c OBJ = $(SRC:.c=.o) DSRC=src -CFLAGS = -Wall -O2 -std=c99 -pedantic -DVERSION=$(VERSION) +#CFLAGS = -Wall -O2 -std=c99 -pedantic -DVERSION=$(VERSION) +# need to quote GIT_VERSION so that the value gets passed as a string +CFLAGS = -Wall -O2 -std=c99 -pedantic -DVERSION=\"$(GIT_VERSION)\" CFLAGSDEV = -Wall -O0 -g -std=c99 -DVERSION=$(VERSIONDEV) LDFLAGS = -lz -lpthread GPROF = -pg -EXE = sabre +EXE = sabre-$(GIT_VERSION) .PHONY: default @@ -31,7 +32,7 @@ sabre.o: sabre.h build: $(OBJ) $(CC) $(CFLAGS) $(OBJ) -o $(EXE) $(LDFLAGS) - #ln -sf $(DSRC)/$(EXE) .. + ln -sf $(DSRC)/$(EXE) ../sabre dev: $(OBJDEV) $(CC) $(CFLAGSDEV) $(SRC) -o $(EXE)-dev $(LDFLAGS) From 5c071b837a6cd30aa635b8a9f275fd02f13bce5b Mon Sep 17 00:00:00 2001 From: serine Date: Mon, 14 Oct 2019 10:49:13 +1100 Subject: [PATCH 48/55] fixed warning message and added git commit hash to the version print --- src/demultiplex.c | 10 +++++----- src/usage.c | 3 ++- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/src/demultiplex.c b/src/demultiplex.c index 206d617..f01072f 100644 --- a/src/demultiplex.c +++ b/src/demultiplex.c @@ -133,7 +133,7 @@ void* demult_runner(void *arg) { get_merged_fqread(fqread1, fq_rec1, fq_rec2, actl_bc, umi_idx, thread_data->params->no_comment, n_crop); pthread_mutex_lock(thread_data->out_lock); - fprintf(curr->bcfile1, fqread1); + fprintf(curr->bcfile1, "%s", fqread1); pthread_mutex_unlock(thread_data->out_lock); } @@ -141,14 +141,14 @@ void* demult_runner(void *arg) { get_fqread(fqread1, fq_rec1, actl_bc, umi_idx, thread_data->params->no_comment, n_crop); pthread_mutex_lock(thread_data->out_lock); - fprintf(curr->bcfile1, fqread1); + fprintf(curr->bcfile1, "%s", fqread1); pthread_mutex_unlock(thread_data->out_lock); if(thread_data->params->paired > 0) { get_fqread(fqread2, fq_rec1, actl_bc, umi_idx, thread_data->params->no_comment, n_crop); pthread_mutex_lock(thread_data->out_lock); - fprintf(curr->bcfile2, fqread2); + fprintf(curr->bcfile2, "%s", fqread2); pthread_mutex_unlock(thread_data->out_lock); //dont need to increment buff_cnt, assuming fq_read1 keeps the right count @@ -163,7 +163,7 @@ void* demult_runner(void *arg) { get_fqread(fqread1, fq_rec1, NULL, NULL, thread_data->params->no_comment, 0); pthread_mutex_lock(thread_data->out_lock); - fprintf(thread_data->params->unassigned1_fd, fqread1); + fprintf(thread_data->params->unassigned1_fd, "%s", fqread1); pthread_mutex_unlock(thread_data->out_lock); thread_data->metrics->num_unknown += 1; @@ -172,7 +172,7 @@ void* demult_runner(void *arg) { get_fqread(fqread2, fq_rec2, NULL, NULL, thread_data->params->no_comment, 0); pthread_mutex_lock(thread_data->out_lock); - fprintf(thread_data->params->unassigned2_fd, fqread2); + fprintf(thread_data->params->unassigned2_fd, "%s", fqread2); pthread_mutex_unlock(thread_data->out_lock); thread_data->metrics->num_unknown += 1; diff --git a/src/usage.c b/src/usage.c index 0090788..563b06c 100644 --- a/src/usage.c +++ b/src/usage.c @@ -61,7 +61,7 @@ void version(int status) { fprintf(stdout, "\n\ \n %s\ \n\ - \n version: %d.%d.%d\ + \n version: %d.%d.%d-%s\ \n\ \n Copyright (c) 2011 The Regents of University of California, Davis Campus.\ \n %s is free software and comes with ABSOLUTELY NO WARRANTY.\ @@ -74,6 +74,7 @@ void version(int status) { VERSION_MAJOR, VERSION_MINOR, VERSION_PATCH, + VERSION, PROGRAM_NAME, AUTHORS); From 6d4172e957701fc4c6573e74338fcf484efc5504 Mon Sep 17 00:00:00 2001 From: serine Date: Mon, 14 Oct 2019 21:16:52 +1100 Subject: [PATCH 49/55] updated readme --- README.md | 131 +++++++++++++++++------------------------------------- 1 file changed, 41 insertions(+), 90 deletions(-) diff --git a/README.md b/README.md index 82bb098..ab3aa22 100644 --- a/README.md +++ b/README.md @@ -1,107 +1,58 @@ -> This is a fork of the [original repo](https://github.com/najoshi/sabre). I might be taking this into slightly different direction +> This is a fork of the [original repo](https://github.com/najoshi/sabre). I might be taking this tool into a different direction to what was originally intended -# sabre - A barcode demultiplexing and trimming tool for FastQ files +# sabre -## About +> A cellular barcode demultiplexing tool of FASTQ files -Next-generation sequencing can currently produce hundreds of millions of reads -per lane of sample and that number increases at a dizzying rate. Barcoding -individual sequences for multiple lines or multiple species is a cost-efficient -method to sequence and analyze a broad range of data. +## Content -Sabre is a tool that will demultiplex barcoded reads into separate files. -It will work on both single-end and paired-end data in fastq format. -It simply compares the provided barcodes with each read and separates -the read into its appropriate barcode file, after stripping the barcode from -the read (and also stripping the quality values of the barcode bases). If -a read does not have a recognized barcode, then it is put into the unknown file. -Sabre also has an option (-m) to allow mismatches of the barcodes. +- [Install](#install) +- [Quick start](#quick-start) +- [Usage](#usage) -Sabre also supports gzipped file inputs. Also, since sabre does not use the -quality values in any way, it can be used on fasta data that is converted to -fastq by creating fake quality values. +## Install -Finally, after demultiplexing, sabre outputs a summary of how many records -went into each barcode file. +```BASH +git clone https://github.com/serine/sabre +cd src +make +``` -## Requirements +## Quick start -Sabre requires a C compiler; GCC or clang are recommended. Sabre -relies on Heng Li's kseq.h, which is bundled with the source. - -Sabre also requires Zlib, which can be obtained at -. - -## Building and Installing Sabre - -To build Sabre, enter: - - make - -Then, copy or move "sabre" to a directory in your $PATH. +```BASH +sabre -f MultiplexRNASeq_S1_R1_001.fastq.gz \ + -r MultiplexRNASeq_S1_R2_001.fastq.gz \ + -b barcodes.txt \ + -c \ + -u \ + -m 2 \ + -l 10 \ + -a 1 \ + -s sabre.txt \ + -t 12 +``` ## Usage -Sabre has two modes to work with both paired-end and single-end -reads: `sabre se` and `sabre pe`. - -Running sabre by itself will print the help: - - sabre - -Running sabre with either the "se" or "pe" commands will give help -specific to those commands: - - sabre se - sabre pe - -### Sabre Single End (`sabre se`) - -`sabre se` takes an input fastq file and an input barcode data file and outputs -the reads demultiplexed into separate files using the file names from the data file. -The barcodes will be stripped from the reads and the quality values of the barcode -bases will also be removed. Any reads with unknown barcodes get put into the "unknown" -file specified on the command line. The -m option allows for mismatches in the barcodes. - -#### Barcode data file format for single end - - barcode1 barcode1_output_file.fastq - barcode2 barcode2_output_file.fastq - etc... - -Be aware that if you do not format the barcode data file correctly, sabre will not work properly. - -#### Example - - sabre se -f input_file.fastq -b barcode_data.txt -u unknown_barcode.fastq - sabre se -m 1 -f input_file.fastq -b barcode_data.txt -u unknown_barcode.fastq - -### Sabre Paired End (`sabre pe`) - -`sabre pe` takes two paired-end files and a barcode data file as input and outputs -the reads demultiplexed into separate paired-end files using the file names from the -data file. The barcodes will be stripped from the reads and the quality values of the barcode -bases will also be removed. Any reads with unknown barcodes get put into the "unknown" files -specified on the command line. It also has an option (-c) to remove barcodes from both files. -Using this option means that if sabre finds a barcode in the first file, it assumes the paired -read in the other file has the same barcode and will strip it (along with the quality values). -The -m option allows for mismatches in the barcodes. - -#### Barcode data file format for paired end +> This tool is under development and this is very much an alpha version +> In it's current form the tool is highly customised a particular multiplexing protocol - barcode1 barcode1_output_file1.fastq barcode1_output_file2.fastq - barcode2 barcode2_output_file1.fastq barcode2_output_file2.fastq - etc... +### Cellular barcodes -Be aware that if you do not format the barcode data file correctly, sabre will not work properly. +In order to demultiplex the use needs to provide `barcodes.txt` file, which is three column tab delimited file -#### Examples +``` +sample_name group barcode +``` - sabre pe -f input_file1.fastq -r input_file2.fastq -b barcode_data.txt \ - -u unknown_barcode1.fastq -w unknown_barcode1.fastq +currently group is semi-redundant column, it there for a feature that in the development. for most use cases group can equals to barcode - sabre pe -c -f input_file1.fastq -r input_file2.fastq -b barcode_data.txt \ - -u unknown_barcode1.fastq -w unknown_barcode1.fastq +e.g - sabre pe -m 2 -f input_file1.fastq -r input_file2.fastq -b barcode_data.txt \ - -u unknown_barcode1.fastq -w unknown_barcode1.fastq +``` +cntr_rep1 TAAGGCGA TAAGGCGA +cntr_rep2 CGTACTAG CGTACTAG +treat_rep1 AGGCAGAA AGGCAGAA +treat_rep2 TCCTGAGC TCCTGAGC +``` From 01eda92fd90914374f2af3f3779a4178a8519854 Mon Sep 17 00:00:00 2001 From: serine Date: Mon, 14 Oct 2019 21:54:20 +1100 Subject: [PATCH 50/55] removed symlinking sabre binary after build from make file. this causes issues with conda packaging --- src/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Makefile b/src/Makefile index c2b5387..4035b40 100644 --- a/src/Makefile +++ b/src/Makefile @@ -32,7 +32,7 @@ sabre.o: sabre.h build: $(OBJ) $(CC) $(CFLAGS) $(OBJ) -o $(EXE) $(LDFLAGS) - ln -sf $(DSRC)/$(EXE) ../sabre + #ln -sf $(DSRC)/$(EXE) ../sabre dev: $(OBJDEV) $(CC) $(CFLAGSDEV) $(SRC) -o $(EXE)-dev $(LDFLAGS) From db0ee829d967df4c73961bbedc9cf4f5fa69bcfb Mon Sep 17 00:00:00 2001 From: serine Date: Mon, 14 Oct 2019 22:01:27 +1100 Subject: [PATCH 51/55] simplified the versioning to be just a hash of the last commit at this stage this is more then appropriate --- src/Makefile | 2 +- src/sabre.h | 3 --- src/usage.c | 5 +---- 3 files changed, 2 insertions(+), 8 deletions(-) diff --git a/src/Makefile b/src/Makefile index 4035b40..71bd7ff 100644 --- a/src/Makefile +++ b/src/Makefile @@ -14,7 +14,7 @@ CFLAGSDEV = -Wall -O0 -g -std=c99 -DVERSION=$(VERSIONDEV) LDFLAGS = -lz -lpthread GPROF = -pg -EXE = sabre-$(GIT_VERSION) +EXE = sabre .PHONY: default diff --git a/src/sabre.h b/src/sabre.h index 757ee56..7f86712 100644 --- a/src/sabre.h +++ b/src/sabre.h @@ -27,9 +27,6 @@ #endif //https://semver.org/ -#define VERSION_MAJOR 0 -#define VERSION_MINOR 3 -#define VERSION_PATCH 1 #define MAX_BARCODE_LENGTH 100 #define MAX_FILENAME_LENGTH 200 diff --git a/src/usage.c b/src/usage.c index 563b06c..d4a97eb 100644 --- a/src/usage.c +++ b/src/usage.c @@ -61,7 +61,7 @@ void version(int status) { fprintf(stdout, "\n\ \n %s\ \n\ - \n version: %d.%d.%d-%s\ + \n version: %s\ \n\ \n Copyright (c) 2011 The Regents of University of California, Davis Campus.\ \n %s is free software and comes with ABSOLUTELY NO WARRANTY.\ @@ -71,9 +71,6 @@ void version(int status) { \n\ \n", PROGRAM_NAME, - VERSION_MAJOR, - VERSION_MINOR, - VERSION_PATCH, VERSION, PROGRAM_NAME, AUTHORS); From c529c298afc04100d92102facc0336e92dfdc558 Mon Sep 17 00:00:00 2001 From: David Powell Date: Tue, 2 Jun 2020 19:21:28 +1000 Subject: [PATCH 52/55] Update sabre.c Error on missing barcode file --- src/sabre.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/sabre.c b/src/sabre.c index 0a0ea6e..2cfc4ab 100644 --- a/src/sabre.c +++ b/src/sabre.c @@ -216,6 +216,11 @@ int main(int argc, char *argv[]) { // https://www.hackerearth.com/practice/data-structures/linked-list/singly-linked-list/tutorial/ // where each node is represents one barcode from the barcode file bc_fd = fopen(bc_fn, "r"); + if (!bc_fd) { + fprintf(stderr, "ERROR: Unable to barcode file\n"); + exit(EXIT_FAILURE); + } + head = NULL; curr = NULL; From a93381da004a1057972131d66a280aa90b93dcf3 Mon Sep 17 00:00:00 2001 From: David Powell Date: Fri, 5 Jun 2020 12:48:24 +1000 Subject: [PATCH 53/55] Find best matching barcode - Tries all barcodes up to max crop and max mismatch - Uses barcode that matches with fewest crop+mismatch --- src/demultiplex.c | 68 ++++++++++++++++++++++++++--------------------- src/utils.c | 25 ++++++++++------- src/utils.h | 7 ++++- 3 files changed, 59 insertions(+), 41 deletions(-) diff --git a/src/demultiplex.c b/src/demultiplex.c index f01072f..aebabf1 100644 --- a/src/demultiplex.c +++ b/src/demultiplex.c @@ -14,6 +14,20 @@ #include "demultiplex.h" +// Is m1 a better match than m2? 0 -> yes, use m1. 1 -> no, stick with m2 +int better(match_ret_t m1, match_ret_t m2) { + if (m1.cropped<0) + return 0; + if (m2.cropped<0) + return 1; + return (m1.cropped + m1.mismatches) < (m2.cropped+m2.mismatches); +} + +// Is m the best possible match? +int best(match_ret_t m) { + return m.cropped==0 && m.mismatches==0; +} + void* demult_runner(void *arg) { char fqread1[MAX_READ_SIZE]; @@ -28,9 +42,6 @@ void* demult_runner(void *arg) { init_fq_rec(fq_rec1); init_fq_rec(fq_rec2); - barcode_data_t *curr; - curr = NULL; - thread_data_t* thread_data = (thread_data_t*)arg; //int my_line_num; @@ -71,31 +82,28 @@ void* demult_runner(void *arg) { //*thread_data->line_num += 1; pthread_mutex_unlock(thread_data->in_lock); - int n_crop = 0; - + // Store a copy of the barcode found in the read (including the mismatches) char *actl_bc = NULL; /* Step 1: Find matching barcode */ - int got_match = 0; - curr = thread_data->curr; - while(curr) { - + match_ret_t best_match = {-1,-1}; + barcode_data_t *best_bc = NULL; + for (barcode_data_t *curr = thread_data->curr; curr!=NULL; curr = curr->next) { for (int i=0; curr->bc[i]; i++) { - n_crop = chk_bc_mtch(curr->bc[i], fq_rec1->seq, thread_data->params->mismatch, thread_data->params->max_5prime_crop); - if(n_crop >= 0) { - //found matching barcode - actl_bc = strndup( (fq_rec1->seq)+n_crop, strlen(curr->bc[i]) ); - got_match = 1; - break; + match_ret_t mtch = chk_bc_mtch(curr->bc[i], fq_rec1->seq, thread_data->params->mismatch, thread_data->params->max_5prime_crop); + if(better(mtch, best_match)) { + //found better match + best_match = mtch; + best_bc = curr; + if (actl_bc) free(actl_bc); + actl_bc = strndup( (fq_rec1->seq)+mtch.cropped, strlen(curr->bc[i]) ); + if (best(best_match)) + break; } - } - if(got_match) { + if (best(best_match)) break; - } - - curr = curr->next; } /* Step 2: Write read out into barcode specific file */ @@ -111,11 +119,11 @@ void* demult_runner(void *arg) { char *umi_idx = NULL; - if(curr != NULL) { + if(best_bc != NULL) { //for now assume barcode and umi are in R1 read if(thread_data->params->umi > 0) { - const char *actl_umi_idx = (fq_rec1->seq)+strlen(actl_bc)+n_crop; + const char *actl_umi_idx = (fq_rec1->seq)+strlen(actl_bc)+best_match.cropped; if(strlen(actl_umi_idx) < thread_data->params->min_umi_len) { pthread_mutex_lock(thread_data->out_lock); @@ -130,32 +138,32 @@ void* demult_runner(void *arg) { } if(thread_data->params->combine > 0 && actl_bc != NULL) { - get_merged_fqread(fqread1, fq_rec1, fq_rec2, actl_bc, umi_idx, thread_data->params->no_comment, n_crop); + get_merged_fqread(fqread1, fq_rec1, fq_rec2, actl_bc, umi_idx, thread_data->params->no_comment, best_match.cropped); pthread_mutex_lock(thread_data->out_lock); - fprintf(curr->bcfile1, "%s", fqread1); + fprintf(best_bc->bcfile1, "%s", fqread1); pthread_mutex_unlock(thread_data->out_lock); } else { - get_fqread(fqread1, fq_rec1, actl_bc, umi_idx, thread_data->params->no_comment, n_crop); + get_fqread(fqread1, fq_rec1, actl_bc, umi_idx, thread_data->params->no_comment, best_match.cropped); pthread_mutex_lock(thread_data->out_lock); - fprintf(curr->bcfile1, "%s", fqread1); + fprintf(best_bc->bcfile1, "%s", fqread1); pthread_mutex_unlock(thread_data->out_lock); if(thread_data->params->paired > 0) { - get_fqread(fqread2, fq_rec1, actl_bc, umi_idx, thread_data->params->no_comment, n_crop); + get_fqread(fqread2, fq_rec1, actl_bc, umi_idx, thread_data->params->no_comment, best_match.cropped); pthread_mutex_lock(thread_data->out_lock); - fprintf(curr->bcfile2, "%s", fqread2); + fprintf(best_bc->bcfile2, "%s", fqread2); pthread_mutex_unlock(thread_data->out_lock); //dont need to increment buff_cnt, assuming fq_read1 keeps the right count - curr->num_records += 1; + best_bc->num_records += 1; } } - curr->num_records += 1; + best_bc->num_records += 1; free(actl_bc); } else { diff --git a/src/utils.c b/src/utils.c index 4ed7f05..0ed0019 100644 --- a/src/utils.c +++ b/src/utils.c @@ -53,24 +53,25 @@ const char * _mkdir(const char *file_path) { //NOTE retuns zero on success //strcmp can be used for sorting, returns pos, zero, neg //BUT this new implementation can't be used as such just FYI -int chk_bc_mtch(const char *orig_bc, const char *orig_read, size_t mismatch, int max_5prime_crop) { +match_ret_t chk_bc_mtch(const char *orig_bc, const char *orig_read, size_t max_mismatch, int max_5prime_crop) { int orig_read_len = strlen(orig_read); int orig_bc_len = strlen(orig_bc); int n_crop = 0; + match_ret_t ret = { -1,-1 }; if(orig_bc_len > orig_read_len) { fprintf (stderr, "WARNING: Length of the barcode %d is greater than length of the reads %d.", orig_bc_len, orig_read_len); - return -1; + return ret; } while(n_crop <= max_5prime_crop) { if(n_crop > orig_read_len) { - return -1; + return ret; } - int cnt = 0; + int mismatch = 0; char u1, u2; const char *bc = orig_bc; const char *read = orig_read+n_crop; @@ -81,25 +82,29 @@ int chk_bc_mtch(const char *orig_bc, const char *orig_read, size_t mismatch, int u2 = *read++; if (u1 != u2) { - cnt++; - if (cnt > mismatch) { + mismatch++; + if (mismatch > max_mismatch) { break; } } if (u1 == '\0' || u2 == '\0') { - return n_crop; + ret.cropped = n_crop; + ret.mismatches = mismatch; + return ret; } } - if(cnt <= mismatch) { - return n_crop; + if(mismatch <= max_mismatch) { + ret.cropped = n_crop; + ret.mismatches = mismatch; + return ret; } n_crop++; } //this is in the case of error - return -1; + return ret; } // https://stackoverflow.com/questions/21880730/c-what-is-the-best-and-fastest-way-to-concatenate-strings diff --git a/src/utils.h b/src/utils.h index 3a9e32b..66ab24d 100644 --- a/src/utils.h +++ b/src/utils.h @@ -15,13 +15,18 @@ typedef struct umis_t { int cnts; } umis_t; +typedef struct { + int mismatches; + int cropped; +} match_ret_t; + //This is needed if compilling with -std=c99, read below for more //https://stackoverflow.com/questions/26284110/strdup-confused-about-warnings-implicit-declaration-makes-pointer-with char *strdup(const char*); char *strndup(const char *s, size_t n); const char * _mkdir (const char *dir); -int chk_bc_mtch(const char *orig_bc, const char *orig_read, size_t mismatch, int max_5prime_crop); +match_ret_t chk_bc_mtch(const char *orig_bc, const char *orig_read, size_t max_mismatch, int max_5prime_crop); void get_fqread(char *fqread, fq_rec_t *fq_rec, char *barcode, char *umi_idx, int no_comment, int n_crop); void get_merged_fqread(char *fqread, fq_rec_t *fq_rec1, fq_rec_t *fq_rec2, char *barcode, char *umi_idx, int no_comment, int n_crop); void get_bc_fn(char **bcout_fn, char *s_name, char *barcode, int read_type); From 379b3c71cff0be956b7ac1ac2f376bfb358b115e Mon Sep 17 00:00:00 2001 From: David Powell Date: Fri, 5 Jun 2020 17:34:15 +1000 Subject: [PATCH 54/55] Add option to compress (gzip) output files - Uses popen() to run either pigz or gzip --- src/Makefile | 4 +- src/demultiplex.c | 137 ++++++++++++++++++++++++---------------------- src/sabre.c | 66 ++++++++++++++++------ src/sabre.h | 1 + src/usage.c | 1 + src/utils.c | 9 ++- src/utils.h | 10 ++-- 7 files changed, 138 insertions(+), 90 deletions(-) diff --git a/src/Makefile b/src/Makefile index 71bd7ff..feb2a3d 100644 --- a/src/Makefile +++ b/src/Makefile @@ -9,8 +9,8 @@ DSRC=src #CFLAGS = -Wall -O2 -std=c99 -pedantic -DVERSION=$(VERSION) # need to quote GIT_VERSION so that the value gets passed as a string -CFLAGS = -Wall -O2 -std=c99 -pedantic -DVERSION=\"$(GIT_VERSION)\" -CFLAGSDEV = -Wall -O0 -g -std=c99 -DVERSION=$(VERSIONDEV) +CFLAGS = -Wall -O2 -std=gnu99 -pedantic -DVERSION=\"$(GIT_VERSION)\" +CFLAGSDEV = -Wall -O0 -g -std=gnu99 -DVERSION=\"$(GIT_VERSION)-dev\" LDFLAGS = -lz -lpthread GPROF = -pg diff --git a/src/demultiplex.c b/src/demultiplex.c index aebabf1..da149e3 100644 --- a/src/demultiplex.c +++ b/src/demultiplex.c @@ -28,14 +28,12 @@ int best(match_ret_t m) { return m.cropped==0 && m.mismatches==0; } -void* demult_runner(void *arg) { - - char fqread1[MAX_READ_SIZE]; - char fqread2[MAX_READ_SIZE]; - - fqread1[0] = '\0'; - fqread2[0] = '\0'; +void write_out(const param_t *params, pthread_mutex_t *out_lock, metrics_t* metrics, + match_ret_t best_match, barcode_data_t *best_bc, + const char* actl_bc, + fq_rec_t *fq_rec1,fq_rec_t *fq_rec2); +void* demult_runner(void *arg) { fq_rec_t *fq_rec1 = (fq_rec_t*) malloc(sizeof(fq_rec_t)); fq_rec_t *fq_rec2 = (fq_rec_t*) malloc(sizeof(fq_rec_t)); @@ -117,82 +115,93 @@ void* demult_runner(void *arg) { //pthread_cond_broadcast(thread_data->cv); // Tell everyone it might be their turn! - char *umi_idx = NULL; + write_out(thread_data->params, thread_data->out_lock, thread_data->metrics, best_match, best_bc, actl_bc, fq_rec1, fq_rec2); + if (actl_bc) + free(actl_bc); - if(best_bc != NULL) { - //for now assume barcode and umi are in R1 read - if(thread_data->params->umi > 0) { + thread_data->metrics->total += 2; + } - const char *actl_umi_idx = (fq_rec1->seq)+strlen(actl_bc)+best_match.cropped; + free(fq_rec1); + free(fq_rec2); + //free(thread_data); according to valgrind report this line isn't needed since no errors given out.. + return NULL; +} - if(strlen(actl_umi_idx) < thread_data->params->min_umi_len) { - pthread_mutex_lock(thread_data->out_lock); - fprintf(thread_data->params->umis_2_short_fd, "%s\t%s\t%zu\t%d\n", fq_rec1->name, actl_umi_idx, strlen(actl_umi_idx), thread_data->params->min_umi_len); - pthread_mutex_unlock(thread_data->out_lock); - continue; - } - else { - umi_idx = strdup(actl_umi_idx); - umi_idx[thread_data->params->min_umi_len] = '\0'; - } - } - if(thread_data->params->combine > 0 && actl_bc != NULL) { - get_merged_fqread(fqread1, fq_rec1, fq_rec2, actl_bc, umi_idx, thread_data->params->no_comment, best_match.cropped); +void write_out(const param_t *params, pthread_mutex_t *out_lock, metrics_t* metrics, + match_ret_t best_match, barcode_data_t *best_bc, + const char* actl_bc, + fq_rec_t *fq_rec1,fq_rec_t *fq_rec2) { - pthread_mutex_lock(thread_data->out_lock); - fprintf(best_bc->bcfile1, "%s", fqread1); - pthread_mutex_unlock(thread_data->out_lock); + char fqread1[MAX_READ_SIZE]; + char fqread2[MAX_READ_SIZE]; - } - else { - get_fqread(fqread1, fq_rec1, actl_bc, umi_idx, thread_data->params->no_comment, best_match.cropped); + fqread1[0] = '\0'; + fqread2[0] = '\0'; - pthread_mutex_lock(thread_data->out_lock); - fprintf(best_bc->bcfile1, "%s", fqread1); - pthread_mutex_unlock(thread_data->out_lock); + char *umi_idx = NULL; - if(thread_data->params->paired > 0) { - get_fqread(fqread2, fq_rec1, actl_bc, umi_idx, thread_data->params->no_comment, best_match.cropped); + if(best_bc != NULL) { + //for now assume barcode and umi are in R1 read + if(params->umi > 0) { - pthread_mutex_lock(thread_data->out_lock); - fprintf(best_bc->bcfile2, "%s", fqread2); - pthread_mutex_unlock(thread_data->out_lock); + const char *actl_umi_idx = (fq_rec1->seq)+strlen(actl_bc)+best_match.cropped; - //dont need to increment buff_cnt, assuming fq_read1 keeps the right count - best_bc->num_records += 1; - } + if(strlen(actl_umi_idx) < params->min_umi_len) { + pthread_mutex_lock(out_lock); + fprintf(params->umis_2_short_fd, "%s\t%s\t%zu\t%d\n", fq_rec1->name, actl_umi_idx, strlen(actl_umi_idx), params->min_umi_len); + pthread_mutex_unlock(out_lock); + return; + } + else { + umi_idx = strdup(actl_umi_idx); + umi_idx[params->min_umi_len] = '\0'; } - best_bc->num_records += 1; - free(actl_bc); } - else { - get_fqread(fqread1, fq_rec1, NULL, NULL, thread_data->params->no_comment, 0); + if(params->combine > 0 && actl_bc != NULL) { + get_merged_fqread(fqread1, fq_rec1, fq_rec2, actl_bc, umi_idx, params->no_comment, best_match.cropped); - pthread_mutex_lock(thread_data->out_lock); - fprintf(thread_data->params->unassigned1_fd, "%s", fqread1); - pthread_mutex_unlock(thread_data->out_lock); + pthread_mutex_lock(out_lock); + fputs(fqread1, best_bc->bcfile1); + pthread_mutex_unlock(out_lock); - thread_data->metrics->num_unknown += 1; + } + else { + get_fqread(fqread1, fq_rec1, actl_bc, umi_idx, params->no_comment, best_match.cropped); - if(thread_data->params->paired > 0) { - get_fqread(fqread2, fq_rec2, NULL, NULL, thread_data->params->no_comment, 0); + pthread_mutex_lock(out_lock); + fputs(fqread1, best_bc->bcfile1); - pthread_mutex_lock(thread_data->out_lock); - fprintf(thread_data->params->unassigned2_fd, "%s", fqread2); - pthread_mutex_unlock(thread_data->out_lock); + if(params->paired > 0) { + get_fqread(fqread2, fq_rec1, actl_bc, umi_idx, params->no_comment, best_match.cropped); - thread_data->metrics->num_unknown += 1; - } + fputs(fqread2, best_bc->bcfile2); + //dont need to increment buff_cnt, assuming fq_read1 keeps the right count + best_bc->num_records += 1; + } + pthread_mutex_unlock(out_lock); } - - thread_data->metrics->total += 2; + best_bc->num_records += 1; } + else { - free(fq_rec1); - free(fq_rec2); - //free(thread_data); according to valgrind report this line isn't needed since no errors given out.. - return NULL; -} + get_fqread(fqread1, fq_rec1, NULL, NULL, params->no_comment, 0); + + pthread_mutex_lock(out_lock); + fputs(fqread1, params->unassigned1_fd); + + metrics->num_unknown += 1; + + if(params->paired > 0) { + get_fqread(fqread2, fq_rec2, NULL, NULL, params->no_comment, 0); + + fputs(fqread2, params->unassigned2_fd); + + metrics->num_unknown += 1; + } + pthread_mutex_unlock(out_lock); + } +} \ No newline at end of file diff --git a/src/sabre.c b/src/sabre.c index 2cfc4ab..be9d5cf 100644 --- a/src/sabre.c +++ b/src/sabre.c @@ -1,8 +1,32 @@ + #include "sabre.h" #include "utils.h" #include "usage.h" #include "demultiplex.h" +FILE* my_fopen(const char* fname, int gz) { + static char* compressor = NULL; + + if (gz) { + + if (!compressor) { + // Guess whether to use pigz or gzip + char tmp[100]; + FILE* fin = popen("pigz --version 2>&1", "r"); + char* str = fgets(tmp, 50, fin); + int found = strncmp("pigz",tmp,4)==0; + compressor = str && found ? "pigz -p 2" : "gzip"; + } + + char command[2048]; + sprintf(command, "%s > %s", compressor, fname); + FILE* ret = popen(command, "w"); + return ret; + } else { + return fopen(fname, "w"); + } +} + int main(int argc, char *argv[]) { //more about getopts http://www.informit.com/articles/article.aspx?p=175771&seqNum=3 @@ -20,6 +44,7 @@ int main(int argc, char *argv[]) { {"stats", required_argument, NULL, 's'}, {"no-comment", no_argument, 0, 'n'}, {"threads", optional_argument, 0, 't'}, + {"gz-out", optional_argument, NULL, 'g'}, {"version", optional_argument, NULL, 'v'}, {"help", optional_argument, NULL, 'h'}, {"story", optional_argument, NULL, 'o'}, @@ -47,8 +72,8 @@ int main(int argc, char *argv[]) { FILE* bc_fd; char *bc_fn=NULL; - char *unassigned1_fn=strdup("unassigned_R1.fq"); - char *unassigned2_fn=strdup("unassigned_R2.fq"); + char *unassigned1_fn=NULL; + char *unassigned2_fn=NULL; char *umis_2_short_fn=strdup("umis_too_short.txt"); FILE* log_file; @@ -64,7 +89,7 @@ int main(int argc, char *argv[]) { while (1) { int option_index = 0; //colon after a flag means should have arguments and no colon means just a flag i.e bool, no args after it - optc = getopt_long (argc, argv, "dnucvof:r:b:z:w:m:s:l:z:a:t:", paired_long_options, &option_index); + optc = getopt_long (argc, argv, "dnucvogf:r:b:z:w:m:s:l:z:a:t:", paired_long_options, &option_index); if (optc == -1) break; @@ -120,6 +145,10 @@ int main(int argc, char *argv[]) { params.no_comment = 1; break; + case 'g': + params.gz_out = 1; + break; + case 't': threads = atoi (optarg); break; @@ -169,8 +198,14 @@ int main(int argc, char *argv[]) { exit(EXIT_FAILURE); } - params.unassigned1_fd = fopen(unassigned1_fn, "wb"); - params.unassigned2_fd = fopen(unassigned2_fn, "wb"); + if (!unassigned1_fn) { + unassigned1_fn= params.gz_out ? strdup("unassigned_R1.fq.gz") : strdup("unassigned_R1.fq"); + } + if (!unassigned2_fn) { + unassigned2_fn= params.gz_out ? strdup("unassigned_R2.fq.gz") : strdup("unassigned_R2.fq"); + } + params.unassigned1_fd = my_fopen(unassigned1_fn, params.gz_out); + params.unassigned2_fd = my_fopen(unassigned2_fn, params.gz_out); params.umis_2_short_fd = fopen(umis_2_short_fn, "a"); // ? where does this goes? @@ -227,7 +262,7 @@ int main(int argc, char *argv[]) { char line_buff[1024]; int max_items = 6; while(fgets(line_buff, 1024, bc_fd)) { - curr = (barcode_data_t*) malloc(sizeof(barcode_data_t)); + curr = (barcode_data_t*) calloc(1,sizeof(barcode_data_t)); char *p = strtok(line_buff, "\t"); char *s_name = strdup(p); @@ -237,14 +272,14 @@ int main(int argc, char *argv[]) { bcout_fn1 = (char *) malloc(MAX_FILENAME_LENGTH*2); bcout_fn1[0] = '\0'; - get_bc_fn(&bcout_fn1, s_name, curr->bc_grp, 1); - curr->bcfile1 = fopen(_mkdir(bcout_fn1), "wb"); + get_bc_fn(&bcout_fn1, s_name, curr->bc_grp, 1, params.gz_out); + curr->bcfile1 = my_fopen(_mkdir(bcout_fn1), params.gz_out); if(params.paired > 0 && params.combine < 0) { bcout_fn2 = (char *) malloc(MAX_FILENAME_LENGTH*2); bcout_fn2[0] = '\0'; - get_bc_fn(&bcout_fn2, s_name, curr->bc_grp, 2); - curr->bcfile2 = fopen(_mkdir(bcout_fn2), "wb"); + get_bc_fn(&bcout_fn2, s_name, curr->bc_grp, 2, params.gz_out); + curr->bcfile2 = my_fopen(_mkdir(bcout_fn2), params.gz_out); } //TODO for hardcode max limit of items in the barcodes file to 6 @@ -271,7 +306,7 @@ int main(int argc, char *argv[]) { free(fq2_fn); free(unassigned1_fn); free(unassigned2_fn); - free(umis_2_short_fn); + //free(umis_2_short_fn); // Threading pthread_t tid[threads]; @@ -344,8 +379,6 @@ int main(int argc, char *argv[]) { \n It took %.2f minutes\n", difftime(end, start)/60); - // good read :) - little_story(EXIT_SUCCESS); fclose(bc_fd); fclose(log_file); @@ -356,7 +389,8 @@ int main(int argc, char *argv[]) { curr = head; while (curr) { fclose(curr->bcfile1); - fclose(curr->bcfile2); + if (curr->bcfile2) + fclose(curr->bcfile2); free (curr->bc_grp); free (curr->bc); @@ -365,6 +399,6 @@ int main(int argc, char *argv[]) { free (temp); } - free(curr); - return EXIT_SUCCESS; + // good read :) + little_story(EXIT_SUCCESS); } diff --git a/src/sabre.h b/src/sabre.h index 7f86712..d6a8f33 100644 --- a/src/sabre.h +++ b/src/sabre.h @@ -72,6 +72,7 @@ typedef struct { int min_umi_len; int max_5prime_crop; int no_comment; + int gz_out; } param_t; typedef struct { diff --git a/src/usage.c b/src/usage.c index d4a97eb..550326b 100644 --- a/src/usage.c +++ b/src/usage.c @@ -23,6 +23,7 @@ void usage(int status) { \n -l, --min-umi-len INT Minimum UMI length to keep [0]\ \n -a, --max-5prime-crop INT Maximum number of possible bases cropped from 5prime [0]\ \n -n, --no-comment Drop extra comments from FASTQ header [NULL]\ + \n -g, --gz-out gzip the output files (requires pigz or gzip in PATH)\ \n -s, --stats FILE Write stats to file instead of STDOUT [STDOUT]\ \n\ \n Extras:\ diff --git a/src/utils.c b/src/utils.c index 0ed0019..d5f75c7 100644 --- a/src/utils.c +++ b/src/utils.c @@ -109,7 +109,7 @@ match_ret_t chk_bc_mtch(const char *orig_bc, const char *orig_read, size_t max_m // https://stackoverflow.com/questions/21880730/c-what-is-the-best-and-fastest-way-to-concatenate-strings //TODO this is a fastq mystrcat function, that returns a pointer to the end of the string -void get_fqread(char *fqread, fq_rec_t *fq_rec, char *barcode, char *umi_idx, int no_comment, int n_crop) { +void get_fqread(char *fqread, fq_rec_t *fq_rec, const char *barcode, char *umi_idx, int no_comment, int n_crop) { fqread[0] = '\0'; @@ -161,7 +161,7 @@ void get_fqread(char *fqread, fq_rec_t *fq_rec, char *barcode, char *umi_idx, in strcat(fqread, "\n"); } -void get_merged_fqread(char *fqread, fq_rec_t *fq_rec1, fq_rec_t *fq_rec2, char *barcode, char *umi_idx, int no_comment, int n_crop) { +void get_merged_fqread(char *fqread, fq_rec_t *fq_rec1, fq_rec_t *fq_rec2, const char *barcode, char *umi_idx, int no_comment, int n_crop) { fqread[0] = '\0'; //@READNAME:BACRCODE:UMI //1st line @@ -202,7 +202,7 @@ void get_merged_fqread(char *fqread, fq_rec_t *fq_rec1, fq_rec_t *fq_rec2, char strcat(fqread, "\n"); } -void get_bc_fn(char **bcout_fn, char *s_name, char *barcode, int read_type) { +void get_bc_fn(char **bcout_fn, char *s_name, char *barcode, int read_type, int gz) { if(strlen(s_name) > MAX_FILENAME_LENGTH) { fprintf (stderr, @@ -227,6 +227,9 @@ void get_bc_fn(char **bcout_fn, char *s_name, char *barcode, int read_type) { read_type); exit(EXIT_FAILURE); } + if (gz) { + strcat(*bcout_fn, ".gz"); + } } void set_default_params(param_t *params) { diff --git a/src/utils.h b/src/utils.h index 66ab24d..ca32703 100644 --- a/src/utils.h +++ b/src/utils.h @@ -22,14 +22,14 @@ typedef struct { //This is needed if compilling with -std=c99, read below for more //https://stackoverflow.com/questions/26284110/strdup-confused-about-warnings-implicit-declaration-makes-pointer-with -char *strdup(const char*); -char *strndup(const char *s, size_t n); +// char *strdup(const char*); +// char *strndup(const char *s, size_t n); const char * _mkdir (const char *dir); match_ret_t chk_bc_mtch(const char *orig_bc, const char *orig_read, size_t max_mismatch, int max_5prime_crop); -void get_fqread(char *fqread, fq_rec_t *fq_rec, char *barcode, char *umi_idx, int no_comment, int n_crop); -void get_merged_fqread(char *fqread, fq_rec_t *fq_rec1, fq_rec_t *fq_rec2, char *barcode, char *umi_idx, int no_comment, int n_crop); -void get_bc_fn(char **bcout_fn, char *s_name, char *barcode, int read_type); +void get_fqread(char *fqread, fq_rec_t *fq_rec, const char *barcode, char *umi_idx, int no_comment, int n_crop); +void get_merged_fqread(char *fqread, fq_rec_t *fq_rec1, fq_rec_t *fq_rec2, const char *barcode, char *umi_idx, int no_comment, int n_crop); +void get_bc_fn(char **bcout_fn, char *s_name, char *barcode, int read_type, int gz); void set_default_params(param_t *params); void params_destroy(param_t *params); From 663365faa7341c943c364c88264a3a2d2c5b371d Mon Sep 17 00:00:00 2001 From: serine Date: Wed, 1 Jul 2020 18:20:46 +1000 Subject: [PATCH 55/55] fixed printing of help when no args are given and also drop what appears to be "random" if statemnet inside switch --- src/sabre.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/sabre.c b/src/sabre.c index be9d5cf..a41ce4f 100644 --- a/src/sabre.c +++ b/src/sabre.c @@ -29,6 +29,10 @@ FILE* my_fopen(const char* fname, int gz) { int main(int argc, char *argv[]) { + if(argc < 2) { + usage(EXIT_SUCCESS); + } + //more about getopts http://www.informit.com/articles/article.aspx?p=175771&seqNum=3 static struct option paired_long_options[] = { {"fq1", required_argument, NULL, 'f'}, @@ -94,7 +98,6 @@ int main(int argc, char *argv[]) { if (optc == -1) break; switch (optc) { - if (paired_long_options[option_index].flag != 0) break; case 'f': fq1_fn = (char*) malloc (strlen (optarg) + 1);