diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..135df85 --- /dev/null +++ b/.gitignore @@ -0,0 +1,10 @@ +# ignore C object files +*.o +# ignore executables +sabre +sabre-dev +metrics +# ignore vim swap files +*.swp +*.gz +/tmp diff --git a/Makefile b/Makefile deleted file mode 100644 index 2ff3141..0000000 --- a/Makefile +++ /dev/null @@ -1,41 +0,0 @@ -PROGRAM_NAME = sabre -VERSION = 1.00 -CC = gcc -CFLAGS = -Wall -pedantic -DVERSION=$(VERSION) -DEBUG = -g -OPT = -O3 -ARCHIVE = $(PROGRAM_NAME)_$(VERSION) -LDFLAGS = -lz -SDIR = src - -.PHONY: clean default build distclean dist debug - -default: build - -barcode.o: $(SDIR)/barcode.c - $(CC) $(CFLAGS) $(OPT) -c $(SDIR)/$*.c - -demulti_single.o: $(SDIR)/demulti_single.c $(SDIR)/sabre.h $(SDIR)/kseq.h - $(CC) $(CFLAGS) $(OPT) -c $(SDIR)/$*.c - -demulti_paired.o: $(SDIR)/demulti_paired.c $(SDIR)/sabre.h $(SDIR)/kseq.h - $(CC) $(CFLAGS) $(OPT) -c $(SDIR)/$*.c - -sabre.o: $(SDIR)/sabre.c $(SDIR)/sabre.h - $(CC) $(CFLAGS) $(OPT) -c $(SDIR)/$*.c - -clean: - rm -rf *.o $(SDIR)/*.gch ./sabre - -distclean: clean - rm -rf *.tar.gz - -dist: - tar -zcf $(ARCHIVE).tar.gz src Makefile - -build: barcode.o demulti_single.o demulti_paired.o sabre.o - $(CC) $(CFLAGS) $(OPT) $? -o $(PROGRAM_NAME) $(LDFLAGS) - -debug: - $(MAKE) build "CFLAGS=-Wall -pedantic -g -DDEBUG" - diff --git a/README.md b/README.md index 5e84da3..ab3aa22 100644 --- a/README.md +++ b/README.md @@ -1,105 +1,58 @@ -# sabre - A barcode demultiplexing and trimming tool for FastQ files +> This is a fork of the [original repo](https://github.com/najoshi/sabre). I might be taking this tool into a different direction to what was originally intended -## About +# sabre -Next-generation sequencing can currently produce hundreds of millions of reads -per lane of sample and that number increases at a dizzying rate. Barcoding -individual sequences for multiple lines or multiple species is a cost-efficient -method to sequence and analyze a broad range of data. +> A cellular barcode demultiplexing tool of FASTQ files -Sabre is a tool that will demultiplex barcoded reads into separate files. -It will work on both single-end and paired-end data in fastq format. -It simply compares the provided barcodes with each read and separates -the read into its appropriate barcode file, after stripping the barcode from -the read (and also stripping the quality values of the barcode bases). If -a read does not have a recognized barcode, then it is put into the unknown file. -Sabre also has an option (-m) to allow mismatches of the barcodes. +## Content -Sabre also supports gzipped file inputs. Also, since sabre does not use the -quality values in any way, it can be used on fasta data that is converted to -fastq by creating fake quality values. +- [Install](#install) +- [Quick start](#quick-start) +- [Usage](#usage) -Finally, after demultiplexing, sabre outputs a summary of how many records -went into each barcode file. +## Install -## Requirements +```BASH +git clone https://github.com/serine/sabre +cd src +make +``` -Sabre requires a C compiler; GCC or clang are recommended. Sabre -relies on Heng Li's kseq.h, which is bundled with the source. +## Quick start -Sabre also requires Zlib, which can be obtained at -. - -## Building and Installing Sabre - -To build Sabre, enter: - - make - -Then, copy or move "sabre" to a directory in your $PATH. +```BASH +sabre -f MultiplexRNASeq_S1_R1_001.fastq.gz \ + -r MultiplexRNASeq_S1_R2_001.fastq.gz \ + -b barcodes.txt \ + -c \ + -u \ + -m 2 \ + -l 10 \ + -a 1 \ + -s sabre.txt \ + -t 12 +``` ## Usage -Sabre has two modes to work with both paired-end and single-end -reads: `sabre se` and `sabre pe`. - -Running sabre by itself will print the help: - - sabre - -Running sabre with either the "se" or "pe" commands will give help -specific to those commands: - - sabre se - sabre pe - -### Sabre Single End (`sabre se`) - -`sabre se` takes an input fastq file and an input barcode data file and outputs -the reads demultiplexed into separate files using the file names from the data file. -The barcodes will be stripped from the reads and the quality values of the barcode -bases will also be removed. Any reads with unknown barcodes get put into the "unknown" -file specified on the command line. The -m option allows for mismatches in the barcodes. - -#### Barcode data file format for single end - - barcode1 barcode1_output_file.fastq - barcode2 barcode2_output_file.fastq - etc... - -Be aware that if you do not format the barcode data file correctly, sabre will not work properly. - -#### Example - - sabre se -f input_file.fastq -b barcode_data.txt -u unknown_barcode.fastq - sabre se -m 1 -f input_file.fastq -b barcode_data.txt -u unknown_barcode.fastq - -### Sabre Paired End (`sabre pe`) - -`sabre pe` takes two paired-end files and a barcode data file as input and outputs -the reads demultiplexed into separate paired-end files using the file names from the -data file. The barcodes will be stripped from the reads and the quality values of the barcode -bases will also be removed. Any reads with unknown barcodes get put into the "unknown" files -specified on the command line. It also has an option (-c) to remove barcodes from both files. -Using this option means that if sabre finds a barcode in the first file, it assumes the paired -read in the other file has the same barcode and will strip it (along with the quality values). -The -m option allows for mismatches in the barcodes. - -#### Barcode data file format for paired end +> This tool is under development and this is very much an alpha version +> In it's current form the tool is highly customised a particular multiplexing protocol - barcode1 barcode1_output_file1.fastq barcode1_output_file2.fastq - barcode2 barcode2_output_file1.fastq barcode2_output_file2.fastq - etc... +### Cellular barcodes -Be aware that if you do not format the barcode data file correctly, sabre will not work properly. +In order to demultiplex the use needs to provide `barcodes.txt` file, which is three column tab delimited file -#### Examples +``` +sample_name group barcode +``` - sabre pe -f input_file1.fastq -r input_file2.fastq -b barcode_data.txt \ - -u unknown_barcode1.fastq -w unknown_barcode1.fastq +currently group is semi-redundant column, it there for a feature that in the development. for most use cases group can equals to barcode - sabre pe -c -f input_file1.fastq -r input_file2.fastq -b barcode_data.txt \ - -u unknown_barcode1.fastq -w unknown_barcode1.fastq +e.g - sabre pe -m 2 -f input_file1.fastq -r input_file2.fastq -b barcode_data.txt \ - -u unknown_barcode1.fastq -w unknown_barcode1.fastq +``` +cntr_rep1 TAAGGCGA TAAGGCGA +cntr_rep2 CGTACTAG CGTACTAG +treat_rep1 AGGCAGAA AGGCAGAA +treat_rep2 TCCTGAGC TCCTGAGC +``` diff --git a/docs/definitions.md b/docs/definitions.md new file mode 100644 index 0000000..b0a8921 --- /dev/null +++ b/docs/definitions.md @@ -0,0 +1,37 @@ +define blocks along the read + +BARCODE +UMI +READ + +then set values to different block + +BARCODE = 8 +UMI = 10 + +## Andrew's suggestion + +``` +--input sample_A_R1.fastq.gz:i8{index1},r151{read1},i8{index2} +``` + +``` +--fq1 sample_A_R1.fastq.gz:i8{index1},r151{READ1},i8{index2} + +--fq2 sample_A_R2.fastq.gz:i8{index1},r151{READ1},i8{index2} +``` + +We need to check that BARCODE == index1 in both fq1 and fq2 but also check that index1_fq1 == index1_fq2 + +``` +--merge 12 merge R1 into R2 +--merge 21 merge R2 into R1 +``` + +either way resulting read is R1 + +``` +--fq1 sample_A_R1.fastq.gz:8index1,*index2 + +--fq2 sample_A_R2.fastq.gz:i8{index1},r151{read2},i8{index2} +``` diff --git a/docs/modes.md b/docs/modes.md new file mode 100644 index 0000000..62bca89 --- /dev/null +++ b/docs/modes.md @@ -0,0 +1,72 @@ +# Sabre + +## Different running modes + +DOCS: In each case BARCODE and/or UMI are trimed off and +put into FASTQ header: + +Not sure if I should have: + + BARCODE always has a precedent i.e BARCODE:UMI + OR + It follows the same structure as per experiment i.e + if BARCODE+UMI then BARCODE:UMI + else if UMI+BARCODE then UMI:BARCODE + +All modes that begin with 3 will return single - R1 file, merging +R1 read into R2 header and renaming R2 into R1 + +10 = single-end where R1 has the following structure: + + R1 --> + BARCODE+READ + +20 = paired-end where R1 and R2 have the following structure: + + R1 --> <--R2 + BARCODE+READ----READ+BARCODE + +this mode returns single file (R1) with barcode appended and into R1 header + +30 = paired-end where R1 and R2 have the following structure: + + R1 --> <-R2 + BARCODE----READ + +40 = paired-end where + +11 = single-end where R1 has the following structure: + + R1 --> + BARCODE+UMI+READ + +21 = paired-end where R1 and R2 have the following structure: + + R1 --> <--R2 + BARCODE+UMI+READ----READ+UMI+BARCODE + +this mode returns single file (R1) with barcode appended and into R1 header + +31 = paired-end where R1 and R2 have the following structure: + + R1 --> <-R2 + BARCODE+UMI----READ + +NOTE this gives me room for yet another mode e.g 12, 22, 32 + +12 = sinle-end where R1 has the following structure: + + R1 --> + UMI+READ + +22 = paired-end where R1 and R2 have the following structure: + + R1 --> <--R2 + UMI+READ----READ+UMI + +this mode returns single file (R1) with barcode appended and into R1 header + +32 = paired-end where R1 and R2 have the following structure: + + R1 --> <-R2 + UMI----READ diff --git a/src/Makefile b/src/Makefile new file mode 100644 index 0000000..feb2a3d --- /dev/null +++ b/src/Makefile @@ -0,0 +1,50 @@ +# Source, Executable, Includes, Library Defines +GIT_VERSION := "$(shell git describe --abbrev=7 --always --tags)" +CC = gcc +INCL = fastq.h sabre.h +#SRC = demulti_paired.c demulti_single.c sabre.c utils.c +SRC = sabre.c usage.c demultiplex.c utils.c fastq.c +OBJ = $(SRC:.c=.o) +DSRC=src + +#CFLAGS = -Wall -O2 -std=c99 -pedantic -DVERSION=$(VERSION) +# need to quote GIT_VERSION so that the value gets passed as a string +CFLAGS = -Wall -O2 -std=gnu99 -pedantic -DVERSION=\"$(GIT_VERSION)\" +CFLAGSDEV = -Wall -O0 -g -std=gnu99 -DVERSION=\"$(GIT_VERSION)-dev\" + +LDFLAGS = -lz -lpthread +GPROF = -pg +EXE = sabre + +.PHONY: default + +default: build +# a smarter way to have an if statement here instead of explicit grpof target +# have a look at gcc -M + +%.o: %.c + $(CC) -c $(CFLAGS) $(SRC) + +usage.o: usage.h sabre.h fastq.h +utils.o: utils.h sabre.h fastq.h +demultiplex.o: demultiplex.h utils.h sabre.h fastq.h +sabre.o: sabre.h + +build: $(OBJ) + $(CC) $(CFLAGS) $(OBJ) -o $(EXE) $(LDFLAGS) + #ln -sf $(DSRC)/$(EXE) ../sabre + +dev: $(OBJDEV) + $(CC) $(CFLAGSDEV) $(SRC) -o $(EXE)-dev $(LDFLAGS) + +metrics: + $(CC) $(CFLAGSDEV) -o metrics metrics.c $(LDFLAGS) + +gprof: + $(CC) $(CFLAGS) $(GPROF) $(SRC) -o $(EXE).gprof $(LDFLAGS) + +clean: + $(RM) $(OBJ) $(EXE) core + +clean-all: + $(RM) $(OBJ) $(EXE) $(EXE)-dev $(EXE).gprof core gmon.out metrics diff --git a/src/barcode.c b/src/barcode.c deleted file mode 100644 index a260281..0000000 --- a/src/barcode.c +++ /dev/null @@ -1,24 +0,0 @@ -#include -#include -#include -#include - -int strncmp_with_mismatch (const char *s1, const char *s2, register size_t n, register size_t mismatch) { - - register unsigned char u1, u2; - int cnt=0; - - while (n-- > 0) { - u1 = (unsigned char) *s1++; - u2 = (unsigned char) *s2++; - - if (u1 != u2) { - cnt++; - if (cnt > mismatch) return u1 - u2; - } - - if (u1 == '\0') return 0; - } - - return 0; -} diff --git a/src/demulti_paired.c b/src/demulti_paired.c deleted file mode 100644 index 2acdb11..0000000 --- a/src/demulti_paired.c +++ /dev/null @@ -1,334 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include "sabre.h" -#include "kseq.h" - -KSEQ_INIT(gzFile, gzread) - - -static struct option paired_long_options[] = { - {"pe-file1", required_argument, 0, 'f'}, - {"pe-file2", required_argument, 0, 'r'}, - {"barcode-file", required_argument, 0, 'b'}, - {"unknown-output1", required_argument, 0, 'u'}, - {"unknown-output2", required_argument, 0, 'w'}, - {"both-barcodes", optional_argument, 0, 'c'}, - {"max-mismatch", optional_argument, 0, 'm'}, - {"quiet", optional_argument, 0, 'z'}, - {GETOPT_HELP_OPTION_DECL}, - {GETOPT_VERSION_OPTION_DECL}, - {NULL, 0, NULL, 0} -}; - -void paired_usage (int status) { - - fprintf (stderr, "\nUsage: %s pe -f -r -b -u -w \n\n\ -Options:\n\ --f, --pe-file1, Input paired-end fastq file 1 (required, must have same number of records as pe2)\n\ --r, --pe-file2, Input paired-end fastq file 2 (required, must have same number of records as pe1)\n\ --b, --barcode-file, File with barcode and two output file names per line (required)\n", PROGRAM_NAME); - -fprintf (stderr, "-u, --unknown-output1, Output paired-end file 1 that contains records with no barcodes found. (required)\n\ --w, --unknown-output2, Output paired-end file 2 that contains records with no barcodes found. (required)\n\ --c, --both-barcodes, Optional flag that indicates that both fastq files have barcodes.\n\ --m , --max-mismatch , Optional argument that is the maximum number of mismatches allowed in a barcode. Default 0.\n"); - -fprintf (stderr, "--quiet, don't print barcode matching info\n\ ---help, display this help and exit\n\ ---version, output version information and exit\n\n"); - - exit (status); -} - -int paired_main (int argc, char *argv[]) { - - gzFile pe1=NULL; - gzFile pe2=NULL; - kseq_t *fqrec1; - kseq_t *fqrec2; - int l1,l2; - FILE* barfile = NULL; - FILE* unknownfile1=NULL; - FILE* unknownfile2=NULL; - int debug=0; - int optc; - extern char *optarg; - char *infn1=NULL; - char *infn2=NULL; - char *barfn=NULL; - char *unknownfn1=NULL; - char *unknownfn2=NULL; - int both_have_barcodes=0; - barcode_data_paired *curr, *head, *temp; - char barcode [MAX_BARCODE_LENGTH]; - char baroutfn1 [MAX_FILENAME_LENGTH]; - char baroutfn2 [MAX_FILENAME_LENGTH]; - int num_unknown=0; - int total=0; - int mismatch=0; - int quiet=0; - - - while (1) { - int option_index = 0; - optc = getopt_long (argc, argv, "dcf:r:b:u:w:m:z", paired_long_options, &option_index); - - if (optc == -1) break; - - switch (optc) { - if (paired_long_options[option_index].flag != 0) break; - - case 'f': - infn1 = (char*) malloc (strlen (optarg) + 1); - strcpy (infn1, optarg); - break; - - case 'r': - infn2 = (char*) malloc (strlen (optarg) + 1); - strcpy (infn2, optarg); - break; - - case 'b': - barfn = (char*) malloc (strlen (optarg) + 1); - strcpy (barfn, optarg); - break; - - case 'u': - unknownfn1 = (char*) malloc (strlen (optarg) + 1); - strcpy (unknownfn1, optarg); - break; - - case 'w': - unknownfn2 = (char*) malloc (strlen (optarg) + 1); - strcpy (unknownfn2, optarg); - break; - - case 'c': - both_have_barcodes=1; - break; - - case 'm': - mismatch = atoi (optarg); - break; - - case 'z': - quiet=1; - break; - - case 'd': - debug = 1; - break; - - case_GETOPT_HELP_CHAR(paired_usage); - case_GETOPT_VERSION_CHAR(PROGRAM_NAME, VERSION, AUTHORS); - - case '?': - paired_usage (EXIT_FAILURE); - break; - - default: - paired_usage (EXIT_FAILURE); - break; - } - } - - - if (!infn1 || !infn2 || !unknownfn1 || !unknownfn2 || !barfn) { - paired_usage (EXIT_FAILURE); - } - - if (!strcmp (infn1, infn2) || !strcmp (infn1, unknownfn1) || !strcmp (infn1, unknownfn2) || - !strcmp (infn1, barfn) || !strcmp (infn2, unknownfn1) || !strcmp (infn2, unknownfn2) || - !strcmp (infn2, barfn) || !strcmp (unknownfn1, unknownfn2) || !strcmp (unknownfn1, barfn) || - !strcmp (unknownfn2, barfn)) { - - fprintf (stderr, "Error: Duplicate input and/or output file names.\n"); - return EXIT_FAILURE; - } - - pe1 = gzopen (infn1, "r"); - if (!pe1) { - fprintf (stderr, "Could not open input file 1 '%s'.\n", infn1); - return EXIT_FAILURE; - } - - pe2 = gzopen (infn2, "r"); - if (!pe2) { - fprintf (stderr, "Could not open input file 2 '%s'.\n", infn2); - return EXIT_FAILURE; - } - - unknownfile1 = fopen (unknownfn1, "w"); - if (!unknownfile1) { - fprintf (stderr, "Could not open unknown output file 1 '%s'.\n", unknownfn1); - return EXIT_FAILURE; - } - - unknownfile2 = fopen (unknownfn2, "w"); - if (!unknownfile2) { - fprintf (stderr, "Could not open unknown output file 2 '%s'.\n", unknownfn2); - return EXIT_FAILURE; - } - - barfile = fopen (barfn, "r"); - if (!barfile) { - fprintf (stderr, "Could not open barcode file '%s'.\n", barfn); - return EXIT_FAILURE; - } - - - /* Creating linked list of barcode data */ - head = NULL; - while (fscanf (barfile, "%s%s%s", barcode, baroutfn1, baroutfn2) != EOF) { - curr = (barcode_data_paired*) malloc (sizeof (barcode_data_paired)); - curr->bc = (char*) malloc (strlen(barcode) + 1); - strcpy (curr->bc, barcode); - - curr->bcfile1 = fopen (baroutfn1, "w"); - curr->bcfile2 = fopen (baroutfn2, "w"); - curr->num_records = 0; - - curr->next = head; - head = curr; - } - - - fqrec1 = kseq_init (pe1); - fqrec2 = kseq_init (pe2); - - while ((l1 = kseq_read (fqrec1)) >= 0) { - - l2 = kseq_read (fqrec2); - if (l2 < 0) { - fprintf (stderr, "Error: PE file 2 is shorter than PE file 1. Disregarding rest of PE file 1.\n"); - break; - } - - - /* Go through all barcode data and check if any match to beginning of read */ - /* If it does then put read in that barcode's file, otherwise put in unknown file */ - curr = head; - while (curr) { - if (strncmp_with_mismatch (curr->bc, fqrec1->seq.s, strlen (curr->bc), mismatch) == 0) { - break; - } - - curr = curr->next; - } - - - if (curr != NULL) { - fprintf (curr->bcfile1, "@%s", fqrec1->name.s); - if (fqrec1->comment.l) fprintf (curr->bcfile1, " %s\n", fqrec1->comment.s); - else fprintf (curr->bcfile1, "\n"); - - fprintf (curr->bcfile1, "%s\n", (fqrec1->seq.s)+strlen(curr->bc)); - - fprintf (curr->bcfile1, "+%s", fqrec1->name.s); - if (fqrec1->comment.l) fprintf (curr->bcfile1, " %s\n", fqrec1->comment.s); - else fprintf (curr->bcfile1, "\n"); - - fprintf (curr->bcfile1, "%s\n", (fqrec1->qual.s)+strlen(curr->bc)); - - - fprintf (curr->bcfile2, "@%s", fqrec2->name.s); - if (fqrec2->comment.l) fprintf (curr->bcfile2, " %s\n", fqrec2->comment.s); - else fprintf (curr->bcfile2, "\n"); - - if (!both_have_barcodes) fprintf (curr->bcfile2, "%s\n", fqrec2->seq.s); - else fprintf (curr->bcfile2, "%s\n", (fqrec2->seq.s)+strlen(curr->bc)); - - fprintf (curr->bcfile2, "+%s", fqrec2->name.s); - if (fqrec2->comment.l) fprintf (curr->bcfile2, " %s\n", fqrec2->comment.s); - else fprintf (curr->bcfile2, "\n"); - - if (!both_have_barcodes) fprintf (curr->bcfile2, "%s\n", fqrec2->qual.s); - else fprintf (curr->bcfile2, "%s\n", (fqrec2->qual.s)+strlen(curr->bc)); - - curr->num_records += 2; - } - - else { - fprintf (unknownfile1, "@%s", fqrec1->name.s); - if (fqrec1->comment.l) fprintf (unknownfile1, " %s\n", fqrec1->comment.s); - else fprintf (unknownfile1, "\n"); - - fprintf (unknownfile1, "%s\n", fqrec1->seq.s); - - fprintf (unknownfile1, "+%s", fqrec1->name.s); - if (fqrec1->comment.l) fprintf (unknownfile1, " %s\n", fqrec1->comment.s); - else fprintf (unknownfile1, "\n"); - - fprintf (unknownfile1, "%s\n", fqrec1->qual.s); - - - fprintf (unknownfile2, "@%s", fqrec2->name.s); - if (fqrec2->comment.l) fprintf (unknownfile2, " %s\n", fqrec2->comment.s); - else fprintf (unknownfile2, "\n"); - - fprintf (unknownfile2, "%s\n", fqrec2->seq.s); - - fprintf (unknownfile2, "+%s", fqrec2->name.s); - if (fqrec2->comment.l) fprintf (unknownfile2, " %s\n", fqrec2->comment.s); - else fprintf (unknownfile2, "\n"); - - fprintf (unknownfile2, "%s\n", fqrec2->qual.s); - - num_unknown += 2; - } - - total += 2; - } - - if (l1 < 0) { - l2 = kseq_read (fqrec2); - if (l2 >= 0) { - fprintf (stderr, "Error: PE file 1 is shorter than PE file 2. Disregarding rest of PE file 2.\n"); - } - } - - - if (!quiet) { - fprintf (stdout, "\nTotal FastQ records: %d (%d pairs)\n\n", total, total/2); - curr = head; - while (curr) { - fprintf (stdout, "FastQ records for barcode %s: %d (%d pairs)\n", curr->bc, curr->num_records, curr->num_records/2); - curr = curr->next; - } - fprintf (stdout, "\nFastQ records with no barcode match: %d (%d pairs)\n", num_unknown, num_unknown/2); - fprintf (stdout, "\nNumber of mismatches allowed: %d\n\n", mismatch); - } - - - kseq_destroy (fqrec1); - kseq_destroy (fqrec2); - gzclose (pe1); - gzclose (pe2); - fclose (unknownfile1); - fclose (unknownfile2); - fclose (barfile); - - free (infn1); - free (infn2); - free (barfn); - free (unknownfn1); - free (unknownfn2); - - curr = head; - while (curr) { - fclose (curr->bcfile1); - fclose (curr->bcfile2); - free (curr->bc); - temp = curr; - curr = curr->next; - free (temp); - } - - return EXIT_SUCCESS; -} diff --git a/src/demulti_single.c b/src/demulti_single.c deleted file mode 100644 index d304410..0000000 --- a/src/demulti_single.c +++ /dev/null @@ -1,239 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include "sabre.h" -#include "kseq.h" - -KSEQ_INIT(gzFile, gzread) - - -static struct option single_long_options[] = { - {"fastq-file", required_argument, 0, 'f'}, - {"barcode-file", required_argument, 0, 'b'}, - {"unknown-output", required_argument, 0, 'u'}, - {"max-mismatch", optional_argument, 0, 'm'}, - {"quiet", optional_argument, 0, 'z'}, - {GETOPT_HELP_OPTION_DECL}, - {GETOPT_VERSION_OPTION_DECL}, - {NULL, 0, NULL, 0} -}; - -void single_usage (int status) { - - fprintf (stderr, "\nUsage: %s se -f -b -u \n\ -\n\ -Options:\n\ --f, --fastq-file, Input fastq file (required)\n", PROGRAM_NAME); - - fprintf (stderr, "-b, --barcode-file, File with barcode and output file name per line (required)\n\ --u, --unknown-output, Output file that contains records with no barcodes found. (required)\n\ --m , --max-mismatch , Optional argument that is the maximum number of mismatches allowed in a barcode. Default 0.\n\ ---quiet, don't output matching info\n\ ---help, display this help and exit\n\ ---version, output version information and exit\n\n"); - - exit (status); -} - -int single_main (int argc, char *argv[]) { - - gzFile se=NULL; - kseq_t *fqrec; - int debug=0; - int optc; - extern char *optarg; - FILE* barfile = NULL; - FILE* unknownfile=NULL; - char *barfn=NULL; - char *infn=NULL; - char *unknownfn=NULL; - barcode_data *curr, *head, *temp; - char barcode [MAX_BARCODE_LENGTH]; - char baroutfn [MAX_FILENAME_LENGTH]; - int num_unknown=0; - int total=0; - int mismatch=0; - int quiet=0; - - while (1) { - int option_index = 0; - optc = getopt_long (argc, argv, "df:b:u:m:z", single_long_options, &option_index); - - if (optc == -1) break; - - switch (optc) { - if (single_long_options[option_index].flag != 0) break; - - case 'f': - infn = (char*) malloc (strlen (optarg) + 1); - strcpy (infn, optarg); - break; - - case 'b': - barfn = (char*) malloc (strlen (optarg) + 1); - strcpy (barfn, optarg); - break; - - case 'u': - unknownfn = (char*) malloc (strlen (optarg) + 1); - strcpy (unknownfn, optarg); - break; - - case 'm': - mismatch = atoi (optarg); - break; - - case 'z': - quiet=1; - break; - - case 'd': - debug = 1; - break; - - case_GETOPT_HELP_CHAR(single_usage) - case_GETOPT_VERSION_CHAR(PROGRAM_NAME, VERSION, AUTHORS); - - case '?': - single_usage (EXIT_FAILURE); - break; - - default: - single_usage (EXIT_FAILURE); - break; - } - } - - - if (!infn || !barfn) { - single_usage (EXIT_FAILURE); - } - - if (!strcmp (infn, barfn)) { - fprintf (stderr, "Error: Input file is same as barcode file.\n"); - return EXIT_FAILURE; - } - - se = gzopen (infn, "r"); - if (!se) { - fprintf (stderr, "Could not open input file '%s'.\n", infn); - return EXIT_FAILURE; - } - - barfile = fopen (barfn, "r"); - if (!barfile) { - fprintf (stderr, "Could not open barcode file '%s'.\n", barfn); - return EXIT_FAILURE; - } - - unknownfile = fopen (unknownfn, "w"); - if (!unknownfile) { - fprintf (stderr, "Could not open unknown output file '%s'.\n", unknownfn); - return EXIT_FAILURE; - } - - - /* Set up a linked list of the barcode data */ - head = NULL; - while (fscanf (barfile, "%s%s", barcode, baroutfn) != EOF) { - curr = (barcode_data*) malloc (sizeof (barcode_data)); - curr->bc = (char*) malloc (strlen(barcode) + 1); - strcpy (curr->bc, barcode); - - curr->bcfile = fopen (baroutfn, "w"); - curr->num_records = 0; - - curr->next = head; - head = curr; - } - - - fqrec = kseq_init (se); - - while (kseq_read (fqrec) >= 0) { - - /* Go through linked list of barcode data and compare each barcode */ - /* with the sequence until a match is found or no match is found for any */ - curr = head; - while (curr) { - if (strncmp_with_mismatch (curr->bc, fqrec->seq.s, strlen (curr->bc), mismatch) == 0) { - break; - } - - curr = curr->next; - } - - - /* If barcode data is found, output to demultiplexed file, else output to unknown file */ - if (curr != NULL) { - fprintf (curr->bcfile, "@%s", fqrec->name.s); - if (fqrec->comment.l) fprintf (curr->bcfile, " %s\n", fqrec->comment.s); - else fprintf (curr->bcfile, "\n"); - - fprintf (curr->bcfile, "%s\n", (fqrec->seq.s)+strlen(curr->bc)); - - fprintf (curr->bcfile, "+%s", fqrec->name.s); - if (fqrec->comment.l) fprintf (curr->bcfile, " %s\n", fqrec->comment.s); - else fprintf (curr->bcfile, "\n"); - - fprintf (curr->bcfile, "%s\n", (fqrec->qual.s)+strlen(curr->bc)); - - curr->num_records++; - } - - else { - fprintf (unknownfile, "@%s", fqrec->name.s); - if (fqrec->comment.l) fprintf (unknownfile, " %s\n", fqrec->comment.s); - else fprintf (unknownfile, "\n"); - - fprintf (unknownfile, "%s\n", fqrec->seq.s); - - fprintf (unknownfile, "+%s", fqrec->name.s); - if (fqrec->comment.l) fprintf (unknownfile, " %s\n", fqrec->comment.s); - else fprintf (unknownfile, "\n"); - - fprintf (unknownfile, "%s\n", fqrec->qual.s); - - num_unknown++; - } - - total++; - } - - - if (!quiet) { - fprintf (stdout, "\nTotal FastQ records: %d\n\n", total); - curr = head; - while (curr) { - fprintf (stdout, "FastQ records for barcode %s: %d\n", curr->bc, curr->num_records); - curr = curr->next; - } - fprintf (stdout, "\nFastQ records with no barcode match: %d\n", num_unknown); - fprintf (stdout, "\nNumber of mismatches allowed: %d\n\n", mismatch); - } - - kseq_destroy (fqrec); - gzclose (se); - fclose (barfile); - fclose (unknownfile); - - free (infn); - free (unknownfn); - free (barfn); - - curr = head; - while (curr) { - fclose (curr->bcfile); - free (curr->bc); - temp = curr; - curr = curr->next; - free (temp); - } - - return 0; -} diff --git a/src/demultiplex.c b/src/demultiplex.c new file mode 100644 index 0000000..da149e3 --- /dev/null +++ b/src/demultiplex.c @@ -0,0 +1,207 @@ + +/* + * set softtab=4 + * set shiftwidth=4 + * set expandtab + * + */ + +/* + * sabre FASTQ files demultiplexing + * demultiplex.c: FASTQ demultiplexing + * + */ + +#include "demultiplex.h" + +// Is m1 a better match than m2? 0 -> yes, use m1. 1 -> no, stick with m2 +int better(match_ret_t m1, match_ret_t m2) { + if (m1.cropped<0) + return 0; + if (m2.cropped<0) + return 1; + return (m1.cropped + m1.mismatches) < (m2.cropped+m2.mismatches); +} + +// Is m the best possible match? +int best(match_ret_t m) { + return m.cropped==0 && m.mismatches==0; +} + +void write_out(const param_t *params, pthread_mutex_t *out_lock, metrics_t* metrics, + match_ret_t best_match, barcode_data_t *best_bc, + const char* actl_bc, + fq_rec_t *fq_rec1,fq_rec_t *fq_rec2); + +void* demult_runner(void *arg) { + fq_rec_t *fq_rec1 = (fq_rec_t*) malloc(sizeof(fq_rec_t)); + fq_rec_t *fq_rec2 = (fq_rec_t*) malloc(sizeof(fq_rec_t)); + + init_fq_rec(fq_rec1); + init_fq_rec(fq_rec2); + + thread_data_t* thread_data = (thread_data_t*)arg; + //int my_line_num; + + /* Get reads, one at a time */ + + while(1) { + + // lock reading + pthread_mutex_lock(thread_data->in_lock); + + //this is equivalent to if(false), which means this block + //is always skipped, unless when there is an error/end of the file + if(get_fq_rec(fq_rec1, thread_data->params->fq1_fd)) { + // sanity check no more reads + pthread_mutex_unlock(thread_data->in_lock); + break; + } + + if(thread_data->params->paired > 0) { + if(get_fq_rec(fq_rec2, thread_data->params->fq2_fd)) { + //error out there becuase if reached the end of the file + //then we should hit first break, above, since the assumptions + //that the files of equal length. If issues with R2 only this is an error + fprintf (stderr, "\n\ + \n ERROR: R2 file shorter than R1 file.\ + \n Stopping here:\ + \n %s\ + \n", + fq_rec1->name); + pthread_mutex_unlock(thread_data->in_lock); + exit(1); + } + } + + // unlock reading + // TODO this bit of code for ordered fastq files, implement later? + //my_line_num = *(thread_data->line_num); + //*thread_data->line_num += 1; + pthread_mutex_unlock(thread_data->in_lock); + + // Store a copy of the barcode found in the read (including the mismatches) + char *actl_bc = NULL; + + /* Step 1: Find matching barcode */ + match_ret_t best_match = {-1,-1}; + barcode_data_t *best_bc = NULL; + for (barcode_data_t *curr = thread_data->curr; curr!=NULL; curr = curr->next) { + for (int i=0; curr->bc[i]; i++) { + match_ret_t mtch = chk_bc_mtch(curr->bc[i], fq_rec1->seq, thread_data->params->mismatch, thread_data->params->max_5prime_crop); + if(better(mtch, best_match)) { + //found better match + best_match = mtch; + best_bc = curr; + if (actl_bc) free(actl_bc); + actl_bc = strndup( (fq_rec1->seq)+mtch.cropped, strlen(curr->bc[i]) ); + if (best(best_match)) + break; + } + } + + if (best(best_match)) + break; + } + + /* Step 2: Write read out into barcode specific file */ + //TODO this bit of code to keep fastq files ordered as per original fastq files + //which I don't think that needed? at least not at this stage + // lock writing + //while(*(thread_data->out_line_num) != my_line_num) { + // pthread_cond_wait(thread_data->cv, thread_data->out_lock); + //} + //*thread_data->out_line_num += 1; + + //pthread_cond_broadcast(thread_data->cv); // Tell everyone it might be their turn! + + write_out(thread_data->params, thread_data->out_lock, thread_data->metrics, best_match, best_bc, actl_bc, fq_rec1, fq_rec2); + if (actl_bc) + free(actl_bc); + + thread_data->metrics->total += 2; + } + + free(fq_rec1); + free(fq_rec2); + //free(thread_data); according to valgrind report this line isn't needed since no errors given out.. + return NULL; +} + + +void write_out(const param_t *params, pthread_mutex_t *out_lock, metrics_t* metrics, + match_ret_t best_match, barcode_data_t *best_bc, + const char* actl_bc, + fq_rec_t *fq_rec1,fq_rec_t *fq_rec2) { + + char fqread1[MAX_READ_SIZE]; + char fqread2[MAX_READ_SIZE]; + + fqread1[0] = '\0'; + fqread2[0] = '\0'; + + char *umi_idx = NULL; + + if(best_bc != NULL) { + //for now assume barcode and umi are in R1 read + if(params->umi > 0) { + + const char *actl_umi_idx = (fq_rec1->seq)+strlen(actl_bc)+best_match.cropped; + + if(strlen(actl_umi_idx) < params->min_umi_len) { + pthread_mutex_lock(out_lock); + fprintf(params->umis_2_short_fd, "%s\t%s\t%zu\t%d\n", fq_rec1->name, actl_umi_idx, strlen(actl_umi_idx), params->min_umi_len); + pthread_mutex_unlock(out_lock); + return; + } + else { + umi_idx = strdup(actl_umi_idx); + umi_idx[params->min_umi_len] = '\0'; + } + } + + if(params->combine > 0 && actl_bc != NULL) { + get_merged_fqread(fqread1, fq_rec1, fq_rec2, actl_bc, umi_idx, params->no_comment, best_match.cropped); + + pthread_mutex_lock(out_lock); + fputs(fqread1, best_bc->bcfile1); + pthread_mutex_unlock(out_lock); + + } + else { + get_fqread(fqread1, fq_rec1, actl_bc, umi_idx, params->no_comment, best_match.cropped); + + pthread_mutex_lock(out_lock); + fputs(fqread1, best_bc->bcfile1); + + if(params->paired > 0) { + get_fqread(fqread2, fq_rec1, actl_bc, umi_idx, params->no_comment, best_match.cropped); + + fputs(fqread2, best_bc->bcfile2); + + //dont need to increment buff_cnt, assuming fq_read1 keeps the right count + best_bc->num_records += 1; + } + pthread_mutex_unlock(out_lock); + } + best_bc->num_records += 1; + } + else { + + get_fqread(fqread1, fq_rec1, NULL, NULL, params->no_comment, 0); + + pthread_mutex_lock(out_lock); + fputs(fqread1, params->unassigned1_fd); + + metrics->num_unknown += 1; + + if(params->paired > 0) { + get_fqread(fqread2, fq_rec2, NULL, NULL, params->no_comment, 0); + + fputs(fqread2, params->unassigned2_fd); + + metrics->num_unknown += 1; + } + pthread_mutex_unlock(out_lock); + } +} \ No newline at end of file diff --git a/src/demultiplex.h b/src/demultiplex.h new file mode 100644 index 0000000..22e3417 --- /dev/null +++ b/src/demultiplex.h @@ -0,0 +1,15 @@ +#ifndef DEMULTIPLEX_H +#define DEMULTIPLEX_H + +#include "sabre.h" +#include "utils.h" +#include "fastq.h" + +#define MAX_READ_SIZE 2048 +//#define MAX_READ_NUMBER 100000 +#define MAX_READ_NUMBER 15000 +#define MAX_READ_BUFFER MAX_READ_SIZE*MAX_READ_NUMBER + +void* demult_runner(void *arg); + +#endif /*DEMULTIPLEX_H*/ diff --git a/src/fastq.c b/src/fastq.c new file mode 100644 index 0000000..333ebf5 --- /dev/null +++ b/src/fastq.c @@ -0,0 +1,47 @@ + +#include "fastq.h" + +int get_line(gzFile fq_fd, char *line, int buff) { + + char *new_line = gzgets(fq_fd, line, buff); + + if(new_line == NULL) { + return -1; + } + + int str_len = strlen(new_line); + + if(new_line[str_len-1] != '\n') { + fprintf(stderr, "Line too long %d\n", buff); + exit(1); + } + + new_line[str_len-1] = '\0'; + + return 0; +} +//user strchr +int get_fq_rec(fq_rec_t *fq_rec, gzFile fq_fd) { + + int done = get_line(fq_fd, fq_rec->name, LINE_SIZE); + done = done || get_line(fq_fd, fq_rec->seq, LINE_SIZE); + done = done || get_line(fq_fd, fq_rec->other, LINE_SIZE); + done = done || get_line(fq_fd, fq_rec->qual, LINE_SIZE); + char *ptr = strchr(fq_rec->name,' '); + if(ptr) { + *ptr='\0'; + fq_rec->comment = ptr+1; + } else { + fq_rec->comment = NULL; + } + // before writing it out check that comment isn't null + + return done; +} + +void init_fq_rec(fq_rec_t *fq_rec) { + fq_rec->name[0] = '\0'; + fq_rec->seq[0] = '\0'; + fq_rec->other[0] = '\0'; + fq_rec->qual[0] = '\0'; +} diff --git a/src/fastq.h b/src/fastq.h new file mode 100644 index 0000000..0083621 --- /dev/null +++ b/src/fastq.h @@ -0,0 +1,20 @@ +#ifndef FASTQ_H +#define FASTQ_H + +#include "sabre.h" + +#define LINE_SIZE 600 + +typedef struct { + char name[LINE_SIZE]; + char *comment; + char seq[LINE_SIZE]; + char other[LINE_SIZE]; + char qual[LINE_SIZE]; +} fq_rec_t; + +void init_fq_rec(fq_rec_t *fq_rec); +int get_line(gzFile fq_fd, char *line, int buff); +int get_fq_rec(fq_rec_t *fq_rec, gzFile fq_fd); + +#endif /*FASTQ_H*/ diff --git a/src/kseq.h b/src/kseq.h index 73600c4..8f9e498 100644 --- a/src/kseq.h +++ b/src/kseq.h @@ -1,6 +1,6 @@ /* The MIT License - Copyright (c) 2008 Genome Research Ltd (GRL). + Copyright (c) 2008, 2009, 2011 Attractive Chaos Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the @@ -23,9 +23,7 @@ SOFTWARE. */ -/* Contact: Heng Li */ - -/* Last Modified: 12APR2009 */ +/* Last Modified: 2017-02-11 */ #ifndef AC_KSEQ_H #define AC_KSEQ_H @@ -34,47 +32,50 @@ #include #include -#define KS_SEP_SPACE 0 /* isspace(): \t, \n, \v, \f, \r */ -#define KS_SEP_TAB 1 /* isspace() && !' ' */ -#define KS_SEP_MAX 1 +#define KS_SEP_SPACE 0 // isspace(): \t, \n, \v, \f, \r +#define KS_SEP_TAB 1 // isspace() && !' ' +#define KS_SEP_LINE 2 // line separator: "\n" (Unix) or "\r\n" (Windows) +#define KS_SEP_MAX 2 -#define __KS_TYPE(type_t) \ - typedef struct __kstream_t { \ - char *buf; \ - int begin, end, is_eof; \ - type_t f; \ - } kstream_t; +#define __KS_TYPE(type_t) \ + typedef struct __kstream_t { \ + unsigned char *buf; \ + int begin, end, is_eof; \ + type_t f; \ + } kstream_t; +#define ks_err(ks) ((ks)->end < 0) #define ks_eof(ks) ((ks)->is_eof && (ks)->begin >= (ks)->end) #define ks_rewind(ks) ((ks)->is_eof = (ks)->begin = (ks)->end = 0) -#define __KS_BASIC(type_t, __bufsize) \ - static inline kstream_t *ks_init(type_t f) \ - { \ - kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t)); \ - ks->f = f; \ - ks->buf = (char*)malloc(__bufsize); \ - return ks; \ - } \ - static inline void ks_destroy(kstream_t *ks) \ - { \ - if (ks) { \ - free(ks->buf); \ - free(ks); \ - } \ - } - -#define __KS_GETC(__read, __bufsize) \ - static inline int ks_getc(kstream_t *ks) \ - { \ - if (ks->is_eof && ks->begin >= ks->end) return -1; \ - if (ks->begin >= ks->end) { \ - ks->begin = 0; \ - ks->end = __read(ks->f, ks->buf, __bufsize); \ - if (ks->end < __bufsize) ks->is_eof = 1; \ - if (ks->end == 0) return -1; \ - } \ - return (int)ks->buf[ks->begin++]; \ +#define __KS_BASIC(type_t, __bufsize) \ + static inline kstream_t *ks_init(type_t f) \ + { \ + kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t)); \ + ks->f = f; \ + ks->buf = (unsigned char*)malloc(__bufsize); \ + return ks; \ + } \ + static inline void ks_destroy(kstream_t *ks) \ + { \ + if (ks) { \ + free(ks->buf); \ + free(ks); \ + } \ + } + +#define __KS_GETC(__read, __bufsize) \ + static inline int ks_getc(kstream_t *ks) \ + { \ + if (ks_err(ks)) return -3; \ + if (ks_eof(ks)) return -1; \ + if (ks->begin >= ks->end) { \ + ks->begin = 0; \ + ks->end = __read(ks->f, ks->buf, __bufsize); \ + if (ks->end == 0) { ks->is_eof = 1; return -1; } \ + else if (ks->end < 0) { ks->is_eof = 1; return -3; } \ + } \ + return (int)ks->buf[ks->begin++]; \ } #ifndef KSTRING_T @@ -89,135 +90,154 @@ typedef struct __kstring_t { #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) #endif -#define __KS_GETUNTIL(__read, __bufsize) \ - static int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \ - { \ - if (dret) *dret = 0; \ - str->l = 0; \ - if (ks->begin >= ks->end && ks->is_eof) return -1; \ - for (;;) { \ - int i; \ - if (ks->begin >= ks->end) { \ - if (!ks->is_eof) { \ - ks->begin = 0; \ - ks->end = __read(ks->f, ks->buf, __bufsize); \ - if (ks->end < __bufsize) ks->is_eof = 1; \ - if (ks->end == 0) break; \ - } else break; \ - } \ - if (delimiter > KS_SEP_MAX) { \ - for (i = ks->begin; i < ks->end; ++i) \ - if (ks->buf[i] == delimiter) break; \ - } else if (delimiter == KS_SEP_SPACE) { \ - for (i = ks->begin; i < ks->end; ++i) \ - if (isspace(ks->buf[i])) break; \ - } else if (delimiter == KS_SEP_TAB) { \ - for (i = ks->begin; i < ks->end; ++i) \ - if (isspace(ks->buf[i]) && ks->buf[i] != ' ') break; \ - } else i = 0; /* never come to here! */ \ - if (str->m - str->l < i - ks->begin + 1) { \ - str->m = str->l + (i - ks->begin) + 1; \ - kroundup32(str->m); \ - str->s = (char*)realloc(str->s, str->m); \ - } \ - memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin); \ - str->l = str->l + (i - ks->begin); \ - ks->begin = i + 1; \ - if (i < ks->end) { \ - if (dret) *dret = ks->buf[i]; \ - break; \ - } \ - } \ - if (str->l == 0) { \ - str->m = 1; \ - str->s = (char*)calloc(1, 1); \ - } \ - str->s[str->l] = '\0'; \ - return str->l; \ - } - -#define KSTREAM_INIT(type_t, __read, __bufsize) \ - __KS_TYPE(type_t) \ - __KS_BASIC(type_t, __bufsize) \ - __KS_GETC(__read, __bufsize) \ - __KS_GETUNTIL(__read, __bufsize) - -#define __KSEQ_BASIC(type_t) \ - static inline kseq_t *kseq_init(type_t fd) \ - { \ - kseq_t *s = (kseq_t*)calloc(1, sizeof(kseq_t)); \ - s->f = ks_init(fd); \ - return s; \ - } \ - static inline void kseq_rewind(kseq_t *ks) \ - { \ - ks->last_char = 0; \ - ks->f->is_eof = ks->f->begin = ks->f->end = 0; \ - } \ - static inline void kseq_destroy(kseq_t *ks) \ - { \ - if (!ks) return; \ - free(ks->name.s); free(ks->comment.s); free(ks->seq.s); free(ks->qual.s); \ - ks_destroy(ks->f); \ - free(ks); \ - } +#define __KS_GETUNTIL(__read, __bufsize) \ + static int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append) \ + { \ + int gotany = 0; \ + if (dret) *dret = 0; \ + str->l = append? str->l : 0; \ + for (;;) { \ + int i; \ + if (ks_err(ks)) return -3; \ + if (ks->begin >= ks->end) { \ + if (!ks->is_eof) { \ + ks->begin = 0; \ + ks->end = __read(ks->f, ks->buf, __bufsize); \ + if (ks->end == 0) { ks->is_eof = 1; break; } \ + if (ks->end == -1) { ks->is_eof = 1; return -3; } \ + } else break; \ + } \ + if (delimiter == KS_SEP_LINE) { \ + for (i = ks->begin; i < ks->end; ++i) \ + if (ks->buf[i] == '\n') break; \ + } else if (delimiter > KS_SEP_MAX) { \ + for (i = ks->begin; i < ks->end; ++i) \ + if (ks->buf[i] == delimiter) break; \ + } else if (delimiter == KS_SEP_SPACE) { \ + for (i = ks->begin; i < ks->end; ++i) \ + if (isspace(ks->buf[i])) break; \ + } else if (delimiter == KS_SEP_TAB) { \ + for (i = ks->begin; i < ks->end; ++i) \ + if (isspace(ks->buf[i]) && ks->buf[i] != ' ') break; \ + } else i = 0; /* never come to here! */ \ + if (str->m - str->l < (size_t)(i - ks->begin + 1)) { \ + str->m = str->l + (i - ks->begin) + 1; \ + kroundup32(str->m); \ + str->s = (char*)realloc(str->s, str->m); \ + } \ + gotany = 1; \ + memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin); \ + str->l = str->l + (i - ks->begin); \ + ks->begin = i + 1; \ + if (i < ks->end) { \ + if (dret) *dret = ks->buf[i]; \ + break; \ + } \ + } \ + if (!gotany && ks_eof(ks)) return -1; \ + if (str->s == 0) { \ + str->m = 1; \ + str->s = (char*)calloc(1, 1); \ + } else if (delimiter == KS_SEP_LINE && str->l > 1 && str->s[str->l-1] == '\r') --str->l; \ + str->s[str->l] = '\0'; \ + return str->l; \ + } \ + static inline int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \ + { return ks_getuntil2(ks, delimiter, str, dret, 0); } + +#define KSTREAM_INIT(type_t, __read, __bufsize) \ + __KS_TYPE(type_t) \ + __KS_BASIC(type_t, __bufsize) \ + __KS_GETC(__read, __bufsize) \ + __KS_GETUNTIL(__read, __bufsize) + +#define kseq_rewind(ks) ((ks)->last_char = (ks)->f->is_eof = (ks)->f->begin = (ks)->f->end = 0) + +#define __KSEQ_BASIC(SCOPE, type_t) \ + SCOPE kseq_t *kseq_init(type_t fd) \ + { \ + kseq_t *s = (kseq_t*)calloc(1, sizeof(kseq_t)); \ + s->f = ks_init(fd); \ + return s; \ + } \ + SCOPE void kseq_destroy(kseq_t *ks) \ + { \ + if (!ks) return; \ + free(ks->name.s); free(ks->comment.s); free(ks->seq.s); free(ks->qual.s); \ + ks_destroy(ks->f); \ + free(ks); \ + } /* Return value: >=0 length of the sequence (normal) -1 end-of-file -2 truncated quality string -*/ -#define __KSEQ_READ \ - static int kseq_read(kseq_t *seq) \ - { \ - int c; \ - kstream_t *ks = seq->f; \ - if (seq->last_char == 0) { /* then jump to the next header line */ \ - while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@'); \ - if (c == -1) return -1; /* end of file */ \ - seq->last_char = c; \ - } /* the first header char has been read */ \ - seq->comment.l = seq->seq.l = seq->qual.l = 0; \ - if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1; \ - if (c != '\n') ks_getuntil(ks, '\n', &seq->comment, 0); \ - while ((c = ks_getc(ks)) != -1 && c != '>' && c != '+' && c != '@') { \ - if (isgraph(c)) { /* printable non-space character */ \ - if (seq->seq.l + 1 >= seq->seq.m) { /* double the memory */ \ - seq->seq.m = seq->seq.l + 2; \ - kroundup32(seq->seq.m); /* rounded to next closest 2^k */ \ - seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \ - } \ - seq->seq.s[seq->seq.l++] = (char)c; \ - } \ - } \ - if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */ \ - seq->seq.s[seq->seq.l] = 0; /* null terminated string */ \ - if (c != '+') return seq->seq.l; /* FASTA */ \ - if (seq->qual.m < seq->seq.m) { /* allocate enough memory */ \ - seq->qual.m = seq->seq.m; \ - seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m); \ - } \ - while ((c = ks_getc(ks)) != -1 && c != '\n'); /* skip the rest of '+' line */ \ - if (c == -1) return -2; /* we should not stop here */ \ - while ((c = ks_getc(ks)) != -1 && seq->qual.l < seq->seq.l) \ - if (c >= 33 && c <= 127) seq->qual.s[seq->qual.l++] = (unsigned char)c; \ - seq->qual.s[seq->qual.l] = 0; /* null terminated string */ \ - seq->last_char = 0; /* we have not come to the next header line */ \ - if (seq->seq.l != seq->qual.l) return -2; /* qual string is shorter than seq string */ \ - return seq->seq.l; \ - } - -#define __KSEQ_TYPE(type_t) \ - typedef struct { \ - kstring_t name, comment, seq, qual; \ - int last_char; \ - kstream_t *f; \ - } kseq_t; - -#define KSEQ_INIT(type_t, __read) \ - KSTREAM_INIT(type_t, __read, 4096) \ - __KSEQ_TYPE(type_t) \ - __KSEQ_BASIC(type_t) \ - __KSEQ_READ + -3 error reading stream + */ +#define __KSEQ_READ(SCOPE) \ + SCOPE int kseq_read(kseq_t *seq) \ + { \ + int c,r; \ + kstream_t *ks = seq->f; \ + if (seq->last_char == 0) { /* then jump to the next header line */ \ + while ((c = ks_getc(ks)) >= 0 && c != '>' && c != '@'); \ + if (c < 0) return c; /* end of file or error*/ \ + seq->last_char = c; \ + } /* else: the first header char has been read in the previous call */ \ + seq->comment.l = seq->seq.l = seq->qual.l = 0; /* reset all members */ \ + if ((r=ks_getuntil(ks, 0, &seq->name, &c)) < 0) return r; /* normal exit: EOF or error */ \ + if (c != '\n') ks_getuntil(ks, KS_SEP_LINE, &seq->comment, 0); /* read FASTA/Q comment */ \ + if (seq->seq.s == 0) { /* we can do this in the loop below, but that is slower */ \ + seq->seq.m = 256; \ + seq->seq.s = (char*)malloc(seq->seq.m); \ + } \ + while ((c = ks_getc(ks)) >= 0 && c != '>' && c != '+' && c != '@') { \ + if (c == '\n') continue; /* skip empty lines */ \ + seq->seq.s[seq->seq.l++] = c; /* this is safe: we always have enough space for 1 char */ \ + ks_getuntil2(ks, KS_SEP_LINE, &seq->seq, 0, 1); /* read the rest of the line */ \ + } \ + if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */ \ + if (seq->seq.l + 1 >= seq->seq.m) { /* seq->seq.s[seq->seq.l] below may be out of boundary */ \ + seq->seq.m = seq->seq.l + 2; \ + kroundup32(seq->seq.m); /* rounded to the next closest 2^k */ \ + seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \ + } \ + seq->seq.s[seq->seq.l] = 0; /* null terminated string */ \ + seq->is_fastq = (c == '+'); \ + if (!seq->is_fastq) return seq->seq.l; /* FASTA */ \ + if (seq->qual.m < seq->seq.m) { /* allocate memory for qual in case insufficient */ \ + seq->qual.m = seq->seq.m; \ + seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m); \ + } \ + while ((c = ks_getc(ks)) >= 0 && c != '\n'); /* skip the rest of '+' line */ \ + if (c == -1) return -2; /* error: no quality string */ \ + while ((c = ks_getuntil2(ks, KS_SEP_LINE, &seq->qual, 0, 1) >= 0 && seq->qual.l < seq->seq.l)); \ + if (c == -3) return -3; /* stream error */ \ + seq->last_char = 0; /* we have not come to the next header line */ \ + if (seq->seq.l != seq->qual.l) return -2; /* error: qual string is of a different length */ \ + return seq->seq.l; \ + } + +#define __KSEQ_TYPE(type_t) \ + typedef struct { \ + kstring_t name, comment, seq, qual; \ + int last_char, is_fastq; \ + kstream_t *f; \ + } kseq_t; + +#define KSEQ_INIT2(SCOPE, type_t, __read) \ + KSTREAM_INIT(type_t, __read, 16384) \ + __KSEQ_TYPE(type_t) \ + __KSEQ_BASIC(SCOPE, type_t) \ + __KSEQ_READ(SCOPE) + +#define KSEQ_INIT(type_t, __read) KSEQ_INIT2(static, type_t, __read) + +#define KSEQ_DECLARE(type_t) \ + __KS_TYPE(type_t) \ + __KSEQ_TYPE(type_t) \ + extern kseq_t *kseq_init(type_t fd); \ + void kseq_destroy(kseq_t *ks); \ + int kseq_read(kseq_t *seq); #endif diff --git a/src/metrics.c b/src/metrics.c new file mode 100644 index 0000000..646b3ea --- /dev/null +++ b/src/metrics.c @@ -0,0 +1,187 @@ +#include "utils.h" + +/* + gcc -Wall -O2 -std=c99 -o metrics metrics.c -lz + */ + +#define BARCODE_ARRAY 1000000 + +//int chk_bc_arr(const char *arr, char *bc); + +/* + WISDOM char **arr and char *arr[] mean the same things + but later one more informative + */ + +int chk_bc_arr(barcodes_t *arr, char *bc) { + + int i = 0; + while(arr[i].bc != 0) { + + if(strcmp(bc, arr[i].bc) == 0) { + return i; + } + + i += 1; + } + + arr[i].bc = strdup(bc); + + // error handling + if(i == BARCODE_ARRAY-1) { + fprintf(stderr, "ERROR: gone too far in the array\n"); + exit(1); + } + + // could also do initialisation of the next element here + // arr[i+1]=0; + return i; +} + +int chk_umi_arr(umis_t *arr, char *bc) { + + int i = 0; + while(arr[i].bc != 0) { + + if(strcmp(bc, arr[i].bc) == 0) { + return i; + } + + i += 1; + } + + arr[i].bc = strdup(bc); + arr[i].len = strlen(bc); + + if(i == BARCODE_ARRAY-1) { + fprintf(stderr, "ERROR: gone too far in the array\n"); + exit(1); + } + return i; +} + +int umi_n_cmp(const void *a1, const void *b1) { + + const umis_t* a=a1; + const umis_t* b=b1; + + return a->cnts - b->cnts; +} + +int bc_n_cmp(const void *a1, const void *b1) { + + const barcodes_t* a=a1; + const barcodes_t* b=b1; + + return a->cnts - b->cnts; +} + +int main (int argc, char *argv[]) { + + if (argc <= 2) { + fprintf(stderr, "\n\ + \n Usage: metrics \ + \n\ + \n Options:\ + \n\ + \n mode INT [0|1]; 0 = sample barcode, 1 = umis barcodes\ + \n\ + \n"); + exit (EXIT_SUCCESS); + } + + gzFile pe1=NULL; + pe1 = gzopen(argv[1], "r"); + kseq_t *fqrec1; + int l1; + int mode = atoi(argv[2]); + + fqrec1 = kseq_init(pe1); + + /* + WISDOM these tow are actually different things + char barcodes[BARCODE_ARRAY][BARCODE]; + char *barcodes[]; + */ + + //char **barcodes = calloc(BARCODE_ARRAY, sizeof(char*)); + //int *bc_cnts = calloc(BARCODE_ARRAY, sizeof(int)); + + barcodes_t *barcodes; + barcodes = calloc(BARCODE_ARRAY, sizeof(barcodes_t)); + + umis_t *umis; + umis = calloc(BARCODE_ARRAY, sizeof(umis_t)); + + /* Get reads, one at a time */ + while ((l1 = kseq_read(fqrec1)) >= 0) { + + char *last; + char *last2; + + char *p = strtok(fqrec1->name.s, ":"); + + while (p != NULL) { + last2 = last; + last = p; + p = strtok(NULL, ":"); + } + + if(mode == 0) { + int bc_idx = chk_bc_arr(barcodes, last2); + barcodes[bc_idx].cnts += 1; + } + else if(mode == 1) { + + int umi_idx = chk_umi_arr(umis, last); + umis[umi_idx].cnts += 1; + } + } + + if(mode == 0) { + + int n = 0; + while(barcodes[n].bc != 0) { + n++; + } + qsort(barcodes, n, sizeof(barcodes_t), bc_n_cmp); + + /* + WISDOM this is to limit the right scope for i + */ + + int i = 0; + while(barcodes[i].bc != 0) { + fprintf(stdout, "%s\t%d\n", barcodes[i].bc, barcodes[i].cnts); + i++; + } + } + else if(mode == 1) { + + int n = 0; + while(umis[n].bc != 0) { + n++; + } + qsort(umis, n, sizeof(umis_t), umi_n_cmp); + + int i = 0; + while(umis[i].bc != 0) { + fprintf(stdout, "%s\t%d\t%d\n", umis[i].bc, umis[i].len, umis[i].cnts); + i++; + } + } + + gzclose(pe1); + kseq_destroy(fqrec1); + + return EXIT_SUCCESS; +} +//TODO use struct instead +//used qsort +//write a function to sort e.g bubbleSort +//actually sorting function should be simpler then bubbleSort +//it should return -1, 0 or 1. OR -1 or 1. +// +//barcodes_t barcodes[size]; +// +//barcodes[i].cnts +=1; diff --git a/src/sabre.c b/src/sabre.c index 4e066bb..a41ce4f 100644 --- a/src/sabre.c +++ b/src/sabre.c @@ -1,54 +1,407 @@ -#include -#include -#include -#include -#include -#include -#include -#include + #include "sabre.h" +#include "utils.h" +#include "usage.h" +#include "demultiplex.h" + +FILE* my_fopen(const char* fname, int gz) { + static char* compressor = NULL; -void main_usage (int status) { + if (gz) { - fprintf (stdout, "\nUsage: %s [options]\n\ -\n\ -Command:\n\ -pe\tpaired-end barcode de-multiplexing\n\ -se\tsingle-end barcode de-multiplexing\n\ -\n\ ---help, display this help and exit\n\ ---version, output version information and exit\n\n", PROGRAM_NAME); + if (!compressor) { + // Guess whether to use pigz or gzip + char tmp[100]; + FILE* fin = popen("pigz --version 2>&1", "r"); + char* str = fgets(tmp, 50, fin); + int found = strncmp("pigz",tmp,4)==0; + compressor = str && found ? "pigz -p 2" : "gzip"; + } - exit (status); + char command[2048]; + sprintf(command, "%s > %s", compressor, fname); + FILE* ret = popen(command, "w"); + return ret; + } else { + return fopen(fname, "w"); + } } -int main (int argc, char *argv[]) { - int retval=0; +int main(int argc, char *argv[]) { + + if(argc < 2) { + usage(EXIT_SUCCESS); + } + + //more about getopts http://www.informit.com/articles/article.aspx?p=175771&seqNum=3 + static struct option paired_long_options[] = { + {"fq1", required_argument, NULL, 'f'}, + {"fq2", required_argument, NULL, 'r'}, + {"barcodes", required_argument, NULL, 'b'}, + {"unassinged1", required_argument, NULL, 'z'}, + {"unassinged2", required_argument, NULL, 'w'}, + {"combine", optional_argument, NULL, 'c'}, + {"umi", optional_argument, NULL, 'u'}, + {"max-mismatch", required_argument, 0, 'm'}, + {"min-umi-len", required_argument, 0, 'l'}, + {"max-5prime-crop", required_argument, 0, 'a'}, + {"stats", required_argument, NULL, 's'}, + {"no-comment", no_argument, 0, 'n'}, + {"threads", optional_argument, 0, 't'}, + {"gz-out", optional_argument, NULL, 'g'}, + {"version", optional_argument, NULL, 'v'}, + {"help", optional_argument, NULL, 'h'}, + {"story", optional_argument, NULL, 'o'}, + //{"quiet", no_argument, 0, 'z'}, + {NULL, 0, NULL, 0} + }; + + // this is on the stack so no need to malloc + // stack gets cleaned when function exits, + // because of that no need to free either + param_t params; + set_default_params(¶ms); + + metrics_t metrics; + metrics. num_unknown = 0; + metrics.total = 0; + + //clock_t begin = clock(); + time_t start, end; + start = time(NULL); + + char *fq1_fn=NULL; + char *fq2_fn=NULL; + + FILE* bc_fd; + char *bc_fn=NULL; + + char *unassigned1_fn=NULL; + char *unassigned2_fn=NULL; + char *umis_2_short_fn=strdup("umis_too_short.txt"); + + FILE* log_file; + char *log_fn=strdup("stats.txt"); + + int optc; + extern char *optarg; + + barcode_data_t *curr, *head, *temp; + + int threads=4; + + while (1) { + int option_index = 0; + //colon after a flag means should have arguments and no colon means just a flag i.e bool, no args after it + optc = getopt_long (argc, argv, "dnucvogf:r:b:z:w:m:s:l:z:a:t:", paired_long_options, &option_index); + + if (optc == -1) break; + + switch (optc) { + + case 'f': + fq1_fn = (char*) malloc (strlen (optarg) + 1); + strcpy (fq1_fn, optarg); + break; + + case 'r': + fq2_fn = (char*) malloc (strlen (optarg) + 1); + strcpy (fq2_fn, optarg); + break; + + case 'b': + bc_fn = (char*) malloc (strlen (optarg) + 1); + strcpy (bc_fn, optarg); + break; + + case 'z': + unassigned1_fn = (char*) malloc (strlen (optarg) + 1); + strcpy (unassigned1_fn, optarg); + break; + + case 'w': + unassigned2_fn = (char*) malloc (strlen (optarg) + 1); + strcpy (unassigned2_fn, optarg); + break; + + case 'c': + params.combine=1; + break; + + case 'u': + params.umi=1; + break; + + case 'm': + params.mismatch = atoi (optarg); + break; + + case 'l': + params.min_umi_len = atoi (optarg); + break; + + case 'a': + params.max_5prime_crop = atoi (optarg); + break; + + case 'n': + params.no_comment = 1; + break; + + case 'g': + params.gz_out = 1; + break; + + case 't': + threads = atoi (optarg); + break; + + case 's': + log_fn = (char*) malloc (strlen (optarg) + 1); + strcpy (log_fn, optarg); + break; + + case 'v': + version(EXIT_SUCCESS); + break; + + //NOTE if user requrested the help menu i.e --help then + //return success for all other cases below while help menu + //is printed it wasn't intended by user (or at least we don't know that) + //and therefore exit code - fail + case 'h': + usage(EXIT_SUCCESS); + break; + + case 'o': + little_story(EXIT_SUCCESS); + break; + + case '*': + usage(EXIT_FAILURE); + break; + + default: + usage(EXIT_FAILURE); + break; + } + } + + // TODO check that what's requre if not run usage + + params.fq1_fd = gzopen(fq1_fn, "r"); + if(!params.fq1_fd) { + fprintf(stderr, "ERROR: Could not open input file R1 '%s'.\n", fq1_fn); + exit(EXIT_FAILURE); + } + + params.fq2_fd = gzopen(fq2_fn, "r"); + if(!params.fq2_fd) { + fprintf(stderr, "ERROR: Could not open input file R2 '%s'.\n", fq2_fn); + exit(EXIT_FAILURE); + } + + if (!unassigned1_fn) { + unassigned1_fn= params.gz_out ? strdup("unassigned_R1.fq.gz") : strdup("unassigned_R1.fq"); + } + if (!unassigned2_fn) { + unassigned2_fn= params.gz_out ? strdup("unassigned_R2.fq.gz") : strdup("unassigned_R2.fq"); + } + params.unassigned1_fd = my_fopen(unassigned1_fn, params.gz_out); + params.unassigned2_fd = my_fopen(unassigned2_fn, params.gz_out); + params.umis_2_short_fd = fopen(umis_2_short_fn, "a"); + + // ? where does this goes? + fprintf(params.umis_2_short_fd, "name\tumi\tlen\tmin_len\n"); + + //TODO plugin sanity_chk here + + if(params.fq2_fd) { + params.paired = 1; + } + + fprintf(stderr, "\n\ + \n Running: %s\ + \n Command line args:\ + \n --fq1 %s\ + \n --fq2 %s\ + \n --barcodes %s\ + \n --unassigned_R1 %s\ + \n --unassigned_R2 %s\ + \n --combine %d\ + \n --umi %d\ + \n --max-mismatch %d\ + \n --min-umi-len %d\ + \n --max-5prime-crop %d\ + \n --no-comment %d\ + \n --stats %s\ + \n --threads %d\ + \n\ + \n In Progess...\ + \n\ + \n", PROGRAM_NAME,\ + fq1_fn, fq2_fn,\ + bc_fn,\ + unassigned1_fn, unassigned2_fn,\ + params.combine, params.umi,\ + params.mismatch, params.min_umi_len, + params.max_5prime_crop, params.no_comment,\ + log_fn, threads); + + char *bcout_fn1 = NULL; + char *bcout_fn2 = NULL; + /* Creating linked list of barcode data */ + // https://www.hackerearth.com/practice/data-structures/linked-list/singly-linked-list/tutorial/ + // where each node is represents one barcode from the barcode file + bc_fd = fopen(bc_fn, "r"); + if (!bc_fd) { + fprintf(stderr, "ERROR: Unable to barcode file\n"); + exit(EXIT_FAILURE); + } + + head = NULL; + curr = NULL; + + char line_buff[1024]; + int max_items = 6; + while(fgets(line_buff, 1024, bc_fd)) { + curr = (barcode_data_t*) calloc(1,sizeof(barcode_data_t)); + + char *p = strtok(line_buff, "\t"); + char *s_name = strdup(p); + + p = strtok(NULL, "\t"); + curr->bc_grp = strdup(p); + + bcout_fn1 = (char *) malloc(MAX_FILENAME_LENGTH*2); + bcout_fn1[0] = '\0'; + get_bc_fn(&bcout_fn1, s_name, curr->bc_grp, 1, params.gz_out); + curr->bcfile1 = my_fopen(_mkdir(bcout_fn1), params.gz_out); + + if(params.paired > 0 && params.combine < 0) { + bcout_fn2 = (char *) malloc(MAX_FILENAME_LENGTH*2); + bcout_fn2[0] = '\0'; + get_bc_fn(&bcout_fn2, s_name, curr->bc_grp, 2, params.gz_out); + curr->bcfile2 = my_fopen(_mkdir(bcout_fn2), params.gz_out); + } + + //TODO for hardcode max limit of items in the barcodes file to 6 + curr->bc = calloc(max_items, sizeof(void*)); + + int i=0; + while(i <= max_items && (p = strtok(NULL, "\t\n"))) { + // remove the token, new line char + curr->bc[i] = strdup(p); + fprintf(stdout, " BC %s ", curr->bc[i]); + i++; + } + fprintf(stdout, "\n"); + + curr->num_records = 0; + curr->next = head; + head = curr; + } + + free(bcout_fn1); + free(bcout_fn2); + free(bc_fn); + free(fq1_fn); + free(fq2_fn); + free(unassigned1_fn); + free(unassigned2_fn); + //free(umis_2_short_fn); + + // Threading + pthread_t tid[threads]; + pthread_mutex_t in_lock; + pthread_mutex_t out_lock; + pthread_cond_t cv; + int line_num = 0; + int out_line_num = 0; + + pthread_mutex_init(&in_lock, NULL); + pthread_mutex_init(&out_lock, NULL); + pthread_cond_init(&cv, NULL); + + thread_data_t thread_data[threads]; + + for(int i=0; i < threads; i++) { + + thread_data[i].params = ¶ms; + thread_data[i].curr = curr; + thread_data[i].metrics = &metrics; + thread_data[i].id = i; + thread_data[i].in_lock = &in_lock; + thread_data[i].out_lock = &out_lock; + thread_data[i].line_num = &line_num; + thread_data[i].out_line_num = &out_line_num; + thread_data[i].cv = &cv; + + pthread_create(&(tid[i]), NULL, &demult_runner, thread_data+i); + } + + for(int i=0; i < threads; i++) { + pthread_join(tid[i], NULL); + } + printf("Threads all done\n"); + + pthread_mutex_destroy(&in_lock); + pthread_mutex_destroy(&out_lock); + + //if (!log_fn) { is this better? + if (log_fn == NULL) { + log_file = stdout; + } + else { + log_file = fopen(log_fn, "w"); + } + + fprintf (log_file, "Barcode\tN_records\tN_pairs\tP_pairs\n"); + curr = head; + int total_pairs = metrics.total/2; + + while(curr) { + + int n_pairs = curr->num_records/2; + float percent_pairs = (float) n_pairs/total_pairs; + + fprintf(log_file,"%s\t%d\t%d\t%.2f\n", curr->bc_grp, curr->num_records, n_pairs, percent_pairs); + + curr = curr->next; + } + + int unknown_pairs = metrics.num_unknown/2; + float percent_unknown = (float) unknown_pairs/total_pairs; + float tot_chk = (float) total_pairs/total_pairs; - if (argc < 2 || (strcmp (argv[1],"pe") != 0 && strcmp (argv[1],"se") != 0 && strcmp (argv[1],"--version") != 0 && strcmp (argv[1],"--help") != 0)) { - main_usage (EXIT_FAILURE); - } + fprintf(log_file, "unassigned\t%d\t%d\t%.2f\n", metrics.num_unknown, unknown_pairs, percent_unknown); + fprintf(log_file, "total\t%d\t%d\t%.2f\n", metrics.total, total_pairs, tot_chk); - if (strcmp (argv[1],"--version") == 0) { - fprintf(stdout, "%s version %0.3f\nCopyright (c) 2011 The Regents of University of California, Davis Campus.\n%s is free software and comes with ABSOLUTELY NO WARRANTY.\nDistributed under the MIT License.\n\nWritten by %s\n", PROGRAM_NAME, VERSION, PROGRAM_NAME, AUTHORS); + end = time(NULL); + fprintf(stderr, "\n All done :) \ + \n It took %.2f minutes\n", + difftime(end, start)/60); - exit (EXIT_SUCCESS); - } + fclose(bc_fd); + fclose(log_file); + params_destroy(¶ms); - else if (strcmp (argv[1],"--help") == 0) { - main_usage (EXIT_SUCCESS); - } + free(log_fn); - else if (strcmp (argv[1],"pe") == 0) { - retval = paired_main (argc, argv); - return (retval); - } + curr = head; + while (curr) { + fclose(curr->bcfile1); + if (curr->bcfile2) + fclose(curr->bcfile2); - else if (strcmp (argv[1],"se") == 0) { - retval = single_main (argc, argv); - return (retval); - } + free (curr->bc_grp); + free (curr->bc); + temp = curr; + curr = curr->next; + free (temp); + } - return 0; + // good read :) + little_story(EXIT_SUCCESS); } diff --git a/src/sabre.h b/src/sabre.h index 693af37..d6a8f33 100644 --- a/src/sabre.h +++ b/src/sabre.h @@ -1,67 +1,95 @@ -#ifndef SABRE_H -#define SABRE_H +#ifndef _SABRE_H +#define _SABRE_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include #ifndef PROGRAM_NAME #define PROGRAM_NAME "sabre" #endif #ifndef AUTHORS -#define AUTHORS "Nikhil Joshi, UC Davis Bioinformatics Core\n" -#endif - -#ifndef VERSION -#define VERSION 0.0 +#define AUTHORS "\n\ + \n Nikhil Joshi, UC Davis Bioinformatics Core\ + \n Kirill Tsyganov, Monash Bioinformatics Platform\ + \n" #endif -/* Options drawn from GNU's coreutils/src/system.h */ -/* These options are defined so as to avoid conflicting with option -values used by commands */ -enum { - GETOPT_HELP_CHAR = (CHAR_MIN - 2), - GETOPT_VERSION_CHAR = (CHAR_MIN - 3) -}; -#define GETOPT_HELP_OPTION_DECL \ -"help", no_argument, NULL, GETOPT_HELP_CHAR -#define GETOPT_VERSION_OPTION_DECL \ -"version", no_argument, NULL, GETOPT_VERSION_CHAR -#define case_GETOPT_HELP_CHAR(Usage_call) \ -case GETOPT_HELP_CHAR: \ -Usage_call(EXIT_SUCCESS); \ -break; -#define case_GETOPT_VERSION_CHAR(Program_name, Version, Authors) \ -case GETOPT_VERSION_CHAR: \ -fprintf(stdout, "%s version %0.3f\nCopyright (c) 2011 The Regents " \ -"of University of California, Davis Campus.\n" \ -"%s is free software and comes with ABSOLUTELY NO WARRANTY.\n"\ -"Distributed under the MIT License.\n\nWritten by %s\n", \ -Program_name, Version, Program_name, Authors); \ -exit(EXIT_SUCCESS); \ -break; -/* end code drawn from system.h */ +//https://semver.org/ #define MAX_BARCODE_LENGTH 100 #define MAX_FILENAME_LENGTH 200 - -typedef struct listel { - char* bc; - int num_records; - FILE* bcfile; - struct listel *next; -} barcode_data; +//NOTE more info on struc and typedef +//https://stackoverflow.com/questions/1675351/typedef-struct-vs-struct-definitions +typedef struct actl_bc_cnt { + char *bc; + int cnt; + struct actl_bc_cnt *next; +} actl_bc_cnt; typedef struct listel_p { - char* bc; + //TODO this is future implementation + //char **bc; + //typedef struct barcode_t *b + // this is a pointer to an array of pointers + char **bc; + char *bc_grp; int num_records; FILE* bcfile1; FILE* bcfile2; + struct actl_bc_cnt *actl_bc_cnt; struct listel_p *next; -} barcode_data_paired; +} barcode_data_t; + +//TODO this is for future implementation +//typdef strcut { +// char *bc; +// barcode_data_t *bacrcode_data; +//} barcode_t + +typedef struct { + gzFile fq1_fd; + gzFile fq2_fd; + FILE* unassigned1_fd; + FILE* unassigned2_fd; + FILE* umis_2_short_fd; + int mismatch; + int combine; + int umi; + int paired; + int min_umi_len; + int max_5prime_crop; + int no_comment; + int gz_out; +} param_t; + +typedef struct { + int num_unknown; + int total; +} metrics_t; +typedef struct { + int id; + const param_t* params; + barcode_data_t* curr; + metrics_t* metrics; + pthread_mutex_t *in_lock, *out_lock; + pthread_cond_t *cv; + volatile int *line_num; // Pointer to line number. Access to this protected by in_lock + volatile int *out_line_num; // Pointer to line number output. Protected by out_lock +} thread_data_t; -/* Function Prototypes */ -int single_main (int argc, char *argv[]); -int paired_main (int argc, char *argv[]); -int strncmp_with_mismatch (const char *s1, const char *s2, register size_t n, register size_t mismatch); -#endif /*SABRE_H*/ +#endif /*_SABRE_H*/ diff --git a/src/sanity_check.c b/src/sanity_check.c new file mode 100644 index 0000000..c6ffb1c --- /dev/null +++ b/src/sanity_check.c @@ -0,0 +1,50 @@ + +void* sanity_chk(void *arg) { + if (!fq1 || !fq2 || !unknownfn1 || !unknownfn2 || !barfn) { + paired_usage (EXIT_FAILURE); + } + + if (!strcmp (fq1, fq2) || !strcmp (fq1, unknownfn1) || !strcmp (fq1, unknownfn2) || + !strcmp (fq1, barfn) || !strcmp (fq2, unknownfn1) || !strcmp (fq2, unknownfn2) || + !strcmp (fq2, barfn) || !strcmp (unknownfn1, unknownfn2) || !strcmp (unknownfn1, barfn) || + !strcmp (unknownfn2, barfn)) { + + fprintf (stderr, "ERROR: Duplicate input and/or output file names.\n"); + return EXIT_FAILURE; + } + + pe1 = gzopen (fq1, "r"); + if (!pe1) { + fprintf (stderr, "ERROR: Could not open input file 1 '%s'.\n", fq1); + return EXIT_FAILURE; + } + + pe2 = gzopen (fq2, "r"); + if (!pe2) { + fprintf (stderr, "ERROR: Could not open input file 2 '%s'.\n", fq2); + return EXIT_FAILURE; + } + + unknownfile1 = gzopen(unknownfn1, "wb"); + if (!unknownfile1) { + fprintf (stderr, "ERROR: Could not open unknown output file 1 '%s'.\n", unknownfn1); + return EXIT_FAILURE; + } + + unknownfile2 = gzopen(unknownfn2, "wb"); + if (!unknownfile2) { + fprintf (stderr, "Could not open unknown output file 2 '%s'.\n", unknownfn2); + return EXIT_FAILURE; + } + + barfile = fopen (barfn, "r"); + if (!barfile) { + fprintf (stderr, "Could not open barcode file '%s'.\n", barfn); + return EXIT_FAILURE; + } + + if(threads < 0) { + fprintf(stderr, "WARNING: Negative number of threads detected %d, setting threads to 1\n", threads); + threads = 1; + } +} diff --git a/src/usage.c b/src/usage.c new file mode 100644 index 0000000..550326b --- /dev/null +++ b/src/usage.c @@ -0,0 +1,80 @@ + +#include "usage.h" + +void usage(int status) { + + fprintf(stderr, "\n Usage: %s [OPTIONS] -f -r -b \ + \n\ + \n\ + \n Options:\ + \n\ + \n Required:\ + \n\ + \n -f, --fq1 FILE Input FASTQ R1 read\ + \n -r, --fq2 FILE Input FASTQ R2 reads\ + \n -b, --barcodes FILE Barcodes files, one barcode per line, e.g BC\\tPREFIX\ + \n -w, --unassigned CHAR Unassigned prefix\ + \n\ + \n Other:\ + \n\ + \n -c, --combine Combine R1 and R2 [NULL]\ + \n -u, --umi Indicates that umi present in the R1 read [NULL]\ + \n -m, --max-mismatch INT Maximum number of mismatches allowed in a barcode [0]\ + \n -l, --min-umi-len INT Minimum UMI length to keep [0]\ + \n -a, --max-5prime-crop INT Maximum number of possible bases cropped from 5prime [0]\ + \n -n, --no-comment Drop extra comments from FASTQ header [NULL]\ + \n -g, --gz-out gzip the output files (requires pigz or gzip in PATH)\ + \n -s, --stats FILE Write stats to file instead of STDOUT [STDOUT]\ + \n\ + \n Extras:\ + \n\ + \n -t, --threads INT specify number of threads to use [4]\ + \n -v, --version get current version\ + \n -h, --help get help menu, exit status is zero\ + \n -o, --story little story about sabre tool\ + \n\ + \n", + PROGRAM_NAME); + + exit(status); +} + +void little_story(int status) { + + fprintf(stdout, "\n\ + \n Little story:\ + \n\ + \n Sabre is a heavy cavalry sword with a curved blade and a single cutting edge\ + \n Not sure though if the meaning was intended by original author...\ + \n\ + \n Later on I was pointed out to me that yes of course it was intended\ + \n since we are cutting off adaptors..\ + \n\ + \n to be continued...\ + \n\ + \n"); + + exit(status); +} + +void version(int status) { + + fprintf(stdout, "\n\ + \n %s\ + \n\ + \n version: %s\ + \n\ + \n Copyright (c) 2011 The Regents of University of California, Davis Campus.\ + \n %s is free software and comes with ABSOLUTELY NO WARRANTY.\ + \n Distributed under the MIT License.\ + \n\ + \n Written by: %s\ + \n\ + \n", + PROGRAM_NAME, + VERSION, + PROGRAM_NAME, + AUTHORS); + + exit(status); +} diff --git a/src/usage.h b/src/usage.h new file mode 100644 index 0000000..4ab8f3f --- /dev/null +++ b/src/usage.h @@ -0,0 +1,10 @@ +#ifndef _USAGE_H +#define _USAGE_H + +#include "sabre.h" + +void usage(int status); +void little_story(int status); +void version(int status); + +#endif /*_USAGE_H*/ diff --git a/src/utils.c b/src/utils.c new file mode 100644 index 0000000..d5f75c7 --- /dev/null +++ b/src/utils.c @@ -0,0 +1,251 @@ +#include "utils.h" + +// https://stackoverflow.com/questions/2336242/recursive-mkdir-system-call-on-unix/11425692 +// https://stackoverflow.com/questions/7430248/creating-a-new-directory-in-c +const char * _mkdir(const char *file_path) { + + if(!file_path) { + fprintf (stderr, + "ERROR: This shouldn't happend, file path == %s\n", + file_path); + exit(EXIT_FAILURE); + } + + // return straigth away if a file_path is not nested file path + if(strstr(file_path, "/") == NULL) { + return file_path; + } + //TODO check if directory already exists or not + //struct stat st = {0}; + // + //if (stat(dir, &st) == -1) { + // mkdir(tmp, S_IRWXU); + //} + + //char tmp[PATH_MAX]; // can't get this to work.. + char tmp[256]; + char *p = NULL; + char *dirc = strdup(file_path); + size_t len; + + const char *dir = dirname(dirc); + snprintf(tmp, sizeof(tmp),"%s", dir); + len = strlen(tmp); + + if(tmp[len - 1] == '/') { + tmp[len - 1] = 0; + } + + for(p = tmp + 1; *p; p++) { + if(*p == '/') { + *p = 0; + mkdir(tmp, S_IRWXU); + *p = '/'; + } + } + + mkdir(tmp, S_IRWXU); + free(dirc); + + return file_path; +} + +//NOTE retuns zero on success +//strcmp can be used for sorting, returns pos, zero, neg +//BUT this new implementation can't be used as such just FYI +match_ret_t chk_bc_mtch(const char *orig_bc, const char *orig_read, size_t max_mismatch, int max_5prime_crop) { + int orig_read_len = strlen(orig_read); + int orig_bc_len = strlen(orig_bc); + int n_crop = 0; + match_ret_t ret = { -1,-1 }; + if(orig_bc_len > orig_read_len) { + fprintf (stderr, + "WARNING: Length of the barcode %d is greater than length of the reads %d.", + orig_bc_len, orig_read_len); + return ret; + } + + while(n_crop <= max_5prime_crop) { + + if(n_crop > orig_read_len) { + return ret; + } + + int mismatch = 0; + char u1, u2; + const char *bc = orig_bc; + const char *read = orig_read+n_crop; + int bc_len = orig_bc_len; + + while (bc_len-- > 0) { + u1 = *bc++; + u2 = *read++; + + if (u1 != u2) { + mismatch++; + if (mismatch > max_mismatch) { + break; + } + } + + if (u1 == '\0' || u2 == '\0') { + ret.cropped = n_crop; + ret.mismatches = mismatch; + return ret; + } + } + + if(mismatch <= max_mismatch) { + ret.cropped = n_crop; + ret.mismatches = mismatch; + return ret; + } + + n_crop++; + } + //this is in the case of error + return ret; +} + +// https://stackoverflow.com/questions/21880730/c-what-is-the-best-and-fastest-way-to-concatenate-strings +//TODO this is a fastq mystrcat function, that returns a pointer to the end of the string +void get_fqread(char *fqread, fq_rec_t *fq_rec, const char *barcode, char *umi_idx, int no_comment, int n_crop) { + + fqread[0] = '\0'; + + if(n_crop < 0) { + fprintf(stderr, + "ERROR: n_crop set to %d. This can't happend\n", + n_crop); + exit(EXIT_FAILURE); + } + + //@READNAME:BACRCODE:UMI + //1st line + strcat(fqread, fq_rec->name); + //TODO later can have conditional here depending on the the structure and/or BARCODE/UMI + if(barcode) { + strcat(fqread, ":"); + strcat(fqread, barcode); + } + else if(!barcode) { + barcode = ""; + } + + if(umi_idx) { + strcat(fqread, ":"); + strcat(fqread, umi_idx); + } + + if(fq_rec->comment && no_comment == -1) { + strcat(fqread, " "); + strcat(fqread, fq_rec->comment); + } + strcat(fqread, "\n"); + + //2nd line + strcat(fqread, (fq_rec->seq)+strlen(barcode)+n_crop); + strcat(fqread, "\n"); + + //3rd line + strcat(fqread, "+"); + strcat(fqread, fq_rec->name); + if(fq_rec->comment && no_comment == -1) { + strcat(fqread, " "); + strcat(fqread, fq_rec->comment); + } + strcat(fqread, "\n"); + + //4th line + strcat(fqread, (fq_rec->qual)+strlen(barcode)+n_crop); + strcat(fqread, "\n"); +} + +void get_merged_fqread(char *fqread, fq_rec_t *fq_rec1, fq_rec_t *fq_rec2, const char *barcode, char *umi_idx, int no_comment, int n_crop) { + fqread[0] = '\0'; + //@READNAME:BACRCODE:UMI + //1st line + strcat(fqread, fq_rec1->name); + //TODO later can have conditional here depending on the the structure and/or BARCODE/UMI + if(barcode) { + strcat(fqread, ":"); + strcat(fqread, barcode); + } + + if(umi_idx) { + strcat(fqread, ":"); + strcat(fqread, umi_idx); + } + + if(fq_rec1->comment && no_comment == -1) { + strcat(fqread, " "); + strcat(fqread, fq_rec1->comment); + } + strcat(fqread, "\n"); + + //2nd line + strcat(fqread, fq_rec2->seq); + strcat(fqread, "\n"); + + //3rd line + strcat(fqread, "+"); + strcat(fqread, ""); + //strcat(fqread, fq_rec2->name); + //if(fq_rec2->comment && no_comment == -1) { + // strcat(fqread, " "); + // strcat(fqread, fq_rec2->comment); + //} + strcat(fqread, "\n"); + + //4th line + strcat(fqread, (fq_rec2->qual)); + strcat(fqread, "\n"); +} + +void get_bc_fn(char **bcout_fn, char *s_name, char *barcode, int read_type, int gz) { + + if(strlen(s_name) > MAX_FILENAME_LENGTH) { + fprintf (stderr, + "ERROR: Too many characters in your sample name; %s:%zd \n", + s_name, strlen(s_name)); + exit(EXIT_FAILURE); + } + + strcat(*bcout_fn, s_name); + strcat(*bcout_fn, "_"); + strcat(*bcout_fn, barcode); + + if(read_type == 1) { + strcat(*bcout_fn, "_R1.fastq"); + } + else if(read_type == 2) { + strcat(*bcout_fn, "_R2.fastq"); + } + else { + fprintf (stderr, + "ERROR: This shouldn't happen, wrong read type was passed through -> %d\n", + read_type); + exit(EXIT_FAILURE); + } + if (gz) { + strcat(*bcout_fn, ".gz"); + } +} + +void set_default_params(param_t *params) { + params->mismatch = 0; + params->combine = -1; + params->umi = -1; + params->paired = -1; + params->min_umi_len = 0; + params->max_5prime_crop = 0; + params->no_comment = -1; +} + +void params_destroy(param_t *params) { + gzclose(params->fq1_fd); + gzclose(params->fq2_fd); + fclose(params->unassigned1_fd); + fclose(params->unassigned2_fd); + fclose(params->umis_2_short_fd); +} diff --git a/src/utils.h b/src/utils.h new file mode 100644 index 0000000..ca32703 --- /dev/null +++ b/src/utils.h @@ -0,0 +1,37 @@ +#ifndef UTILS_H +#define UTILS_H + +#include "sabre.h" +#include "fastq.h" + +typedef struct barcodes_t { + char *bc; + int cnts; +} barcodes_t; + +typedef struct umis_t { + char *bc; + int len; + int cnts; +} umis_t; + +typedef struct { + int mismatches; + int cropped; +} match_ret_t; + +//This is needed if compilling with -std=c99, read below for more +//https://stackoverflow.com/questions/26284110/strdup-confused-about-warnings-implicit-declaration-makes-pointer-with +// char *strdup(const char*); +// char *strndup(const char *s, size_t n); + +const char * _mkdir (const char *dir); +match_ret_t chk_bc_mtch(const char *orig_bc, const char *orig_read, size_t max_mismatch, int max_5prime_crop); +void get_fqread(char *fqread, fq_rec_t *fq_rec, const char *barcode, char *umi_idx, int no_comment, int n_crop); +void get_merged_fqread(char *fqread, fq_rec_t *fq_rec1, fq_rec_t *fq_rec2, const char *barcode, char *umi_idx, int no_comment, int n_crop); +void get_bc_fn(char **bcout_fn, char *s_name, char *barcode, int read_type, int gz); + +void set_default_params(param_t *params); +void params_destroy(param_t *params); + +#endif /*UTILS_H*/