diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..135df85
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,10 @@
+# ignore C object files
+*.o
+# ignore executables
+sabre
+sabre-dev
+metrics
+# ignore vim swap files
+*.swp
+*.gz
+/tmp
diff --git a/Makefile b/Makefile
deleted file mode 100644
index 2ff3141..0000000
--- a/Makefile
+++ /dev/null
@@ -1,41 +0,0 @@
-PROGRAM_NAME = sabre
-VERSION = 1.00
-CC = gcc
-CFLAGS = -Wall -pedantic -DVERSION=$(VERSION)
-DEBUG = -g
-OPT = -O3
-ARCHIVE = $(PROGRAM_NAME)_$(VERSION)
-LDFLAGS = -lz
-SDIR = src
-
-.PHONY: clean default build distclean dist debug
-
-default: build
-
-barcode.o: $(SDIR)/barcode.c
- $(CC) $(CFLAGS) $(OPT) -c $(SDIR)/$*.c
-
-demulti_single.o: $(SDIR)/demulti_single.c $(SDIR)/sabre.h $(SDIR)/kseq.h
- $(CC) $(CFLAGS) $(OPT) -c $(SDIR)/$*.c
-
-demulti_paired.o: $(SDIR)/demulti_paired.c $(SDIR)/sabre.h $(SDIR)/kseq.h
- $(CC) $(CFLAGS) $(OPT) -c $(SDIR)/$*.c
-
-sabre.o: $(SDIR)/sabre.c $(SDIR)/sabre.h
- $(CC) $(CFLAGS) $(OPT) -c $(SDIR)/$*.c
-
-clean:
- rm -rf *.o $(SDIR)/*.gch ./sabre
-
-distclean: clean
- rm -rf *.tar.gz
-
-dist:
- tar -zcf $(ARCHIVE).tar.gz src Makefile
-
-build: barcode.o demulti_single.o demulti_paired.o sabre.o
- $(CC) $(CFLAGS) $(OPT) $? -o $(PROGRAM_NAME) $(LDFLAGS)
-
-debug:
- $(MAKE) build "CFLAGS=-Wall -pedantic -g -DDEBUG"
-
diff --git a/README.md b/README.md
index 5e84da3..ab3aa22 100644
--- a/README.md
+++ b/README.md
@@ -1,105 +1,58 @@
-# sabre - A barcode demultiplexing and trimming tool for FastQ files
+> This is a fork of the [original repo](https://github.com/najoshi/sabre). I might be taking this tool into a different direction to what was originally intended
-## About
+# sabre
-Next-generation sequencing can currently produce hundreds of millions of reads
-per lane of sample and that number increases at a dizzying rate. Barcoding
-individual sequences for multiple lines or multiple species is a cost-efficient
-method to sequence and analyze a broad range of data.
+> A cellular barcode demultiplexing tool of FASTQ files
-Sabre is a tool that will demultiplex barcoded reads into separate files.
-It will work on both single-end and paired-end data in fastq format.
-It simply compares the provided barcodes with each read and separates
-the read into its appropriate barcode file, after stripping the barcode from
-the read (and also stripping the quality values of the barcode bases). If
-a read does not have a recognized barcode, then it is put into the unknown file.
-Sabre also has an option (-m) to allow mismatches of the barcodes.
+## Content
-Sabre also supports gzipped file inputs. Also, since sabre does not use the
-quality values in any way, it can be used on fasta data that is converted to
-fastq by creating fake quality values.
+- [Install](#install)
+- [Quick start](#quick-start)
+- [Usage](#usage)
-Finally, after demultiplexing, sabre outputs a summary of how many records
-went into each barcode file.
+## Install
-## Requirements
+```BASH
+git clone https://github.com/serine/sabre
+cd src
+make
+```
-Sabre requires a C compiler; GCC or clang are recommended. Sabre
-relies on Heng Li's kseq.h, which is bundled with the source.
+## Quick start
-Sabre also requires Zlib, which can be obtained at
-.
-
-## Building and Installing Sabre
-
-To build Sabre, enter:
-
- make
-
-Then, copy or move "sabre" to a directory in your $PATH.
+```BASH
+sabre -f MultiplexRNASeq_S1_R1_001.fastq.gz \
+ -r MultiplexRNASeq_S1_R2_001.fastq.gz \
+ -b barcodes.txt \
+ -c \
+ -u \
+ -m 2 \
+ -l 10 \
+ -a 1 \
+ -s sabre.txt \
+ -t 12
+```
## Usage
-Sabre has two modes to work with both paired-end and single-end
-reads: `sabre se` and `sabre pe`.
-
-Running sabre by itself will print the help:
-
- sabre
-
-Running sabre with either the "se" or "pe" commands will give help
-specific to those commands:
-
- sabre se
- sabre pe
-
-### Sabre Single End (`sabre se`)
-
-`sabre se` takes an input fastq file and an input barcode data file and outputs
-the reads demultiplexed into separate files using the file names from the data file.
-The barcodes will be stripped from the reads and the quality values of the barcode
-bases will also be removed. Any reads with unknown barcodes get put into the "unknown"
-file specified on the command line. The -m option allows for mismatches in the barcodes.
-
-#### Barcode data file format for single end
-
- barcode1 barcode1_output_file.fastq
- barcode2 barcode2_output_file.fastq
- etc...
-
-Be aware that if you do not format the barcode data file correctly, sabre will not work properly.
-
-#### Example
-
- sabre se -f input_file.fastq -b barcode_data.txt -u unknown_barcode.fastq
- sabre se -m 1 -f input_file.fastq -b barcode_data.txt -u unknown_barcode.fastq
-
-### Sabre Paired End (`sabre pe`)
-
-`sabre pe` takes two paired-end files and a barcode data file as input and outputs
-the reads demultiplexed into separate paired-end files using the file names from the
-data file. The barcodes will be stripped from the reads and the quality values of the barcode
-bases will also be removed. Any reads with unknown barcodes get put into the "unknown" files
-specified on the command line. It also has an option (-c) to remove barcodes from both files.
-Using this option means that if sabre finds a barcode in the first file, it assumes the paired
-read in the other file has the same barcode and will strip it (along with the quality values).
-The -m option allows for mismatches in the barcodes.
-
-#### Barcode data file format for paired end
+> This tool is under development and this is very much an alpha version
+> In it's current form the tool is highly customised a particular multiplexing protocol
- barcode1 barcode1_output_file1.fastq barcode1_output_file2.fastq
- barcode2 barcode2_output_file1.fastq barcode2_output_file2.fastq
- etc...
+### Cellular barcodes
-Be aware that if you do not format the barcode data file correctly, sabre will not work properly.
+In order to demultiplex the use needs to provide `barcodes.txt` file, which is three column tab delimited file
-#### Examples
+```
+sample_name group barcode
+```
- sabre pe -f input_file1.fastq -r input_file2.fastq -b barcode_data.txt \
- -u unknown_barcode1.fastq -w unknown_barcode1.fastq
+currently group is semi-redundant column, it there for a feature that in the development. for most use cases group can equals to barcode
- sabre pe -c -f input_file1.fastq -r input_file2.fastq -b barcode_data.txt \
- -u unknown_barcode1.fastq -w unknown_barcode1.fastq
+e.g
- sabre pe -m 2 -f input_file1.fastq -r input_file2.fastq -b barcode_data.txt \
- -u unknown_barcode1.fastq -w unknown_barcode1.fastq
+```
+cntr_rep1 TAAGGCGA TAAGGCGA
+cntr_rep2 CGTACTAG CGTACTAG
+treat_rep1 AGGCAGAA AGGCAGAA
+treat_rep2 TCCTGAGC TCCTGAGC
+```
diff --git a/docs/definitions.md b/docs/definitions.md
new file mode 100644
index 0000000..b0a8921
--- /dev/null
+++ b/docs/definitions.md
@@ -0,0 +1,37 @@
+define blocks along the read
+
+BARCODE
+UMI
+READ
+
+then set values to different block
+
+BARCODE = 8
+UMI = 10
+
+## Andrew's suggestion
+
+```
+--input sample_A_R1.fastq.gz:i8{index1},r151{read1},i8{index2}
+```
+
+```
+--fq1 sample_A_R1.fastq.gz:i8{index1},r151{READ1},i8{index2}
+
+--fq2 sample_A_R2.fastq.gz:i8{index1},r151{READ1},i8{index2}
+```
+
+We need to check that BARCODE == index1 in both fq1 and fq2 but also check that index1_fq1 == index1_fq2
+
+```
+--merge 12 merge R1 into R2
+--merge 21 merge R2 into R1
+```
+
+either way resulting read is R1
+
+```
+--fq1 sample_A_R1.fastq.gz:8index1,*index2
+
+--fq2 sample_A_R2.fastq.gz:i8{index1},r151{read2},i8{index2}
+```
diff --git a/docs/modes.md b/docs/modes.md
new file mode 100644
index 0000000..62bca89
--- /dev/null
+++ b/docs/modes.md
@@ -0,0 +1,72 @@
+# Sabre
+
+## Different running modes
+
+DOCS: In each case BARCODE and/or UMI are trimed off and
+put into FASTQ header:
+
+Not sure if I should have:
+
+ BARCODE always has a precedent i.e BARCODE:UMI
+ OR
+ It follows the same structure as per experiment i.e
+ if BARCODE+UMI then BARCODE:UMI
+ else if UMI+BARCODE then UMI:BARCODE
+
+All modes that begin with 3 will return single - R1 file, merging
+R1 read into R2 header and renaming R2 into R1
+
+10 = single-end where R1 has the following structure:
+
+ R1 -->
+ BARCODE+READ
+
+20 = paired-end where R1 and R2 have the following structure:
+
+ R1 --> <--R2
+ BARCODE+READ----READ+BARCODE
+
+this mode returns single file (R1) with barcode appended and into R1 header
+
+30 = paired-end where R1 and R2 have the following structure:
+
+ R1 --> <-R2
+ BARCODE----READ
+
+40 = paired-end where
+
+11 = single-end where R1 has the following structure:
+
+ R1 -->
+ BARCODE+UMI+READ
+
+21 = paired-end where R1 and R2 have the following structure:
+
+ R1 --> <--R2
+ BARCODE+UMI+READ----READ+UMI+BARCODE
+
+this mode returns single file (R1) with barcode appended and into R1 header
+
+31 = paired-end where R1 and R2 have the following structure:
+
+ R1 --> <-R2
+ BARCODE+UMI----READ
+
+NOTE this gives me room for yet another mode e.g 12, 22, 32
+
+12 = sinle-end where R1 has the following structure:
+
+ R1 -->
+ UMI+READ
+
+22 = paired-end where R1 and R2 have the following structure:
+
+ R1 --> <--R2
+ UMI+READ----READ+UMI
+
+this mode returns single file (R1) with barcode appended and into R1 header
+
+32 = paired-end where R1 and R2 have the following structure:
+
+ R1 --> <-R2
+ UMI----READ
diff --git a/src/Makefile b/src/Makefile
new file mode 100644
index 0000000..feb2a3d
--- /dev/null
+++ b/src/Makefile
@@ -0,0 +1,50 @@
+# Source, Executable, Includes, Library Defines
+GIT_VERSION := "$(shell git describe --abbrev=7 --always --tags)"
+CC = gcc
+INCL = fastq.h sabre.h
+#SRC = demulti_paired.c demulti_single.c sabre.c utils.c
+SRC = sabre.c usage.c demultiplex.c utils.c fastq.c
+OBJ = $(SRC:.c=.o)
+DSRC=src
+
+#CFLAGS = -Wall -O2 -std=c99 -pedantic -DVERSION=$(VERSION)
+# need to quote GIT_VERSION so that the value gets passed as a string
+CFLAGS = -Wall -O2 -std=gnu99 -pedantic -DVERSION=\"$(GIT_VERSION)\"
+CFLAGSDEV = -Wall -O0 -g -std=gnu99 -DVERSION=\"$(GIT_VERSION)-dev\"
+
+LDFLAGS = -lz -lpthread
+GPROF = -pg
+EXE = sabre
+
+.PHONY: default
+
+default: build
+# a smarter way to have an if statement here instead of explicit grpof target
+# have a look at gcc -M
+
+%.o: %.c
+ $(CC) -c $(CFLAGS) $(SRC)
+
+usage.o: usage.h sabre.h fastq.h
+utils.o: utils.h sabre.h fastq.h
+demultiplex.o: demultiplex.h utils.h sabre.h fastq.h
+sabre.o: sabre.h
+
+build: $(OBJ)
+ $(CC) $(CFLAGS) $(OBJ) -o $(EXE) $(LDFLAGS)
+ #ln -sf $(DSRC)/$(EXE) ../sabre
+
+dev: $(OBJDEV)
+ $(CC) $(CFLAGSDEV) $(SRC) -o $(EXE)-dev $(LDFLAGS)
+
+metrics:
+ $(CC) $(CFLAGSDEV) -o metrics metrics.c $(LDFLAGS)
+
+gprof:
+ $(CC) $(CFLAGS) $(GPROF) $(SRC) -o $(EXE).gprof $(LDFLAGS)
+
+clean:
+ $(RM) $(OBJ) $(EXE) core
+
+clean-all:
+ $(RM) $(OBJ) $(EXE) $(EXE)-dev $(EXE).gprof core gmon.out metrics
diff --git a/src/barcode.c b/src/barcode.c
deleted file mode 100644
index a260281..0000000
--- a/src/barcode.c
+++ /dev/null
@@ -1,24 +0,0 @@
-#include
-#include
-#include
-#include
-
-int strncmp_with_mismatch (const char *s1, const char *s2, register size_t n, register size_t mismatch) {
-
- register unsigned char u1, u2;
- int cnt=0;
-
- while (n-- > 0) {
- u1 = (unsigned char) *s1++;
- u2 = (unsigned char) *s2++;
-
- if (u1 != u2) {
- cnt++;
- if (cnt > mismatch) return u1 - u2;
- }
-
- if (u1 == '\0') return 0;
- }
-
- return 0;
-}
diff --git a/src/demulti_paired.c b/src/demulti_paired.c
deleted file mode 100644
index 2acdb11..0000000
--- a/src/demulti_paired.c
+++ /dev/null
@@ -1,334 +0,0 @@
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include "sabre.h"
-#include "kseq.h"
-
-KSEQ_INIT(gzFile, gzread)
-
-
-static struct option paired_long_options[] = {
- {"pe-file1", required_argument, 0, 'f'},
- {"pe-file2", required_argument, 0, 'r'},
- {"barcode-file", required_argument, 0, 'b'},
- {"unknown-output1", required_argument, 0, 'u'},
- {"unknown-output2", required_argument, 0, 'w'},
- {"both-barcodes", optional_argument, 0, 'c'},
- {"max-mismatch", optional_argument, 0, 'm'},
- {"quiet", optional_argument, 0, 'z'},
- {GETOPT_HELP_OPTION_DECL},
- {GETOPT_VERSION_OPTION_DECL},
- {NULL, 0, NULL, 0}
-};
-
-void paired_usage (int status) {
-
- fprintf (stderr, "\nUsage: %s pe -f -r -b -u -w \n\n\
-Options:\n\
--f, --pe-file1, Input paired-end fastq file 1 (required, must have same number of records as pe2)\n\
--r, --pe-file2, Input paired-end fastq file 2 (required, must have same number of records as pe1)\n\
--b, --barcode-file, File with barcode and two output file names per line (required)\n", PROGRAM_NAME);
-
-fprintf (stderr, "-u, --unknown-output1, Output paired-end file 1 that contains records with no barcodes found. (required)\n\
--w, --unknown-output2, Output paired-end file 2 that contains records with no barcodes found. (required)\n\
--c, --both-barcodes, Optional flag that indicates that both fastq files have barcodes.\n\
--m , --max-mismatch , Optional argument that is the maximum number of mismatches allowed in a barcode. Default 0.\n");
-
-fprintf (stderr, "--quiet, don't print barcode matching info\n\
---help, display this help and exit\n\
---version, output version information and exit\n\n");
-
- exit (status);
-}
-
-int paired_main (int argc, char *argv[]) {
-
- gzFile pe1=NULL;
- gzFile pe2=NULL;
- kseq_t *fqrec1;
- kseq_t *fqrec2;
- int l1,l2;
- FILE* barfile = NULL;
- FILE* unknownfile1=NULL;
- FILE* unknownfile2=NULL;
- int debug=0;
- int optc;
- extern char *optarg;
- char *infn1=NULL;
- char *infn2=NULL;
- char *barfn=NULL;
- char *unknownfn1=NULL;
- char *unknownfn2=NULL;
- int both_have_barcodes=0;
- barcode_data_paired *curr, *head, *temp;
- char barcode [MAX_BARCODE_LENGTH];
- char baroutfn1 [MAX_FILENAME_LENGTH];
- char baroutfn2 [MAX_FILENAME_LENGTH];
- int num_unknown=0;
- int total=0;
- int mismatch=0;
- int quiet=0;
-
-
- while (1) {
- int option_index = 0;
- optc = getopt_long (argc, argv, "dcf:r:b:u:w:m:z", paired_long_options, &option_index);
-
- if (optc == -1) break;
-
- switch (optc) {
- if (paired_long_options[option_index].flag != 0) break;
-
- case 'f':
- infn1 = (char*) malloc (strlen (optarg) + 1);
- strcpy (infn1, optarg);
- break;
-
- case 'r':
- infn2 = (char*) malloc (strlen (optarg) + 1);
- strcpy (infn2, optarg);
- break;
-
- case 'b':
- barfn = (char*) malloc (strlen (optarg) + 1);
- strcpy (barfn, optarg);
- break;
-
- case 'u':
- unknownfn1 = (char*) malloc (strlen (optarg) + 1);
- strcpy (unknownfn1, optarg);
- break;
-
- case 'w':
- unknownfn2 = (char*) malloc (strlen (optarg) + 1);
- strcpy (unknownfn2, optarg);
- break;
-
- case 'c':
- both_have_barcodes=1;
- break;
-
- case 'm':
- mismatch = atoi (optarg);
- break;
-
- case 'z':
- quiet=1;
- break;
-
- case 'd':
- debug = 1;
- break;
-
- case_GETOPT_HELP_CHAR(paired_usage);
- case_GETOPT_VERSION_CHAR(PROGRAM_NAME, VERSION, AUTHORS);
-
- case '?':
- paired_usage (EXIT_FAILURE);
- break;
-
- default:
- paired_usage (EXIT_FAILURE);
- break;
- }
- }
-
-
- if (!infn1 || !infn2 || !unknownfn1 || !unknownfn2 || !barfn) {
- paired_usage (EXIT_FAILURE);
- }
-
- if (!strcmp (infn1, infn2) || !strcmp (infn1, unknownfn1) || !strcmp (infn1, unknownfn2) ||
- !strcmp (infn1, barfn) || !strcmp (infn2, unknownfn1) || !strcmp (infn2, unknownfn2) ||
- !strcmp (infn2, barfn) || !strcmp (unknownfn1, unknownfn2) || !strcmp (unknownfn1, barfn) ||
- !strcmp (unknownfn2, barfn)) {
-
- fprintf (stderr, "Error: Duplicate input and/or output file names.\n");
- return EXIT_FAILURE;
- }
-
- pe1 = gzopen (infn1, "r");
- if (!pe1) {
- fprintf (stderr, "Could not open input file 1 '%s'.\n", infn1);
- return EXIT_FAILURE;
- }
-
- pe2 = gzopen (infn2, "r");
- if (!pe2) {
- fprintf (stderr, "Could not open input file 2 '%s'.\n", infn2);
- return EXIT_FAILURE;
- }
-
- unknownfile1 = fopen (unknownfn1, "w");
- if (!unknownfile1) {
- fprintf (stderr, "Could not open unknown output file 1 '%s'.\n", unknownfn1);
- return EXIT_FAILURE;
- }
-
- unknownfile2 = fopen (unknownfn2, "w");
- if (!unknownfile2) {
- fprintf (stderr, "Could not open unknown output file 2 '%s'.\n", unknownfn2);
- return EXIT_FAILURE;
- }
-
- barfile = fopen (barfn, "r");
- if (!barfile) {
- fprintf (stderr, "Could not open barcode file '%s'.\n", barfn);
- return EXIT_FAILURE;
- }
-
-
- /* Creating linked list of barcode data */
- head = NULL;
- while (fscanf (barfile, "%s%s%s", barcode, baroutfn1, baroutfn2) != EOF) {
- curr = (barcode_data_paired*) malloc (sizeof (barcode_data_paired));
- curr->bc = (char*) malloc (strlen(barcode) + 1);
- strcpy (curr->bc, barcode);
-
- curr->bcfile1 = fopen (baroutfn1, "w");
- curr->bcfile2 = fopen (baroutfn2, "w");
- curr->num_records = 0;
-
- curr->next = head;
- head = curr;
- }
-
-
- fqrec1 = kseq_init (pe1);
- fqrec2 = kseq_init (pe2);
-
- while ((l1 = kseq_read (fqrec1)) >= 0) {
-
- l2 = kseq_read (fqrec2);
- if (l2 < 0) {
- fprintf (stderr, "Error: PE file 2 is shorter than PE file 1. Disregarding rest of PE file 1.\n");
- break;
- }
-
-
- /* Go through all barcode data and check if any match to beginning of read */
- /* If it does then put read in that barcode's file, otherwise put in unknown file */
- curr = head;
- while (curr) {
- if (strncmp_with_mismatch (curr->bc, fqrec1->seq.s, strlen (curr->bc), mismatch) == 0) {
- break;
- }
-
- curr = curr->next;
- }
-
-
- if (curr != NULL) {
- fprintf (curr->bcfile1, "@%s", fqrec1->name.s);
- if (fqrec1->comment.l) fprintf (curr->bcfile1, " %s\n", fqrec1->comment.s);
- else fprintf (curr->bcfile1, "\n");
-
- fprintf (curr->bcfile1, "%s\n", (fqrec1->seq.s)+strlen(curr->bc));
-
- fprintf (curr->bcfile1, "+%s", fqrec1->name.s);
- if (fqrec1->comment.l) fprintf (curr->bcfile1, " %s\n", fqrec1->comment.s);
- else fprintf (curr->bcfile1, "\n");
-
- fprintf (curr->bcfile1, "%s\n", (fqrec1->qual.s)+strlen(curr->bc));
-
-
- fprintf (curr->bcfile2, "@%s", fqrec2->name.s);
- if (fqrec2->comment.l) fprintf (curr->bcfile2, " %s\n", fqrec2->comment.s);
- else fprintf (curr->bcfile2, "\n");
-
- if (!both_have_barcodes) fprintf (curr->bcfile2, "%s\n", fqrec2->seq.s);
- else fprintf (curr->bcfile2, "%s\n", (fqrec2->seq.s)+strlen(curr->bc));
-
- fprintf (curr->bcfile2, "+%s", fqrec2->name.s);
- if (fqrec2->comment.l) fprintf (curr->bcfile2, " %s\n", fqrec2->comment.s);
- else fprintf (curr->bcfile2, "\n");
-
- if (!both_have_barcodes) fprintf (curr->bcfile2, "%s\n", fqrec2->qual.s);
- else fprintf (curr->bcfile2, "%s\n", (fqrec2->qual.s)+strlen(curr->bc));
-
- curr->num_records += 2;
- }
-
- else {
- fprintf (unknownfile1, "@%s", fqrec1->name.s);
- if (fqrec1->comment.l) fprintf (unknownfile1, " %s\n", fqrec1->comment.s);
- else fprintf (unknownfile1, "\n");
-
- fprintf (unknownfile1, "%s\n", fqrec1->seq.s);
-
- fprintf (unknownfile1, "+%s", fqrec1->name.s);
- if (fqrec1->comment.l) fprintf (unknownfile1, " %s\n", fqrec1->comment.s);
- else fprintf (unknownfile1, "\n");
-
- fprintf (unknownfile1, "%s\n", fqrec1->qual.s);
-
-
- fprintf (unknownfile2, "@%s", fqrec2->name.s);
- if (fqrec2->comment.l) fprintf (unknownfile2, " %s\n", fqrec2->comment.s);
- else fprintf (unknownfile2, "\n");
-
- fprintf (unknownfile2, "%s\n", fqrec2->seq.s);
-
- fprintf (unknownfile2, "+%s", fqrec2->name.s);
- if (fqrec2->comment.l) fprintf (unknownfile2, " %s\n", fqrec2->comment.s);
- else fprintf (unknownfile2, "\n");
-
- fprintf (unknownfile2, "%s\n", fqrec2->qual.s);
-
- num_unknown += 2;
- }
-
- total += 2;
- }
-
- if (l1 < 0) {
- l2 = kseq_read (fqrec2);
- if (l2 >= 0) {
- fprintf (stderr, "Error: PE file 1 is shorter than PE file 2. Disregarding rest of PE file 2.\n");
- }
- }
-
-
- if (!quiet) {
- fprintf (stdout, "\nTotal FastQ records: %d (%d pairs)\n\n", total, total/2);
- curr = head;
- while (curr) {
- fprintf (stdout, "FastQ records for barcode %s: %d (%d pairs)\n", curr->bc, curr->num_records, curr->num_records/2);
- curr = curr->next;
- }
- fprintf (stdout, "\nFastQ records with no barcode match: %d (%d pairs)\n", num_unknown, num_unknown/2);
- fprintf (stdout, "\nNumber of mismatches allowed: %d\n\n", mismatch);
- }
-
-
- kseq_destroy (fqrec1);
- kseq_destroy (fqrec2);
- gzclose (pe1);
- gzclose (pe2);
- fclose (unknownfile1);
- fclose (unknownfile2);
- fclose (barfile);
-
- free (infn1);
- free (infn2);
- free (barfn);
- free (unknownfn1);
- free (unknownfn2);
-
- curr = head;
- while (curr) {
- fclose (curr->bcfile1);
- fclose (curr->bcfile2);
- free (curr->bc);
- temp = curr;
- curr = curr->next;
- free (temp);
- }
-
- return EXIT_SUCCESS;
-}
diff --git a/src/demulti_single.c b/src/demulti_single.c
deleted file mode 100644
index d304410..0000000
--- a/src/demulti_single.c
+++ /dev/null
@@ -1,239 +0,0 @@
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include "sabre.h"
-#include "kseq.h"
-
-KSEQ_INIT(gzFile, gzread)
-
-
-static struct option single_long_options[] = {
- {"fastq-file", required_argument, 0, 'f'},
- {"barcode-file", required_argument, 0, 'b'},
- {"unknown-output", required_argument, 0, 'u'},
- {"max-mismatch", optional_argument, 0, 'm'},
- {"quiet", optional_argument, 0, 'z'},
- {GETOPT_HELP_OPTION_DECL},
- {GETOPT_VERSION_OPTION_DECL},
- {NULL, 0, NULL, 0}
-};
-
-void single_usage (int status) {
-
- fprintf (stderr, "\nUsage: %s se -f -b -u \n\
-\n\
-Options:\n\
--f, --fastq-file, Input fastq file (required)\n", PROGRAM_NAME);
-
- fprintf (stderr, "-b, --barcode-file, File with barcode and output file name per line (required)\n\
--u, --unknown-output, Output file that contains records with no barcodes found. (required)\n\
--m , --max-mismatch , Optional argument that is the maximum number of mismatches allowed in a barcode. Default 0.\n\
---quiet, don't output matching info\n\
---help, display this help and exit\n\
---version, output version information and exit\n\n");
-
- exit (status);
-}
-
-int single_main (int argc, char *argv[]) {
-
- gzFile se=NULL;
- kseq_t *fqrec;
- int debug=0;
- int optc;
- extern char *optarg;
- FILE* barfile = NULL;
- FILE* unknownfile=NULL;
- char *barfn=NULL;
- char *infn=NULL;
- char *unknownfn=NULL;
- barcode_data *curr, *head, *temp;
- char barcode [MAX_BARCODE_LENGTH];
- char baroutfn [MAX_FILENAME_LENGTH];
- int num_unknown=0;
- int total=0;
- int mismatch=0;
- int quiet=0;
-
- while (1) {
- int option_index = 0;
- optc = getopt_long (argc, argv, "df:b:u:m:z", single_long_options, &option_index);
-
- if (optc == -1) break;
-
- switch (optc) {
- if (single_long_options[option_index].flag != 0) break;
-
- case 'f':
- infn = (char*) malloc (strlen (optarg) + 1);
- strcpy (infn, optarg);
- break;
-
- case 'b':
- barfn = (char*) malloc (strlen (optarg) + 1);
- strcpy (barfn, optarg);
- break;
-
- case 'u':
- unknownfn = (char*) malloc (strlen (optarg) + 1);
- strcpy (unknownfn, optarg);
- break;
-
- case 'm':
- mismatch = atoi (optarg);
- break;
-
- case 'z':
- quiet=1;
- break;
-
- case 'd':
- debug = 1;
- break;
-
- case_GETOPT_HELP_CHAR(single_usage)
- case_GETOPT_VERSION_CHAR(PROGRAM_NAME, VERSION, AUTHORS);
-
- case '?':
- single_usage (EXIT_FAILURE);
- break;
-
- default:
- single_usage (EXIT_FAILURE);
- break;
- }
- }
-
-
- if (!infn || !barfn) {
- single_usage (EXIT_FAILURE);
- }
-
- if (!strcmp (infn, barfn)) {
- fprintf (stderr, "Error: Input file is same as barcode file.\n");
- return EXIT_FAILURE;
- }
-
- se = gzopen (infn, "r");
- if (!se) {
- fprintf (stderr, "Could not open input file '%s'.\n", infn);
- return EXIT_FAILURE;
- }
-
- barfile = fopen (barfn, "r");
- if (!barfile) {
- fprintf (stderr, "Could not open barcode file '%s'.\n", barfn);
- return EXIT_FAILURE;
- }
-
- unknownfile = fopen (unknownfn, "w");
- if (!unknownfile) {
- fprintf (stderr, "Could not open unknown output file '%s'.\n", unknownfn);
- return EXIT_FAILURE;
- }
-
-
- /* Set up a linked list of the barcode data */
- head = NULL;
- while (fscanf (barfile, "%s%s", barcode, baroutfn) != EOF) {
- curr = (barcode_data*) malloc (sizeof (barcode_data));
- curr->bc = (char*) malloc (strlen(barcode) + 1);
- strcpy (curr->bc, barcode);
-
- curr->bcfile = fopen (baroutfn, "w");
- curr->num_records = 0;
-
- curr->next = head;
- head = curr;
- }
-
-
- fqrec = kseq_init (se);
-
- while (kseq_read (fqrec) >= 0) {
-
- /* Go through linked list of barcode data and compare each barcode */
- /* with the sequence until a match is found or no match is found for any */
- curr = head;
- while (curr) {
- if (strncmp_with_mismatch (curr->bc, fqrec->seq.s, strlen (curr->bc), mismatch) == 0) {
- break;
- }
-
- curr = curr->next;
- }
-
-
- /* If barcode data is found, output to demultiplexed file, else output to unknown file */
- if (curr != NULL) {
- fprintf (curr->bcfile, "@%s", fqrec->name.s);
- if (fqrec->comment.l) fprintf (curr->bcfile, " %s\n", fqrec->comment.s);
- else fprintf (curr->bcfile, "\n");
-
- fprintf (curr->bcfile, "%s\n", (fqrec->seq.s)+strlen(curr->bc));
-
- fprintf (curr->bcfile, "+%s", fqrec->name.s);
- if (fqrec->comment.l) fprintf (curr->bcfile, " %s\n", fqrec->comment.s);
- else fprintf (curr->bcfile, "\n");
-
- fprintf (curr->bcfile, "%s\n", (fqrec->qual.s)+strlen(curr->bc));
-
- curr->num_records++;
- }
-
- else {
- fprintf (unknownfile, "@%s", fqrec->name.s);
- if (fqrec->comment.l) fprintf (unknownfile, " %s\n", fqrec->comment.s);
- else fprintf (unknownfile, "\n");
-
- fprintf (unknownfile, "%s\n", fqrec->seq.s);
-
- fprintf (unknownfile, "+%s", fqrec->name.s);
- if (fqrec->comment.l) fprintf (unknownfile, " %s\n", fqrec->comment.s);
- else fprintf (unknownfile, "\n");
-
- fprintf (unknownfile, "%s\n", fqrec->qual.s);
-
- num_unknown++;
- }
-
- total++;
- }
-
-
- if (!quiet) {
- fprintf (stdout, "\nTotal FastQ records: %d\n\n", total);
- curr = head;
- while (curr) {
- fprintf (stdout, "FastQ records for barcode %s: %d\n", curr->bc, curr->num_records);
- curr = curr->next;
- }
- fprintf (stdout, "\nFastQ records with no barcode match: %d\n", num_unknown);
- fprintf (stdout, "\nNumber of mismatches allowed: %d\n\n", mismatch);
- }
-
- kseq_destroy (fqrec);
- gzclose (se);
- fclose (barfile);
- fclose (unknownfile);
-
- free (infn);
- free (unknownfn);
- free (barfn);
-
- curr = head;
- while (curr) {
- fclose (curr->bcfile);
- free (curr->bc);
- temp = curr;
- curr = curr->next;
- free (temp);
- }
-
- return 0;
-}
diff --git a/src/demultiplex.c b/src/demultiplex.c
new file mode 100644
index 0000000..da149e3
--- /dev/null
+++ b/src/demultiplex.c
@@ -0,0 +1,207 @@
+
+/*
+ * set softtab=4
+ * set shiftwidth=4
+ * set expandtab
+ *
+ */
+
+/*
+ * sabre FASTQ files demultiplexing
+ * demultiplex.c: FASTQ demultiplexing
+ *
+ */
+
+#include "demultiplex.h"
+
+// Is m1 a better match than m2? 0 -> yes, use m1. 1 -> no, stick with m2
+int better(match_ret_t m1, match_ret_t m2) {
+ if (m1.cropped<0)
+ return 0;
+ if (m2.cropped<0)
+ return 1;
+ return (m1.cropped + m1.mismatches) < (m2.cropped+m2.mismatches);
+}
+
+// Is m the best possible match?
+int best(match_ret_t m) {
+ return m.cropped==0 && m.mismatches==0;
+}
+
+void write_out(const param_t *params, pthread_mutex_t *out_lock, metrics_t* metrics,
+ match_ret_t best_match, barcode_data_t *best_bc,
+ const char* actl_bc,
+ fq_rec_t *fq_rec1,fq_rec_t *fq_rec2);
+
+void* demult_runner(void *arg) {
+ fq_rec_t *fq_rec1 = (fq_rec_t*) malloc(sizeof(fq_rec_t));
+ fq_rec_t *fq_rec2 = (fq_rec_t*) malloc(sizeof(fq_rec_t));
+
+ init_fq_rec(fq_rec1);
+ init_fq_rec(fq_rec2);
+
+ thread_data_t* thread_data = (thread_data_t*)arg;
+ //int my_line_num;
+
+ /* Get reads, one at a time */
+
+ while(1) {
+
+ // lock reading
+ pthread_mutex_lock(thread_data->in_lock);
+
+ //this is equivalent to if(false), which means this block
+ //is always skipped, unless when there is an error/end of the file
+ if(get_fq_rec(fq_rec1, thread_data->params->fq1_fd)) {
+ // sanity check no more reads
+ pthread_mutex_unlock(thread_data->in_lock);
+ break;
+ }
+
+ if(thread_data->params->paired > 0) {
+ if(get_fq_rec(fq_rec2, thread_data->params->fq2_fd)) {
+ //error out there becuase if reached the end of the file
+ //then we should hit first break, above, since the assumptions
+ //that the files of equal length. If issues with R2 only this is an error
+ fprintf (stderr, "\n\
+ \n ERROR: R2 file shorter than R1 file.\
+ \n Stopping here:\
+ \n %s\
+ \n",
+ fq_rec1->name);
+ pthread_mutex_unlock(thread_data->in_lock);
+ exit(1);
+ }
+ }
+
+ // unlock reading
+ // TODO this bit of code for ordered fastq files, implement later?
+ //my_line_num = *(thread_data->line_num);
+ //*thread_data->line_num += 1;
+ pthread_mutex_unlock(thread_data->in_lock);
+
+ // Store a copy of the barcode found in the read (including the mismatches)
+ char *actl_bc = NULL;
+
+ /* Step 1: Find matching barcode */
+ match_ret_t best_match = {-1,-1};
+ barcode_data_t *best_bc = NULL;
+ for (barcode_data_t *curr = thread_data->curr; curr!=NULL; curr = curr->next) {
+ for (int i=0; curr->bc[i]; i++) {
+ match_ret_t mtch = chk_bc_mtch(curr->bc[i], fq_rec1->seq, thread_data->params->mismatch, thread_data->params->max_5prime_crop);
+ if(better(mtch, best_match)) {
+ //found better match
+ best_match = mtch;
+ best_bc = curr;
+ if (actl_bc) free(actl_bc);
+ actl_bc = strndup( (fq_rec1->seq)+mtch.cropped, strlen(curr->bc[i]) );
+ if (best(best_match))
+ break;
+ }
+ }
+
+ if (best(best_match))
+ break;
+ }
+
+ /* Step 2: Write read out into barcode specific file */
+ //TODO this bit of code to keep fastq files ordered as per original fastq files
+ //which I don't think that needed? at least not at this stage
+ // lock writing
+ //while(*(thread_data->out_line_num) != my_line_num) {
+ // pthread_cond_wait(thread_data->cv, thread_data->out_lock);
+ //}
+ //*thread_data->out_line_num += 1;
+
+ //pthread_cond_broadcast(thread_data->cv); // Tell everyone it might be their turn!
+
+ write_out(thread_data->params, thread_data->out_lock, thread_data->metrics, best_match, best_bc, actl_bc, fq_rec1, fq_rec2);
+ if (actl_bc)
+ free(actl_bc);
+
+ thread_data->metrics->total += 2;
+ }
+
+ free(fq_rec1);
+ free(fq_rec2);
+ //free(thread_data); according to valgrind report this line isn't needed since no errors given out..
+ return NULL;
+}
+
+
+void write_out(const param_t *params, pthread_mutex_t *out_lock, metrics_t* metrics,
+ match_ret_t best_match, barcode_data_t *best_bc,
+ const char* actl_bc,
+ fq_rec_t *fq_rec1,fq_rec_t *fq_rec2) {
+
+ char fqread1[MAX_READ_SIZE];
+ char fqread2[MAX_READ_SIZE];
+
+ fqread1[0] = '\0';
+ fqread2[0] = '\0';
+
+ char *umi_idx = NULL;
+
+ if(best_bc != NULL) {
+ //for now assume barcode and umi are in R1 read
+ if(params->umi > 0) {
+
+ const char *actl_umi_idx = (fq_rec1->seq)+strlen(actl_bc)+best_match.cropped;
+
+ if(strlen(actl_umi_idx) < params->min_umi_len) {
+ pthread_mutex_lock(out_lock);
+ fprintf(params->umis_2_short_fd, "%s\t%s\t%zu\t%d\n", fq_rec1->name, actl_umi_idx, strlen(actl_umi_idx), params->min_umi_len);
+ pthread_mutex_unlock(out_lock);
+ return;
+ }
+ else {
+ umi_idx = strdup(actl_umi_idx);
+ umi_idx[params->min_umi_len] = '\0';
+ }
+ }
+
+ if(params->combine > 0 && actl_bc != NULL) {
+ get_merged_fqread(fqread1, fq_rec1, fq_rec2, actl_bc, umi_idx, params->no_comment, best_match.cropped);
+
+ pthread_mutex_lock(out_lock);
+ fputs(fqread1, best_bc->bcfile1);
+ pthread_mutex_unlock(out_lock);
+
+ }
+ else {
+ get_fqread(fqread1, fq_rec1, actl_bc, umi_idx, params->no_comment, best_match.cropped);
+
+ pthread_mutex_lock(out_lock);
+ fputs(fqread1, best_bc->bcfile1);
+
+ if(params->paired > 0) {
+ get_fqread(fqread2, fq_rec1, actl_bc, umi_idx, params->no_comment, best_match.cropped);
+
+ fputs(fqread2, best_bc->bcfile2);
+
+ //dont need to increment buff_cnt, assuming fq_read1 keeps the right count
+ best_bc->num_records += 1;
+ }
+ pthread_mutex_unlock(out_lock);
+ }
+ best_bc->num_records += 1;
+ }
+ else {
+
+ get_fqread(fqread1, fq_rec1, NULL, NULL, params->no_comment, 0);
+
+ pthread_mutex_lock(out_lock);
+ fputs(fqread1, params->unassigned1_fd);
+
+ metrics->num_unknown += 1;
+
+ if(params->paired > 0) {
+ get_fqread(fqread2, fq_rec2, NULL, NULL, params->no_comment, 0);
+
+ fputs(fqread2, params->unassigned2_fd);
+
+ metrics->num_unknown += 1;
+ }
+ pthread_mutex_unlock(out_lock);
+ }
+}
\ No newline at end of file
diff --git a/src/demultiplex.h b/src/demultiplex.h
new file mode 100644
index 0000000..22e3417
--- /dev/null
+++ b/src/demultiplex.h
@@ -0,0 +1,15 @@
+#ifndef DEMULTIPLEX_H
+#define DEMULTIPLEX_H
+
+#include "sabre.h"
+#include "utils.h"
+#include "fastq.h"
+
+#define MAX_READ_SIZE 2048
+//#define MAX_READ_NUMBER 100000
+#define MAX_READ_NUMBER 15000
+#define MAX_READ_BUFFER MAX_READ_SIZE*MAX_READ_NUMBER
+
+void* demult_runner(void *arg);
+
+#endif /*DEMULTIPLEX_H*/
diff --git a/src/fastq.c b/src/fastq.c
new file mode 100644
index 0000000..333ebf5
--- /dev/null
+++ b/src/fastq.c
@@ -0,0 +1,47 @@
+
+#include "fastq.h"
+
+int get_line(gzFile fq_fd, char *line, int buff) {
+
+ char *new_line = gzgets(fq_fd, line, buff);
+
+ if(new_line == NULL) {
+ return -1;
+ }
+
+ int str_len = strlen(new_line);
+
+ if(new_line[str_len-1] != '\n') {
+ fprintf(stderr, "Line too long %d\n", buff);
+ exit(1);
+ }
+
+ new_line[str_len-1] = '\0';
+
+ return 0;
+}
+//user strchr
+int get_fq_rec(fq_rec_t *fq_rec, gzFile fq_fd) {
+
+ int done = get_line(fq_fd, fq_rec->name, LINE_SIZE);
+ done = done || get_line(fq_fd, fq_rec->seq, LINE_SIZE);
+ done = done || get_line(fq_fd, fq_rec->other, LINE_SIZE);
+ done = done || get_line(fq_fd, fq_rec->qual, LINE_SIZE);
+ char *ptr = strchr(fq_rec->name,' ');
+ if(ptr) {
+ *ptr='\0';
+ fq_rec->comment = ptr+1;
+ } else {
+ fq_rec->comment = NULL;
+ }
+ // before writing it out check that comment isn't null
+
+ return done;
+}
+
+void init_fq_rec(fq_rec_t *fq_rec) {
+ fq_rec->name[0] = '\0';
+ fq_rec->seq[0] = '\0';
+ fq_rec->other[0] = '\0';
+ fq_rec->qual[0] = '\0';
+}
diff --git a/src/fastq.h b/src/fastq.h
new file mode 100644
index 0000000..0083621
--- /dev/null
+++ b/src/fastq.h
@@ -0,0 +1,20 @@
+#ifndef FASTQ_H
+#define FASTQ_H
+
+#include "sabre.h"
+
+#define LINE_SIZE 600
+
+typedef struct {
+ char name[LINE_SIZE];
+ char *comment;
+ char seq[LINE_SIZE];
+ char other[LINE_SIZE];
+ char qual[LINE_SIZE];
+} fq_rec_t;
+
+void init_fq_rec(fq_rec_t *fq_rec);
+int get_line(gzFile fq_fd, char *line, int buff);
+int get_fq_rec(fq_rec_t *fq_rec, gzFile fq_fd);
+
+#endif /*FASTQ_H*/
diff --git a/src/kseq.h b/src/kseq.h
index 73600c4..8f9e498 100644
--- a/src/kseq.h
+++ b/src/kseq.h
@@ -1,6 +1,6 @@
/* The MIT License
- Copyright (c) 2008 Genome Research Ltd (GRL).
+ Copyright (c) 2008, 2009, 2011 Attractive Chaos
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
@@ -23,9 +23,7 @@
SOFTWARE.
*/
-/* Contact: Heng Li */
-
-/* Last Modified: 12APR2009 */
+/* Last Modified: 2017-02-11 */
#ifndef AC_KSEQ_H
#define AC_KSEQ_H
@@ -34,47 +32,50 @@
#include
#include
-#define KS_SEP_SPACE 0 /* isspace(): \t, \n, \v, \f, \r */
-#define KS_SEP_TAB 1 /* isspace() && !' ' */
-#define KS_SEP_MAX 1
+#define KS_SEP_SPACE 0 // isspace(): \t, \n, \v, \f, \r
+#define KS_SEP_TAB 1 // isspace() && !' '
+#define KS_SEP_LINE 2 // line separator: "\n" (Unix) or "\r\n" (Windows)
+#define KS_SEP_MAX 2
-#define __KS_TYPE(type_t) \
- typedef struct __kstream_t { \
- char *buf; \
- int begin, end, is_eof; \
- type_t f; \
- } kstream_t;
+#define __KS_TYPE(type_t) \
+ typedef struct __kstream_t { \
+ unsigned char *buf; \
+ int begin, end, is_eof; \
+ type_t f; \
+ } kstream_t;
+#define ks_err(ks) ((ks)->end < 0)
#define ks_eof(ks) ((ks)->is_eof && (ks)->begin >= (ks)->end)
#define ks_rewind(ks) ((ks)->is_eof = (ks)->begin = (ks)->end = 0)
-#define __KS_BASIC(type_t, __bufsize) \
- static inline kstream_t *ks_init(type_t f) \
- { \
- kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t)); \
- ks->f = f; \
- ks->buf = (char*)malloc(__bufsize); \
- return ks; \
- } \
- static inline void ks_destroy(kstream_t *ks) \
- { \
- if (ks) { \
- free(ks->buf); \
- free(ks); \
- } \
- }
-
-#define __KS_GETC(__read, __bufsize) \
- static inline int ks_getc(kstream_t *ks) \
- { \
- if (ks->is_eof && ks->begin >= ks->end) return -1; \
- if (ks->begin >= ks->end) { \
- ks->begin = 0; \
- ks->end = __read(ks->f, ks->buf, __bufsize); \
- if (ks->end < __bufsize) ks->is_eof = 1; \
- if (ks->end == 0) return -1; \
- } \
- return (int)ks->buf[ks->begin++]; \
+#define __KS_BASIC(type_t, __bufsize) \
+ static inline kstream_t *ks_init(type_t f) \
+ { \
+ kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t)); \
+ ks->f = f; \
+ ks->buf = (unsigned char*)malloc(__bufsize); \
+ return ks; \
+ } \
+ static inline void ks_destroy(kstream_t *ks) \
+ { \
+ if (ks) { \
+ free(ks->buf); \
+ free(ks); \
+ } \
+ }
+
+#define __KS_GETC(__read, __bufsize) \
+ static inline int ks_getc(kstream_t *ks) \
+ { \
+ if (ks_err(ks)) return -3; \
+ if (ks_eof(ks)) return -1; \
+ if (ks->begin >= ks->end) { \
+ ks->begin = 0; \
+ ks->end = __read(ks->f, ks->buf, __bufsize); \
+ if (ks->end == 0) { ks->is_eof = 1; return -1; } \
+ else if (ks->end < 0) { ks->is_eof = 1; return -3; } \
+ } \
+ return (int)ks->buf[ks->begin++]; \
}
#ifndef KSTRING_T
@@ -89,135 +90,154 @@ typedef struct __kstring_t {
#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
#endif
-#define __KS_GETUNTIL(__read, __bufsize) \
- static int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \
- { \
- if (dret) *dret = 0; \
- str->l = 0; \
- if (ks->begin >= ks->end && ks->is_eof) return -1; \
- for (;;) { \
- int i; \
- if (ks->begin >= ks->end) { \
- if (!ks->is_eof) { \
- ks->begin = 0; \
- ks->end = __read(ks->f, ks->buf, __bufsize); \
- if (ks->end < __bufsize) ks->is_eof = 1; \
- if (ks->end == 0) break; \
- } else break; \
- } \
- if (delimiter > KS_SEP_MAX) { \
- for (i = ks->begin; i < ks->end; ++i) \
- if (ks->buf[i] == delimiter) break; \
- } else if (delimiter == KS_SEP_SPACE) { \
- for (i = ks->begin; i < ks->end; ++i) \
- if (isspace(ks->buf[i])) break; \
- } else if (delimiter == KS_SEP_TAB) { \
- for (i = ks->begin; i < ks->end; ++i) \
- if (isspace(ks->buf[i]) && ks->buf[i] != ' ') break; \
- } else i = 0; /* never come to here! */ \
- if (str->m - str->l < i - ks->begin + 1) { \
- str->m = str->l + (i - ks->begin) + 1; \
- kroundup32(str->m); \
- str->s = (char*)realloc(str->s, str->m); \
- } \
- memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin); \
- str->l = str->l + (i - ks->begin); \
- ks->begin = i + 1; \
- if (i < ks->end) { \
- if (dret) *dret = ks->buf[i]; \
- break; \
- } \
- } \
- if (str->l == 0) { \
- str->m = 1; \
- str->s = (char*)calloc(1, 1); \
- } \
- str->s[str->l] = '\0'; \
- return str->l; \
- }
-
-#define KSTREAM_INIT(type_t, __read, __bufsize) \
- __KS_TYPE(type_t) \
- __KS_BASIC(type_t, __bufsize) \
- __KS_GETC(__read, __bufsize) \
- __KS_GETUNTIL(__read, __bufsize)
-
-#define __KSEQ_BASIC(type_t) \
- static inline kseq_t *kseq_init(type_t fd) \
- { \
- kseq_t *s = (kseq_t*)calloc(1, sizeof(kseq_t)); \
- s->f = ks_init(fd); \
- return s; \
- } \
- static inline void kseq_rewind(kseq_t *ks) \
- { \
- ks->last_char = 0; \
- ks->f->is_eof = ks->f->begin = ks->f->end = 0; \
- } \
- static inline void kseq_destroy(kseq_t *ks) \
- { \
- if (!ks) return; \
- free(ks->name.s); free(ks->comment.s); free(ks->seq.s); free(ks->qual.s); \
- ks_destroy(ks->f); \
- free(ks); \
- }
+#define __KS_GETUNTIL(__read, __bufsize) \
+ static int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append) \
+ { \
+ int gotany = 0; \
+ if (dret) *dret = 0; \
+ str->l = append? str->l : 0; \
+ for (;;) { \
+ int i; \
+ if (ks_err(ks)) return -3; \
+ if (ks->begin >= ks->end) { \
+ if (!ks->is_eof) { \
+ ks->begin = 0; \
+ ks->end = __read(ks->f, ks->buf, __bufsize); \
+ if (ks->end == 0) { ks->is_eof = 1; break; } \
+ if (ks->end == -1) { ks->is_eof = 1; return -3; } \
+ } else break; \
+ } \
+ if (delimiter == KS_SEP_LINE) { \
+ for (i = ks->begin; i < ks->end; ++i) \
+ if (ks->buf[i] == '\n') break; \
+ } else if (delimiter > KS_SEP_MAX) { \
+ for (i = ks->begin; i < ks->end; ++i) \
+ if (ks->buf[i] == delimiter) break; \
+ } else if (delimiter == KS_SEP_SPACE) { \
+ for (i = ks->begin; i < ks->end; ++i) \
+ if (isspace(ks->buf[i])) break; \
+ } else if (delimiter == KS_SEP_TAB) { \
+ for (i = ks->begin; i < ks->end; ++i) \
+ if (isspace(ks->buf[i]) && ks->buf[i] != ' ') break; \
+ } else i = 0; /* never come to here! */ \
+ if (str->m - str->l < (size_t)(i - ks->begin + 1)) { \
+ str->m = str->l + (i - ks->begin) + 1; \
+ kroundup32(str->m); \
+ str->s = (char*)realloc(str->s, str->m); \
+ } \
+ gotany = 1; \
+ memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin); \
+ str->l = str->l + (i - ks->begin); \
+ ks->begin = i + 1; \
+ if (i < ks->end) { \
+ if (dret) *dret = ks->buf[i]; \
+ break; \
+ } \
+ } \
+ if (!gotany && ks_eof(ks)) return -1; \
+ if (str->s == 0) { \
+ str->m = 1; \
+ str->s = (char*)calloc(1, 1); \
+ } else if (delimiter == KS_SEP_LINE && str->l > 1 && str->s[str->l-1] == '\r') --str->l; \
+ str->s[str->l] = '\0'; \
+ return str->l; \
+ } \
+ static inline int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \
+ { return ks_getuntil2(ks, delimiter, str, dret, 0); }
+
+#define KSTREAM_INIT(type_t, __read, __bufsize) \
+ __KS_TYPE(type_t) \
+ __KS_BASIC(type_t, __bufsize) \
+ __KS_GETC(__read, __bufsize) \
+ __KS_GETUNTIL(__read, __bufsize)
+
+#define kseq_rewind(ks) ((ks)->last_char = (ks)->f->is_eof = (ks)->f->begin = (ks)->f->end = 0)
+
+#define __KSEQ_BASIC(SCOPE, type_t) \
+ SCOPE kseq_t *kseq_init(type_t fd) \
+ { \
+ kseq_t *s = (kseq_t*)calloc(1, sizeof(kseq_t)); \
+ s->f = ks_init(fd); \
+ return s; \
+ } \
+ SCOPE void kseq_destroy(kseq_t *ks) \
+ { \
+ if (!ks) return; \
+ free(ks->name.s); free(ks->comment.s); free(ks->seq.s); free(ks->qual.s); \
+ ks_destroy(ks->f); \
+ free(ks); \
+ }
/* Return value:
>=0 length of the sequence (normal)
-1 end-of-file
-2 truncated quality string
-*/
-#define __KSEQ_READ \
- static int kseq_read(kseq_t *seq) \
- { \
- int c; \
- kstream_t *ks = seq->f; \
- if (seq->last_char == 0) { /* then jump to the next header line */ \
- while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@'); \
- if (c == -1) return -1; /* end of file */ \
- seq->last_char = c; \
- } /* the first header char has been read */ \
- seq->comment.l = seq->seq.l = seq->qual.l = 0; \
- if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1; \
- if (c != '\n') ks_getuntil(ks, '\n', &seq->comment, 0); \
- while ((c = ks_getc(ks)) != -1 && c != '>' && c != '+' && c != '@') { \
- if (isgraph(c)) { /* printable non-space character */ \
- if (seq->seq.l + 1 >= seq->seq.m) { /* double the memory */ \
- seq->seq.m = seq->seq.l + 2; \
- kroundup32(seq->seq.m); /* rounded to next closest 2^k */ \
- seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \
- } \
- seq->seq.s[seq->seq.l++] = (char)c; \
- } \
- } \
- if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */ \
- seq->seq.s[seq->seq.l] = 0; /* null terminated string */ \
- if (c != '+') return seq->seq.l; /* FASTA */ \
- if (seq->qual.m < seq->seq.m) { /* allocate enough memory */ \
- seq->qual.m = seq->seq.m; \
- seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m); \
- } \
- while ((c = ks_getc(ks)) != -1 && c != '\n'); /* skip the rest of '+' line */ \
- if (c == -1) return -2; /* we should not stop here */ \
- while ((c = ks_getc(ks)) != -1 && seq->qual.l < seq->seq.l) \
- if (c >= 33 && c <= 127) seq->qual.s[seq->qual.l++] = (unsigned char)c; \
- seq->qual.s[seq->qual.l] = 0; /* null terminated string */ \
- seq->last_char = 0; /* we have not come to the next header line */ \
- if (seq->seq.l != seq->qual.l) return -2; /* qual string is shorter than seq string */ \
- return seq->seq.l; \
- }
-
-#define __KSEQ_TYPE(type_t) \
- typedef struct { \
- kstring_t name, comment, seq, qual; \
- int last_char; \
- kstream_t *f; \
- } kseq_t;
-
-#define KSEQ_INIT(type_t, __read) \
- KSTREAM_INIT(type_t, __read, 4096) \
- __KSEQ_TYPE(type_t) \
- __KSEQ_BASIC(type_t) \
- __KSEQ_READ
+ -3 error reading stream
+ */
+#define __KSEQ_READ(SCOPE) \
+ SCOPE int kseq_read(kseq_t *seq) \
+ { \
+ int c,r; \
+ kstream_t *ks = seq->f; \
+ if (seq->last_char == 0) { /* then jump to the next header line */ \
+ while ((c = ks_getc(ks)) >= 0 && c != '>' && c != '@'); \
+ if (c < 0) return c; /* end of file or error*/ \
+ seq->last_char = c; \
+ } /* else: the first header char has been read in the previous call */ \
+ seq->comment.l = seq->seq.l = seq->qual.l = 0; /* reset all members */ \
+ if ((r=ks_getuntil(ks, 0, &seq->name, &c)) < 0) return r; /* normal exit: EOF or error */ \
+ if (c != '\n') ks_getuntil(ks, KS_SEP_LINE, &seq->comment, 0); /* read FASTA/Q comment */ \
+ if (seq->seq.s == 0) { /* we can do this in the loop below, but that is slower */ \
+ seq->seq.m = 256; \
+ seq->seq.s = (char*)malloc(seq->seq.m); \
+ } \
+ while ((c = ks_getc(ks)) >= 0 && c != '>' && c != '+' && c != '@') { \
+ if (c == '\n') continue; /* skip empty lines */ \
+ seq->seq.s[seq->seq.l++] = c; /* this is safe: we always have enough space for 1 char */ \
+ ks_getuntil2(ks, KS_SEP_LINE, &seq->seq, 0, 1); /* read the rest of the line */ \
+ } \
+ if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */ \
+ if (seq->seq.l + 1 >= seq->seq.m) { /* seq->seq.s[seq->seq.l] below may be out of boundary */ \
+ seq->seq.m = seq->seq.l + 2; \
+ kroundup32(seq->seq.m); /* rounded to the next closest 2^k */ \
+ seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \
+ } \
+ seq->seq.s[seq->seq.l] = 0; /* null terminated string */ \
+ seq->is_fastq = (c == '+'); \
+ if (!seq->is_fastq) return seq->seq.l; /* FASTA */ \
+ if (seq->qual.m < seq->seq.m) { /* allocate memory for qual in case insufficient */ \
+ seq->qual.m = seq->seq.m; \
+ seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m); \
+ } \
+ while ((c = ks_getc(ks)) >= 0 && c != '\n'); /* skip the rest of '+' line */ \
+ if (c == -1) return -2; /* error: no quality string */ \
+ while ((c = ks_getuntil2(ks, KS_SEP_LINE, &seq->qual, 0, 1) >= 0 && seq->qual.l < seq->seq.l)); \
+ if (c == -3) return -3; /* stream error */ \
+ seq->last_char = 0; /* we have not come to the next header line */ \
+ if (seq->seq.l != seq->qual.l) return -2; /* error: qual string is of a different length */ \
+ return seq->seq.l; \
+ }
+
+#define __KSEQ_TYPE(type_t) \
+ typedef struct { \
+ kstring_t name, comment, seq, qual; \
+ int last_char, is_fastq; \
+ kstream_t *f; \
+ } kseq_t;
+
+#define KSEQ_INIT2(SCOPE, type_t, __read) \
+ KSTREAM_INIT(type_t, __read, 16384) \
+ __KSEQ_TYPE(type_t) \
+ __KSEQ_BASIC(SCOPE, type_t) \
+ __KSEQ_READ(SCOPE)
+
+#define KSEQ_INIT(type_t, __read) KSEQ_INIT2(static, type_t, __read)
+
+#define KSEQ_DECLARE(type_t) \
+ __KS_TYPE(type_t) \
+ __KSEQ_TYPE(type_t) \
+ extern kseq_t *kseq_init(type_t fd); \
+ void kseq_destroy(kseq_t *ks); \
+ int kseq_read(kseq_t *seq);
#endif
diff --git a/src/metrics.c b/src/metrics.c
new file mode 100644
index 0000000..646b3ea
--- /dev/null
+++ b/src/metrics.c
@@ -0,0 +1,187 @@
+#include "utils.h"
+
+/*
+ gcc -Wall -O2 -std=c99 -o metrics metrics.c -lz
+ */
+
+#define BARCODE_ARRAY 1000000
+
+//int chk_bc_arr(const char *arr, char *bc);
+
+/*
+ WISDOM char **arr and char *arr[] mean the same things
+ but later one more informative
+ */
+
+int chk_bc_arr(barcodes_t *arr, char *bc) {
+
+ int i = 0;
+ while(arr[i].bc != 0) {
+
+ if(strcmp(bc, arr[i].bc) == 0) {
+ return i;
+ }
+
+ i += 1;
+ }
+
+ arr[i].bc = strdup(bc);
+
+ // error handling
+ if(i == BARCODE_ARRAY-1) {
+ fprintf(stderr, "ERROR: gone too far in the array\n");
+ exit(1);
+ }
+
+ // could also do initialisation of the next element here
+ // arr[i+1]=0;
+ return i;
+}
+
+int chk_umi_arr(umis_t *arr, char *bc) {
+
+ int i = 0;
+ while(arr[i].bc != 0) {
+
+ if(strcmp(bc, arr[i].bc) == 0) {
+ return i;
+ }
+
+ i += 1;
+ }
+
+ arr[i].bc = strdup(bc);
+ arr[i].len = strlen(bc);
+
+ if(i == BARCODE_ARRAY-1) {
+ fprintf(stderr, "ERROR: gone too far in the array\n");
+ exit(1);
+ }
+ return i;
+}
+
+int umi_n_cmp(const void *a1, const void *b1) {
+
+ const umis_t* a=a1;
+ const umis_t* b=b1;
+
+ return a->cnts - b->cnts;
+}
+
+int bc_n_cmp(const void *a1, const void *b1) {
+
+ const barcodes_t* a=a1;
+ const barcodes_t* b=b1;
+
+ return a->cnts - b->cnts;
+}
+
+int main (int argc, char *argv[]) {
+
+ if (argc <= 2) {
+ fprintf(stderr, "\n\
+ \n Usage: metrics \
+ \n\
+ \n Options:\
+ \n\
+ \n mode INT [0|1]; 0 = sample barcode, 1 = umis barcodes\
+ \n\
+ \n");
+ exit (EXIT_SUCCESS);
+ }
+
+ gzFile pe1=NULL;
+ pe1 = gzopen(argv[1], "r");
+ kseq_t *fqrec1;
+ int l1;
+ int mode = atoi(argv[2]);
+
+ fqrec1 = kseq_init(pe1);
+
+ /*
+ WISDOM these tow are actually different things
+ char barcodes[BARCODE_ARRAY][BARCODE];
+ char *barcodes[];
+ */
+
+ //char **barcodes = calloc(BARCODE_ARRAY, sizeof(char*));
+ //int *bc_cnts = calloc(BARCODE_ARRAY, sizeof(int));
+
+ barcodes_t *barcodes;
+ barcodes = calloc(BARCODE_ARRAY, sizeof(barcodes_t));
+
+ umis_t *umis;
+ umis = calloc(BARCODE_ARRAY, sizeof(umis_t));
+
+ /* Get reads, one at a time */
+ while ((l1 = kseq_read(fqrec1)) >= 0) {
+
+ char *last;
+ char *last2;
+
+ char *p = strtok(fqrec1->name.s, ":");
+
+ while (p != NULL) {
+ last2 = last;
+ last = p;
+ p = strtok(NULL, ":");
+ }
+
+ if(mode == 0) {
+ int bc_idx = chk_bc_arr(barcodes, last2);
+ barcodes[bc_idx].cnts += 1;
+ }
+ else if(mode == 1) {
+
+ int umi_idx = chk_umi_arr(umis, last);
+ umis[umi_idx].cnts += 1;
+ }
+ }
+
+ if(mode == 0) {
+
+ int n = 0;
+ while(barcodes[n].bc != 0) {
+ n++;
+ }
+ qsort(barcodes, n, sizeof(barcodes_t), bc_n_cmp);
+
+ /*
+ WISDOM this is to limit the right scope for i
+ */
+
+ int i = 0;
+ while(barcodes[i].bc != 0) {
+ fprintf(stdout, "%s\t%d\n", barcodes[i].bc, barcodes[i].cnts);
+ i++;
+ }
+ }
+ else if(mode == 1) {
+
+ int n = 0;
+ while(umis[n].bc != 0) {
+ n++;
+ }
+ qsort(umis, n, sizeof(umis_t), umi_n_cmp);
+
+ int i = 0;
+ while(umis[i].bc != 0) {
+ fprintf(stdout, "%s\t%d\t%d\n", umis[i].bc, umis[i].len, umis[i].cnts);
+ i++;
+ }
+ }
+
+ gzclose(pe1);
+ kseq_destroy(fqrec1);
+
+ return EXIT_SUCCESS;
+}
+//TODO use struct instead
+//used qsort
+//write a function to sort e.g bubbleSort
+//actually sorting function should be simpler then bubbleSort
+//it should return -1, 0 or 1. OR -1 or 1.
+//
+//barcodes_t barcodes[size];
+//
+//barcodes[i].cnts +=1;
diff --git a/src/sabre.c b/src/sabre.c
index 4e066bb..a41ce4f 100644
--- a/src/sabre.c
+++ b/src/sabre.c
@@ -1,54 +1,407 @@
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
+
#include "sabre.h"
+#include "utils.h"
+#include "usage.h"
+#include "demultiplex.h"
+
+FILE* my_fopen(const char* fname, int gz) {
+ static char* compressor = NULL;
-void main_usage (int status) {
+ if (gz) {
- fprintf (stdout, "\nUsage: %s [options]\n\
-\n\
-Command:\n\
-pe\tpaired-end barcode de-multiplexing\n\
-se\tsingle-end barcode de-multiplexing\n\
-\n\
---help, display this help and exit\n\
---version, output version information and exit\n\n", PROGRAM_NAME);
+ if (!compressor) {
+ // Guess whether to use pigz or gzip
+ char tmp[100];
+ FILE* fin = popen("pigz --version 2>&1", "r");
+ char* str = fgets(tmp, 50, fin);
+ int found = strncmp("pigz",tmp,4)==0;
+ compressor = str && found ? "pigz -p 2" : "gzip";
+ }
- exit (status);
+ char command[2048];
+ sprintf(command, "%s > %s", compressor, fname);
+ FILE* ret = popen(command, "w");
+ return ret;
+ } else {
+ return fopen(fname, "w");
+ }
}
-int main (int argc, char *argv[]) {
- int retval=0;
+int main(int argc, char *argv[]) {
+
+ if(argc < 2) {
+ usage(EXIT_SUCCESS);
+ }
+
+ //more about getopts http://www.informit.com/articles/article.aspx?p=175771&seqNum=3
+ static struct option paired_long_options[] = {
+ {"fq1", required_argument, NULL, 'f'},
+ {"fq2", required_argument, NULL, 'r'},
+ {"barcodes", required_argument, NULL, 'b'},
+ {"unassinged1", required_argument, NULL, 'z'},
+ {"unassinged2", required_argument, NULL, 'w'},
+ {"combine", optional_argument, NULL, 'c'},
+ {"umi", optional_argument, NULL, 'u'},
+ {"max-mismatch", required_argument, 0, 'm'},
+ {"min-umi-len", required_argument, 0, 'l'},
+ {"max-5prime-crop", required_argument, 0, 'a'},
+ {"stats", required_argument, NULL, 's'},
+ {"no-comment", no_argument, 0, 'n'},
+ {"threads", optional_argument, 0, 't'},
+ {"gz-out", optional_argument, NULL, 'g'},
+ {"version", optional_argument, NULL, 'v'},
+ {"help", optional_argument, NULL, 'h'},
+ {"story", optional_argument, NULL, 'o'},
+ //{"quiet", no_argument, 0, 'z'},
+ {NULL, 0, NULL, 0}
+ };
+
+ // this is on the stack so no need to malloc
+ // stack gets cleaned when function exits,
+ // because of that no need to free either
+ param_t params;
+ set_default_params(¶ms);
+
+ metrics_t metrics;
+ metrics. num_unknown = 0;
+ metrics.total = 0;
+
+ //clock_t begin = clock();
+ time_t start, end;
+ start = time(NULL);
+
+ char *fq1_fn=NULL;
+ char *fq2_fn=NULL;
+
+ FILE* bc_fd;
+ char *bc_fn=NULL;
+
+ char *unassigned1_fn=NULL;
+ char *unassigned2_fn=NULL;
+ char *umis_2_short_fn=strdup("umis_too_short.txt");
+
+ FILE* log_file;
+ char *log_fn=strdup("stats.txt");
+
+ int optc;
+ extern char *optarg;
+
+ barcode_data_t *curr, *head, *temp;
+
+ int threads=4;
+
+ while (1) {
+ int option_index = 0;
+ //colon after a flag means should have arguments and no colon means just a flag i.e bool, no args after it
+ optc = getopt_long (argc, argv, "dnucvogf:r:b:z:w:m:s:l:z:a:t:", paired_long_options, &option_index);
+
+ if (optc == -1) break;
+
+ switch (optc) {
+
+ case 'f':
+ fq1_fn = (char*) malloc (strlen (optarg) + 1);
+ strcpy (fq1_fn, optarg);
+ break;
+
+ case 'r':
+ fq2_fn = (char*) malloc (strlen (optarg) + 1);
+ strcpy (fq2_fn, optarg);
+ break;
+
+ case 'b':
+ bc_fn = (char*) malloc (strlen (optarg) + 1);
+ strcpy (bc_fn, optarg);
+ break;
+
+ case 'z':
+ unassigned1_fn = (char*) malloc (strlen (optarg) + 1);
+ strcpy (unassigned1_fn, optarg);
+ break;
+
+ case 'w':
+ unassigned2_fn = (char*) malloc (strlen (optarg) + 1);
+ strcpy (unassigned2_fn, optarg);
+ break;
+
+ case 'c':
+ params.combine=1;
+ break;
+
+ case 'u':
+ params.umi=1;
+ break;
+
+ case 'm':
+ params.mismatch = atoi (optarg);
+ break;
+
+ case 'l':
+ params.min_umi_len = atoi (optarg);
+ break;
+
+ case 'a':
+ params.max_5prime_crop = atoi (optarg);
+ break;
+
+ case 'n':
+ params.no_comment = 1;
+ break;
+
+ case 'g':
+ params.gz_out = 1;
+ break;
+
+ case 't':
+ threads = atoi (optarg);
+ break;
+
+ case 's':
+ log_fn = (char*) malloc (strlen (optarg) + 1);
+ strcpy (log_fn, optarg);
+ break;
+
+ case 'v':
+ version(EXIT_SUCCESS);
+ break;
+
+ //NOTE if user requrested the help menu i.e --help then
+ //return success for all other cases below while help menu
+ //is printed it wasn't intended by user (or at least we don't know that)
+ //and therefore exit code - fail
+ case 'h':
+ usage(EXIT_SUCCESS);
+ break;
+
+ case 'o':
+ little_story(EXIT_SUCCESS);
+ break;
+
+ case '*':
+ usage(EXIT_FAILURE);
+ break;
+
+ default:
+ usage(EXIT_FAILURE);
+ break;
+ }
+ }
+
+ // TODO check that what's requre if not run usage
+
+ params.fq1_fd = gzopen(fq1_fn, "r");
+ if(!params.fq1_fd) {
+ fprintf(stderr, "ERROR: Could not open input file R1 '%s'.\n", fq1_fn);
+ exit(EXIT_FAILURE);
+ }
+
+ params.fq2_fd = gzopen(fq2_fn, "r");
+ if(!params.fq2_fd) {
+ fprintf(stderr, "ERROR: Could not open input file R2 '%s'.\n", fq2_fn);
+ exit(EXIT_FAILURE);
+ }
+
+ if (!unassigned1_fn) {
+ unassigned1_fn= params.gz_out ? strdup("unassigned_R1.fq.gz") : strdup("unassigned_R1.fq");
+ }
+ if (!unassigned2_fn) {
+ unassigned2_fn= params.gz_out ? strdup("unassigned_R2.fq.gz") : strdup("unassigned_R2.fq");
+ }
+ params.unassigned1_fd = my_fopen(unassigned1_fn, params.gz_out);
+ params.unassigned2_fd = my_fopen(unassigned2_fn, params.gz_out);
+ params.umis_2_short_fd = fopen(umis_2_short_fn, "a");
+
+ // ? where does this goes?
+ fprintf(params.umis_2_short_fd, "name\tumi\tlen\tmin_len\n");
+
+ //TODO plugin sanity_chk here
+
+ if(params.fq2_fd) {
+ params.paired = 1;
+ }
+
+ fprintf(stderr, "\n\
+ \n Running: %s\
+ \n Command line args:\
+ \n --fq1 %s\
+ \n --fq2 %s\
+ \n --barcodes %s\
+ \n --unassigned_R1 %s\
+ \n --unassigned_R2 %s\
+ \n --combine %d\
+ \n --umi %d\
+ \n --max-mismatch %d\
+ \n --min-umi-len %d\
+ \n --max-5prime-crop %d\
+ \n --no-comment %d\
+ \n --stats %s\
+ \n --threads %d\
+ \n\
+ \n In Progess...\
+ \n\
+ \n", PROGRAM_NAME,\
+ fq1_fn, fq2_fn,\
+ bc_fn,\
+ unassigned1_fn, unassigned2_fn,\
+ params.combine, params.umi,\
+ params.mismatch, params.min_umi_len,
+ params.max_5prime_crop, params.no_comment,\
+ log_fn, threads);
+
+ char *bcout_fn1 = NULL;
+ char *bcout_fn2 = NULL;
+ /* Creating linked list of barcode data */
+ // https://www.hackerearth.com/practice/data-structures/linked-list/singly-linked-list/tutorial/
+ // where each node is represents one barcode from the barcode file
+ bc_fd = fopen(bc_fn, "r");
+ if (!bc_fd) {
+ fprintf(stderr, "ERROR: Unable to barcode file\n");
+ exit(EXIT_FAILURE);
+ }
+
+ head = NULL;
+ curr = NULL;
+
+ char line_buff[1024];
+ int max_items = 6;
+ while(fgets(line_buff, 1024, bc_fd)) {
+ curr = (barcode_data_t*) calloc(1,sizeof(barcode_data_t));
+
+ char *p = strtok(line_buff, "\t");
+ char *s_name = strdup(p);
+
+ p = strtok(NULL, "\t");
+ curr->bc_grp = strdup(p);
+
+ bcout_fn1 = (char *) malloc(MAX_FILENAME_LENGTH*2);
+ bcout_fn1[0] = '\0';
+ get_bc_fn(&bcout_fn1, s_name, curr->bc_grp, 1, params.gz_out);
+ curr->bcfile1 = my_fopen(_mkdir(bcout_fn1), params.gz_out);
+
+ if(params.paired > 0 && params.combine < 0) {
+ bcout_fn2 = (char *) malloc(MAX_FILENAME_LENGTH*2);
+ bcout_fn2[0] = '\0';
+ get_bc_fn(&bcout_fn2, s_name, curr->bc_grp, 2, params.gz_out);
+ curr->bcfile2 = my_fopen(_mkdir(bcout_fn2), params.gz_out);
+ }
+
+ //TODO for hardcode max limit of items in the barcodes file to 6
+ curr->bc = calloc(max_items, sizeof(void*));
+
+ int i=0;
+ while(i <= max_items && (p = strtok(NULL, "\t\n"))) {
+ // remove the token, new line char
+ curr->bc[i] = strdup(p);
+ fprintf(stdout, " BC %s ", curr->bc[i]);
+ i++;
+ }
+ fprintf(stdout, "\n");
+
+ curr->num_records = 0;
+ curr->next = head;
+ head = curr;
+ }
+
+ free(bcout_fn1);
+ free(bcout_fn2);
+ free(bc_fn);
+ free(fq1_fn);
+ free(fq2_fn);
+ free(unassigned1_fn);
+ free(unassigned2_fn);
+ //free(umis_2_short_fn);
+
+ // Threading
+ pthread_t tid[threads];
+ pthread_mutex_t in_lock;
+ pthread_mutex_t out_lock;
+ pthread_cond_t cv;
+ int line_num = 0;
+ int out_line_num = 0;
+
+ pthread_mutex_init(&in_lock, NULL);
+ pthread_mutex_init(&out_lock, NULL);
+ pthread_cond_init(&cv, NULL);
+
+ thread_data_t thread_data[threads];
+
+ for(int i=0; i < threads; i++) {
+
+ thread_data[i].params = ¶ms;
+ thread_data[i].curr = curr;
+ thread_data[i].metrics = &metrics;
+ thread_data[i].id = i;
+ thread_data[i].in_lock = &in_lock;
+ thread_data[i].out_lock = &out_lock;
+ thread_data[i].line_num = &line_num;
+ thread_data[i].out_line_num = &out_line_num;
+ thread_data[i].cv = &cv;
+
+ pthread_create(&(tid[i]), NULL, &demult_runner, thread_data+i);
+ }
+
+ for(int i=0; i < threads; i++) {
+ pthread_join(tid[i], NULL);
+ }
+ printf("Threads all done\n");
+
+ pthread_mutex_destroy(&in_lock);
+ pthread_mutex_destroy(&out_lock);
+
+ //if (!log_fn) { is this better?
+ if (log_fn == NULL) {
+ log_file = stdout;
+ }
+ else {
+ log_file = fopen(log_fn, "w");
+ }
+
+ fprintf (log_file, "Barcode\tN_records\tN_pairs\tP_pairs\n");
+ curr = head;
+ int total_pairs = metrics.total/2;
+
+ while(curr) {
+
+ int n_pairs = curr->num_records/2;
+ float percent_pairs = (float) n_pairs/total_pairs;
+
+ fprintf(log_file,"%s\t%d\t%d\t%.2f\n", curr->bc_grp, curr->num_records, n_pairs, percent_pairs);
+
+ curr = curr->next;
+ }
+
+ int unknown_pairs = metrics.num_unknown/2;
+ float percent_unknown = (float) unknown_pairs/total_pairs;
+ float tot_chk = (float) total_pairs/total_pairs;
- if (argc < 2 || (strcmp (argv[1],"pe") != 0 && strcmp (argv[1],"se") != 0 && strcmp (argv[1],"--version") != 0 && strcmp (argv[1],"--help") != 0)) {
- main_usage (EXIT_FAILURE);
- }
+ fprintf(log_file, "unassigned\t%d\t%d\t%.2f\n", metrics.num_unknown, unknown_pairs, percent_unknown);
+ fprintf(log_file, "total\t%d\t%d\t%.2f\n", metrics.total, total_pairs, tot_chk);
- if (strcmp (argv[1],"--version") == 0) {
- fprintf(stdout, "%s version %0.3f\nCopyright (c) 2011 The Regents of University of California, Davis Campus.\n%s is free software and comes with ABSOLUTELY NO WARRANTY.\nDistributed under the MIT License.\n\nWritten by %s\n", PROGRAM_NAME, VERSION, PROGRAM_NAME, AUTHORS);
+ end = time(NULL);
+ fprintf(stderr, "\n All done :) \
+ \n It took %.2f minutes\n",
+ difftime(end, start)/60);
- exit (EXIT_SUCCESS);
- }
+ fclose(bc_fd);
+ fclose(log_file);
+ params_destroy(¶ms);
- else if (strcmp (argv[1],"--help") == 0) {
- main_usage (EXIT_SUCCESS);
- }
+ free(log_fn);
- else if (strcmp (argv[1],"pe") == 0) {
- retval = paired_main (argc, argv);
- return (retval);
- }
+ curr = head;
+ while (curr) {
+ fclose(curr->bcfile1);
+ if (curr->bcfile2)
+ fclose(curr->bcfile2);
- else if (strcmp (argv[1],"se") == 0) {
- retval = single_main (argc, argv);
- return (retval);
- }
+ free (curr->bc_grp);
+ free (curr->bc);
+ temp = curr;
+ curr = curr->next;
+ free (temp);
+ }
- return 0;
+ // good read :)
+ little_story(EXIT_SUCCESS);
}
diff --git a/src/sabre.h b/src/sabre.h
index 693af37..d6a8f33 100644
--- a/src/sabre.h
+++ b/src/sabre.h
@@ -1,67 +1,95 @@
-#ifndef SABRE_H
-#define SABRE_H
+#ifndef _SABRE_H
+#define _SABRE_H
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
#ifndef PROGRAM_NAME
#define PROGRAM_NAME "sabre"
#endif
#ifndef AUTHORS
-#define AUTHORS "Nikhil Joshi, UC Davis Bioinformatics Core\n"
-#endif
-
-#ifndef VERSION
-#define VERSION 0.0
+#define AUTHORS "\n\
+ \n Nikhil Joshi, UC Davis Bioinformatics Core\
+ \n Kirill Tsyganov, Monash Bioinformatics Platform\
+ \n"
#endif
-/* Options drawn from GNU's coreutils/src/system.h */
-/* These options are defined so as to avoid conflicting with option
-values used by commands */
-enum {
- GETOPT_HELP_CHAR = (CHAR_MIN - 2),
- GETOPT_VERSION_CHAR = (CHAR_MIN - 3)
-};
-#define GETOPT_HELP_OPTION_DECL \
-"help", no_argument, NULL, GETOPT_HELP_CHAR
-#define GETOPT_VERSION_OPTION_DECL \
-"version", no_argument, NULL, GETOPT_VERSION_CHAR
-#define case_GETOPT_HELP_CHAR(Usage_call) \
-case GETOPT_HELP_CHAR: \
-Usage_call(EXIT_SUCCESS); \
-break;
-#define case_GETOPT_VERSION_CHAR(Program_name, Version, Authors) \
-case GETOPT_VERSION_CHAR: \
-fprintf(stdout, "%s version %0.3f\nCopyright (c) 2011 The Regents " \
-"of University of California, Davis Campus.\n" \
-"%s is free software and comes with ABSOLUTELY NO WARRANTY.\n"\
-"Distributed under the MIT License.\n\nWritten by %s\n", \
-Program_name, Version, Program_name, Authors); \
-exit(EXIT_SUCCESS); \
-break;
-/* end code drawn from system.h */
+//https://semver.org/
#define MAX_BARCODE_LENGTH 100
#define MAX_FILENAME_LENGTH 200
-
-typedef struct listel {
- char* bc;
- int num_records;
- FILE* bcfile;
- struct listel *next;
-} barcode_data;
+//NOTE more info on struc and typedef
+//https://stackoverflow.com/questions/1675351/typedef-struct-vs-struct-definitions
+typedef struct actl_bc_cnt {
+ char *bc;
+ int cnt;
+ struct actl_bc_cnt *next;
+} actl_bc_cnt;
typedef struct listel_p {
- char* bc;
+ //TODO this is future implementation
+ //char **bc;
+ //typedef struct barcode_t *b
+ // this is a pointer to an array of pointers
+ char **bc;
+ char *bc_grp;
int num_records;
FILE* bcfile1;
FILE* bcfile2;
+ struct actl_bc_cnt *actl_bc_cnt;
struct listel_p *next;
-} barcode_data_paired;
+} barcode_data_t;
+
+//TODO this is for future implementation
+//typdef strcut {
+// char *bc;
+// barcode_data_t *bacrcode_data;
+//} barcode_t
+
+typedef struct {
+ gzFile fq1_fd;
+ gzFile fq2_fd;
+ FILE* unassigned1_fd;
+ FILE* unassigned2_fd;
+ FILE* umis_2_short_fd;
+ int mismatch;
+ int combine;
+ int umi;
+ int paired;
+ int min_umi_len;
+ int max_5prime_crop;
+ int no_comment;
+ int gz_out;
+} param_t;
+
+typedef struct {
+ int num_unknown;
+ int total;
+} metrics_t;
+typedef struct {
+ int id;
+ const param_t* params;
+ barcode_data_t* curr;
+ metrics_t* metrics;
+ pthread_mutex_t *in_lock, *out_lock;
+ pthread_cond_t *cv;
+ volatile int *line_num; // Pointer to line number. Access to this protected by in_lock
+ volatile int *out_line_num; // Pointer to line number output. Protected by out_lock
+} thread_data_t;
-/* Function Prototypes */
-int single_main (int argc, char *argv[]);
-int paired_main (int argc, char *argv[]);
-int strncmp_with_mismatch (const char *s1, const char *s2, register size_t n, register size_t mismatch);
-#endif /*SABRE_H*/
+#endif /*_SABRE_H*/
diff --git a/src/sanity_check.c b/src/sanity_check.c
new file mode 100644
index 0000000..c6ffb1c
--- /dev/null
+++ b/src/sanity_check.c
@@ -0,0 +1,50 @@
+
+void* sanity_chk(void *arg) {
+ if (!fq1 || !fq2 || !unknownfn1 || !unknownfn2 || !barfn) {
+ paired_usage (EXIT_FAILURE);
+ }
+
+ if (!strcmp (fq1, fq2) || !strcmp (fq1, unknownfn1) || !strcmp (fq1, unknownfn2) ||
+ !strcmp (fq1, barfn) || !strcmp (fq2, unknownfn1) || !strcmp (fq2, unknownfn2) ||
+ !strcmp (fq2, barfn) || !strcmp (unknownfn1, unknownfn2) || !strcmp (unknownfn1, barfn) ||
+ !strcmp (unknownfn2, barfn)) {
+
+ fprintf (stderr, "ERROR: Duplicate input and/or output file names.\n");
+ return EXIT_FAILURE;
+ }
+
+ pe1 = gzopen (fq1, "r");
+ if (!pe1) {
+ fprintf (stderr, "ERROR: Could not open input file 1 '%s'.\n", fq1);
+ return EXIT_FAILURE;
+ }
+
+ pe2 = gzopen (fq2, "r");
+ if (!pe2) {
+ fprintf (stderr, "ERROR: Could not open input file 2 '%s'.\n", fq2);
+ return EXIT_FAILURE;
+ }
+
+ unknownfile1 = gzopen(unknownfn1, "wb");
+ if (!unknownfile1) {
+ fprintf (stderr, "ERROR: Could not open unknown output file 1 '%s'.\n", unknownfn1);
+ return EXIT_FAILURE;
+ }
+
+ unknownfile2 = gzopen(unknownfn2, "wb");
+ if (!unknownfile2) {
+ fprintf (stderr, "Could not open unknown output file 2 '%s'.\n", unknownfn2);
+ return EXIT_FAILURE;
+ }
+
+ barfile = fopen (barfn, "r");
+ if (!barfile) {
+ fprintf (stderr, "Could not open barcode file '%s'.\n", barfn);
+ return EXIT_FAILURE;
+ }
+
+ if(threads < 0) {
+ fprintf(stderr, "WARNING: Negative number of threads detected %d, setting threads to 1\n", threads);
+ threads = 1;
+ }
+}
diff --git a/src/usage.c b/src/usage.c
new file mode 100644
index 0000000..550326b
--- /dev/null
+++ b/src/usage.c
@@ -0,0 +1,80 @@
+
+#include "usage.h"
+
+void usage(int status) {
+
+ fprintf(stderr, "\n Usage: %s [OPTIONS] -f -r -b \
+ \n\
+ \n\
+ \n Options:\
+ \n\
+ \n Required:\
+ \n\
+ \n -f, --fq1 FILE Input FASTQ R1 read\
+ \n -r, --fq2 FILE Input FASTQ R2 reads\
+ \n -b, --barcodes FILE Barcodes files, one barcode per line, e.g BC\\tPREFIX\
+ \n -w, --unassigned CHAR Unassigned prefix\
+ \n\
+ \n Other:\
+ \n\
+ \n -c, --combine Combine R1 and R2 [NULL]\
+ \n -u, --umi Indicates that umi present in the R1 read [NULL]\
+ \n -m, --max-mismatch INT Maximum number of mismatches allowed in a barcode [0]\
+ \n -l, --min-umi-len INT Minimum UMI length to keep [0]\
+ \n -a, --max-5prime-crop INT Maximum number of possible bases cropped from 5prime [0]\
+ \n -n, --no-comment Drop extra comments from FASTQ header [NULL]\
+ \n -g, --gz-out gzip the output files (requires pigz or gzip in PATH)\
+ \n -s, --stats FILE Write stats to file instead of STDOUT [STDOUT]\
+ \n\
+ \n Extras:\
+ \n\
+ \n -t, --threads INT specify number of threads to use [4]\
+ \n -v, --version get current version\
+ \n -h, --help get help menu, exit status is zero\
+ \n -o, --story little story about sabre tool\
+ \n\
+ \n",
+ PROGRAM_NAME);
+
+ exit(status);
+}
+
+void little_story(int status) {
+
+ fprintf(stdout, "\n\
+ \n Little story:\
+ \n\
+ \n Sabre is a heavy cavalry sword with a curved blade and a single cutting edge\
+ \n Not sure though if the meaning was intended by original author...\
+ \n\
+ \n Later on I was pointed out to me that yes of course it was intended\
+ \n since we are cutting off adaptors..\
+ \n\
+ \n to be continued...\
+ \n\
+ \n");
+
+ exit(status);
+}
+
+void version(int status) {
+
+ fprintf(stdout, "\n\
+ \n %s\
+ \n\
+ \n version: %s\
+ \n\
+ \n Copyright (c) 2011 The Regents of University of California, Davis Campus.\
+ \n %s is free software and comes with ABSOLUTELY NO WARRANTY.\
+ \n Distributed under the MIT License.\
+ \n\
+ \n Written by: %s\
+ \n\
+ \n",
+ PROGRAM_NAME,
+ VERSION,
+ PROGRAM_NAME,
+ AUTHORS);
+
+ exit(status);
+}
diff --git a/src/usage.h b/src/usage.h
new file mode 100644
index 0000000..4ab8f3f
--- /dev/null
+++ b/src/usage.h
@@ -0,0 +1,10 @@
+#ifndef _USAGE_H
+#define _USAGE_H
+
+#include "sabre.h"
+
+void usage(int status);
+void little_story(int status);
+void version(int status);
+
+#endif /*_USAGE_H*/
diff --git a/src/utils.c b/src/utils.c
new file mode 100644
index 0000000..d5f75c7
--- /dev/null
+++ b/src/utils.c
@@ -0,0 +1,251 @@
+#include "utils.h"
+
+// https://stackoverflow.com/questions/2336242/recursive-mkdir-system-call-on-unix/11425692
+// https://stackoverflow.com/questions/7430248/creating-a-new-directory-in-c
+const char * _mkdir(const char *file_path) {
+
+ if(!file_path) {
+ fprintf (stderr,
+ "ERROR: This shouldn't happend, file path == %s\n",
+ file_path);
+ exit(EXIT_FAILURE);
+ }
+
+ // return straigth away if a file_path is not nested file path
+ if(strstr(file_path, "/") == NULL) {
+ return file_path;
+ }
+ //TODO check if directory already exists or not
+ //struct stat st = {0};
+ //
+ //if (stat(dir, &st) == -1) {
+ // mkdir(tmp, S_IRWXU);
+ //}
+
+ //char tmp[PATH_MAX]; // can't get this to work..
+ char tmp[256];
+ char *p = NULL;
+ char *dirc = strdup(file_path);
+ size_t len;
+
+ const char *dir = dirname(dirc);
+ snprintf(tmp, sizeof(tmp),"%s", dir);
+ len = strlen(tmp);
+
+ if(tmp[len - 1] == '/') {
+ tmp[len - 1] = 0;
+ }
+
+ for(p = tmp + 1; *p; p++) {
+ if(*p == '/') {
+ *p = 0;
+ mkdir(tmp, S_IRWXU);
+ *p = '/';
+ }
+ }
+
+ mkdir(tmp, S_IRWXU);
+ free(dirc);
+
+ return file_path;
+}
+
+//NOTE retuns zero on success
+//strcmp can be used for sorting, returns pos, zero, neg
+//BUT this new implementation can't be used as such just FYI
+match_ret_t chk_bc_mtch(const char *orig_bc, const char *orig_read, size_t max_mismatch, int max_5prime_crop) {
+ int orig_read_len = strlen(orig_read);
+ int orig_bc_len = strlen(orig_bc);
+ int n_crop = 0;
+ match_ret_t ret = { -1,-1 };
+ if(orig_bc_len > orig_read_len) {
+ fprintf (stderr,
+ "WARNING: Length of the barcode %d is greater than length of the reads %d.",
+ orig_bc_len, orig_read_len);
+ return ret;
+ }
+
+ while(n_crop <= max_5prime_crop) {
+
+ if(n_crop > orig_read_len) {
+ return ret;
+ }
+
+ int mismatch = 0;
+ char u1, u2;
+ const char *bc = orig_bc;
+ const char *read = orig_read+n_crop;
+ int bc_len = orig_bc_len;
+
+ while (bc_len-- > 0) {
+ u1 = *bc++;
+ u2 = *read++;
+
+ if (u1 != u2) {
+ mismatch++;
+ if (mismatch > max_mismatch) {
+ break;
+ }
+ }
+
+ if (u1 == '\0' || u2 == '\0') {
+ ret.cropped = n_crop;
+ ret.mismatches = mismatch;
+ return ret;
+ }
+ }
+
+ if(mismatch <= max_mismatch) {
+ ret.cropped = n_crop;
+ ret.mismatches = mismatch;
+ return ret;
+ }
+
+ n_crop++;
+ }
+ //this is in the case of error
+ return ret;
+}
+
+// https://stackoverflow.com/questions/21880730/c-what-is-the-best-and-fastest-way-to-concatenate-strings
+//TODO this is a fastq mystrcat function, that returns a pointer to the end of the string
+void get_fqread(char *fqread, fq_rec_t *fq_rec, const char *barcode, char *umi_idx, int no_comment, int n_crop) {
+
+ fqread[0] = '\0';
+
+ if(n_crop < 0) {
+ fprintf(stderr,
+ "ERROR: n_crop set to %d. This can't happend\n",
+ n_crop);
+ exit(EXIT_FAILURE);
+ }
+
+ //@READNAME:BACRCODE:UMI
+ //1st line
+ strcat(fqread, fq_rec->name);
+ //TODO later can have conditional here depending on the the structure and/or BARCODE/UMI
+ if(barcode) {
+ strcat(fqread, ":");
+ strcat(fqread, barcode);
+ }
+ else if(!barcode) {
+ barcode = "";
+ }
+
+ if(umi_idx) {
+ strcat(fqread, ":");
+ strcat(fqread, umi_idx);
+ }
+
+ if(fq_rec->comment && no_comment == -1) {
+ strcat(fqread, " ");
+ strcat(fqread, fq_rec->comment);
+ }
+ strcat(fqread, "\n");
+
+ //2nd line
+ strcat(fqread, (fq_rec->seq)+strlen(barcode)+n_crop);
+ strcat(fqread, "\n");
+
+ //3rd line
+ strcat(fqread, "+");
+ strcat(fqread, fq_rec->name);
+ if(fq_rec->comment && no_comment == -1) {
+ strcat(fqread, " ");
+ strcat(fqread, fq_rec->comment);
+ }
+ strcat(fqread, "\n");
+
+ //4th line
+ strcat(fqread, (fq_rec->qual)+strlen(barcode)+n_crop);
+ strcat(fqread, "\n");
+}
+
+void get_merged_fqread(char *fqread, fq_rec_t *fq_rec1, fq_rec_t *fq_rec2, const char *barcode, char *umi_idx, int no_comment, int n_crop) {
+ fqread[0] = '\0';
+ //@READNAME:BACRCODE:UMI
+ //1st line
+ strcat(fqread, fq_rec1->name);
+ //TODO later can have conditional here depending on the the structure and/or BARCODE/UMI
+ if(barcode) {
+ strcat(fqread, ":");
+ strcat(fqread, barcode);
+ }
+
+ if(umi_idx) {
+ strcat(fqread, ":");
+ strcat(fqread, umi_idx);
+ }
+
+ if(fq_rec1->comment && no_comment == -1) {
+ strcat(fqread, " ");
+ strcat(fqread, fq_rec1->comment);
+ }
+ strcat(fqread, "\n");
+
+ //2nd line
+ strcat(fqread, fq_rec2->seq);
+ strcat(fqread, "\n");
+
+ //3rd line
+ strcat(fqread, "+");
+ strcat(fqread, "");
+ //strcat(fqread, fq_rec2->name);
+ //if(fq_rec2->comment && no_comment == -1) {
+ // strcat(fqread, " ");
+ // strcat(fqread, fq_rec2->comment);
+ //}
+ strcat(fqread, "\n");
+
+ //4th line
+ strcat(fqread, (fq_rec2->qual));
+ strcat(fqread, "\n");
+}
+
+void get_bc_fn(char **bcout_fn, char *s_name, char *barcode, int read_type, int gz) {
+
+ if(strlen(s_name) > MAX_FILENAME_LENGTH) {
+ fprintf (stderr,
+ "ERROR: Too many characters in your sample name; %s:%zd \n",
+ s_name, strlen(s_name));
+ exit(EXIT_FAILURE);
+ }
+
+ strcat(*bcout_fn, s_name);
+ strcat(*bcout_fn, "_");
+ strcat(*bcout_fn, barcode);
+
+ if(read_type == 1) {
+ strcat(*bcout_fn, "_R1.fastq");
+ }
+ else if(read_type == 2) {
+ strcat(*bcout_fn, "_R2.fastq");
+ }
+ else {
+ fprintf (stderr,
+ "ERROR: This shouldn't happen, wrong read type was passed through -> %d\n",
+ read_type);
+ exit(EXIT_FAILURE);
+ }
+ if (gz) {
+ strcat(*bcout_fn, ".gz");
+ }
+}
+
+void set_default_params(param_t *params) {
+ params->mismatch = 0;
+ params->combine = -1;
+ params->umi = -1;
+ params->paired = -1;
+ params->min_umi_len = 0;
+ params->max_5prime_crop = 0;
+ params->no_comment = -1;
+}
+
+void params_destroy(param_t *params) {
+ gzclose(params->fq1_fd);
+ gzclose(params->fq2_fd);
+ fclose(params->unassigned1_fd);
+ fclose(params->unassigned2_fd);
+ fclose(params->umis_2_short_fd);
+}
diff --git a/src/utils.h b/src/utils.h
new file mode 100644
index 0000000..ca32703
--- /dev/null
+++ b/src/utils.h
@@ -0,0 +1,37 @@
+#ifndef UTILS_H
+#define UTILS_H
+
+#include "sabre.h"
+#include "fastq.h"
+
+typedef struct barcodes_t {
+ char *bc;
+ int cnts;
+} barcodes_t;
+
+typedef struct umis_t {
+ char *bc;
+ int len;
+ int cnts;
+} umis_t;
+
+typedef struct {
+ int mismatches;
+ int cropped;
+} match_ret_t;
+
+//This is needed if compilling with -std=c99, read below for more
+//https://stackoverflow.com/questions/26284110/strdup-confused-about-warnings-implicit-declaration-makes-pointer-with
+// char *strdup(const char*);
+// char *strndup(const char *s, size_t n);
+
+const char * _mkdir (const char *dir);
+match_ret_t chk_bc_mtch(const char *orig_bc, const char *orig_read, size_t max_mismatch, int max_5prime_crop);
+void get_fqread(char *fqread, fq_rec_t *fq_rec, const char *barcode, char *umi_idx, int no_comment, int n_crop);
+void get_merged_fqread(char *fqread, fq_rec_t *fq_rec1, fq_rec_t *fq_rec2, const char *barcode, char *umi_idx, int no_comment, int n_crop);
+void get_bc_fn(char **bcout_fn, char *s_name, char *barcode, int read_type, int gz);
+
+void set_default_params(param_t *params);
+void params_destroy(param_t *params);
+
+#endif /*UTILS_H*/