From 8125d5b05449b81c2979a10359e26f310d79d799 Mon Sep 17 00:00:00 2001 From: John Didion Date: Tue, 21 Oct 2014 16:37:53 -0400 Subject: [PATCH 1/2] Orphan quality and length thresholds Added two new options, -Q and -L, to specify quality and length thresholds, respectively, for orphaned reads, i.e. read pairs where one mate but not both fails the initial quality check. --- src/trim_paired.c | 130 +++++++++++++++++++++++++++++++++++----------- 1 file changed, 99 insertions(+), 31 deletions(-) diff --git a/src/trim_paired.c b/src/trim_paired.c index f63b877..13f4597 100644 --- a/src/trim_paired.c +++ b/src/trim_paired.c @@ -14,7 +14,10 @@ __KS_GETUNTIL(gzread, BUFFER_SIZE) __KSEQ_READ int paired_qual_threshold = 20; +int orphan_qual_threshold = -1; int paired_length_threshold = 20; +int orphan_length_threshold = -1; +int recheck_orphans = 1; static struct option paired_long_options[] = { {"qual-type", required_argument, 0, 't'}, @@ -26,7 +29,9 @@ static struct option paired_long_options[] = { {"output-single", optional_argument, 0, 's'}, {"output-combo", optional_argument, 0, 'm'}, {"qual-threshold", optional_argument, 0, 'q'}, + {"orphan-qual-threshold", optional_argument, 0, 'Q'}, {"length-threshold", optional_argument, 0, 'l'}, + {"orphan-length-threshold", optional_argument, 0, 'L'}, {"no-fiveprime", optional_argument, 0, 'x'}, {"truncate-n", optional_argument, 0, 'n'}, {"gzip-output", optional_argument, 0, 'g'}, @@ -62,7 +67,9 @@ Global options\n\ -t, --qual-type, Type of quality values (solexa (CASAVA < 1.3), illumina (CASAVA 1.3 to 1.7), sanger (which is CASAVA >= 1.8)) (required)\n"); fprintf(stderr, "-s, --output-single, Output trimmed singles fastq file\n\ -q, --qual-threshold, Threshold for trimming based on average quality in a window. Default 20.\n\ +-Q, --orphan-qual-threshold, Quality threshold to use for retaining a single mate if one (but not both) mate fails the paired-end quality threshold. Defaults to the value specified for --qual-threshold.\n\ -l, --length-threshold, Threshold to keep a read based on length after trimming. Default 20.\n\ +-L, --orphan-length-threshold, Length threshold to use for retaining a single mate if one (but not both) mate fails the paired-end length threshold. Defaults to the value specified for --length-threshold.\n\ -x, --no-fiveprime, Don't do five prime trimming.\n\ -n, --truncate-n, Truncate sequences at position of first N.\n"); @@ -118,10 +125,10 @@ int paired_main(int argc, char *argv[]) { int combo_all=0; int combo_s=0; int total=0; - + while (1) { int option_index = 0; - optc = getopt_long(argc, argv, "df:r:c:t:o:p:m:M:s:q:l:xng", paired_long_options, &option_index); + optc = getopt_long(argc, argv, "df:r:c:t:o:p:m:M:s:q:l:Q:L:xng", paired_long_options, &option_index); if (optc == -1) break; @@ -189,7 +196,15 @@ int paired_main(int argc, char *argv[]) { return EXIT_FAILURE; } break; - + + case 'Q': + orphan_qual_threshold = atoi(optarg); + if (orphan_qual_threshold < 0) { + fprintf(stderr, "Orphan quality threshold must be >= 0\n"); + return EXIT_FAILURE; + } + break; + case 'l': paired_length_threshold = atoi(optarg); if (paired_length_threshold < 0) { @@ -198,6 +213,14 @@ int paired_main(int argc, char *argv[]) { } break; + case 'L': + orphan_length_threshold = atoi(optarg); + if (orphan_length_threshold < 0) { + fprintf(stderr, "Orphan length threshold must be >= 0\n"); + return EXIT_FAILURE; + } + break; + case 'x': no_fiveprime = 1; break; @@ -241,6 +264,20 @@ int paired_main(int argc, char *argv[]) { paired_usage(EXIT_FAILURE, "****Error: Must have either -f OR -c argument."); } + /* set orphan length and quality thresholds to paired thresholds if unset */ + if ((orphan_qual_threshold == -1 || orphan_qual_threshold == paired_qual_threshold) && + (orphan_length_threshold == -1 || orphan_length_threshold == paired_length_threshold)) { + recheck_orphans = 0; + } + else { + if (orphan_qual_threshold == -1) { + orphan_qual_threshold = paired_qual_threshold; + } + if (orphan_length_threshold == -1) { + orphan_length_threshold = paired_length_threshold; + } + } + if (infnc) { /* using combined input file */ if (infn1 || infn2 || outfn1 || outfn2) { @@ -411,45 +448,76 @@ int paired_main(int argc, char *argv[]) { /* if only one sequence passed filter, then put its record in singles and discard the other */ /* or put an "N" record in if that option was chosen. */ else if (p1cut->three_prime_cut >= 0 && p2cut->three_prime_cut < 0) { - if (!gzip_output) { - if (combo_all) { - print_record (combo, fqrec1, p1cut); - print_record_N (combo, fqrec2, qualtype); - } else { - print_record (single, fqrec1, p1cut); + int write_orphan = 1; + + if (recheck_orphans) { + p1cut = sliding_window(fqrec1, qualtype, orphan_length_threshold, orphan_qual_threshold, no_fiveprime, trunc_n, debug); + if (p1cut->three_prime_cut < 0) { + if (debug) printf("p1 orhpan discarded"); + write_orphan = 0; } - } else { - if (combo_all) { - print_record_gzip (combo_gzip, fqrec1, p1cut); - print_record_N_gzip (combo_gzip, fqrec2, qualtype); + else if (debug) printf("p1 orhpan retained"); + } + + if (write_orphan) { + if (!gzip_output) { + if (combo_all) { + print_record (combo, fqrec1, p1cut); + print_record_N (combo, fqrec2, qualtype); + } else { + print_record (single, fqrec1, p1cut); + } } else { - print_record_gzip (single_gzip, fqrec1, p1cut); + if (combo_all) { + print_record_gzip (combo_gzip, fqrec1, p1cut); + print_record_N_gzip (combo_gzip, fqrec2, qualtype); + } else { + print_record_gzip (single_gzip, fqrec1, p1cut); + } } - } - kept_s1++; - discard_s2++; + kept_s1++; + discard_s2++; + } + else { + discard_s2 += 2; + } } else if (p1cut->three_prime_cut < 0 && p2cut->three_prime_cut >= 0) { - if (!gzip_output) { - if (combo_all) { - print_record_N (combo, fqrec1, qualtype); - print_record (combo, fqrec2, p2cut); - } else { - print_record (single, fqrec2, p2cut); + int write_orphan = 1; + + if (recheck_orphans) { + p2cut = sliding_window(fqrec2, qualtype, orphan_length_threshold, orphan_qual_threshold, no_fiveprime, trunc_n, debug); + if (p2cut->three_prime_cut < 0) { + if (debug) printf("p2 orhpan discarded"); + write_orphan = 0; } - } else { - if (combo_all) { - print_record_N_gzip (combo_gzip, fqrec1, qualtype); - print_record_gzip (combo_gzip, fqrec2, p2cut); + else if (debug) printf("p2 orhpan retained"); + } + + if (write_orphan) { + if (!gzip_output) { + if (combo_all) { + print_record_N (combo, fqrec1, qualtype); + print_record (combo, fqrec2, p2cut); + } else { + print_record (single, fqrec2, p2cut); + } } else { - print_record_gzip (single_gzip, fqrec2, p2cut); + if (combo_all) { + print_record_N_gzip (combo_gzip, fqrec1, qualtype); + print_record_gzip (combo_gzip, fqrec2, p2cut); + } else { + print_record_gzip (single_gzip, fqrec2, p2cut); + } } + kept_s2++; + discard_s1++; + } + else { + discard_s2 += 2; } - - kept_s2++; - discard_s1++; } else { From 482e83988a6ee18824b4c7ea391bdc644197dea5 Mon Sep 17 00:00:00 2001 From: "Didion, John (NIH/NHGRI) [F]" Date: Mon, 15 Jun 2015 11:29:51 -0400 Subject: [PATCH 2/2] add parameters for truncating reads --- foo | Bin 0 -> 31592 bytes src/sickle.h | 2 +- src/sliding.c | 25 +++++++++++++++++-- src/trim_paired.c | 62 +++++++++++++++++++++++++++++++++++----------- src/trim_single.c | 9 +++++-- 5 files changed, 78 insertions(+), 20 deletions(-) create mode 100755 foo diff --git a/foo b/foo new file mode 100755 index 0000000000000000000000000000000000000000..736b20793b8b063b5561ad4b1c68acf3a7fd12a3 GIT binary patch literal 31592 zcmeHw4}4VBmH(TNXc+!XP@r^+`lO&i6HHJ9!oLh;@QqFY`L9)nVKO08l9_S-Kv06w z$!zm{#-?5EkM62VceQ1A+x?|2wy{5h?edG8CX^Vi>`F+p5_q};D zNwn^M?C;Nq_u;*F&$;KGd+xdCo_p@SZ^AFnyz}w7l4KhpN$o|FB#pyeb&e#ZBpE45 z`XX)xx7WMEd5!Bv*UIID&A)8<;E)jUX+;HI?`qfj)%hf>{Dyp-Bw)xUmEfi~U0!cI z&>H7y4(IceN1-vuIC3?oA-Fh4PC&aA^m=3Q#KxG3DwOYplk@8q6b5V*C>8vmGInu( z-ljlvBoGCrkYC4Q&W{O+e1b{u<~2w@e>fD&HMo#p_Y%&pPcR@jue~N?p7478v3N8X z+9dFW{8rR)evkbXCm`6$FHcsGc)f|x)?lc?8w@pu8JyoAqrIG8MAS!cArFzoUS4mY z$=ew9wFEe}P`-A-uT>Nx7`8j?MRwN-gYbImoU757=CwieLz%z;W|x4;-iWW!cCR{aIRMe%U8C6%O{Hh1Y6rl{6rTq3a_`> z_et_qp*)R?K1Pi(4%o_%+IykMqnFp);EVfmh(i4x-@y5G3k4Bu=qgt#m&*AOk9*m5 zt~$5j17H{=mt-goFkMX;w{($QET2J=G)0oyt0)aRDpH2`27<*E0o;TFFWo9hJ$RF? zk>1Oo?74Wi!f=iQN$gkPrmIxkn{2EcTE2{*~{>J6S>zk2TM*w3nutla#)pH)l+j@lvF?d-nHfC-1I9POoY8x}Iq zd{8}AxG9g?SW-k0IYr>s;epy=sM1{dc$e zqWYoYKd5w#u2s4wY=={=(;DtBuABRtI<5Kc((PZYo13{~{NqG<-m!1~q&oFok9tP^5B*CYXR}WAfXns=C5s+%4a|MX zrT#AQqDy_ZPHTDC?SG2coNziWd+LN^oU1$8Hc8T4&pFmXp388LbM1C?B%!P7J`nb@ zXM3HRxTn!^QJo|a9m*a!;kd=sP1HMkY)aRHZ_XyON$japPb%8Yz3B6&uaYG7rMi*F z9jR}i4JRCBuAPmJ8Q@+%5WN9KeuN_Wzkte>dS}O46ut-2_9Dh4ftuE<;6%mT1@Kpt4@F0%{l$ zLa6$kB-w6AwqILw&(^V|F;CZ3w)%Nnl`R(m>P?=qIXeCx#gc7%BuDCRkur#caHZ<6 z)T5Bjt^S+-z@MQ0=?9RbxaS^>T|gv@(Lq&=qE3BZzl|ttjKZU)NlGQVpSoMQ+%*96 za{Tp9bYc3!f-a~&#PmVRe+J60?vV{=Bd~vhGxpuqfVimw0<)Yaph#rC3%s>RD|9lH5n@UBUDtU{eZ?-2Da1kaHiC$ zzK|J#2lP;<>LSylJwoGY$3oixjV}LxxYcI{sX#x&RIq&_RNzrx&+eaB|M2z~L{Z}T zYH}})Y$;yp*1Wa*mN64Lrk|;XDWv)m=j{s#jLtFflIbsD42m9#f3Z$|4f<@KJ}@su zeqnnl4Z6e?nYq1yvarT>LO35MC=Ta$N16Uja58gw4oo=)3)Z(H3yo}D%GloAo*8HH z2hQv>K%@G(Pqj^FsUO;`3hA2Qgk!Ak)BhgvyjycX?AbuANA@fkNUnxeG6VVqbBMq2sq>* zA5@O7eG#Sik*<`k3!WH>m%liv*j~!MW{mK@^mULP6H$+u$gGD@rP9@E z(`Qjab#@;9g~)yGw(K1*Ke0A@miWpDbY zK!M*b#&0N@UItet(|0I=&Vk$>^>_N?0JXMVR`sO63T9ls6+_53btw7X1EDG3OpE$f}!`$^dg`Zw4py8sb@fn22Z$-UKDBtQ3XcF4=}bp zT_sa75;5VGyo0u4;#QC7rzdj|b@bGJuJ^f)@MKwTKFj*d%^ zs85}AbX-6f^_WLJss9P(Qhjw=+XL|!YTH0Dk$OyT0U8EbpZcw%<1QwWYd5kZ$o8lo zAlm8p8nF6OkSQw7e(f|WQ{=0a)FDTTg0u8AYzi&d0aRjib+-DX-j2y3d1?_03%7qJQZ)vKWfa)A>Gr7a>34y?UJErjlRGv@j{Ev{Bj6xE=S;ShNlwjG>eT8- z$4lo9IJzE!InEu}KFitff@MP>ErGjj($84MBf!Y1x<)U^Iwa|iG7yz<({XeJP|)pn zm3ApMr&?d?*6Jn4zG9o|dhRiZ2DN4#9dYi{;-iwMY7=+yc+aUW-QezDs<6uXm%=fF zPyf)+hrzfE;#tN$Iyo@-tJ!g;4*oSQ(srnEI743e~f_@_iGZT&Y z`|dT*DP6bP-0GI9&hgYx+-pY9J>qzvk2N37KH}(_h-U1zC5CDCXlybK)9hc9xP{H0 zZ&TZzOJ*;-*~=Xrdw@u07dkq=#}c)U4jL7;9og0|DE@v$8}r?3 z;IHcLoc|c@wOu@rvG1dVu3|TdEN2$l>DHE2DP2yRd*0GHv5{_VX>Ga;XtMSrX=NmA zen8o;7bznL2(>?5Y~o58?x3>&-7;n5K_z<-%9-QVcGT)g@OP_AYl(OI3HZ0h?ta&E z;F`@8JJlTPQV&9Iw}#?#x2%Yb1EUpSv|g{_jMfifVf9e;@D}!+ z@?ljy9t8)fnp+qoB@2sBOrEkQw6q3I=@VL%?YhW|CgVl;a8a)2H4v74!0d zT09j#LrXm4!qlnN_v*?Cs8@Z;nLU6YVunY}D9I;_ofru)onj2@t_5@8?Q}O}Ws3uA z{3EE7^Q6|Ph|hMX`r;Le*47SRhFMtuXd22?TH-0t?8?GZxOSzlN8paAdoMXDL4F~~ zLD9n~deT>bLUafl3h5~zH4vnP^f)|}uIpj%YZbS)c712DeF&Q`o_nNi6YRZu=wUwj zQfnuS&JFYU%pp1eipMCAUkJlc-1C-I#m0B7IF>wf%htb6M|0xAbRf{O^<+BE{-qmW zH!z%KRVbl2I&OsUXlEya-Amy}e~N`5ccj0DyM@NJHQiXqd!Pw)&@TNE?6P8_@MwF< zDaKu}xib9$Adx87|Ar@O^}Bguy?!T4+|oorVVR=M(<4C8G}44O`d;6_-dVkj@VwV5 zsn_%@_16rDG}#nsEE5TCNxu-(Yy!w&S;xAK(L>%l4|Jk^mIxOdUS2}HX&(6kCgFW_ zc#!__X*R|D+{}6vbIQJpC~GIrdYEO=obpX%F;%o@QUHGkCTyE{8k^)49LJld>TizO-HH~IX8^Fh3>Xd|x0^156}Wn0g8 zYIXbIlWUnzcChmM2-1&#nH+O`45$}7eo|bXJS8U*sH|?@=F!_9&wMqvALOcj$CdhR zV!f!UZ8^lX>i5s9ubtJGIM1l!5GGR!X=ns2!sb?~GX0EpT`AV7(+9dDHpPEZnR_C( zO4+H{PW{(rh^?5?v266AVz+ut8L9#QOi4EuyJd>4U-9p!PO}}XQ`6J;EB+(;G*F`c z%a@dT{KxA@%&u2wJG<7{RAv*bvute8uQ{xwo{f*AeIxW5g$zZR5%t<^jPLk4#69yR zcmFk|WLG>iqLi*p;c_fKSX>3SJX<}ECFLZ`fHPKO~gSW!zRHVPxr zN5gF%81jXq7hK^UFuFRB;pv+8B1U+1pGSQ|pUJYQ%nTa6PP5GAW|?)!V8mGC>qRF0 zw-E~OD=RYU`0!$*@^2v6+4nx0!z+Ko%=qd|WXv96WE?UxRD`$tzGGJ1gN*uAHa?!q z?81CR^`FsG$e?xZX1rlsR{JvbLHn~<@0Qyj4z0BR)bHw}Q3@BMrZymfp#FbRwB0pk zz)8w*J!h>#S2O&py~eNuVKJsUxW9qSV-N5HLSsW1el`~%F@1J zNp~$@LP0i%x(ULkzkEKGDEa*YQ4k>olj>>|WNXxq;OC)h(j8l-eymS0c|!RMv4${~ zU<2uMh)OGY#NcL#u^miuVldUEUuFZVWLqKaiagqq=NDN--h(n}?K~!Otx?brIZ2Yz zQvX^?sIHx9GFc%UGnRBCud}4EsA3d_FQua0={s^oUAxmWc)5R@Pc>WA`!MFmT)Rpt z7g=O2C0WmmqY@?e83jLftG}#QpVD7MK_=@%l(4Axr2Z4eQ}?K+q26N{WQKa%)8E4i z*JuwJVftG@fc-3ky`hR$gZ(1L-qn--pAr*fR3YUugK`3=>`kXQP>{9tN0dMenVANe?<^!cV328?tW%iVcL8!8Lm$6HNSoe; z023S*aSn>QS0S_Onw4d4->|tRvm8txZUdeQvmYCmj*iWYg;DUAc?GX1CyT+ z;DdPuPfj)@!t582SF}{Qs+r;=97$n>Ju%_LFdTA>|QD5?Cft{X_uQ@sz zfYZiofk7lsXSI?b5+3Y6d9;>J?0B@ti|6i7oUcu|nF!8g_ze?l*GCSE6w}+T-cAfpH8S!JwVrCp2Y#m$N@*k zcPvy-Fsi=#)I0U+dye~`blkVU<5!N<*Fho=I8tB3+XLUMJ;;6CQT>uHQUArgC>OsF zSswLc{XCAlj&1yWtdHQ}dnf58b3+<%)p9p(V5ta-)r;k&bu9f}Clj;|-pKMmk7&Vm

aZ*?t6m9OX`r*asR>UK4<9*iCaDDhh%q;ffqm0BJ=*K zOf7_Y$JglJ_yVQ(gHP&EVr{w)OClKZ-%%{aHWw$%;8MJtnFxH+#MAyV4^~s)z?=RR zY>!nR%9QENK%w2#MPXv7b{&R@Q0f3)Jg|;HAblq2(DfcQ(9W?COwuPak#OA0qZNob zr>^tSV<_rTfAAV(v}6+BoV=8gj&NghrRnF4(5biDdxzpE4Mq|_@$chRI~ zq-9IG%91Wex<54Gg*x=XzMs@1;siin@q_e#`wF0|UZ{j$+kDY}-*b1bRCZ9(lNV%}| zP>)f^x_Yz;AHX)WF=;$G3zVh$P!J`E^cIjpyGd7PkqZlnIT*|sxlhu5msX{%D8?=r zTG6#2fTfLo!vw6iwX2=lTD}Q(3nA-(r0ow@4&;2RY-uqzz%U=Sx4PgI)ejw=Z$pdZ zDpGwkMX(P5l;mC-ViK0-=x@vpwSX`PaA1R3UVoN?QT15*C^7|;RwR=P%F!MvzDk=g z83}F7SUlNI9t(c4nMX~8dOR9iT6nZBQj8uu%9@Hj@M6i)IT^D9HiO2p)*Y=tLv9~$ z2q>guGawT-!ln|7c-opi-A~~?bMn2k7c`ZH&JU6Sq_>P9G4DkDjtSzu_)2X;6lrZt z1D<)3q9X?j`iozap#9F|9VF=AQR@vs$C9AWvULS%($VonrZ_fLcbsNvi5l2}HK3Agq!pz*n9iwXtP3<&{=uAh)S3;s z#!(l%&mt1u1;6J~qPU}DYhhVZ%mE_KDP{tTupeZK`7P*8mH#XD{&g=?TZ(d%eMNAZ zjP5=3uSw@6y4ge0J;v(aTgJ4+O#2!p>0XJHE!ut+P)6H3QGf+-8f%QBBh3bn!GW6o zpIm#yA^j|J1v$xuFeQ(iTKm`e6%vPk1c&r#P@)4#t=0}a7kRl3Jk3NSS6GC;NWYb< zj;uZ2ie40LBfd}Ps>(E2#ES38(UC|?SS)4gfQojbXeJ0UHZ=*wJPo=GHia2d*v?N0 zR|_0XUWNuwj3I+cRY4}&F6W6S6_x2QGh0v#4o`97a)X#9h?q2J-7kR}^u`z~0J8}f;;#pfb&06-)yAowW5$5m$jYp> zeVga+;rXbI`Uka<Sip~RcspRMed5=;we3^O^lbnsRj!9cs#Xt( z)O!6ZEQPSeJSK1h11F(7@uqlUF5YOLM|+fFp7=S`shQ>Ij$SEVk5ynVcCE|w^Qn|} zd%14siCW#p6YKR;V`#i>ZgQ)yV$p$JPpwVTe?v$d)4syqZ|c9o`(%oOD_dd;pEKih zz(Auy400j9W5Y$3NnJegYqlRVdrBF$qks~Sx5jkZ2uXsMW zVtp*qnM;sEK0&V{2sXq9xITr`k<-y9A`N@H^8jriTJ~Mh)9=F-@}Ay<;2?cHP7Bw zKUsWwI%Rjw`X91!=rc9V5dUH#&gY0jCZdQVPMC=IbynodP$Ubb_18G!iYW%-mmG1c ziP#53W4A4n&DP-V#QorV{N7_p8@SK%e&8+6=N<6*!rMGuj`vEDw({Bl+_!PRiTf1p zDeqvq!kxnXecbQjw$kr@m&1?Yowwh;!mWPn)^6b+LL|1)xs$e}RJ%@m{4ju+14q-< zL!CmE+TrtuI)(Zh3RL_{=)(u~x>9zAG1`WYe0$MtS?uUIIyQnLd3s^IBzd|vp*huM z!+!8UA39W@WbxMt{X9gG>VC$C4zMX=WuH8p@9tVVNqqJ|&gm^=;4DU4#lLExM#aa^ zJ+i&5zH3LVI2=ybVU!Y#Z=kQG)7`yNDf{AK$hJ-%IZRvMmj<=f{MKP2ed|z>eg!1f z`ICL`a1coLy#p&90O{|M1AD!)p{!;0~RLy22E zH{<@Von48q8y*2){vYhcqonMW%%hX-?Y{4`dcP@m)s2xAQ`3W;^o0WDV>FNP1pt}f$x}KBu9Tt z--lOYw`t#WUX5*81!#|o_>gd@D{(1nUxUer-6p|q!2tH zP`w;=K2SFa)FTX)pie3n3Dh?^)mMSKTA=P=C`X5e+NqukkvB%<`8oN8)qNEGvES>| z(H_7_15AJ8b=t1VL*4tV*lg5)i;)eT{F`j(L%WClnzzBmFD@VNo1PD|A2wC?b$#QzLCN(L z5FSGjQyX;9k2BCukhdFLA@5EQ=F59K%OH8TBQG-!P51@$!m7}NKo!co+$caYD#2TH*WbZdDbCL7HF+{D%gHd+C}oiWYaFRNJ8ceBmz`F_p7YsrA$!)c=VJD(W6!1R zsj%m@?CD`oIt$KDTfv?y*>g2}u4T{l?0F-5-o&1_u;&K$+{m5{?AeGXjgJf3Agyy% zc3KPG-A#l{{|>Wc6ISIo-=Y*Uq%?+GwyBa*1C*LaseP0p&7}8G>IO>vGo?0BY8Rzq zl=>E>?xfT`l)9HvcTwsgO0`nz`;-b%iUPZI1EmHibt9!-rPK;aWhk|jQfDYdTMA8d z8kwF+Df;*}T~4V>D0LyFW>IQ9r6_V`za?l|MJZa9frsU)T7qA8yDSJK2}f6z>u1pPqJajyC25r}w2A$R3TaiiIne4OoV&R> z(Gm-A=w{}Mg#u193^4o#&A@g7Lx-Zf4BjEqGEhRLx1cdyw&U5vI!K8c8LmjS-#hwh;u&Vf5jCH#exk1c~&JOkwGHa*Tj6A zh<{9u1>~=UBP2lVX1TIap5KTT-3s)r2`B*y2{s27%9U|>{;fix@of>pgys48D;_N^ zfd;6PiZDvMeK}`VW3OzikSi-$UM1yK$nH=iLCQC%Xp3cQ$fk%+loYtxZ9H zlN^l6qV&QAAeb|KdY&Xk+M)?7anJ9GrDoL zkz^*uIKjT;jlLKZ7!Jw4ErBTd#B44s(+3l!o}MAs1sZ*c=D0kwYKFbCnb?oTbZH!82OYd6 zAaf%!GH(M4Z=%ARZVN_4PgTgGlL!|fqlC%DKnR-(24Gyg84y1dm(jVhFSN~6 zf=qupCfeeoVSp0!r*kUghF~nx>;q8~?1fM$(jW&~gK?0cw_+f!AXa450=6>Bk=rFW z2D{YISQD=a*W6muTytA1{``$$vJy9Gi5scpiblim5{4h4(EyhbyOhk9oR^PWTChQ; z4EQIL&M*bJB>@F*49GC(NQ4g*O!4S}VI9EVRAPkfj3i_a9-xeBMzLrP=st0R8jnM$CA>0_> z>Wc=LPW<7PK#aLx*|~Vta?hI8F3%0}vgPtR=gO7NWvg$PVXq64``nm_qa)$|$X+2s zy?eFn3Hk$}7#eP07Y)Yafsnj$8}qx%f}5Ly&GNP3SW~bERAqt+5j z!inYv)|24~#Lrd4dKnFqwtfP+g|kV&qFVU<3YtRft6UbQH1Na9X} z{D5Q`*x-ioE71Uygr!P$ccZ*5oZ$Ty3q*WTaye|oGxrL6gtav=Yl{Y`RDk)ySdCrE zO??dzbBHPoTgh3rz0w*KMFJ3Pc|lAhhMctwCQ&nxiUd$oL7Bnz%#fKs zPGG#kV0c`LIUG}q$q1pl$%w*uQjM5R5NW{j;m^qwVu|RM;1-Ipn9eKs@bJ;J&b=AK z%Ui`Pf}n}_Asb_4H$It==*r#=8Wu;xJ^b2)k-rEQ@F2O4p!tgA&aGEa6 z+S<_+C)+DqENvGYgc=DNf>9w0)}Ub&TVKe$(yHFiQh`t}f~xD7Hb=g1&Fy z%xKEDSb_bT=I}<$Vq&#YFo^yyT0lsZF^gKc5t$}yxJc8Zo2w!F=e=;ao=FvrH2Fee zp=C_b6}f2BK+)QXh9TSy21?u)4AD?B#{^2V z1#K2{q_}{hMl%*Q7at%EGittCk3|ChU}KO^VHo=P0R+SoBmFyHv=7;_1&p zQM}gAuBSq2p{f7Qf{vDDf5A1K^w}46Se2675T4sumBN=chCi2e9F}m=8b4dcMQU1V z>T698LLaGC@|@%>_#{%PQr1W8~0c5bF$M-z0A-l3U6I zu(gFrAKN+0SIU)s*%#fEXuh~F$+bcN+*(*ISD>+CmbcFS!~qM~yoF-HvW@e#7NRx!rw)D-ZAVl3cgmc(?y=10+Km58+{BsP^8%j>b#G?kjnghGwUmTiJk zD8cqWj2?$EAoL((sLx}S)3q6hlNPO-_%qZugwhTDt@B|sA%wk6S}suxKa;Jjo5E4H zZ-v>*%&Eo%5{yLyo3Pm9ksbwqjh1}76v{Ul* z8Yl~yi8ILt30^&{;4?o(@R``yv5Vy**0H1pwvaDaBr+T$w1kp5YDuj7c}FcRT>aCg zI1itvqc)z+XKVNu+R%=fI|6y6oKo~)A*Hb;K1{Y;-WUzHpo6MuGXcBX9GXcGq!>~N zQ|UzX>5}A;V+;=|ZXVGm3k4Grt1%-nlM>7U7#kp15#^FIjU-}eBgxfaK%(qWp_mlr z*utrt?~bq;IhSO6C4A&@c_@6h?(WY=O)c;X3v)XicI@ymT#Y>v7-@;l%q@Ro8nwp) zBe~pcOJs2V=Q71c0#Xc{=*A8!Qf%udf1h&Z|GWE?wbNn}+pwHA<1!oefus9`6AxUqxT)M|e27+o zzpG4F?KzUI2VdKCkC%#e;d{)oE2R;&S4u^PXGu02{lkS1CfPOOww02twaivF`rOiP zV zeVKS)CEg3gTM_S-;(e2NH;Q*eyl)e4(yO6cYWxqcVGhRKhM z{AWdeQI0iB-74VMhskdi`6q_SZx#93Ve;v$m#&c>-oC>6m+@?A{4n`+cuto*O#W>= zTe?K#3($P+5a-&L50l?E=-k^ZnwOxy0%_KG7xvSq>Hi;Y8oK>P|NhG``B5Roo?-I+ zB7g5N`KEmglTYXJbp2|Wd>_x2jt-MgpES_*_AvP^JX`uS`#U{M`RhgbPqRC%wh8XT>GsZraXNxLf8Lk`{q)deV*08Sq+@kz*!BP)xcQ|oYlZt z4V=}$Uq%Dtrc`a1w77lLq{#aA@sqBX&a1`$^u<4TT$u<3JYl~NUsYZoE$c=G{;xcI z4Hk_^20*)C1Yn;WryNMp|9HjzM?Eigv_UJiAEwnCUAAs&QM77Yt#{LH{^l_DnYyLu z90t<2SOnEFh71qtVE@x?adJ4IG{Jutp_I7qo0kcP0$VLw4lfaq&M zN=J}xG;3@n$f&U0Mo_N8P7WvHj7c|x@ChAp*~tL*$!@~6 zm@@P*Fwo+UY@^&>0&`XMF)$I2QQiOnXs03mVS@1$p1&zd*)JJj3)C$|Uo$|?SQj96 zEs-lOn-@Q7I@jCM1T_S(?H<7YN$8^nDpHdw2Lr8+J(PLQ-qHovFY#SAIVCsNH+((QrifrXA=G=xCDLT zEd7Y{{Hcwq^O^_G(yK*$gVln1qiBTm+7vK9{ADm_AMbdj=WX=CDD*=6;nMeu48C_4 zF0&vIgVmSPzkYec%?78bH>`4?#a^HwSE!;JtSb7+=#nNu} z{62g3P-$GdCUzI~*dsj?drG>Cj#nN&x2L44r>Mu)V}lU5&Kw<$->9+wV$ImfnGBfn zYLSPRG*!UZag>PW9R6RxA%#v8G_;*bWel28pfUd2H%3PV#}dT_oREHiL2%D(4*!*a z+wt2ABD<7e{6L^j?61*p4hTLjVEQ2g!Ezm^Ur+ETJbWCceF}p6Hgfnbzz$m(o*e!b zgHawdRNwamO#dGq!9Ngim4NAo8xGKZU+|+pCPMJT0!|Bfw}6Yl#DTj{=KLs4^t(hp z{e**XJ^0xaUeZ+p?#3@82wp7U7zl%on5hsT?3Ak3|-$pR__Xz&dT&t6%~I{|*oQD?*Z8XA0nRkYmjsR{+z`vaR`3 z3gF8M;Hm<6P652A09Fd%0Mq|&VXgnU0{GPe_%bx}a@-ZTD{*63N;2+?aZkYwZ^;J!G`ugt zP2-UUIsWc~Gz0e)xT|r`#C;|1t8mZ4jlaevVMw$86B4glawZqzhM7xMxIgcDWy!la zM)PMP6c08N5V(~lz=i_6F;TJq1s7kNVy>c3#MvRYWKLt2Tz+Q8req3B=%+Q9-uQ$< z^K=e5#N0>%nM`SJ#B7d9kv+xSA!bAGApdS;6wFCx-RzeDmKo2;;57&dhbhdbEgn1= z(_tRuU4}+lCQI)!K4s-2*(^ro^QLvLp|U{|iVQBH0171(Xdz)NKP(t3mb8X&mXMMs zSwP7{IwP3lVVn`biO|iA*^r=M4gX~@qBL(K56|-AGBeslf`xr}izo~!ZgL`K^S1C{ m&4_2Y_8OsD3e(6uS$t*$X6%iKjJ*+x(VJqhK|wIa)BgrV(%F3g literal 0 HcmV?d00001 diff --git a/src/sickle.h b/src/sickle.h index e7b679c..d30df14 100644 --- a/src/sickle.h +++ b/src/sickle.h @@ -96,6 +96,6 @@ typedef struct __cutsites_ { /* Function Prototypes */ int single_main (int argc, char *argv[]); int paired_main (int argc, char *argv[]); -cutsites* sliding_window (kseq_t *fqrec, int qualtype, int length_threshold, int qual_threshold, int no_fiveprime, int trunc_n, int debug); +cutsites* sliding_window (kseq_t *fqrec, int qualtype, int length_threshold, int qual_threshold, int no_fiveprime, int trunc_n, int trunc_size, int debug); #endif /*SICKLE_H*/ diff --git a/src/sliding.c b/src/sliding.c index 7573106..056b7a8 100644 --- a/src/sliding.c +++ b/src/sliding.c @@ -32,8 +32,11 @@ int get_quality_num (char qualchar, int qualtype, kseq_t *fqrec, int pos) { } -cutsites* sliding_window (kseq_t *fqrec, int qualtype, int length_threshold, int qual_threshold, int no_fiveprime, int trunc_n, int debug) { - +cutsites* sliding_window (kseq_t *fqrec, int qualtype, int length_threshold, int qual_threshold, int no_fiveprime, int trunc_n, int trunc_size, int debug) { + if (trunc_size >= 0 && trunc_size < length_threshold) { + trunc_size = length_threshold; + } + int window_size = (int) (0.1 * fqrec->seq.l); int i,j; int window_start=0; @@ -130,6 +133,24 @@ cutsites* sliding_window (kseq_t *fqrec, int qualtype, int length_threshold, int if (debug) printf ("\n\n"); + /* if trunc_size is less than read length, remove equal number of bases from each side */ + /* TODO: this should acutally remove bases based on quality */ + + + if (trunc_size >= 0) { + int side = 0; + while (trunc_size < three_prime_cut - five_prime_cut) { + if (side == 0) { + five_prime_cut++; + side = 1; + } + else { + three_prime_cut--; + side = 0; + } + } + } + retvals = (cutsites*) malloc (sizeof(cutsites)); retvals->three_prime_cut = three_prime_cut; retvals->five_prime_cut = five_prime_cut; diff --git a/src/trim_paired.c b/src/trim_paired.c index 13f4597..c138830 100644 --- a/src/trim_paired.c +++ b/src/trim_paired.c @@ -26,6 +26,7 @@ static struct option paired_long_options[] = { {"pe-combo", optional_argument, 0, 'c'}, {"output-pe1", optional_argument, 0, 'o'}, {"output-pe2", optional_argument, 0, 'p'}, + {"output-all", optional_argument, 0, 'a'}, {"output-single", optional_argument, 0, 's'}, {"output-combo", optional_argument, 0, 'm'}, {"qual-threshold", optional_argument, 0, 'q'}, @@ -34,6 +35,7 @@ static struct option paired_long_options[] = { {"orphan-length-threshold", optional_argument, 0, 'L'}, {"no-fiveprime", optional_argument, 0, 'x'}, {"truncate-n", optional_argument, 0, 'n'}, + {"truncate-size", optional_argument, 0, 'N'}, {"gzip-output", optional_argument, 0, 'g'}, {"output-combo-all", optional_argument, 0, 'M'}, {"quiet", optional_argument, 0, 'z'}, @@ -56,7 +58,8 @@ Paired-end separated reads\n\ -f, --pe-file1, Input paired-end forward fastq file (Input files must have same number of records)\n\ -r, --pe-file2, Input paired-end reverse fastq file\n\ -o, --output-pe1, Output trimmed forward fastq file\n\ --p, --output-pe2, Output trimmed reverse fastq file. Must use -s option.\n\n\ +-p, --output-pe2, Output trimmed reverse fastq file. Must use -s option.\n\ +-a, --output-all, Output all pairs with at least one surviving reads, with a discarded read written to output file as a single N.\n\n\ Paired-end interleaved reads\n\ ----------------------------\n"); fprintf(stderr,"-c, --pe-combo, Combined (interleaved) input paired-end fastq\n\ @@ -71,7 +74,8 @@ Global options\n\ -l, --length-threshold, Threshold to keep a read based on length after trimming. Default 20.\n\ -L, --orphan-length-threshold, Length threshold to use for retaining a single mate if one (but not both) mate fails the paired-end length threshold. Defaults to the value specified for --length-threshold.\n\ -x, --no-fiveprime, Don't do five prime trimming.\n\ --n, --truncate-n, Truncate sequences at position of first N.\n"); +-n, --truncate-n, Truncate sequences at position of first N.\n\ +-N, --truncate-size, Truncate sequences at a maximum length.\n"); fprintf(stderr, "-g, --gzip-output, Output gzipped files.\n--quiet, do not output trimming info\n\ @@ -121,14 +125,16 @@ int paired_main(int argc, char *argv[]) { int quiet = 0; int no_fiveprime = 0; int trunc_n = 0; + int trunc_size = -1; int gzip_output = 0; + int split_all = 0; int combo_all=0; int combo_s=0; int total=0; while (1) { int option_index = 0; - optc = getopt_long(argc, argv, "df:r:c:t:o:p:m:M:s:q:l:Q:L:xng", paired_long_options, &option_index); + optc = getopt_long(argc, argv, "df:r:c:t:o:p:m:M:s:q:l:Q:L:xnga", paired_long_options, &option_index); if (optc == -1) break; @@ -171,7 +177,11 @@ int paired_main(int argc, char *argv[]) { outfn2 = (char *) malloc(strlen(optarg) + 1); strcpy(outfn2, optarg); break; - + + case 'a': + split_all = 1; + break; + case 'm': outfnc = (char *) malloc(strlen(optarg) + 1); strcpy(outfnc, optarg); @@ -228,7 +238,9 @@ int paired_main(int argc, char *argv[]) { case 'n': trunc_n = 1; break; - + case 'N': + trunc_size = atoi(optarg); + break; case 'g': gzip_output = 1; break; @@ -278,6 +290,10 @@ int paired_main(int argc, char *argv[]) { } } + if (split_all) { + sfn = ""; + } + if (infnc) { /* using combined input file */ if (infn1 || infn2 || outfn1 || outfn2) { @@ -321,7 +337,7 @@ int paired_main(int argc, char *argv[]) { } else { /* using forward and reverse input files */ - if (infn1 && (!infn2 || !outfn1 || !outfn2 || !sfn)) { + if (infn1 && (!infn2 || !outfn1 || !outfn2 || (!sfn && !split_all))) { paired_usage(EXIT_FAILURE, "****Error: Using the -f option means you must have the -r, -o, -p, and -s options."); } @@ -378,7 +394,7 @@ int paired_main(int argc, char *argv[]) { } /* get singles output file handle */ - if (sfn && !combo_all) { + if (sfn && !combo_all && !split_all) { if (!gzip_output) { single = fopen(sfn, "w"); if (!single) { @@ -411,8 +427,8 @@ int paired_main(int argc, char *argv[]) { break; } - p1cut = sliding_window(fqrec1, qualtype, paired_length_threshold, paired_qual_threshold, no_fiveprime, trunc_n, debug); - p2cut = sliding_window(fqrec2, qualtype, paired_length_threshold, paired_qual_threshold, no_fiveprime, trunc_n, debug); + p1cut = sliding_window(fqrec1, qualtype, paired_length_threshold, paired_qual_threshold, no_fiveprime, trunc_n, trunc_size, debug); + p2cut = sliding_window(fqrec2, qualtype, paired_length_threshold, paired_qual_threshold, no_fiveprime, trunc_n, trunc_size, debug); total += 2; if (debug) printf("p1cut: %d,%d\n", p1cut->five_prime_cut, p1cut->three_prime_cut); @@ -451,7 +467,7 @@ int paired_main(int argc, char *argv[]) { int write_orphan = 1; if (recheck_orphans) { - p1cut = sliding_window(fqrec1, qualtype, orphan_length_threshold, orphan_qual_threshold, no_fiveprime, trunc_n, debug); + p1cut = sliding_window(fqrec1, qualtype, orphan_length_threshold, orphan_qual_threshold, no_fiveprime, trunc_n, trunc_size, debug); if (p1cut->three_prime_cut < 0) { if (debug) printf("p1 orhpan discarded"); write_orphan = 0; @@ -464,14 +480,22 @@ int paired_main(int argc, char *argv[]) { if (combo_all) { print_record (combo, fqrec1, p1cut); print_record_N (combo, fqrec2, qualtype); - } else { + } else if (split_all) { + print_record(outfile1, fqrec1, p1cut); + print_record_N(outfile2, fqrec2, qualtype); + } + else { print_record (single, fqrec1, p1cut); } } else { if (combo_all) { print_record_gzip (combo_gzip, fqrec1, p1cut); print_record_N_gzip (combo_gzip, fqrec2, qualtype); - } else { + } else if (split_all) { + print_record_gzip(outfile1, fqrec1, p1cut); + print_record_N_gzip(outfile2, fqrec2, qualtype); + } + else { print_record_gzip (single_gzip, fqrec1, p1cut); } } @@ -488,7 +512,7 @@ int paired_main(int argc, char *argv[]) { int write_orphan = 1; if (recheck_orphans) { - p2cut = sliding_window(fqrec2, qualtype, orphan_length_threshold, orphan_qual_threshold, no_fiveprime, trunc_n, debug); + p2cut = sliding_window(fqrec2, qualtype, orphan_length_threshold, orphan_qual_threshold, no_fiveprime, trunc_n, trunc_size, debug); if (p2cut->three_prime_cut < 0) { if (debug) printf("p2 orhpan discarded"); write_orphan = 0; @@ -501,14 +525,22 @@ int paired_main(int argc, char *argv[]) { if (combo_all) { print_record_N (combo, fqrec1, qualtype); print_record (combo, fqrec2, p2cut); - } else { + } else if (split_all) { + print_record_N(outfile1, fqrec1, qualtype); + print_record(outfile2, fqrec2, p2cut); + } + else { print_record (single, fqrec2, p2cut); } } else { if (combo_all) { print_record_N_gzip (combo_gzip, fqrec1, qualtype); print_record_gzip (combo_gzip, fqrec2, p2cut); - } else { + } else if (split_all) { + print_record_N_gzip(outfile1, fqrec1, qualtype); + print_record_gzip(outfile2, fqrec2, p2cut); + } + else { print_record_gzip (single_gzip, fqrec2, p2cut); } } diff --git a/src/trim_single.c b/src/trim_single.c index 44fe013..e182c04 100644 --- a/src/trim_single.c +++ b/src/trim_single.c @@ -23,6 +23,7 @@ static struct option single_long_options[] = { {"length-threshold", optional_argument, 0, 'l'}, {"no-fiveprime", optional_argument, 0, 'x'}, {"discard-n", optional_argument, 0, 'n'}, + {"trunc-size", optional_argument, 0, 'N'}, {"gzip-output", optional_argument, 0, 'g'}, {"quiet", optional_argument, 0, 'z'}, {GETOPT_HELP_OPTION_DECL}, @@ -43,6 +44,7 @@ Options:\n\ -l, --length-threshold, Threshold to keep a read based on length after trimming. Default 20.\n\ -x, --no-fiveprime, Don't do five prime trimming.\n\ -n, --trunc-n, Truncate sequences at position of first N.\n\ +-N, --truncate-size, Truncate sequences at a maximum length.\n\ -g, --gzip-output, Output gzipped files.\n\ --quiet, Don't print out any trimming information\n\ --help, display this help and exit\n\ @@ -71,6 +73,7 @@ int single_main(int argc, char *argv[]) { int quiet = 0; int no_fiveprime = 0; int trunc_n = 0; + int trunc_size = -1; int gzip_output = 0; int total=0; @@ -131,7 +134,9 @@ int single_main(int argc, char *argv[]) { case 'n': trunc_n = 1; break; - + case 'N': + trunc_size = atoi(optarg); + break; case 'g': gzip_output = 1; break; @@ -192,7 +197,7 @@ int single_main(int argc, char *argv[]) { while ((l = kseq_read(fqrec)) >= 0) { - p1cut = sliding_window(fqrec, qualtype, single_length_threshold, single_qual_threshold, no_fiveprime, trunc_n, debug); + p1cut = sliding_window(fqrec, qualtype, single_length_threshold, single_qual_threshold, no_fiveprime, trunc_n, trunc_size, debug); total++; if (debug) printf("P1cut: %d,%d\n", p1cut->five_prime_cut, p1cut->three_prime_cut);