bactopia/docs/data/bactopia-datasets-software.bib at master · kusandeep/bactopia · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
% Generated by Paperpile. Check out https://paperpile.com for more information.
% BibTeX export options can be customized via Settings -> BibTeX.

@MISC{Seemann_undated-ra,
  title       = "Barrnap: Bacterial ribosomal {RNA} predictor",
  author      = "Seemann, Torsten",
  abstract    = ":microscope: :leo: Bacterial ribosomal RNA predictor -
                 tseemann/barrnap",
  institution = "Github"
}

@MISC{Li2012-wr,
  title     = "seqtk Toolkit for processing sequences in {FASTA/Q} formats",
  author    = "Li, Heng",
  publisher = "GitHub",
  year      =  2012
}

@ARTICLE{Song2014-hn,
  title    = "Lighter: fast and memory-efficient sequencing error correction
              without counting",
  author   = "Song, Li and Florea, Liliana and Langmead, Ben",
  abstract = "Lighter is a fast, memory-efficient tool for correcting
              sequencing errors. Lighter avoids counting k-mers. Instead, it
              uses a pair of Bloom filters, one holding a sample of the input
              k-mers and the other holding k-mers likely to be correct. As long
              as the sampling fraction is adjusted in inverse proportion to the
              depth of sequencing, Bloom filter size can be held constant while
              maintaining near-constant accuracy. Lighter is parallelized, uses
              no secondary storage, and is both faster and more
              memory-efficient than competing approaches while achieving
              comparable accuracy.",
  journal  = "Genome Biol.",
  volume   =  15,
  number   =  11,
  pages    = "509",
  year     =  2014,
  language = "en"
}

@ARTICLE{Page2015-ob,
  title    = "Roary: rapid large-scale prokaryote pan genome analysis",
  author   = "Page, Andrew J and Cummins, Carla A and Hunt, Martin and Wong,
              Vanessa K and Reuter, Sandra and Holden, Matthew T G and Fookes,
              Maria and Falush, Daniel and Keane, Jacqueline A and Parkhill,
              Julian",
  abstract = "UNLABELLED: A typical prokaryote population sequencing study can
              now consist of hundreds or thousands of isolates. Interrogating
              these datasets can provide detailed insights into the genetic
              structure of prokaryotic genomes. We introduce Roary, a tool that
              rapidly builds large-scale pan genomes, identifying the core and
              accessory genes. Roary makes construction of the pan genome of
              thousands of prokaryote samples possible on a standard desktop
              without compromising on the accuracy of results. Using a single
              CPU Roary can produce a pan genome consisting of 1000 isolates in
              4.5 hours using 13 GB of RAM, with further speedups possible
              using multiple processors. AVAILABILITY AND IMPLEMENTATION: Roary
              is implemented in Perl and is freely available under an open
              source GPLv3 license from http://sanger-pathogens.github.io/Roary
              CONTACT: roary@sanger.ac.uk SUPPLEMENTARY INFORMATION:
              Supplementary data are available at Bioinformatics online.",
  journal  = "Bioinformatics",
  volume   =  31,
  number   =  22,
  pages    = "3691--3693",
  month    =  nov,
  year     =  2015,
  language = "en"
}

@MISC{Andrews2016-kz,
  title  = "{FastQC} A Quality Control tool for High Throughput Sequence Data.
            Babraham Bioinformatics. 2012",
  author = "Andrews, S and Krueger, F and Seconds-Pichon, A and Biggins, F and
            Wingett, S",
  year   =  2016
}

@ARTICLE{Hoang2018-mu,
  title    = "{UFBoot2}: Improving the Ultrafast Bootstrap Approximation",
  author   = "Hoang, Diep Thi and Chernomor, Olga and von Haeseler, Arndt and
              Minh, Bui Quang and Vinh, Le Sy",
  abstract = "The standard bootstrap (SBS), despite being computationally
              intensive, is widely used in maximum likelihood phylogenetic
              analyses. We recently proposed the ultrafast bootstrap
              approximation (UFBoot) to reduce computing time while achieving
              more unbiased branch supports than SBS under mild model
              violations. UFBoot has been steadily adopted as an efficient
              alternative to SBS and other bootstrap approaches. Here, we
              present UFBoot2, which substantially accelerates UFBoot and
              reduces the risk of overestimating branch supports due to
              polytomies or severe model violations. Additionally, UFBoot2
              provides suitable bootstrap resampling strategies for
              phylogenomic data. UFBoot2 is 778 times (median) faster than SBS
              and 8.4 times (median) faster than RAxML rapid bootstrap on
              tested data sets. UFBoot2 is implemented in the IQ-TREE software
              package version 1.6 and freely available at
              http://www.iqtree.org.",
  journal  = "Mol. Biol. Evol.",
  volume   =  35,
  number   =  2,
  pages    = "518--522",
  month    =  feb,
  year     =  2018,
  keywords = "maximum likelihood; model violation; phylogenetic inference;
              polytomies; ultrafast bootstrap",
  language = "en"
}

@ARTICLE{Buchfink2015-yu,
  title    = "Fast and sensitive protein alignment using {DIAMOND}",
  author   = "Buchfink, Benjamin and Xie, Chao and Huson, Daniel H",
  abstract = "The alignment of sequencing reads against a protein reference
              database is a major computational bottleneck in metagenomics and
              data-intensive evolutionary projects. Although recent tools offer
              improved performance over the gold standard BLASTX, they exhibit
              only a modest speedup or low sensitivity. We introduce DIAMOND,
              an open-source algorithm based on double indexing that is 20,000
              times faster than BLASTX on short reads and has a similar degree
              of sensitivity.",
  journal  = "Nat. Methods",
  volume   =  12,
  number   =  1,
  pages    = "59--60",
  month    =  jan,
  year     =  2015,
  language = "en"
}

@MISC{noauthor_undated-dt,
  title       = "Vcflib: A C++ library for parsing and manipulating {VCF} files",
  abstract    = "a simple C++ library for parsing and manipulating VCF files, +
                 many command-line utilities - vcflib/vcflib",
  institution = "Github"
}

@ARTICLE{Di_Tommaso2017-jf,
  title    = "Nextflow enables reproducible computational workflows",
  author   = "Di Tommaso, Paolo and Chatzou, Maria and Floden, Evan W and
              Barja, Pablo Prieto and Palumbo, Emilio and Notredame, Cedric",
  journal  = "Nat. Biotechnol.",
  volume   =  35,
  number   =  4,
  pages    = "316--319",
  month    =  apr,
  year     =  2017,
  language = "en"
}

@ARTICLE{Katz2019-cp,
  title    = "Mashtree: a rapid comparison of whole genome sequence files",
  author   = "Katz, Lee and Griswold, Taylor and Morrison, Shatavia and
              Caravas, Jason and Zhang, Shaokang and Bakker, Henk and Deng,
              Xiangyu and Carleton, Heather",
  abstract = "Software archive",
  journal  = "JOSS",
  volume   =  4,
  number   =  44,
  pages    = "1762",
  month    =  dec,
  year     =  2019
}

@ARTICLE{Laslett2004-li,
  title    = "{ARAGORN}, a program to detect {tRNA} genes and {tmRNA} genes in
              nucleotide sequences",
  author   = "Laslett, Dean and Canback, Bjorn",
  abstract = "A computer program, ARAGORN, identifies tRNA and tmRNA genes. The
              program employs heuristic algorithms to predict tRNA secondary
              structure, based on homology with recognized tRNA consensus
              sequences and ability to form a base-paired cloverleaf. tmRNA
              genes are identified using a modified version of the BRUCE
              program. ARAGORN achieves a detection sensitivity of 99\% from a
              set of 1290 eubacterial, eukaryotic and archaeal tRNA genes and
              detects all complete tmRNA sequences in the tmRNA database,
              improving on the performance of the BRUCE program. Recently
              discovered tmRNA genes in the chloroplasts of two species from
              the 'green' algae lineage are detected. The output of the program
              reports the proposed tRNA secondary structure and, for tmRNA
              genes, the secondary structure of the tRNA domain, the tmRNA gene
              sequence, the tag peptide and a list of organisms with matching
              tmRNA peptide tags.",
  journal  = "Nucleic Acids Res.",
  volume   =  32,
  number   =  1,
  pages    = "11--16",
  month    =  jan,
  year     =  2004,
  language = "en"
}

@ARTICLE{Rognes2016-qx,
  title    = "{VSEARCH}: a versatile open source tool for metagenomics",
  author   = "Rognes, Torbj{\o}rn and Flouri, Tom{\'a}{\v s} and Nichols, Ben
              and Quince, Christopher and Mah{\'e}, Fr{\'e}d{\'e}ric",
  abstract = "BACKGROUND: VSEARCH is an open source and free of charge
              multithreaded 64-bit tool for processing and preparing
              metagenomics, genomics and population genomics nucleotide
              sequence data. It is designed as an alternative to the widely
              used USEARCH tool (Edgar, 2010) for which the source code is not
              publicly available, algorithm details are only rudimentarily
              described, and only a memory-confined 32-bit version is freely
              available for academic use. METHODS: When searching nucleotide
              sequences, VSEARCH uses a fast heuristic based on words shared by
              the query and target sequences in order to quickly identify
              similar sequences, a similar strategy is probably used in
              USEARCH. VSEARCH then performs optimal global sequence alignment
              of the query against potential target sequences, using full
              dynamic programming instead of the seed-and-extend heuristic used
              by USEARCH. Pairwise alignments are computed in parallel using
              vectorisation and multiple threads. RESULTS: VSEARCH includes
              most commands for analysing nucleotide sequences available in
              USEARCH version 7 and several of those available in USEARCH
              version 8, including searching (exact or based on global
              alignment), clustering by similarity (using length pre-sorting,
              abundance pre-sorting or a user-defined order), chimera detection
              (reference-based or de novo), dereplication (full length or
              prefix), pairwise alignment, reverse complementation, sorting,
              and subsampling. VSEARCH also includes commands for FASTQ file
              processing, i.e., format detection, filtering, read quality
              statistics, and merging of paired reads. Furthermore, VSEARCH
              extends functionality with several new commands and improvements,
              including shuffling, rereplication, masking of low-complexity
              sequences with the well-known DUST algorithm, a choice among
              different similarity definitions, and FASTQ file format
              conversion. VSEARCH is here shown to be more accurate than
              USEARCH when performing searching, clustering, chimera detection
              and subsampling, while on a par with USEARCH for paired-ends read
              merging. VSEARCH is slower than USEARCH when performing
              clustering and chimera detection, but significantly faster when
              performing paired-end reads merging and dereplication. VSEARCH is
              available at https://github.com/torognes/vsearch under either the
              BSD 2-clause license or the GNU General Public License version
              3.0. DISCUSSION: VSEARCH has been shown to be a fast, accurate
              and full-fledged alternative to USEARCH. A free and open-source
              versatile tool for sequence analysis is now available to the
              metagenomics community.",
  journal  = "PeerJ",
  volume   =  4,
  pages    = "e2584",
  month    =  oct,
  year     =  2016,
  keywords = "Alignment; Chimera detection; Clustering; Dereplication; Masking;
              Metagenomics; Parallellization; Searching; Sequences; Shuffling",
  language = "en"
}

@ARTICLE{Garrison2012-sy,
  title         = "Haplotype-based variant detection from short-read sequencing",
  author        = "Garrison, Erik and Marth, Gabor",
  abstract      = "The direct detection of haplotypes from short-read DNA
                   sequencing data requires changes to existing small-variant
                   detection methods. Here, we develop a Bayesian statistical
                   framework which is capable of modeling multiallelic loci in
                   sets of individuals with non-uniform copy number. We then
                   describe our implementation of this framework in a
                   haplotype-based variant detector, FreeBayes.",
  month         =  jul,
  year          =  2012,
  archivePrefix = "arXiv",
  primaryClass  = "q-bio.GN",
  eprint        = "1207.3907"
}

@ARTICLE{Joensen2014-gi,
  title    = "Real-time whole-genome sequencing for routine typing,
              surveillance, and outbreak detection of verotoxigenic Escherichia
              coli",
  author   = "Joensen, Katrine Grimstrup and Scheutz, Flemming and Lund, Ole
              and Hasman, Henrik and Kaas, Rolf S and Nielsen, Eva M and
              Aarestrup, Frank M",
  abstract = "Fast and accurate identification and typing of pathogens are
              essential for effective surveillance and outbreak detection. The
              current routine procedure is based on a variety of techniques,
              making the procedure laborious, time-consuming, and expensive.
              With whole-genome sequencing (WGS) becoming cheaper, it has huge
              potential in both diagnostics and routine surveillance. The aim
              of this study was to perform a real-time evaluation of WGS for
              routine typing and surveillance of verocytotoxin-producing
              Escherichia coli (VTEC). In Denmark, the Statens Serum Institut
              (SSI) routinely receives all suspected VTEC isolates. During a
              7-week period in the fall of 2012, all incoming isolates were
              concurrently subjected to WGS using IonTorrent PGM. Real-time
              bioinformatics analysis was performed using web-tools
              (www.genomicepidemiology.org) for species determination,
              multilocus sequence type (MLST) typing, and determination of
              phylogenetic relationship, and a specific VirulenceFinder for
              detection of E. coli virulence genes was developed as part of
              this study. In total, 46 suspected VTEC isolates were
              characterized in parallel during the study. VirulenceFinder
              proved successful in detecting virulence genes included in
              routine typing, explicitly verocytotoxin 1 (vtx1), verocytotoxin
              2 (vtx2), and intimin (eae), and also detected additional
              virulence genes. VirulenceFinder is also a robust method for
              assigning verocytotoxin (vtx) subtypes. A real-time clustering of
              isolates in agreement with the epidemiology was established from
              WGS, enabling discrimination between sporadic and outbreak
              isolates. Overall, WGS typing produced results faster and at a
              lower cost than the current routine. Therefore, WGS typing is a
              superior alternative to conventional typing strategies. This
              approach may also be applied to typing and surveillance of other
              pathogens.",
  journal  = "J. Clin. Microbiol.",
  volume   =  52,
  number   =  5,
  pages    = "1501--1510",
  month    =  may,
  year     =  2014,
  language = "en"
}

@ARTICLE{Hyatt2010-wx,
  title    = "Prodigal: prokaryotic gene recognition and translation initiation
              site identification",
  author   = "Hyatt, Doug and Chen, Gwo-Liang and Locascio, Philip F and Land,
              Miriam L and Larimer, Frank W and Hauser, Loren J",
  abstract = "BACKGROUND: The quality of automated gene prediction in microbial
              organisms has improved steadily over the past decade, but there
              is still room for improvement. Increasing the number of correct
              identifications, both of genes and of the translation initiation
              sites for each gene, and reducing the overall number of false
              positives, are all desirable goals. RESULTS: With our years of
              experience in manually curating genomes for the Joint Genome
              Institute, we developed a new gene prediction algorithm called
              Prodigal (PROkaryotic DYnamic programming Gene-finding
              ALgorithm). With Prodigal, we focused specifically on the three
              goals of improved gene structure prediction, improved translation
              initiation site recognition, and reduced false positives. We
              compared the results of Prodigal to existing gene-finding methods
              to demonstrate that it met each of these objectives. CONCLUSION:
              We built a fast, lightweight, open source gene prediction program
              called Prodigal http://compbio.ornl.gov/prodigal/. Prodigal
              achieved good results compared to existing methods, and we
              believe it will be a valuable asset to automated microbial
              annotation pipelines.",
  journal  = "BMC Bioinformatics",
  volume   =  11,
  pages    = "119",
  month    =  mar,
  year     =  2010,
  language = "en"
}

@ARTICLE{Alcock2020-cx,
  title    = "{CARD} 2020: antibiotic resistome surveillance with the
              comprehensive antibiotic resistance database",
  author   = "Alcock, Brian P and Raphenya, Amogelang R and Lau, Tammy T Y and
              Tsang, Kara K and Bouchard, M{\'e}gane and Edalatmand, Arman and
              Huynh, William and Nguyen, Anna-Lisa V and Cheng, Annie A and
              Liu, Sihan and Min, Sally Y and Miroshnichenko, Anatoly and Tran,
              Hiu-Ki and Werfalli, Rafik E and Nasir, Jalees A and Oloni,
              Martins and Speicher, David J and Florescu, Alexandra and Singh,
              Bhavya and Faltyn, Mateusz and Hernandez-Koutoucheva, Anastasia
              and Sharma, Arjun N and Bordeleau, Emily and Pawlowski, Andrew C
              and Zubyk, Haley L and Dooley, Damion and Griffiths, Emma and
              Maguire, Finlay and Winsor, Geoff L and Beiko, Robert G and
              Brinkman, Fiona S L and Hsiao, William W L and Domselaar, Gary V
              and McArthur, Andrew G",
  abstract = "The Comprehensive Antibiotic Resistance Database (CARD;
              https://card.mcmaster.ca) is a curated resource providing
              reference DNA and protein sequences, detection models and
              bioinformatics tools on the molecular basis of bacterial
              antimicrobial resistance (AMR). CARD focuses on providing
              high-quality reference data and molecular sequences within a
              controlled vocabulary, the Antibiotic Resistance Ontology (ARO),
              designed by the CARD biocuration team to integrate with software
              development efforts for resistome analysis and prediction, such
              as CARD's Resistance Gene Identifier (RGI) software. Since 2017,
              CARD has expanded through extensive curation of reference
              sequences, revision of the ontological structure, curation of
              over 500 new AMR detection models, development of a new
              classification paradigm and expansion of analytical tools. Most
              notably, a new Resistomes \& Variants module provides analysis
              and statistical summary of in silico predicted resistance
              variants from 82 pathogens and over 100 000 genomes. By adding
              these resistance variants to CARD, we are able to summarize
              predicted resistance using the information included in CARD,
              identify trends in AMR mobility and determine previously
              undescribed and novel resistance variants. Here, we describe
              updates and recent expansions to CARD and its biocuration
              process, including new resources for community biocuration of AMR
              molecular reference data.",
  journal  = "Nucleic Acids Res.",
  volume   =  48,
  number   = "D1",
  pages    = "D517--D525",
  month    =  jan,
  year     =  2020,
  language = "en"
}

@ARTICLE{Ondov2016-cn,
  title    = "Mash: fast genome and metagenome distance estimation using
              {MinHash}",
  author   = "Ondov, Brian D and Treangen, Todd J and Melsted, P{\'a}ll and
              Mallonee, Adam B and Bergman, Nicholas H and Koren, Sergey and
              Phillippy, Adam M",
  abstract = "Mash extends the MinHash dimensionality-reduction technique to
              include a pairwise mutation distance and P value significance
              test, enabling the efficient clustering and search of massive
              sequence collections. Mash reduces large sequences and sequence
              sets to small, representative sketches, from which global
              mutation distances can be rapidly estimated. We demonstrate
              several use cases, including the clustering of all 54,118 NCBI
              RefSeq genomes in 33 CPU h; real-time database search using
              assembled or unassembled Illumina, Pacific Biosciences, and
              Oxford Nanopore data; and the scalable clustering of hundreds of
              metagenomic samples by composition. Mash is freely released under
              a BSD license ( https://github.com/marbl/mash ).",
  journal  = "Genome Biol.",
  volume   =  17,
  number   =  1,
  pages    = "132",
  month    =  jun,
  year     =  2016,
  keywords = "Alignment; Comparative genomics; Genomic distance; Metagenomics;
              Nanopore; Sequencing",
  language = "en"
}

% The entry below contains non-ASCII chars that could not be converted
% to a LaTeX equivalent.
@ARTICLE{Huerta-Cepas2017-sn,
  title    = "Fast {Genome-Wide} Functional Annotation through Orthology
              Assignment by {eggNOG-Mapper}",
  author   = "Huerta-Cepas, Jaime and Forslund, Kristoffer and Coelho, Luis
              Pedro and Szklarczyk, Damian and Jensen, Lars Juhl and von
              Mering, Christian and Bork, Peer",
  abstract = "Orthology assignment is ideally suited for functional inference.
              However, because predicting orthology is computationally
              intensive at large scale, and most pipelines are relatively
              inaccessible (e.g., new assignments only available through
              database updates), less precise homology-based functional
              transfer is still the default for (meta-)genome annotation. We,
              therefore, developed eggNOG-mapper, a tool for functional
              annotation of large sets of sequences based on fast orthology
              assignments using precomputed clusters and phylogenies from the
              eggNOG database. To validate our method, we benchmarked Gene
              Ontology (GO) predictions against two widely used homology-based
              approaches: BLAST and InterProScan. Orthology filters applied to
              BLAST results reduced the rate of false positive assignments by
              11\%, and increased the ratio of experimentally validated terms
              recovered over all terms assigned per protein by 15\%. Compared
              with InterProScan, eggNOG-mapper achieved similar proteome
              coverage and precision while predicting, on average, 41 more
              terms per protein and increasing the rate of experimentally
              validated terms recovered over total term assignments per protein
              by 35\%. EggNOG-mapper predictions scored within the top-5
              methods in the three GO categories using the CAFA2 NK-partial
              benchmark. Finally, we evaluated eggNOG-mapper for functional
              annotation of metagenomics data, yielding better performance than
              interProScan. eggNOG-mapper runs ∼15$\times$ faster than BLAST
              and at least 2.5$\times$ faster than InterProScan. The tool is
              available standalone and as an online service at
              http://eggnog-mapper.embl.de.",
  journal  = "Mol. Biol. Evol.",
  volume   =  34,
  number   =  8,
  pages    = "2115--2122",
  month    =  aug,
  year     =  2017,
  keywords = "comparative genomics; functional annotation; gene function;
              genomics; metagenomics; orthology",
  language = "en"
}

@ARTICLE{Walker2014-lr,
  title     = "Pilon: An Integrated Tool for Comprehensive Microbial Variant
               Detection and Genome Assembly Improvement",
  author    = "Walker, Bruce J and Abeel, Thomas and Shea, Terrance and Priest,
               Margaret and Abouelliel, Amr and Sakthikumar, Sharadha and
               Cuomo, Christina A and Zeng, Qiandong and Wortman, Jennifer and
               Young, Sarah K and Earl, Ashlee M",
  abstract  = "Advances in modern sequencing technologies allow us to generate
               sufficient data to analyze hundreds of bacterial genomes from a
               single machine in a single day. This potential for sequencing
               massive numbers of genomes calls for fully automated methods to
               produce high-quality assemblies and variant calls. We introduce
               Pilon, a fully automated, all-in-one tool for correcting draft
               assemblies and calling sequence variants of multiple sizes,
               including very large insertions and deletions. Pilon works with
               many types of sequence data, but is particularly strong when
               supplied with paired end data from two Illumina libraries with
               small e.g., 180 bp and large e.g., 3--5 Kb inserts. Pilon
               significantly improves draft genome assemblies by correcting
               bases, fixing mis-assemblies and filling gaps. For both haploid
               and diploid genomes, Pilon produces more contiguous genomes with
               fewer errors, enabling identification of more biologically
               relevant genes. Furthermore, Pilon identifies small variants
               with high accuracy as compared to state-of-the-art tools and is
               unique in its ability to accurately identify large sequence
               variants including duplications and resolve large insertions.
               Pilon is being used to improve the assemblies of thousands of
               new genomes and to identify variants from thousands of
               clinically relevant bacterial strains. Pilon is freely available
               as open source software.",
  journal   = "PLoS One",
  publisher = "Public Library of Science",
  volume    =  9,
  number    =  11,
  pages     = "e112963",
  month     =  nov,
  year      =  2014
}

@ARTICLE{Magoc2011-jn,
  title    = "{FLASH}: fast length adjustment of short reads to improve genome
              assemblies",
  author   = "Mago{\v c}, Tanja and Salzberg, Steven L",
  abstract = "MOTIVATION: Next-generation sequencing technologies generate very
              large numbers of short reads. Even with very deep genome
              coverage, short read lengths cause problems in de novo
              assemblies. The use of paired-end libraries with a fragment size
              shorter than twice the read length provides an opportunity to
              generate much longer reads by overlapping and merging read pairs
              before assembling a genome. RESULTS: We present FLASH, a fast
              computational tool to extend the length of short reads by
              overlapping paired-end reads from fragment libraries that are
              sufficiently short. We tested the correctness of the tool on one
              million simulated read pairs, and we then applied it as a
              pre-processor for genome assemblies of Illumina reads from the
              bacterium Staphylococcus aureus and human chromosome 14. FLASH
              correctly extended and merged reads >99\% of the time on
              simulated reads with an error rate of <1\%. With adequately set
              parameters, FLASH correctly merged reads over 90\% of the time
              even when the reads contained up to 5\% errors. When FLASH was
              used to extend reads prior to assembly, the resulting assemblies
              had substantially greater N50 lengths for both contigs and
              scaffolds. AVAILABILITY AND IMPLEMENTATION: The FLASH system is
              implemented in C and is freely available as open-source code at
              http://www.cbcb.umd.edu/software/flash. CONTACT:
              t.magoc@gmail.com.",
  journal  = "Bioinformatics",
  volume   =  27,
  number   =  21,
  pages    = "2957--2963",
  month    =  nov,
  year     =  2011,
  keywords = "PubMed S. aureus 2000-15/2011"
}

@ARTICLE{Petersen2011-nh,
  title    = "{SignalP} 4.0: discriminating signal peptides from transmembrane
              regions",
  author   = "Petersen, Thomas Nordahl and Brunak, S{\o}ren and von Heijne,
              Gunnar and Nielsen, Henrik",
  journal  = "Nat. Methods",
  volume   =  8,
  number   =  10,
  pages    = "785--786",
  month    =  sep,
  year     =  2011,
  language = "en"
}

@MISC{Skennerton_undated-gp,
  title       = "{MinCED}: Mining {CRISPRs} in Environmental Datasets",
  author      = "Skennerton, Connor T",
  abstract    = "Mining CRISPRs in Environmental Datasets. Contribute to
                 ctSkennerton/minced development by creating an account on
                 GitHub.",
  institution = "Github"
}

@ARTICLE{Kalyaanamoorthy2017-ea,
  title    = "{ModelFinder}: fast model selection for accurate phylogenetic
              estimates",
  author   = "Kalyaanamoorthy, Subha and Minh, Bui Quang and Wong, Thomas K F
              and von Haeseler, Arndt and Jermiin, Lars S",
  abstract = "Model-based molecular phylogenetics plays an important role in
              comparisons of genomic data, and model selection is a key step in
              all such analyses. We present ModelFinder, a fast model-selection
              method that greatly improves the accuracy of phylogenetic
              estimates by incorporating a model of rate heterogeneity across
              sites not previously considered in this context and by allowing
              concurrent searches of model space and tree space.",
  journal  = "Nat. Methods",
  volume   =  14,
  number   =  6,
  pages    = "587--589",
  month    =  jun,
  year     =  2017,
  language = "en"
}

@ARTICLE{Price2010-bv,
  title     = "{FastTree} 2 -- Approximately {Maximum-Likelihood} Trees for
               Large Alignments",
  author    = "Price, Morgan N. and Dehal, Paramvir S. and Arkin, Adam P.",
  abstract  = "Background We recently described FastTree, a tool for inferring
               phylogenies for alignments with up to hundreds of thousands of
               sequences. Here, we describe improvements to FastTree that
               improve its accuracy without sacrificing scalability.
               Methodology/Principal Findings Where FastTree 1 used
               nearest-neighbor interchanges (NNIs) and the minimum-evolution
               criterion to improve the tree, FastTree 2 adds minimum-evolution
               subtree-pruning-regrafting (SPRs) and maximum-likelihood NNIs.
               FastTree 2 uses heuristics to restrict the search for better
               trees and estimates a rate of evolution for each site (the
               ``CAT'' approximation). Nevertheless, for both simulated and
               genuine alignments, FastTree 2 is slightly more accurate than a
               standard implementation of maximum-likelihood NNIs (PhyML 3 with
               default settings). Although FastTree 2 is not quite as accurate
               as methods that use maximum-likelihood SPRs, most of the splits
               that disagree are poorly supported, and for large alignments,
               FastTree 2 is 100--1,000 times faster. FastTree 2 inferred a
               topology and likelihood-based local support values for 237,882
               distinct 16S ribosomal RNAs on a desktop computer in 22 hours
               and 5.8 gigabytes of memory. Conclusions/Significance FastTree 2
               allows the inference of maximum-likelihood phylogenies for huge
               alignments. FastTree 2 is freely available at
               http://www.microbesonline.org/fasttree .",
  journal   = "PLoS One",
  publisher = "Public Library of Science",
  volume    =  5,
  number    =  3,
  pages     = "e9490",
  month     =  mar,
  year      =  2010,
  keywords  = "00-Ungrouped"
}

@ARTICLE{Zankari2012-jn,
  title    = "Identification of acquired antimicrobial resistance genes",
  author   = "Zankari, Ea and Hasman, Henrik and Cosentino, Salvatore and
              Vestergaard, Martin and Rasmussen, Simon and Lund, Ole and
              Aarestrup, Frank M and Larsen, Mette Voldby",
  abstract = "OBJECTIVES: Identification of antimicrobial resistance genes is
              important for understanding the underlying mechanisms and the
              epidemiology of antimicrobial resistance. As the costs of
              whole-genome sequencing (WGS) continue to decline, it becomes
              increasingly available in routine diagnostic laboratories and is
              anticipated to substitute traditional methods for resistance gene
              identification. Thus, the current challenge is to extract the
              relevant information from the large amount of generated data.
              METHODS: We developed a web-based method, ResFinder that uses
              BLAST for identification of acquired antimicrobial resistance
              genes in whole-genome data. As input, the method can use both
              pre-assembled, complete or partial genomes, and short sequence
              reads from four different sequencing platforms. The method was
              evaluated on 1862 GenBank files containing 1411 different
              resistance genes, as well as on 23 de-novo-sequenced isolates.
              RESULTS: When testing the 1862 GenBank files, the method
              identified the resistance genes with an ID = 100\% (100\%
              identity) to the genes in ResFinder. Agreement between in silico
              predictions and phenotypic testing was found when the method was
              further tested on 23 isolates of five different bacterial
              species, with available phenotypes. Furthermore, ResFinder was
              evaluated on WGS chromosomes and plasmids of 30 isolates. Seven
              of these isolates were annotated to have antimicrobial
              resistance, and in all cases, annotations were compatible with
              the ResFinder results. CONCLUSIONS: A web server providing a
              convenient way of identifying acquired antimicrobial resistance
              genes in completely sequenced isolates was created. ResFinder can
              be accessed at www.genomicepidemiology.org. ResFinder will
              continuously be updated as new resistance genes are identified.",
  journal  = "J. Antimicrob. Chemother.",
  volume   =  67,
  number   =  11,
  pages    = "2640--2644",
  month    =  nov,
  year     =  2012,
  language = "en"
}

% The entry below contains non-ASCII chars that could not be converted
% to a LaTeX equivalent.
@ARTICLE{Li2018-be,
  title    = "Minimap2: pairwise alignment for nucleotide sequences",
  author   = "Li, Heng",
  abstract = "Motivation: Recent advances in sequencing technologies promise
              ultra-long reads of ∼100 kb in average, full-length mRNA or cDNA
              reads in high throughput and genomic contigs over 100 Mb in
              length. Existing alignment programs are unable or inefficient to
              process such data at scale, which presses for the development of
              new alignment algorithms. Results: Minimap2 is a general-purpose
              alignment program to map DNA or long mRNA sequences against a
              large reference database. It works with accurate short reads of
              $\geq$100 bp in length, $\geq$1 kb genomic reads at error rate
              ∼15\%, full-length noisy Direct RNA or cDNA reads and assembly
              contigs or closely related full chromosomes of hundreds of
              megabases in length. Minimap2 does split-read alignment, employs
              concave gap cost for long insertions and deletions and introduces
              new heuristics to reduce spurious alignments. It is 3-4 times as
              fast as mainstream short-read mappers at comparable accuracy, and
              is $\geq$30 times faster than long-read genomic or cDNA mappers
              at higher accuracy, surpassing most aligners specialized in one
              type of alignment. Availability and implementation:
              https://github.com/lh3/minimap2. Supplementary information:
              Supplementary data are available at Bioinformatics online.",
  journal  = "Bioinformatics",
  volume   =  34,
  number   =  18,
  pages    = "3094--3100",
  month    =  sep,
  year     =  2018,
  language = "en"
}

% The entry below contains non-ASCII chars that could not be converted
% to a LaTeX equivalent.
@ARTICLE{Nawrocki2013-ht,
  title    = "Infernal 1.1: 100-fold faster {RNA} homology searches",
  author   = "Nawrocki, Eric P and Eddy, Sean R",
  abstract = "SUMMARY: Infernal builds probabilistic profiles of the sequence
              and secondary structure of an RNA family called covariance models
              (CMs) from structurally annotated multiple sequence alignments
              given as input. Infernal uses CMs to search for new family
              members in sequence databases and to create potentially large
              multiple sequence alignments. Version 1.1 of Infernal introduces
              a new filter pipeline for RNA homology search based on
              accelerated profile hidden Markov model (HMM) methods and
              HMM-banded CM alignment methods. This enables ∼100-fold
              acceleration over the previous version and ∼10 000-fold
              acceleration over exhaustive non-filtered CM searches.
              AVAILABILITY: Source code, documentation and the benchmark are
              downloadable from http://infernal.janelia.org. Infernal is freely
              licensed under the GNU GPLv3 and should be portable to any
              POSIX-compliant operating system, including Linux and Mac OS/X.
              Documentation includes a user's guide with a tutorial, a
              discussion of file formats and user options and additional
              details on methods implemented in the software. CONTACT:
              nawrockie@janelia.hhmi.org",
  journal  = "Bioinformatics",
  volume   =  29,
  number   =  22,
  pages    = "2933--2935",
  month    =  nov,
  year     =  2013,
  language = "en"
}

@ARTICLE{Galata2019-wt,
  title    = "{PLSDB}: a resource of complete bacterial plasmids",
  author   = "Galata, Valentina and Fehlmann, Tobias and Backes, Christina and
              Keller, Andreas",
  abstract = "The study of bacterial isolates or communities requires the
              analysis of the therein included plasmids in order to provide an
              extensive characterization of the organisms. Plasmids harboring
              resistance and virulence factors are of especial interest as they
              contribute to the dissemination of antibiotic resistance. As the
              number of newly sequenced bacterial genomes is growing a
              comprehensive resource is required which will allow to browse and
              filter the available plasmids, and to perform sequence analyses.
              Here, we present PLSDB, a resource containing 13 789 plasmid
              records collected from the NCBI nucleotide database. The web
              server provides an interactive view of all obtained plasmids with
              additional meta information such as sequence characteristics,
              sample-related information and taxonomy. Moreover, nucleotide
              sequence data can be uploaded to search for short nucleotide
              sequences (e.g. specific genes) in the plasmids, to compare a
              given plasmid to the records in the collection or to determine
              whether a sample contains one or multiple of the known plasmids
              (containment analysis). The resource is freely accessible under
              https://ccb-microbe.cs.uni-saarland.de/plsdb/.",
  journal  = "Nucleic Acids Res.",
  volume   =  47,
  number   = "D1",
  pages    = "D195--D202",
  month    =  jan,
  year     =  2019,
  language = "en"
}

@ARTICLE{Feldgarden2019-il,
  title    = "Validating the {NCBI} {AMRFinder} Tool and Resistance Gene
              Database Using Antimicrobial Resistance {Genotype-Phenotype}
              Correlations in a Collection of {NARMS} Isolates",
  author   = "Feldgarden, Michael and Brover, Vyacheslav and Haft, Daniel H and
              Prasad, Arjun B and Slotta, Douglas J and Tolstoy, Igor and
              Tyson, Gregory H and Zhao, Shaohua and Hsu, Chih-Hao and
              McDermott, Patrick F and Tadesse, Daniel A and Morales, Cesar and
              Simmons, Mustafa and Tillman, Glenn and Wasilenko, Jamie and
              Folster, Jason P and Klimke, William",
  abstract = "Antimicrobial resistance (AMR) is a major public health problem
              that requires publicly available tools for rapid analysis. To
              identify AMR genes in whole genome sequences, the National Center
              for Biotechnology Information (NCBI) has produced AMRFinder, a
              tool that identifies AMR genes using a high-quality curated AMR
              gene reference database. The Bacterial Antimicrobial Resistance
              Reference Gene Database consists of up-to-date gene nomenclature,
              a set of hidden Markov models (HMMs), and a curated protein
              family hierarchy. Currently, it contains 4,579 antimicrobial
              resistance proteins and more than 560 HMMs.Here, we describe
              AMRFinder and its associated database. To assess the predictive
              ability of AMRFinder, we measured the consistency between
              predicted AMR genotypes from AMRFinder and resistance phenotypes
              of 6,242 isolates from the National Antimicrobial Resistance
              Monitoring System (NARMS). This included 5,425 Salmonella
              enterica, 770 Campylobacter spp., and 47 Escherichia coli
              phenotypically tested against various antimicrobial agents. Of
              87,679 susceptibility tests performed, 98.4\% were consistent
              with predictions.To assess the accuracy of AMRFinder, we compared
              its gene symbol output with that of a 2017 version of ResFinder,
              another publicly available resistance gene detection system. Most
              gene calls were identical, but there were 1,229 gene symbol
              differences (8.8\%) between them, with differences due to both
              algorithmic differences and database composition. AMRFinder
              missed 16 loci that Resfinder found, while Resfinder missed 216
              loci AMRFinder identified. Based on these results, AMRFinder
              appears to be a highly accurate AMR gene detection system.",
  journal  = "Antimicrob. Agents Chemother.",
  month    =  aug,
  year     =  2019,
  language = "en"
}

% The entry below contains non-ASCII chars that could not be converted
% to a LaTeX equivalent.
@ARTICLE{Zerbino2008-yg,
  title    = "Velvet: Algorithms for de novo short read assembly using de
              Bruijn graphs",
  author   = "Zerbino, Daniel R and Birney, Ewan",
  abstract = "We have developed a new set of algorithms, collectively called
              ``Velvet,'' to manipulate de Bruijn graphs for genomic sequence
              assembly. A de Bruijn graph is a compact representation based on
              short words (k-mers) that is ideal for high coverage, very short
              read (25--50 bp) data sets. Applying Velvet to very short reads
              and paired-ends information only, one can produce contigs of
              significant length, up to 50-kb N50 length in simulations of
              prokaryotic data and 3-kb N50 on simulated mammalian BACs. When
              applied to real Solexa data sets without read pairs, Velvet
              generated contigs of ∼8 kb in a prokaryote and 2 kb in a
              mammalian BAC, in close agreement with our simulated results
              without read-pair information. Velvet represents a new approach
              to assembly that can leverage very short reads in combination
              with read pairs to produce useful assemblies.",
  journal  = "Genome Res.",
  volume   =  18,
  number   =  5,
  pages    = "821--829",
  month    =  may,
  year     =  2008
}

@ARTICLE{Chaumeil2019-tf,
  title    = "{GTDB-Tk}: a toolkit to classify genomes with the Genome Taxonomy
              Database",
  author   = "Chaumeil, Pierre-Alain and Mussig, Aaron J and Hugenholtz, Philip
              and Parks, Donovan H",
  abstract = "SUMMARY: The GTDB Toolkit (GTDB-Tk) provides objective taxonomic
              assignments for bacterial and archaeal genomes based on the
              Genome Taxonomy Database (GTDB). GTDB-Tk is computationally
              efficient and able to classify thousands of draft genomes in
              parallel. Here we demonstrate the accuracy of the GTDB-Tk
              taxonomic assignments by evaluating its performance on a
              phylogenetically diverse set of 10,156 bacterial and archaeal
              metagenome-assembled genomes. AVAILABILITY: GTDB-Tk is
              implemented in Python and licensed under the GNU General Public
              License v3.0. Source code and documentation are available at:
              https://github.com/ecogenomics/gtdbtk. SUPPLEMENTARY INFORMATION:
              Supplementary data are available at Bioinformatics online.",
  journal  = "Bioinformatics",
  month    =  nov,
  year     =  2019,
  language = "en"
}

@MISC{Petit_undated-mv,
  title       = "fastq-dl - Download {FASTQ} files from {SRA} or {ENA}
                 repositories",
  author      = "Petit, III, Robert A",
  abstract    = "Download FASTQ files from SRA or ENA repositories. -
                 rpetit3/fastq-dl",
  institution = "Github"
}

@ARTICLE{Bayliss2019-cv,
  title    = "{PIRATE}: A fast and scalable pangenomics toolbox for clustering
              diverged orthologues in bacteria",
  author   = "Bayliss, Sion C and Thorpe, Harry A and Coyle, Nicola M and
              Sheppard, Samuel K and Feil, Edward J",
  abstract = "BACKGROUND: Cataloguing the distribution of genes within natural
              bacterial populations is essential for understanding evolutionary
              processes and the genetic basis of adaptation. Advances in whole
              genome sequencing technologies have led to a vast expansion in
              the amount of bacterial genomes deposited in public databases.
              There is a pressing need for software solutions which are able to
              cluster, catalogue and characterise genes, or other features, in
              increasingly large genomic datasets. RESULTS: Here we present a
              pangenomics toolbox, PIRATE (Pangenome Iterative Refinement and
              Threshold Evaluation), which identifies and classifies
              orthologous gene families in bacterial pangenomes over a wide
              range of sequence similarity thresholds. PIRATE builds upon
              recent scalable software developments to allow for the rapid
              interrogation of thousands of isolates. PIRATE clusters genes (or
              other annotated features) over a wide range of amino acid or
              nucleotide identity thresholds and uses the clustering
              information to rapidly identify paralogous gene families and
              putative fission/fusion events. Furthermore, PIRATE orders the
              pangenome using a directed graph, provides a measure of allelic
              variation, and estimates sequence divergence for each gene
              family. CONCLUSIONS: We demonstrate that PIRATE scales linearly
              with both number of samples and computation resources, allowing
              for analysis of large genomic datasets, and compares favorably to
              other popular tools. PIRATE provides a robust framework for
              analysing bacterial pangenomes, from largely clonal to panmictic
              species.",
  journal  = "Gigascience",
  volume   =  8,
  number   =  10,
  month    =  oct,
  year     =  2019,
  keywords = "bioinformatics; microbial genomics; next-generation sequencing;
              pangenomics",
  language = "en"
}

@ARTICLE{Ondov2019-dw,
  title    = "Mash Screen: high-throughput sequence containment estimation for
              genome discovery",
  author   = "Ondov, Brian D and Starrett, Gabriel J and Sappington, Anna and
              Kostic, Aleksandra and Koren, Sergey and Buck, Christopher B and
              Phillippy, Adam M",
  abstract = "The MinHash algorithm has proven effective for rapidly estimating
              the resemblance of two genomes or metagenomes. However, this
              method cannot reliably estimate the containment of a genome
              within a metagenome. Here, we describe an online algorithm
              capable of measuring the containment of genomes and proteomes
              within either assembled or unassembled sequencing read sets. We
              describe several use cases, including contamination screening and
              retrospective analysis of metagenomes for novel genome discovery.
              Using this tool, we provide containment estimates for every NCBI
              RefSeq genome within every SRA metagenome and demonstrate the
              identification of a novel polyomavirus species from a public
              metagenome.",
  journal  = "Genome Biol.",
  volume   =  20,
  number   =  1,
  pages    = "232",
  month    =  nov,
  year     =  2019,
  keywords = "Metagenomics; MinHash; Polyomavirus; SRA; Sequencing; Viral
              Discovery",
  language = "en"
}

@ARTICLE{Souvorov2018-tg,
  title    = "{SKESA}: strategic k-mer extension for scrupulous assemblies",
  author   = "Souvorov, Alexandre and Agarwala, Richa and Lipman, David J",
  abstract = "SKESA is a DeBruijn graph-based de-novo assembler designed for
              assembling reads of microbial genomes sequenced using Illumina.
              Comparison with SPAdes and MegaHit shows that SKESA produces
              assemblies that have high sequence quality and contiguity,
              handles low-level contamination in reads, is fast, and produces
              an identical assembly for the same input when assembled multiple
              times with the same or different compute resources. SKESA has
              been used for assembling over 272,000 read sets in the Sequence
              Read Archive at NCBI and for real-time pathogen detection. Source
              code for SKESA is freely available at
              https://github.com/ncbi/SKESA/releases .",
  journal  = "Genome Biol.",
  volume   =  19,
  number   =  1,
  pages    = "153",
  month    =  oct,
  year     =  2018,
  keywords = "Contamination; De-novo assembly; DeBruijn graphs; Illumina reads;
              Sequence quality",
  language = "en"
}

@ARTICLE{Chen2016-sb,
  title    = "{VFDB} 2016: hierarchical and refined dataset for big data
              analysis--10 years on",
  author   = "Chen, Lihong and Zheng, Dandan and Liu, Bo and Yang, Jian and
              Jin, Qi",
  abstract = "The virulence factor database (VFDB, http://www.mgc.ac.cn/VFs/)
              is dedicated to providing up-to-date knowledge of virulence
              factors (VFs) of various bacterial pathogens. Since its inception
              the VFDB has served as a comprehensive repository of bacterial
              VFs for over a decade. The exponential growth in the amount of
              biological data is challenging to the current database in regard
              to big data analysis. We recently improved two aspects of the
              infrastructural dataset of VFDB: (i) removed the redundancy
              introduced by previous releases and generated two hierarchical
              datasets--one core dataset of experimentally verified VFs only
              and another full dataset including all known and predicted VFs
              and (ii) refined the gene annotation of the core dataset with
              controlled vocabularies. Our efforts enhanced the data quality of
              the VFDB and promoted the usability of the database in the big
              data era for the bioinformatic mining of the explosively growing
              data regarding bacterial VFs.",
  journal  = "Nucleic Acids Res.",
  volume   =  44,
  number   = "D1",
  pages    = "D694--7",
  month    =  jan,
  year     =  2016,
  language = "en"
}

@ARTICLE{Turner2018-zv,
  title    = "Integrating long-range connectivity information into de Bruijn
              graphs",
  author   = "Turner, Isaac and Garimella, Kiran V and Iqbal, Zamin and McVean,
              Gil",
  abstract = "Motivation: The de Bruijn graph is a simple and efficient data
              structure that is used in many areas of sequence analysis
              including genome assembly, read error correction and variant