forked from bactopia/bactopia
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathbactopia-datasets-software.bib
More file actions
2631 lines (2535 loc) · 141 KB
/
bactopia-datasets-software.bib
File metadata and controls
2631 lines (2535 loc) · 141 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
% Generated by Paperpile. Check out https://paperpile.com for more information.
% BibTeX export options can be customized via Settings -> BibTeX.
@MISC{Seemann_undated-ra,
title = "Barrnap: Bacterial ribosomal {RNA} predictor",
author = "Seemann, Torsten",
abstract = ":microscope: :leo: Bacterial ribosomal RNA predictor -
tseemann/barrnap",
institution = "Github"
}
@MISC{Li2012-wr,
title = "seqtk Toolkit for processing sequences in {FASTA/Q} formats",
author = "Li, Heng",
publisher = "GitHub",
year = 2012
}
@ARTICLE{Song2014-hn,
title = "Lighter: fast and memory-efficient sequencing error correction
without counting",
author = "Song, Li and Florea, Liliana and Langmead, Ben",
abstract = "Lighter is a fast, memory-efficient tool for correcting
sequencing errors. Lighter avoids counting k-mers. Instead, it
uses a pair of Bloom filters, one holding a sample of the input
k-mers and the other holding k-mers likely to be correct. As long
as the sampling fraction is adjusted in inverse proportion to the
depth of sequencing, Bloom filter size can be held constant while
maintaining near-constant accuracy. Lighter is parallelized, uses
no secondary storage, and is both faster and more
memory-efficient than competing approaches while achieving
comparable accuracy.",
journal = "Genome Biol.",
volume = 15,
number = 11,
pages = "509",
year = 2014,
language = "en"
}
@ARTICLE{Page2015-ob,
title = "Roary: rapid large-scale prokaryote pan genome analysis",
author = "Page, Andrew J and Cummins, Carla A and Hunt, Martin and Wong,
Vanessa K and Reuter, Sandra and Holden, Matthew T G and Fookes,
Maria and Falush, Daniel and Keane, Jacqueline A and Parkhill,
Julian",
abstract = "UNLABELLED: A typical prokaryote population sequencing study can
now consist of hundreds or thousands of isolates. Interrogating
these datasets can provide detailed insights into the genetic
structure of prokaryotic genomes. We introduce Roary, a tool that
rapidly builds large-scale pan genomes, identifying the core and
accessory genes. Roary makes construction of the pan genome of
thousands of prokaryote samples possible on a standard desktop
without compromising on the accuracy of results. Using a single
CPU Roary can produce a pan genome consisting of 1000 isolates in
4.5 hours using 13 GB of RAM, with further speedups possible
using multiple processors. AVAILABILITY AND IMPLEMENTATION: Roary
is implemented in Perl and is freely available under an open
source GPLv3 license from http://sanger-pathogens.github.io/Roary
CONTACT: roary@sanger.ac.uk SUPPLEMENTARY INFORMATION:
Supplementary data are available at Bioinformatics online.",
journal = "Bioinformatics",
volume = 31,
number = 22,
pages = "3691--3693",
month = nov,
year = 2015,
language = "en"
}
@MISC{Andrews2016-kz,
title = "{FastQC} A Quality Control tool for High Throughput Sequence Data.
Babraham Bioinformatics. 2012",
author = "Andrews, S and Krueger, F and Seconds-Pichon, A and Biggins, F and
Wingett, S",
year = 2016
}
@ARTICLE{Hoang2018-mu,
title = "{UFBoot2}: Improving the Ultrafast Bootstrap Approximation",
author = "Hoang, Diep Thi and Chernomor, Olga and von Haeseler, Arndt and
Minh, Bui Quang and Vinh, Le Sy",
abstract = "The standard bootstrap (SBS), despite being computationally
intensive, is widely used in maximum likelihood phylogenetic
analyses. We recently proposed the ultrafast bootstrap
approximation (UFBoot) to reduce computing time while achieving
more unbiased branch supports than SBS under mild model
violations. UFBoot has been steadily adopted as an efficient
alternative to SBS and other bootstrap approaches. Here, we
present UFBoot2, which substantially accelerates UFBoot and
reduces the risk of overestimating branch supports due to
polytomies or severe model violations. Additionally, UFBoot2
provides suitable bootstrap resampling strategies for
phylogenomic data. UFBoot2 is 778 times (median) faster than SBS
and 8.4 times (median) faster than RAxML rapid bootstrap on
tested data sets. UFBoot2 is implemented in the IQ-TREE software
package version 1.6 and freely available at
http://www.iqtree.org.",
journal = "Mol. Biol. Evol.",
volume = 35,
number = 2,
pages = "518--522",
month = feb,
year = 2018,
keywords = "maximum likelihood; model violation; phylogenetic inference;
polytomies; ultrafast bootstrap",
language = "en"
}
@ARTICLE{Buchfink2015-yu,
title = "Fast and sensitive protein alignment using {DIAMOND}",
author = "Buchfink, Benjamin and Xie, Chao and Huson, Daniel H",
abstract = "The alignment of sequencing reads against a protein reference
database is a major computational bottleneck in metagenomics and
data-intensive evolutionary projects. Although recent tools offer
improved performance over the gold standard BLASTX, they exhibit
only a modest speedup or low sensitivity. We introduce DIAMOND,
an open-source algorithm based on double indexing that is 20,000
times faster than BLASTX on short reads and has a similar degree
of sensitivity.",
journal = "Nat. Methods",
volume = 12,
number = 1,
pages = "59--60",
month = jan,
year = 2015,
language = "en"
}
@MISC{noauthor_undated-dt,
title = "Vcflib: A C++ library for parsing and manipulating {VCF} files",
abstract = "a simple C++ library for parsing and manipulating VCF files, +
many command-line utilities - vcflib/vcflib",
institution = "Github"
}
@ARTICLE{Di_Tommaso2017-jf,
title = "Nextflow enables reproducible computational workflows",
author = "Di Tommaso, Paolo and Chatzou, Maria and Floden, Evan W and
Barja, Pablo Prieto and Palumbo, Emilio and Notredame, Cedric",
journal = "Nat. Biotechnol.",
volume = 35,
number = 4,
pages = "316--319",
month = apr,
year = 2017,
language = "en"
}
@ARTICLE{Katz2019-cp,
title = "Mashtree: a rapid comparison of whole genome sequence files",
author = "Katz, Lee and Griswold, Taylor and Morrison, Shatavia and
Caravas, Jason and Zhang, Shaokang and Bakker, Henk and Deng,
Xiangyu and Carleton, Heather",
abstract = "Software archive",
journal = "JOSS",
volume = 4,
number = 44,
pages = "1762",
month = dec,
year = 2019
}
@ARTICLE{Laslett2004-li,
title = "{ARAGORN}, a program to detect {tRNA} genes and {tmRNA} genes in
nucleotide sequences",
author = "Laslett, Dean and Canback, Bjorn",
abstract = "A computer program, ARAGORN, identifies tRNA and tmRNA genes. The
program employs heuristic algorithms to predict tRNA secondary
structure, based on homology with recognized tRNA consensus
sequences and ability to form a base-paired cloverleaf. tmRNA
genes are identified using a modified version of the BRUCE
program. ARAGORN achieves a detection sensitivity of 99\% from a
set of 1290 eubacterial, eukaryotic and archaeal tRNA genes and
detects all complete tmRNA sequences in the tmRNA database,
improving on the performance of the BRUCE program. Recently
discovered tmRNA genes in the chloroplasts of two species from
the 'green' algae lineage are detected. The output of the program
reports the proposed tRNA secondary structure and, for tmRNA
genes, the secondary structure of the tRNA domain, the tmRNA gene
sequence, the tag peptide and a list of organisms with matching
tmRNA peptide tags.",
journal = "Nucleic Acids Res.",
volume = 32,
number = 1,
pages = "11--16",
month = jan,
year = 2004,
language = "en"
}
@ARTICLE{Rognes2016-qx,
title = "{VSEARCH}: a versatile open source tool for metagenomics",
author = "Rognes, Torbj{\o}rn and Flouri, Tom{\'a}{\v s} and Nichols, Ben
and Quince, Christopher and Mah{\'e}, Fr{\'e}d{\'e}ric",
abstract = "BACKGROUND: VSEARCH is an open source and free of charge
multithreaded 64-bit tool for processing and preparing
metagenomics, genomics and population genomics nucleotide
sequence data. It is designed as an alternative to the widely
used USEARCH tool (Edgar, 2010) for which the source code is not
publicly available, algorithm details are only rudimentarily
described, and only a memory-confined 32-bit version is freely
available for academic use. METHODS: When searching nucleotide
sequences, VSEARCH uses a fast heuristic based on words shared by
the query and target sequences in order to quickly identify
similar sequences, a similar strategy is probably used in
USEARCH. VSEARCH then performs optimal global sequence alignment
of the query against potential target sequences, using full
dynamic programming instead of the seed-and-extend heuristic used
by USEARCH. Pairwise alignments are computed in parallel using
vectorisation and multiple threads. RESULTS: VSEARCH includes
most commands for analysing nucleotide sequences available in
USEARCH version 7 and several of those available in USEARCH
version 8, including searching (exact or based on global
alignment), clustering by similarity (using length pre-sorting,
abundance pre-sorting or a user-defined order), chimera detection
(reference-based or de novo), dereplication (full length or
prefix), pairwise alignment, reverse complementation, sorting,
and subsampling. VSEARCH also includes commands for FASTQ file
processing, i.e., format detection, filtering, read quality
statistics, and merging of paired reads. Furthermore, VSEARCH
extends functionality with several new commands and improvements,
including shuffling, rereplication, masking of low-complexity
sequences with the well-known DUST algorithm, a choice among
different similarity definitions, and FASTQ file format
conversion. VSEARCH is here shown to be more accurate than
USEARCH when performing searching, clustering, chimera detection
and subsampling, while on a par with USEARCH for paired-ends read
merging. VSEARCH is slower than USEARCH when performing
clustering and chimera detection, but significantly faster when
performing paired-end reads merging and dereplication. VSEARCH is
available at https://github.com/torognes/vsearch under either the
BSD 2-clause license or the GNU General Public License version
3.0. DISCUSSION: VSEARCH has been shown to be a fast, accurate
and full-fledged alternative to USEARCH. A free and open-source
versatile tool for sequence analysis is now available to the
metagenomics community.",
journal = "PeerJ",
volume = 4,
pages = "e2584",
month = oct,
year = 2016,
keywords = "Alignment; Chimera detection; Clustering; Dereplication; Masking;
Metagenomics; Parallellization; Searching; Sequences; Shuffling",
language = "en"
}
@ARTICLE{Garrison2012-sy,
title = "Haplotype-based variant detection from short-read sequencing",
author = "Garrison, Erik and Marth, Gabor",
abstract = "The direct detection of haplotypes from short-read DNA
sequencing data requires changes to existing small-variant
detection methods. Here, we develop a Bayesian statistical
framework which is capable of modeling multiallelic loci in
sets of individuals with non-uniform copy number. We then
describe our implementation of this framework in a
haplotype-based variant detector, FreeBayes.",
month = jul,
year = 2012,
archivePrefix = "arXiv",
primaryClass = "q-bio.GN",
eprint = "1207.3907"
}
@ARTICLE{Joensen2014-gi,
title = "Real-time whole-genome sequencing for routine typing,
surveillance, and outbreak detection of verotoxigenic Escherichia
coli",
author = "Joensen, Katrine Grimstrup and Scheutz, Flemming and Lund, Ole
and Hasman, Henrik and Kaas, Rolf S and Nielsen, Eva M and
Aarestrup, Frank M",
abstract = "Fast and accurate identification and typing of pathogens are
essential for effective surveillance and outbreak detection. The
current routine procedure is based on a variety of techniques,
making the procedure laborious, time-consuming, and expensive.
With whole-genome sequencing (WGS) becoming cheaper, it has huge
potential in both diagnostics and routine surveillance. The aim
of this study was to perform a real-time evaluation of WGS for
routine typing and surveillance of verocytotoxin-producing
Escherichia coli (VTEC). In Denmark, the Statens Serum Institut
(SSI) routinely receives all suspected VTEC isolates. During a
7-week period in the fall of 2012, all incoming isolates were
concurrently subjected to WGS using IonTorrent PGM. Real-time
bioinformatics analysis was performed using web-tools
(www.genomicepidemiology.org) for species determination,
multilocus sequence type (MLST) typing, and determination of
phylogenetic relationship, and a specific VirulenceFinder for
detection of E. coli virulence genes was developed as part of
this study. In total, 46 suspected VTEC isolates were
characterized in parallel during the study. VirulenceFinder
proved successful in detecting virulence genes included in
routine typing, explicitly verocytotoxin 1 (vtx1), verocytotoxin
2 (vtx2), and intimin (eae), and also detected additional
virulence genes. VirulenceFinder is also a robust method for
assigning verocytotoxin (vtx) subtypes. A real-time clustering of
isolates in agreement with the epidemiology was established from
WGS, enabling discrimination between sporadic and outbreak
isolates. Overall, WGS typing produced results faster and at a
lower cost than the current routine. Therefore, WGS typing is a
superior alternative to conventional typing strategies. This
approach may also be applied to typing and surveillance of other
pathogens.",
journal = "J. Clin. Microbiol.",
volume = 52,
number = 5,
pages = "1501--1510",
month = may,
year = 2014,
language = "en"
}
@ARTICLE{Hyatt2010-wx,
title = "Prodigal: prokaryotic gene recognition and translation initiation
site identification",
author = "Hyatt, Doug and Chen, Gwo-Liang and Locascio, Philip F and Land,
Miriam L and Larimer, Frank W and Hauser, Loren J",
abstract = "BACKGROUND: The quality of automated gene prediction in microbial
organisms has improved steadily over the past decade, but there
is still room for improvement. Increasing the number of correct
identifications, both of genes and of the translation initiation
sites for each gene, and reducing the overall number of false
positives, are all desirable goals. RESULTS: With our years of
experience in manually curating genomes for the Joint Genome
Institute, we developed a new gene prediction algorithm called
Prodigal (PROkaryotic DYnamic programming Gene-finding
ALgorithm). With Prodigal, we focused specifically on the three
goals of improved gene structure prediction, improved translation
initiation site recognition, and reduced false positives. We
compared the results of Prodigal to existing gene-finding methods
to demonstrate that it met each of these objectives. CONCLUSION:
We built a fast, lightweight, open source gene prediction program
called Prodigal http://compbio.ornl.gov/prodigal/. Prodigal
achieved good results compared to existing methods, and we
believe it will be a valuable asset to automated microbial
annotation pipelines.",
journal = "BMC Bioinformatics",
volume = 11,
pages = "119",
month = mar,
year = 2010,
language = "en"
}
@ARTICLE{Alcock2020-cx,
title = "{CARD} 2020: antibiotic resistome surveillance with the
comprehensive antibiotic resistance database",
author = "Alcock, Brian P and Raphenya, Amogelang R and Lau, Tammy T Y and
Tsang, Kara K and Bouchard, M{\'e}gane and Edalatmand, Arman and
Huynh, William and Nguyen, Anna-Lisa V and Cheng, Annie A and
Liu, Sihan and Min, Sally Y and Miroshnichenko, Anatoly and Tran,
Hiu-Ki and Werfalli, Rafik E and Nasir, Jalees A and Oloni,
Martins and Speicher, David J and Florescu, Alexandra and Singh,
Bhavya and Faltyn, Mateusz and Hernandez-Koutoucheva, Anastasia
and Sharma, Arjun N and Bordeleau, Emily and Pawlowski, Andrew C
and Zubyk, Haley L and Dooley, Damion and Griffiths, Emma and
Maguire, Finlay and Winsor, Geoff L and Beiko, Robert G and
Brinkman, Fiona S L and Hsiao, William W L and Domselaar, Gary V
and McArthur, Andrew G",
abstract = "The Comprehensive Antibiotic Resistance Database (CARD;
https://card.mcmaster.ca) is a curated resource providing
reference DNA and protein sequences, detection models and
bioinformatics tools on the molecular basis of bacterial
antimicrobial resistance (AMR). CARD focuses on providing
high-quality reference data and molecular sequences within a
controlled vocabulary, the Antibiotic Resistance Ontology (ARO),
designed by the CARD biocuration team to integrate with software
development efforts for resistome analysis and prediction, such
as CARD's Resistance Gene Identifier (RGI) software. Since 2017,
CARD has expanded through extensive curation of reference
sequences, revision of the ontological structure, curation of
over 500 new AMR detection models, development of a new
classification paradigm and expansion of analytical tools. Most
notably, a new Resistomes \& Variants module provides analysis
and statistical summary of in silico predicted resistance
variants from 82 pathogens and over 100 000 genomes. By adding
these resistance variants to CARD, we are able to summarize
predicted resistance using the information included in CARD,
identify trends in AMR mobility and determine previously
undescribed and novel resistance variants. Here, we describe
updates and recent expansions to CARD and its biocuration
process, including new resources for community biocuration of AMR
molecular reference data.",
journal = "Nucleic Acids Res.",
volume = 48,
number = "D1",
pages = "D517--D525",
month = jan,
year = 2020,
language = "en"
}
@ARTICLE{Ondov2016-cn,
title = "Mash: fast genome and metagenome distance estimation using
{MinHash}",
author = "Ondov, Brian D and Treangen, Todd J and Melsted, P{\'a}ll and
Mallonee, Adam B and Bergman, Nicholas H and Koren, Sergey and
Phillippy, Adam M",
abstract = "Mash extends the MinHash dimensionality-reduction technique to
include a pairwise mutation distance and P value significance
test, enabling the efficient clustering and search of massive
sequence collections. Mash reduces large sequences and sequence
sets to small, representative sketches, from which global
mutation distances can be rapidly estimated. We demonstrate
several use cases, including the clustering of all 54,118 NCBI
RefSeq genomes in 33 CPU h; real-time database search using
assembled or unassembled Illumina, Pacific Biosciences, and
Oxford Nanopore data; and the scalable clustering of hundreds of
metagenomic samples by composition. Mash is freely released under
a BSD license ( https://github.com/marbl/mash ).",
journal = "Genome Biol.",
volume = 17,
number = 1,
pages = "132",
month = jun,
year = 2016,
keywords = "Alignment; Comparative genomics; Genomic distance; Metagenomics;
Nanopore; Sequencing",
language = "en"
}
% The entry below contains non-ASCII chars that could not be converted
% to a LaTeX equivalent.
@ARTICLE{Huerta-Cepas2017-sn,
title = "Fast {Genome-Wide} Functional Annotation through Orthology
Assignment by {eggNOG-Mapper}",
author = "Huerta-Cepas, Jaime and Forslund, Kristoffer and Coelho, Luis
Pedro and Szklarczyk, Damian and Jensen, Lars Juhl and von
Mering, Christian and Bork, Peer",
abstract = "Orthology assignment is ideally suited for functional inference.
However, because predicting orthology is computationally
intensive at large scale, and most pipelines are relatively
inaccessible (e.g., new assignments only available through
database updates), less precise homology-based functional
transfer is still the default for (meta-)genome annotation. We,
therefore, developed eggNOG-mapper, a tool for functional
annotation of large sets of sequences based on fast orthology
assignments using precomputed clusters and phylogenies from the
eggNOG database. To validate our method, we benchmarked Gene
Ontology (GO) predictions against two widely used homology-based
approaches: BLAST and InterProScan. Orthology filters applied to
BLAST results reduced the rate of false positive assignments by
11\%, and increased the ratio of experimentally validated terms
recovered over all terms assigned per protein by 15\%. Compared
with InterProScan, eggNOG-mapper achieved similar proteome
coverage and precision while predicting, on average, 41 more
terms per protein and increasing the rate of experimentally
validated terms recovered over total term assignments per protein
by 35\%. EggNOG-mapper predictions scored within the top-5
methods in the three GO categories using the CAFA2 NK-partial
benchmark. Finally, we evaluated eggNOG-mapper for functional
annotation of metagenomics data, yielding better performance than
interProScan. eggNOG-mapper runs ∼15$\times$ faster than BLAST
and at least 2.5$\times$ faster than InterProScan. The tool is
available standalone and as an online service at
http://eggnog-mapper.embl.de.",
journal = "Mol. Biol. Evol.",
volume = 34,
number = 8,
pages = "2115--2122",
month = aug,
year = 2017,
keywords = "comparative genomics; functional annotation; gene function;
genomics; metagenomics; orthology",
language = "en"
}
@ARTICLE{Walker2014-lr,
title = "Pilon: An Integrated Tool for Comprehensive Microbial Variant
Detection and Genome Assembly Improvement",
author = "Walker, Bruce J and Abeel, Thomas and Shea, Terrance and Priest,
Margaret and Abouelliel, Amr and Sakthikumar, Sharadha and
Cuomo, Christina A and Zeng, Qiandong and Wortman, Jennifer and
Young, Sarah K and Earl, Ashlee M",
abstract = "Advances in modern sequencing technologies allow us to generate
sufficient data to analyze hundreds of bacterial genomes from a
single machine in a single day. This potential for sequencing
massive numbers of genomes calls for fully automated methods to
produce high-quality assemblies and variant calls. We introduce
Pilon, a fully automated, all-in-one tool for correcting draft
assemblies and calling sequence variants of multiple sizes,
including very large insertions and deletions. Pilon works with
many types of sequence data, but is particularly strong when
supplied with paired end data from two Illumina libraries with
small e.g., 180 bp and large e.g., 3--5 Kb inserts. Pilon
significantly improves draft genome assemblies by correcting
bases, fixing mis-assemblies and filling gaps. For both haploid
and diploid genomes, Pilon produces more contiguous genomes with
fewer errors, enabling identification of more biologically
relevant genes. Furthermore, Pilon identifies small variants
with high accuracy as compared to state-of-the-art tools and is
unique in its ability to accurately identify large sequence
variants including duplications and resolve large insertions.
Pilon is being used to improve the assemblies of thousands of
new genomes and to identify variants from thousands of
clinically relevant bacterial strains. Pilon is freely available
as open source software.",
journal = "PLoS One",
publisher = "Public Library of Science",
volume = 9,
number = 11,
pages = "e112963",
month = nov,
year = 2014
}
@ARTICLE{Magoc2011-jn,
title = "{FLASH}: fast length adjustment of short reads to improve genome
assemblies",
author = "Mago{\v c}, Tanja and Salzberg, Steven L",
abstract = "MOTIVATION: Next-generation sequencing technologies generate very
large numbers of short reads. Even with very deep genome
coverage, short read lengths cause problems in de novo
assemblies. The use of paired-end libraries with a fragment size
shorter than twice the read length provides an opportunity to
generate much longer reads by overlapping and merging read pairs
before assembling a genome. RESULTS: We present FLASH, a fast
computational tool to extend the length of short reads by
overlapping paired-end reads from fragment libraries that are
sufficiently short. We tested the correctness of the tool on one
million simulated read pairs, and we then applied it as a
pre-processor for genome assemblies of Illumina reads from the
bacterium Staphylococcus aureus and human chromosome 14. FLASH
correctly extended and merged reads >99\% of the time on
simulated reads with an error rate of <1\%. With adequately set
parameters, FLASH correctly merged reads over 90\% of the time
even when the reads contained up to 5\% errors. When FLASH was
used to extend reads prior to assembly, the resulting assemblies
had substantially greater N50 lengths for both contigs and
scaffolds. AVAILABILITY AND IMPLEMENTATION: The FLASH system is
implemented in C and is freely available as open-source code at
http://www.cbcb.umd.edu/software/flash. CONTACT:
t.magoc@gmail.com.",
journal = "Bioinformatics",
volume = 27,
number = 21,
pages = "2957--2963",
month = nov,
year = 2011,
keywords = "PubMed S. aureus 2000-15/2011"
}
@ARTICLE{Petersen2011-nh,
title = "{SignalP} 4.0: discriminating signal peptides from transmembrane
regions",
author = "Petersen, Thomas Nordahl and Brunak, S{\o}ren and von Heijne,
Gunnar and Nielsen, Henrik",
journal = "Nat. Methods",
volume = 8,
number = 10,
pages = "785--786",
month = sep,
year = 2011,
language = "en"
}
@MISC{Skennerton_undated-gp,
title = "{MinCED}: Mining {CRISPRs} in Environmental Datasets",
author = "Skennerton, Connor T",
abstract = "Mining CRISPRs in Environmental Datasets. Contribute to
ctSkennerton/minced development by creating an account on
GitHub.",
institution = "Github"
}
@ARTICLE{Kalyaanamoorthy2017-ea,
title = "{ModelFinder}: fast model selection for accurate phylogenetic
estimates",
author = "Kalyaanamoorthy, Subha and Minh, Bui Quang and Wong, Thomas K F
and von Haeseler, Arndt and Jermiin, Lars S",
abstract = "Model-based molecular phylogenetics plays an important role in
comparisons of genomic data, and model selection is a key step in
all such analyses. We present ModelFinder, a fast model-selection
method that greatly improves the accuracy of phylogenetic
estimates by incorporating a model of rate heterogeneity across
sites not previously considered in this context and by allowing
concurrent searches of model space and tree space.",
journal = "Nat. Methods",
volume = 14,
number = 6,
pages = "587--589",
month = jun,
year = 2017,
language = "en"
}
@ARTICLE{Price2010-bv,
title = "{FastTree} 2 -- Approximately {Maximum-Likelihood} Trees for
Large Alignments",
author = "Price, Morgan N. and Dehal, Paramvir S. and Arkin, Adam P.",
abstract = "Background We recently described FastTree, a tool for inferring
phylogenies for alignments with up to hundreds of thousands of
sequences. Here, we describe improvements to FastTree that
improve its accuracy without sacrificing scalability.
Methodology/Principal Findings Where FastTree 1 used
nearest-neighbor interchanges (NNIs) and the minimum-evolution
criterion to improve the tree, FastTree 2 adds minimum-evolution
subtree-pruning-regrafting (SPRs) and maximum-likelihood NNIs.
FastTree 2 uses heuristics to restrict the search for better
trees and estimates a rate of evolution for each site (the
``CAT'' approximation). Nevertheless, for both simulated and
genuine alignments, FastTree 2 is slightly more accurate than a
standard implementation of maximum-likelihood NNIs (PhyML 3 with
default settings). Although FastTree 2 is not quite as accurate
as methods that use maximum-likelihood SPRs, most of the splits
that disagree are poorly supported, and for large alignments,
FastTree 2 is 100--1,000 times faster. FastTree 2 inferred a
topology and likelihood-based local support values for 237,882
distinct 16S ribosomal RNAs on a desktop computer in 22 hours
and 5.8 gigabytes of memory. Conclusions/Significance FastTree 2
allows the inference of maximum-likelihood phylogenies for huge
alignments. FastTree 2 is freely available at
http://www.microbesonline.org/fasttree .",
journal = "PLoS One",
publisher = "Public Library of Science",
volume = 5,
number = 3,
pages = "e9490",
month = mar,
year = 2010,
keywords = "00-Ungrouped"
}
@ARTICLE{Zankari2012-jn,
title = "Identification of acquired antimicrobial resistance genes",
author = "Zankari, Ea and Hasman, Henrik and Cosentino, Salvatore and
Vestergaard, Martin and Rasmussen, Simon and Lund, Ole and
Aarestrup, Frank M and Larsen, Mette Voldby",
abstract = "OBJECTIVES: Identification of antimicrobial resistance genes is
important for understanding the underlying mechanisms and the
epidemiology of antimicrobial resistance. As the costs of
whole-genome sequencing (WGS) continue to decline, it becomes
increasingly available in routine diagnostic laboratories and is
anticipated to substitute traditional methods for resistance gene
identification. Thus, the current challenge is to extract the
relevant information from the large amount of generated data.
METHODS: We developed a web-based method, ResFinder that uses
BLAST for identification of acquired antimicrobial resistance
genes in whole-genome data. As input, the method can use both
pre-assembled, complete or partial genomes, and short sequence
reads from four different sequencing platforms. The method was
evaluated on 1862 GenBank files containing 1411 different
resistance genes, as well as on 23 de-novo-sequenced isolates.
RESULTS: When testing the 1862 GenBank files, the method
identified the resistance genes with an ID = 100\% (100\%
identity) to the genes in ResFinder. Agreement between in silico
predictions and phenotypic testing was found when the method was
further tested on 23 isolates of five different bacterial
species, with available phenotypes. Furthermore, ResFinder was
evaluated on WGS chromosomes and plasmids of 30 isolates. Seven
of these isolates were annotated to have antimicrobial
resistance, and in all cases, annotations were compatible with
the ResFinder results. CONCLUSIONS: A web server providing a
convenient way of identifying acquired antimicrobial resistance
genes in completely sequenced isolates was created. ResFinder can
be accessed at www.genomicepidemiology.org. ResFinder will
continuously be updated as new resistance genes are identified.",
journal = "J. Antimicrob. Chemother.",
volume = 67,
number = 11,
pages = "2640--2644",
month = nov,
year = 2012,
language = "en"
}
% The entry below contains non-ASCII chars that could not be converted
% to a LaTeX equivalent.
@ARTICLE{Li2018-be,
title = "Minimap2: pairwise alignment for nucleotide sequences",
author = "Li, Heng",
abstract = "Motivation: Recent advances in sequencing technologies promise
ultra-long reads of ∼100 kb in average, full-length mRNA or cDNA
reads in high throughput and genomic contigs over 100 Mb in
length. Existing alignment programs are unable or inefficient to
process such data at scale, which presses for the development of
new alignment algorithms. Results: Minimap2 is a general-purpose
alignment program to map DNA or long mRNA sequences against a
large reference database. It works with accurate short reads of
$\geq$100 bp in length, $\geq$1 kb genomic reads at error rate
∼15\%, full-length noisy Direct RNA or cDNA reads and assembly
contigs or closely related full chromosomes of hundreds of
megabases in length. Minimap2 does split-read alignment, employs
concave gap cost for long insertions and deletions and introduces
new heuristics to reduce spurious alignments. It is 3-4 times as
fast as mainstream short-read mappers at comparable accuracy, and
is $\geq$30 times faster than long-read genomic or cDNA mappers
at higher accuracy, surpassing most aligners specialized in one
type of alignment. Availability and implementation:
https://github.com/lh3/minimap2. Supplementary information:
Supplementary data are available at Bioinformatics online.",
journal = "Bioinformatics",
volume = 34,
number = 18,
pages = "3094--3100",
month = sep,
year = 2018,
language = "en"
}
% The entry below contains non-ASCII chars that could not be converted
% to a LaTeX equivalent.
@ARTICLE{Nawrocki2013-ht,
title = "Infernal 1.1: 100-fold faster {RNA} homology searches",
author = "Nawrocki, Eric P and Eddy, Sean R",
abstract = "SUMMARY: Infernal builds probabilistic profiles of the sequence
and secondary structure of an RNA family called covariance models
(CMs) from structurally annotated multiple sequence alignments
given as input. Infernal uses CMs to search for new family
members in sequence databases and to create potentially large
multiple sequence alignments. Version 1.1 of Infernal introduces
a new filter pipeline for RNA homology search based on
accelerated profile hidden Markov model (HMM) methods and
HMM-banded CM alignment methods. This enables ∼100-fold
acceleration over the previous version and ∼10 000-fold
acceleration over exhaustive non-filtered CM searches.
AVAILABILITY: Source code, documentation and the benchmark are
downloadable from http://infernal.janelia.org. Infernal is freely
licensed under the GNU GPLv3 and should be portable to any
POSIX-compliant operating system, including Linux and Mac OS/X.
Documentation includes a user's guide with a tutorial, a
discussion of file formats and user options and additional
details on methods implemented in the software. CONTACT:
nawrockie@janelia.hhmi.org",
journal = "Bioinformatics",
volume = 29,
number = 22,
pages = "2933--2935",
month = nov,
year = 2013,
language = "en"
}
@ARTICLE{Galata2019-wt,
title = "{PLSDB}: a resource of complete bacterial plasmids",
author = "Galata, Valentina and Fehlmann, Tobias and Backes, Christina and
Keller, Andreas",
abstract = "The study of bacterial isolates or communities requires the
analysis of the therein included plasmids in order to provide an
extensive characterization of the organisms. Plasmids harboring
resistance and virulence factors are of especial interest as they
contribute to the dissemination of antibiotic resistance. As the
number of newly sequenced bacterial genomes is growing a
comprehensive resource is required which will allow to browse and
filter the available plasmids, and to perform sequence analyses.
Here, we present PLSDB, a resource containing 13 789 plasmid
records collected from the NCBI nucleotide database. The web
server provides an interactive view of all obtained plasmids with
additional meta information such as sequence characteristics,
sample-related information and taxonomy. Moreover, nucleotide
sequence data can be uploaded to search for short nucleotide
sequences (e.g. specific genes) in the plasmids, to compare a
given plasmid to the records in the collection or to determine
whether a sample contains one or multiple of the known plasmids
(containment analysis). The resource is freely accessible under
https://ccb-microbe.cs.uni-saarland.de/plsdb/.",
journal = "Nucleic Acids Res.",
volume = 47,
number = "D1",
pages = "D195--D202",
month = jan,
year = 2019,
language = "en"
}
@ARTICLE{Feldgarden2019-il,
title = "Validating the {NCBI} {AMRFinder} Tool and Resistance Gene
Database Using Antimicrobial Resistance {Genotype-Phenotype}
Correlations in a Collection of {NARMS} Isolates",
author = "Feldgarden, Michael and Brover, Vyacheslav and Haft, Daniel H and
Prasad, Arjun B and Slotta, Douglas J and Tolstoy, Igor and
Tyson, Gregory H and Zhao, Shaohua and Hsu, Chih-Hao and
McDermott, Patrick F and Tadesse, Daniel A and Morales, Cesar and
Simmons, Mustafa and Tillman, Glenn and Wasilenko, Jamie and
Folster, Jason P and Klimke, William",
abstract = "Antimicrobial resistance (AMR) is a major public health problem
that requires publicly available tools for rapid analysis. To
identify AMR genes in whole genome sequences, the National Center
for Biotechnology Information (NCBI) has produced AMRFinder, a
tool that identifies AMR genes using a high-quality curated AMR
gene reference database. The Bacterial Antimicrobial Resistance
Reference Gene Database consists of up-to-date gene nomenclature,
a set of hidden Markov models (HMMs), and a curated protein
family hierarchy. Currently, it contains 4,579 antimicrobial
resistance proteins and more than 560 HMMs.Here, we describe
AMRFinder and its associated database. To assess the predictive
ability of AMRFinder, we measured the consistency between
predicted AMR genotypes from AMRFinder and resistance phenotypes
of 6,242 isolates from the National Antimicrobial Resistance
Monitoring System (NARMS). This included 5,425 Salmonella
enterica, 770 Campylobacter spp., and 47 Escherichia coli
phenotypically tested against various antimicrobial agents. Of
87,679 susceptibility tests performed, 98.4\% were consistent
with predictions.To assess the accuracy of AMRFinder, we compared
its gene symbol output with that of a 2017 version of ResFinder,
another publicly available resistance gene detection system. Most
gene calls were identical, but there were 1,229 gene symbol
differences (8.8\%) between them, with differences due to both
algorithmic differences and database composition. AMRFinder
missed 16 loci that Resfinder found, while Resfinder missed 216
loci AMRFinder identified. Based on these results, AMRFinder
appears to be a highly accurate AMR gene detection system.",
journal = "Antimicrob. Agents Chemother.",
month = aug,
year = 2019,
language = "en"
}
% The entry below contains non-ASCII chars that could not be converted
% to a LaTeX equivalent.
@ARTICLE{Zerbino2008-yg,
title = "Velvet: Algorithms for de novo short read assembly using de
Bruijn graphs",
author = "Zerbino, Daniel R and Birney, Ewan",
abstract = "We have developed a new set of algorithms, collectively called
``Velvet,'' to manipulate de Bruijn graphs for genomic sequence
assembly. A de Bruijn graph is a compact representation based on
short words (k-mers) that is ideal for high coverage, very short
read (25--50 bp) data sets. Applying Velvet to very short reads
and paired-ends information only, one can produce contigs of
significant length, up to 50-kb N50 length in simulations of
prokaryotic data and 3-kb N50 on simulated mammalian BACs. When
applied to real Solexa data sets without read pairs, Velvet
generated contigs of ∼8 kb in a prokaryote and 2 kb in a
mammalian BAC, in close agreement with our simulated results
without read-pair information. Velvet represents a new approach
to assembly that can leverage very short reads in combination
with read pairs to produce useful assemblies.",
journal = "Genome Res.",
volume = 18,
number = 5,
pages = "821--829",
month = may,
year = 2008
}
@ARTICLE{Chaumeil2019-tf,
title = "{GTDB-Tk}: a toolkit to classify genomes with the Genome Taxonomy
Database",
author = "Chaumeil, Pierre-Alain and Mussig, Aaron J and Hugenholtz, Philip
and Parks, Donovan H",
abstract = "SUMMARY: The GTDB Toolkit (GTDB-Tk) provides objective taxonomic
assignments for bacterial and archaeal genomes based on the
Genome Taxonomy Database (GTDB). GTDB-Tk is computationally
efficient and able to classify thousands of draft genomes in
parallel. Here we demonstrate the accuracy of the GTDB-Tk
taxonomic assignments by evaluating its performance on a
phylogenetically diverse set of 10,156 bacterial and archaeal
metagenome-assembled genomes. AVAILABILITY: GTDB-Tk is
implemented in Python and licensed under the GNU General Public
License v3.0. Source code and documentation are available at:
https://github.com/ecogenomics/gtdbtk. SUPPLEMENTARY INFORMATION:
Supplementary data are available at Bioinformatics online.",
journal = "Bioinformatics",
month = nov,
year = 2019,
language = "en"
}
@MISC{Petit_undated-mv,
title = "fastq-dl - Download {FASTQ} files from {SRA} or {ENA}
repositories",
author = "Petit, III, Robert A",
abstract = "Download FASTQ files from SRA or ENA repositories. -
rpetit3/fastq-dl",
institution = "Github"
}
@ARTICLE{Bayliss2019-cv,
title = "{PIRATE}: A fast and scalable pangenomics toolbox for clustering
diverged orthologues in bacteria",
author = "Bayliss, Sion C and Thorpe, Harry A and Coyle, Nicola M and
Sheppard, Samuel K and Feil, Edward J",
abstract = "BACKGROUND: Cataloguing the distribution of genes within natural
bacterial populations is essential for understanding evolutionary
processes and the genetic basis of adaptation. Advances in whole
genome sequencing technologies have led to a vast expansion in
the amount of bacterial genomes deposited in public databases.
There is a pressing need for software solutions which are able to
cluster, catalogue and characterise genes, or other features, in
increasingly large genomic datasets. RESULTS: Here we present a
pangenomics toolbox, PIRATE (Pangenome Iterative Refinement and
Threshold Evaluation), which identifies and classifies
orthologous gene families in bacterial pangenomes over a wide
range of sequence similarity thresholds. PIRATE builds upon
recent scalable software developments to allow for the rapid
interrogation of thousands of isolates. PIRATE clusters genes (or
other annotated features) over a wide range of amino acid or
nucleotide identity thresholds and uses the clustering
information to rapidly identify paralogous gene families and
putative fission/fusion events. Furthermore, PIRATE orders the
pangenome using a directed graph, provides a measure of allelic
variation, and estimates sequence divergence for each gene
family. CONCLUSIONS: We demonstrate that PIRATE scales linearly
with both number of samples and computation resources, allowing
for analysis of large genomic datasets, and compares favorably to
other popular tools. PIRATE provides a robust framework for
analysing bacterial pangenomes, from largely clonal to panmictic
species.",
journal = "Gigascience",
volume = 8,
number = 10,
month = oct,
year = 2019,
keywords = "bioinformatics; microbial genomics; next-generation sequencing;
pangenomics",
language = "en"
}
@ARTICLE{Ondov2019-dw,
title = "Mash Screen: high-throughput sequence containment estimation for
genome discovery",
author = "Ondov, Brian D and Starrett, Gabriel J and Sappington, Anna and
Kostic, Aleksandra and Koren, Sergey and Buck, Christopher B and
Phillippy, Adam M",
abstract = "The MinHash algorithm has proven effective for rapidly estimating
the resemblance of two genomes or metagenomes. However, this
method cannot reliably estimate the containment of a genome
within a metagenome. Here, we describe an online algorithm
capable of measuring the containment of genomes and proteomes
within either assembled or unassembled sequencing read sets. We
describe several use cases, including contamination screening and
retrospective analysis of metagenomes for novel genome discovery.
Using this tool, we provide containment estimates for every NCBI
RefSeq genome within every SRA metagenome and demonstrate the
identification of a novel polyomavirus species from a public
metagenome.",
journal = "Genome Biol.",
volume = 20,
number = 1,
pages = "232",
month = nov,
year = 2019,
keywords = "Metagenomics; MinHash; Polyomavirus; SRA; Sequencing; Viral
Discovery",
language = "en"
}
@ARTICLE{Souvorov2018-tg,
title = "{SKESA}: strategic k-mer extension for scrupulous assemblies",
author = "Souvorov, Alexandre and Agarwala, Richa and Lipman, David J",
abstract = "SKESA is a DeBruijn graph-based de-novo assembler designed for
assembling reads of microbial genomes sequenced using Illumina.
Comparison with SPAdes and MegaHit shows that SKESA produces
assemblies that have high sequence quality and contiguity,
handles low-level contamination in reads, is fast, and produces
an identical assembly for the same input when assembled multiple
times with the same or different compute resources. SKESA has
been used for assembling over 272,000 read sets in the Sequence
Read Archive at NCBI and for real-time pathogen detection. Source
code for SKESA is freely available at
https://github.com/ncbi/SKESA/releases .",
journal = "Genome Biol.",
volume = 19,
number = 1,
pages = "153",
month = oct,
year = 2018,
keywords = "Contamination; De-novo assembly; DeBruijn graphs; Illumina reads;
Sequence quality",
language = "en"
}
@ARTICLE{Chen2016-sb,
title = "{VFDB} 2016: hierarchical and refined dataset for big data
analysis--10 years on",
author = "Chen, Lihong and Zheng, Dandan and Liu, Bo and Yang, Jian and
Jin, Qi",
abstract = "The virulence factor database (VFDB, http://www.mgc.ac.cn/VFs/)
is dedicated to providing up-to-date knowledge of virulence
factors (VFs) of various bacterial pathogens. Since its inception
the VFDB has served as a comprehensive repository of bacterial
VFs for over a decade. The exponential growth in the amount of
biological data is challenging to the current database in regard
to big data analysis. We recently improved two aspects of the
infrastructural dataset of VFDB: (i) removed the redundancy
introduced by previous releases and generated two hierarchical
datasets--one core dataset of experimentally verified VFs only
and another full dataset including all known and predicted VFs
and (ii) refined the gene annotation of the core dataset with
controlled vocabularies. Our efforts enhanced the data quality of
the VFDB and promoted the usability of the database in the big
data era for the bioinformatic mining of the explosively growing
data regarding bacterial VFs.",
journal = "Nucleic Acids Res.",
volume = 44,
number = "D1",
pages = "D694--7",
month = jan,
year = 2016,
language = "en"
}
@ARTICLE{Turner2018-zv,
title = "Integrating long-range connectivity information into de Bruijn
graphs",
author = "Turner, Isaac and Garimella, Kiran V and Iqbal, Zamin and McVean,
Gil",
abstract = "Motivation: The de Bruijn graph is a simple and efficient data
structure that is used in many areas of sequence analysis
including genome assembly, read error correction and variant