Skip to content

Assertion failure #132

Description

@drtconway

Hi Oxbow.

Thanks for your work on this project!

Some of my VCF inputs cause an assertion failure. The VCFs may be non-conforming to the specification, but it would be nice to receive an error rather than an assertion failure.

To reproduce:

With the files below, I get the following behaviour:

$ ./target/debug/oxbow-read-vcf 

thread 'main' panicked at /Users/tom.conway/.cargo/registry/src/index.crates.io-1949cf8c6b5b557f/arrow-array-54.3.1/src/builder/fixed_size_list_builder.rs:174:9:
assertion `left == right` failed: Length of the child array (6) must be the multiple of the value length (2) and the array length (4).
  left: 6
 right: 8
note: run with `RUST_BACKTRACE=1` environment variable to display a backtrace

Cargo.toml:

[package]
name = "oxbow-read-vcf"
version = "0.1.0"
edition = "2024"

[dependencies]
oxbow = "0.4.0"
noodles = { version = "0.90.0", features = ["core", "vcf"] }

main.rs:

use std::fs::File;
use std::io::BufReader;
use oxbow::variant::format::vcf::Scanner;
fn main() {
    let inner = File::open("x.vcf").map(BufReader::new).unwrap();
    let mut fmt_reader = noodles::vcf::io::Reader::new(inner);
    let header = fmt_reader.read_header().unwrap();

    let scanner = Scanner::new(header);
    let batches = scanner.scan(fmt_reader, None, None, None, None, None, None, Some(1000)).unwrap();
    for batch in batches {
        let batch = batch.unwrap();
        eprintln!("{:?}", batch.schema().as_ref());
    }
}

x.vcf:

##fileformat=VCFv4.2
##FILTER=<ID=PASS,Description="All filters passed">
##ALT=<ID=NON_REF,Description="Represents any possible alternative allele not already represented at this location by REF and ALT">
##FILTER=<ID=LowQual,Description="Low quality">
##FORMAT=<ID=AD,Number=R,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth (reads with MQ=255 or with bad mates are filtered)">
##FORMAT=<ID=GP,Number=G,Type=Float,Description="genotype posterior in Phred Scale">
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##FORMAT=<ID=MIN_DP,Number=1,Type=Integer,Description="Minimum DP observed within the GVCF block">
##FORMAT=<ID=PG,Number=G,Type=Float,Description="genotype priors in Phred Scale">
##FORMAT=<ID=PGT,Number=1,Type=String,Description="Physical phasing haplotype information, describing how the alternate alleles are phased in relation to one another; will always be heterozygous and is not intended to describe called alleles">
##FORMAT=<ID=PID,Number=1,Type=String,Description="Physical phasing ID information, where each unique ID within a given sample (but not across samples) connects records within a phasing group">
##FORMAT=<ID=PL,Number=G,Type=Integer,Description="Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification">
##FORMAT=<ID=PS,Number=1,Type=Integer,Description="Phasing set (typically the position of the first variant in the set)">
##FORMAT=<ID=RGQ,Number=1,Type=Integer,Description="Unconditional reference genotype confidence, encoded as a phred quality -10*log10 p(genotype call is wrong)">
##FORMAT=<ID=SB,Number=4,Type=Integer,Description="Per-sample component statistics which comprise the Fisher's Exact Test to detect strand bias.">
##INFO=<ID=AC,Number=A,Type=Integer,Description="Allele count in genotypes, for each ALT allele, in the same order as listed">
##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency, for each ALT allele, in the same order as listed">
##INFO=<ID=AN,Number=1,Type=Integer,Description="Total number of alleles in called genotypes">
##INFO=<ID=BaseQRankSum,Number=1,Type=Float,Description="Z-score from Wilcoxon rank sum test of Alt Vs. Ref base qualities">
##INFO=<ID=DB,Number=0,Type=Flag,Description="dbSNP Membership">
##INFO=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth; some reads may have been filtered">
##INFO=<ID=DRAGstrInfo,Number=2,Type=Integer,Description="Indicates the period and repeat count">
##INFO=<ID=DRAGstrParams,Number=3,Type=Float,Description="Parameters used (GOP, GCP, API)">
##INFO=<ID=END,Number=1,Type=Integer,Description="Stop position of the interval">
##INFO=<ID=ExcessHet,Number=1,Type=Float,Description="Phred-scaled p-value for exact test of excess heterozygosity">
##INFO=<ID=FS,Number=1,Type=Float,Description="Phred-scaled p-value using Fisher's exact test to detect strand bias">
##INFO=<ID=InbreedingCoeff,Number=1,Type=Float,Description="Inbreeding coefficient as estimated from the genotype likelihoods per-sample when compared against the Hardy-Weinberg expectation">
##INFO=<ID=MLEAC,Number=A,Type=Integer,Description="Maximum likelihood expectation (MLE) for the allele counts (not necessarily the same as the AC), for each ALT allele, in the same order as listed">
##INFO=<ID=MLEAF,Number=A,Type=Float,Description="Maximum likelihood expectation (MLE) for the allele frequency (not necessarily the same as the AF), for each ALT allele, in the same order as listed">
##INFO=<ID=MQ,Number=1,Type=Float,Description="RMS Mapping Quality">
##INFO=<ID=MQRankSum,Number=1,Type=Float,Description="Z-score From Wilcoxon rank sum test of Alt vs. Ref read mapping qualities">
##INFO=<ID=QD,Number=1,Type=Float,Description="Variant Confidence/Quality by Depth">
##INFO=<ID=RAW_MQandDP,Number=2,Type=Integer,Description="Raw data (sum of squared MQ and total depth) for improved RMS Mapping Quality calculation. Incompatible with deprecated RAW_MQ formulation.">
##INFO=<ID=ReadPosRankSum,Number=1,Type=Float,Description="Z-score from Wilcoxon rank sum test of Alt vs. Ref read position bias">
##INFO=<ID=SOR,Number=1,Type=Float,Description="Symmetric Odds Ratio of 2x2 contingency table to detect strand bias">
##contig=<ID=chr1,length=248956422>
##contig=<ID=chr2,length=242193529>
##contig=<ID=chr3,length=198295559>
##contig=<ID=chr4,length=190214555>
##contig=<ID=chr5,length=181538259>
##contig=<ID=chr6,length=170805979>
##contig=<ID=chr7,length=159345973>
##contig=<ID=chr8,length=145138636>
##contig=<ID=chr9,length=138394717>
##contig=<ID=chr10,length=133797422>
##contig=<ID=chr11,length=135086622>
##contig=<ID=chr12,length=133275309>
##contig=<ID=chr13,length=114364328>
##contig=<ID=chr14,length=107043718>
##contig=<ID=chr15,length=101991189>
##contig=<ID=chr16,length=90338345>
##contig=<ID=chr17,length=83257441>
##contig=<ID=chr18,length=80373285>
##contig=<ID=chr19,length=58617616>
##contig=<ID=chr20,length=64444167>
##contig=<ID=chr21,length=46709983>
##contig=<ID=chr22,length=50818468>
##contig=<ID=chrX,length=156040895>
##contig=<ID=chrY,length=57227415>
##contig=<ID=chrM,length=16569>
##INFO=<ID=ANN,Number=.,Type=String,Description="Consequence annotations from Ensembl VEP. Format: Allele|Consequence|IMPACT|SYMBOL|Gene|Feature_type|Feature|BIOTYPE|EXON|INTRON|HGVSc|HGVSp|cDNA_position|CDS_position|Protein_position|Amino_acids|Codons|Existing_variation|ALLELE_NUM|DISTANCE|STRAND|FLAGS|VARIANT_CLASS|SYMBOL_SOURCE|HGNC_ID|CANONICAL|MANE|ENSP|REFSEQ_MATCH|REFSEQ_OFFSET|GIVEN_REF|USED_REF|BAM_EDIT|SIFT|PolyPhen|HGVS_OFFSET|AFR_AF|AMR_AF|EAS_AF|EUR_AF|SAS_AF|AA_AF|EA_AF|gnomAD_AF|gnomAD_AFR_AF|gnomAD_AMR_AF|gnomAD_ASJ_AF|gnomAD_EAS_AF|gnomAD_FIN_AF|gnomAD_NFE_AF|gnomAD_OTH_AF|gnomAD_SAS_AF|MAX_AF|MAX_AF_POPS|CLIN_SIG|SOMATIC|PHENO|PUBMED">
#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	NA24143	NA24149	NA24385
chr1	10421	.	ACCCTAACCCTAACCCTAAC	A	13.58	.	AC=1;AF=0.167;AN=6;BaseQRankSum=0.366;DP=141;DRAGstrInfo=1;DRAGstrParams=36;ExcessHet=0;FS=6.419;MLEAC=1;MLEAF=0.167;MQ=20.57;MQRankSum=-1.534;QD=1.7;ReadPosRankSum=0.366;SOR=2.833;ANN=-|downstream_gene_variant|MODIFIER|WASH7P|653635|Transcript|NR_024540.1|transcribed_pseudogene||||||||||rs1557426865|1|3922|-1||deletion|EntrezGene||YES|||||CCCTAACCCTAACCCTAAC|CCCTAACCCTAACCCTAAC|OK|||||||||||||||||||||||||,-|upstream_gene_variant|MODIFIER|DDX11L1|100287102|Transcript|NR_046018.2|transcribed_pseudogene||||||||||rs1557426865|1|1434|1||deletion|EntrezGene||YES|||||CCCTAACCCTAACCCTAAC|CCCTAACCCTAACCCTAAC||||||||||||||||||||||||||	GT:AD:DP:GQ:PGT:PID:PL:PS	0/1:7,1:8:21:.:.:21,0,34:.	0/0:14,0:14:0:.:.:0,0,248:.	0|0:9,0:11:53:0|1:10418_CT_C:0,53,93:10418
chr1	10439	rs112766696	AC	A,*	30.89	.	AC=2,1;AF=0.333,0.167;AN=6;BaseQRankSum=1.09;DB;DP=153;DRAGstrInfo=1;DRAGstrParams=32;ExcessHet=3.9794;FS=0;MLEAC=2,1;MLEAF=0.333,0.167;MQ=24.89;MQRankSum=-1.068;QD=1.72;ReadPosRankSum=0.536;SOR=1.075;ANN=-|downstream_gene_variant|MODIFIER|WASH7P|653635|Transcript|NR_024540.1|transcribed_pseudogene||||||||||rs112766696|1|3922|-1||sequence_alteration|EntrezGene||YES|||||C|C|OK|||||||||||||||||||||||||,-|upstream_gene_variant|MODIFIER|DDX11L1|100287102|Transcript|NR_046018.2|transcribed_pseudogene||||||||||rs112766696|1|1434|1||sequence_alteration|EntrezGene||YES|||||C|C||||||||||||||||||||||||||	GT:AD:DP:GQ:PL	0/2:7,0,1:8:21:21,42,76,0,206,34	0/1:6,2,0:8:34:34,0,40,52,179,92	0/1:7,2,1:10:7:7,0,40,30,212,70
chr1	10492	rs55998931	C	T	101.58	.	AC=2;AF=0.333;AN=6;BaseQRankSum=-0.319;DB;DP=47;ExcessHet=1.7609;FS=7.16;MLEAC=2;MLEAF=0.333;MQ=31.71;MQRankSum=0;QD=7.26;ReadPosRankSum=-0.414;SOR=3.737;ANN=T|downstream_gene_variant|MODIFIER|WASH7P|653635|Transcript|NR_024540.1|transcribed_pseudogene||||||||||rs55998931|1|3870|-1||SNV|EntrezGene||YES|||||C|C|OK|||||||||||||||||||||||||,T|upstream_gene_variant|MODIFIER|DDX11L1|100287102|Transcript|NR_046018.2|transcribed_pseudogene||||||||||rs55998931|1|1382|1||SNV|EntrezGene||YES|||||C|C||||||||||||||||||||||||||	GT:AD:DP:GQ:PL	0/1:2,4:6:30:57,0,30	0/0:6,0:6:0:0,0,149	0/1:5,3:8:40:53,0,40
chr1	13273	.	G	C	91.88	.	AC=2;AF=0.5;AN=4;DP=38;ExcessHet=0;FS=0;MLEAC=2;MLEAF=0.5;MQ=5.35;QD=5.1;SOR=1.179;ANN=C|downstream_gene_variant|MODIFIER|WASH7P|653635|Transcript|NR_024540.1|transcribed_pseudogene||||||||||rs531730856|1|1089|-1||SNV|EntrezGene||YES|||||G|G|OK||||0.0204|0.1455|0.0625|0.1471|0.1401||||||||||||0.1471|EUR||||,C|non_coding_transcript_exon_variant|MODIFIER|DDX11L1|100287102|Transcript|NR_046018.2|transcribed_pseudogene|3/3||NR_046018.2:n.516G>C||516|||||rs531730856|1||1||SNV|EntrezGene||YES|||||G|G|||||0.0204|0.1455|0.0625|0.1471|0.1401||||||||||||0.1471|EUR||||,C|downstream_gene_variant|MODIFIER|MIR6859-1|102466751|Transcript|NR_106918.1|miRNA||||||||||rs531730856|1|4096|-1||SNV|EntrezGene||YES|||||G|G|||||0.0204|0.1455|0.0625|0.1471|0.1401||||||||||||0.1471|EUR||||	GT:AD:DP:GQ:PL	1/1:0,18:18:54:106,54,0	0/0:17,0:17:51:0,51,540	./.:0,0:0:0:0,0,0

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type
    No fields configured for issues without a type.

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions