diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml index bd34ea1..11041de 100644 --- a/.github/workflows/unit_tests.yml +++ b/.github/workflows/unit_tests.yml @@ -5,13 +5,17 @@ on: [push, pull_request] jobs: build: runs-on: ubuntu-22.04 + strategy: + matrix: + python-version: ['3.10', '3.11', '3.12','3.13'] + name: Set up Python ${{ matrix.python-version }} steps: - uses: actions/checkout@v4 - - name: Set up Python 3.10 + - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v4 with: - python-version: '3.10' + python-version: ${{ matrix.python-version }} - name: Install dependencies run: | diff --git a/checkQC/parsers/illumina.py b/checkQC/parsers/illumina.py index da55c69..0fe0705 100644 --- a/checkQC/parsers/illumina.py +++ b/checkQC/parsers/illumina.py @@ -12,24 +12,33 @@ def from_bclconvert(cls, runfolder_path, parser_config): runfolder_path = pathlib.Path(runfolder_path) assert runfolder_path.is_dir() - summary, index_summary = _read_interop_summary(runfolder_path) - quality_metrics = _read_quality_metrics( + summary, index_summary, run_info = _read_interop_summary(runfolder_path) + quality_metrics = _read_demultiplexing_metrics( runfolder_path / parser_config["reports_location"] / "Quality_Metrics.csv" ) - top_unknown_barcodes = _read_top_unknown_barcodes( + top_unknown_barcodes = _read_demultiplexing_metrics( runfolder_path / parser_config["reports_location"] / "Top_Unknown_Barcodes.csv" ) + + demultiplex_stats = _read_demultiplexing_metrics( + runfolder_path + / parser_config["reports_location"] + / "Demultiplex_Stats.csv" + ) samplesheet = _read_samplesheet(runfolder_path) instrument, read_length = _read_run_metadata(runfolder_path) sequencing_metrics = { lane + 1: { - "total_cluster_pf": summary.at(0).at(lane).reads_pf(), + "total_reads_pf": summary.at(0).at(lane).reads_pf(), + "total_reads": summary.at(0).at(lane).reads(), + "raw_density":summary.at(0).at(lane).density().mean(), + "pf_density":summary.at(0).at(lane).density_pf().mean(), "yield": sum( int(row["Yield"]) for row in quality_metrics @@ -69,6 +78,36 @@ def from_bclconvert(cls, runfolder_path, parser_config): sample_summary := index_summary.at(lane).at(sample_no) ).sample_id(), "cluster_count": sample_summary.cluster_count(), + "percent_of_lane": next( + round(float(sample_stat["% Reads"]) * 100, 2) + for sample_stat in demultiplex_stats + if sample_stat["Lane"] == str(lane + 1) and + sample_stat["SampleID"] == sample_summary.sample_id() + ), + "percent_perfect_index_reads": next( + round(float(sample_stat["% Perfect Index Reads"]) * 100, 2) + for sample_stat in demultiplex_stats + if sample_stat["Lane"] == str(lane + 1) and + sample_stat["SampleID"] == sample_summary.sample_id() + ), + "mean_q30": next( + float(row["Mean Quality Score (PF)"]) + for row in quality_metrics + if ( + row["Lane"] == str(lane + 1) + and row["SampleID"] == sample_summary.sample_id() + ) + ), + "percent_q30": next( + float(row["% Q30"]) * 100 + for row in quality_metrics + if ( + row["Lane"] == str(lane + 1) + and row["SampleID"] == sample_summary.sample_id() + ) + ) + + } for sample_no in range(index_summary.at(lane).size()) ], @@ -104,24 +143,16 @@ def _read_interop_summary(runfolder_path): index_summary = interop.py_interop_summary.index_flowcell_summary() interop.py_interop_summary.summarize_index_metrics(run_metrics, index_summary) - return run_summary, index_summary + return run_summary, index_summary, run_info -def _read_quality_metrics(quality_metrics_path): +def _read_demultiplexing_metrics(metrics_path): """ - Read quality metrics file + Read demultiplexing metrics file """ - with open(quality_metrics_path, encoding="utf-8") as csvfile: + with open(metrics_path, encoding="utf-8") as csvfile: return list(csv.DictReader(csvfile)) - - -def _read_top_unknown_barcodes(top_unknown_barcodes_path): - """ - Read top unknown barcodes file - """ - with open(top_unknown_barcodes_path, encoding="utf-8") as csvfile: - return list(csv.DictReader(csvfile)) - + def _read_run_metadata(runfolder_path): """ diff --git a/checkQC/qc_checkers/cluster_pf.py b/checkQC/qc_checkers/cluster_pf.py index 867516b..2e4eb0c 100644 --- a/checkQC/qc_checkers/cluster_pf.py +++ b/checkQC/qc_checkers/cluster_pf.py @@ -17,26 +17,26 @@ def cluster_pf( if warning_threshold != "unknown": warning_threshold = int(warning_threshold * 10**6) - def format_msg(total_cluster_pf, threshold, lane, **kwargs): - return f"Clusters PF {total_cluster_pf / 10**6}M < {threshold / 10**6}M on lane {lane}" + def format_msg(total_reads_pf, threshold, lane, **kwargs): + return f"Clusters PF {total_reads_pf / 10**6}M < {threshold / 10**6}M on lane {lane}" - def _qualify_error(total_cluster_pf, lane): + def _qualify_error(total_reads_pf, lane): data = { "lane": lane, - "total_cluster_pf": total_cluster_pf, + "total_reads_pf": total_reads_pf, "qc_checker": "cluster_pf", } - match total_cluster_pf: - case total_cluster_pf if ( + match total_reads_pf: + case total_reads_pf if ( error_threshold != "unknown" - and total_cluster_pf < error_threshold + and total_reads_pf < error_threshold ): data["threshold"] = error_threshold return QCErrorFatal(format_msg(**data), data=data) - case total_cluster_pf if ( + case total_reads_pf if ( warning_threshold != "unknown" - and total_cluster_pf < warning_threshold + and total_reads_pf < warning_threshold ): data["threshold"] = warning_threshold return QCErrorWarning(format_msg(**data), data=data) @@ -44,6 +44,6 @@ def _qualify_error(total_cluster_pf, lane): return [ qc_report for lane, lane_data in qc_data.sequencing_metrics.items() - if (qc_report := _qualify_error(lane_data["total_cluster_pf"], lane)) + if (qc_report := _qualify_error(lane_data["total_reads_pf"], lane)) ] diff --git a/checkQC/qc_checkers/unidentified_index.py b/checkQC/qc_checkers/unidentified_index.py index ce12352..c824c0d 100644 --- a/checkQC/qc_checkers/unidentified_index.py +++ b/checkQC/qc_checkers/unidentified_index.py @@ -40,7 +40,7 @@ def unidentified_index( qc_errors = [] for lane, lane_data in qc_data.sequencing_metrics.items(): for barcode in lane_data["top_unknown_barcodes"]: - significance = barcode["count"] / lane_data["total_cluster_pf"] * 100. + significance = barcode["count"] / lane_data["total_reads_pf"] * 100. if significance < significance_threshold: continue index = ( diff --git a/checkQC/qc_data_utils.py b/checkQC/qc_data_utils.py new file mode 100644 index 0000000..55f7456 --- /dev/null +++ b/checkQC/qc_data_utils.py @@ -0,0 +1,204 @@ +import numpy as np +from checkQC.parsers.illumina import _read_interop_summary + + +def bclconvert_test_runfolder(qc_data, runfolder_path): + _, _, run_info = _read_interop_summary(runfolder_path) + flowcell_id = run_info.flowcell_id() + if "HMTFYDRXX" in flowcell_id: + return { + "qc_data": qc_data, + "expected_instrument": "novaseq_SP", + "expected_read_length": 36, + "expected_samplesheet": { + "len": 4, + "head": [ + { + "lane": 1, + "sample_id": "Sample_14574-Qiagen-IndexSet1-SP-Lane1", + "index": "GAACTGAGCG", + "index2": "TCGTGGAGCG", + "sample_project": "AB-1234", + "overridecycles": "Y36;I10;I10", + "custom_description": "LIBRARY_NAME:test", + }, + { + "lane": 1, + "sample_id": "Sample_14575-Qiagen-IndexSet1-SP-Lane1", + "index": "AGGTCAGATA", + "index2": "CTACAAGATA", + "sample_project": "CD-5678", + "overridecycles": "Y36;I10;I10", + "custom_description": "LIBRARY_NAME:test", + }, + { + "lane": 2, + "sample_id": "Sample_14574-Qiagen-IndexSet1-SP-Lane2", + "index": "GAACTGAGCG", + "index2": "TCGTGGAGCG", + "sample_project": "AB-1234", + "overridecycles": "Y36;I10;I10", + "custom_description": "LIBRARY_NAME:test", + }, + { + "lane": 2, + "sample_id": "Sample_14575-Qiagen-IndexSet1-SP-Lane2", + "index": "AGGTCAGATA", + "index2": "CTACAAGATA", + "sample_project": "CD-5678", + "overridecycles": "Y36;I10;I10", + "custom_description": "LIBRARY_NAME:test", + }, + ], + }, + "expected_sequencing_metrics": { + 1: { + "total_reads_pf": 532_464_327, + "total_reads": 638_337_024, + "raw_density": 2_961_270.5, + "pf_density": 2_470_118.25, + "yield": 122_605_416, + "yield_undetermined": 121_940_136, + "top_unknown_barcodes": { + "len": 1029, + "head": [ + { + 'index': 'ATATCTGCTT', 'index2': 'TAGACAATCT', + 'count': 12857, + }, + { + 'index': 'CACCTCTCTT', 'index2': 'CTCGACTCCT', + 'count': 12406, + }, + { + 'index': 'ATGTAACGTT', 'index2': 'ACGATTGCTG', + 'count': 12177, + }, + { + 'index': 'TTCGGTGTGA', 'index2': 'GAACAAGTAT', + 'count': 11590, + }, + { + 'index': 'GGTCCGCTTC', 'index2': 'CTCACACAAG', + 'count': 11509, + }, + ], + }, + "reads": { + 1: { + "mean_error_rate": np.nan, + "percent_q30": 95.70932006835938, + "is_index": False, + "mean_percent_phix_aligned": 0., + }, + 2: { + "mean_error_rate": np.nan, + "percent_q30": 92.57965850830078, + "is_index": True, + "mean_percent_phix_aligned": np.nan, + }, + 3: { + "mean_error_rate": np.nan, + "percent_q30": 90.3790283203125, + "is_index": True, + "mean_percent_phix_aligned": np.nan, + }, + }, + "reads_per_sample": [ + { + "sample_id": "Sample_14574-Qiagen-IndexSet1-SP-Lane1", + "cluster_count": 9920, + "percent_of_lane": 0.29, + "percent_perfect_index_reads": 97.96, + "mean_q30": 36.37, + "percent_q30": 96, + }, + { + "sample_id": "Sample_14575-Qiagen-IndexSet1-SP-Lane1", + "cluster_count": 8560, + "percent_of_lane": 0.25, + "percent_perfect_index_reads": 98.15, + "mean_q30": 36.43, + "percent_q30": 96, + }, + ], + }, + 2: { + "total_reads_pf": 530_917_565, + "total_reads": 638_337_024, + "raw_density": 2_961_270.5, + "pf_density": 2_462_942.5, + "yield": 124_497_108, + "yield_undetermined": 123_817_428, + "top_unknown_barcodes": { + "len": 1055, + "head": [ + { + 'index': 'ATATCTGCTT', 'index2': 'TAGACAATCT', + 'count': 13176, + }, + { + 'index': 'ATGTAACGTT', 'index2': 'ACGATTGCTG', + 'count': 12395, + }, + { + 'index': 'CACCTCTCTT', 'index2': 'CTCGACTCCT', + 'count': 12247, + }, + { + 'index': 'TTCGGTGTGA', 'index2': 'GAACAAGTAT', + 'count': 11909, + }, + { + 'index': 'TAATTAGCGT', 'index2': 'TGGTTAAGAA', + 'count': 11330, + }, + ], + }, + "reads": { + 1: { + "mean_error_rate": np.nan, + "percent_q30": 95.75276184082031, + "is_index": False, + "mean_percent_phix_aligned": 0., + }, + 2: { + "mean_error_rate": np.nan, + "percent_q30": 92.60448455810547, + "is_index": True, + "mean_percent_phix_aligned": np.nan, + }, + 3: { + "mean_error_rate": np.nan, + "percent_q30": 90.2811050415039, + "is_index": True, + "mean_percent_phix_aligned": np.nan, + }, + }, + "reads_per_sample": [ + { + "sample_id": "Sample_14574-Qiagen-IndexSet1-SP-Lane2", + "cluster_count": 10208, + "percent_of_lane": 0.3, + "percent_perfect_index_reads": 98.2, + "mean_q30": 36.4, + "percent_q30": 96, + }, + { + "sample_id": "Sample_14575-Qiagen-IndexSet1-SP-Lane2", + "cluster_count": 8672, + "percent_of_lane": 0.25, + "percent_perfect_index_reads": 98.29, + "mean_q30": 36.48, + "percent_q30": 97, + }, + ], + }, + }, + } + else: + raise Exception( + "This function is only compatible with the run with flowcell_id: 'HMTFYDRXX', " + f"the supplied runfolder has flowcell_id: {flowcell_id}" + ) + diff --git a/requirements/prod b/requirements/prod index 07cc4bb..777b608 100644 --- a/requirements/prod +++ b/requirements/prod @@ -1,10 +1,10 @@ click~=8.1.1 PyYAML~=6.0 -interop~=1.3.2 +interop~=1.4.0 xmltodict~=0.13.0 tornado~=6.3.2 sample_sheet~=0.13.0 pandas~=2.2.2 -numpy~=1.26.4 +numpy~=2.2.4 samshee~=0.2.3 jsonschema~=4.23.0 diff --git a/setup.py b/setup.py index 0c36f37..7b4c546 100644 --- a/setup.py +++ b/setup.py @@ -11,14 +11,19 @@ author_email='johan.dahlberg@medsci.uu.se', url="https://www.github.com/Molmed/checkQC", download_url='https://github.com/Molmed/checkQC/archive/{}.tar.gz'.format(__version__), - python_requires='>3.10, <3.11', + python_requires='>3.10', install_requires=[ - "click", - "PyYAML>=6.0", - "interop>=1.2.4", - "xmltodict", - "tornado", - "sample_sheet"], + "click~=8.1.1", + "PyYAML~=6.0", + "interop~=1.4.0", + " xmltodict~=0.13.0", + "tornado~=6.3.2", + " sample_sheet~=0.13.0", + " pandas~=2.2.2", + "numpy~=2.2.4", + "samshee~=0.2.3", + "jsonschema~=4.23.0", + ], packages=find_packages(exclude=["tests*"]), test_suite="tests", package_data={ diff --git a/tests/parsers/test_illumina_parser.py b/tests/parsers/test_illumina_parser.py index 2071cab..7f10343 100644 --- a/tests/parsers/test_illumina_parser.py +++ b/tests/parsers/test_illumina_parser.py @@ -4,8 +4,7 @@ from checkQC.parsers.illumina import ( _read_interop_summary, - _read_quality_metrics, - _read_top_unknown_barcodes, + _read_demultiplexing_metrics, _read_run_metadata, _read_samplesheet, ) @@ -20,17 +19,17 @@ def runfolder_path(): def test_read_interop_summary(runfolder_path): - run_summary, index_summary = _read_interop_summary(runfolder_path) + run_summary, index_summary, _ = _read_interop_summary(runfolder_path) - total_cluster_pf = run_summary.at(0).at(0).reads_pf() - assert total_cluster_pf == 532464327 + total_reads_pf = run_summary.at(0).at(0).reads_pf() + assert total_reads_pf == 532464327 sample_id = index_summary.at(0).at(0).sample_id() assert sample_id == "Sample_14574-Qiagen-IndexSet1-SP-Lane1" def test_read_quality_metrics(runfolder_path): - quality_metrics = _read_quality_metrics( + quality_metrics = _read_demultiplexing_metrics( runfolder_path / "Reports/Quality_Metrics.csv") assert len(quality_metrics) == 6 @@ -50,7 +49,7 @@ def test_read_quality_metrics(runfolder_path): def test_read_to_unknown_barcodes(runfolder_path): - top_unknown_barcodes = _read_top_unknown_barcodes( + top_unknown_barcodes = _read_demultiplexing_metrics( runfolder_path / "Reports/Top_Unknown_Barcodes.csv") assert len(top_unknown_barcodes) == 2084 @@ -108,4 +107,6 @@ def test_read_samplesheet(runfolder_path): 'lane': 1, 'sample_id': 'Sample_14574-Qiagen-IndexSet1-SP-Lane1', 'sample_project': 'AB-1234', + "overridecycles": "Y36;I10;I10", + 'custom_description': 'LIBRARY_NAME:test', } diff --git a/tests/qc_checkers/test_cluster_pf.py b/tests/qc_checkers/test_cluster_pf.py index e8fce39..ec613fb 100644 --- a/tests/qc_checkers/test_cluster_pf.py +++ b/tests/qc_checkers/test_cluster_pf.py @@ -8,16 +8,16 @@ def qc_data(): return namedtuple("QCData", "sequencing_metrics")( { - 1: {"total_cluster_pf": 1_000_000_000}, - 2: {"total_cluster_pf": 10_000_000}, - 3: {"total_cluster_pf": 100_000_000}, - 4: {"total_cluster_pf": 10_000_000_000}, + 1: {"total_reads_pf": 1_000_000_000}, + 2: {"total_reads_pf": 10_000_000}, + 3: {"total_reads_pf": 100_000_000}, + 4: {"total_reads_pf": 10_000_000_000}, } ) -def format_msg(total_cluster_pf, threshold, lane, **kwargs): - return f"Clusters PF {total_cluster_pf / 10**6}M < {threshold / 10**6}M on lane {lane}" +def format_msg(total_reads_pf, threshold, lane, **kwargs): + return f"Clusters PF {total_reads_pf / 10**6}M < {threshold / 10**6}M on lane {lane}" def test_cluster_pf(qc_data): @@ -34,7 +34,7 @@ def test_cluster_pf(qc_data): match lane: case 2: exp_data = { - "total_cluster_pf": qc_data.sequencing_metrics[lane]["total_cluster_pf"], + "total_reads_pf": qc_data.sequencing_metrics[lane]["total_reads_pf"], "threshold": 50_000_000, "lane": lane, "qc_checker": "cluster_pf", @@ -44,7 +44,7 @@ def test_cluster_pf(qc_data): assert report.data == exp_data case 3: exp_data = { - "total_cluster_pf": qc_data.sequencing_metrics[lane]["total_cluster_pf"], + "total_reads_pf": qc_data.sequencing_metrics[lane]["total_reads_pf"], "threshold": 500_500_000, "lane": lane, "qc_checker": "cluster_pf", @@ -69,7 +69,7 @@ def test_cluster_pf_error_unknown(qc_data): match lane: case 2: exp_data = { - "total_cluster_pf": qc_data.sequencing_metrics[lane]["total_cluster_pf"], + "total_reads_pf": qc_data.sequencing_metrics[lane]["total_reads_pf"], "threshold": 500_000_000, "lane": lane, "qc_checker": "cluster_pf", @@ -79,7 +79,7 @@ def test_cluster_pf_error_unknown(qc_data): assert report.data == exp_data case 3: exp_data = { - "total_cluster_pf": qc_data.sequencing_metrics[lane]["total_cluster_pf"], + "total_reads_pf": qc_data.sequencing_metrics[lane]["total_reads_pf"], "threshold": 500_000_000, "lane": lane, "qc_checker": "cluster_pf", @@ -104,7 +104,7 @@ def test_cluster_pf_warning_unknown(qc_data): match lane: case 2: exp_data = { - "total_cluster_pf": qc_data.sequencing_metrics[lane]["total_cluster_pf"], + "total_reads_pf": qc_data.sequencing_metrics[lane]["total_reads_pf"], "threshold": 50_000_000, "lane": lane, "qc_checker": "cluster_pf", diff --git a/tests/qc_checkers/test_unidentified_index.py b/tests/qc_checkers/test_unidentified_index.py index 5dbe481..4c81282 100644 --- a/tests/qc_checkers/test_unidentified_index.py +++ b/tests/qc_checkers/test_unidentified_index.py @@ -165,7 +165,7 @@ def qc_data(): return namedtuple("QCData", ["sequencing_metrics", "samplesheet"])( { 1: { - "total_cluster_pf": 100, + "total_reads_pf": 100, "top_unknown_barcodes": [ {"lane": 1, "index": "ACCT", "count": 10}, {"lane": 1, "index": "AC", "count": 50}, diff --git a/tests/resources/bclconvert/200624_A00834_0183_BHMTFYTINY/Reports/Demultiplex_Stats.csv b/tests/resources/bclconvert/200624_A00834_0183_BHMTFYTINY/Reports/Demultiplex_Stats.csv new file mode 100644 index 0000000..f8afe72 --- /dev/null +++ b/tests/resources/bclconvert/200624_A00834_0183_BHMTFYTINY/Reports/Demultiplex_Stats.csv @@ -0,0 +1,7 @@ +Lane,SampleID,Sample_Project,Index,# Reads,# Perfect Index Reads,# One Mismatch Index Reads,# Two Mismatch Index Reads,% Reads,% Perfect Index Reads,% One Mismatch Index Reads,% Two Mismatch Index Reads +1,Sample_14574-Qiagen-IndexSet1-SP-Lane1,AB-1234,GAACTGAGCG-TCGTGGAGCG,9920,9718,202,0,0.0029,0.9796,0.0204,0.0000 +1,Sample_14575-Qiagen-IndexSet1-SP-Lane1,CD-5678,AGGTCAGATA-CTACAAGATA,8560,8402,158,0,0.0025,0.9815,0.0185,0.0000 +1,Undetermined,Undetermined,,3387226,3387226,0,0,0.9946,1.0000,0.0000,0.0000 +2,Sample_14574-Qiagen-IndexSet1-SP-Lane2,AB-1234,GAACTGAGCG-TCGTGGAGCG,10208,10024,184,0,0.0030,0.9820,0.0180,0.0000 +2,Sample_14575-Qiagen-IndexSet1-SP-Lane2,CD-5678,AGGTCAGATA-CTACAAGATA,8672,8524,148,0,0.0025,0.9829,0.0171,0.0000 +2,Undetermined,Undetermined,,3439373,3439373,0,0,0.9945,1.0000,0.0000,0.0000 diff --git a/tests/resources/bclconvert/200624_A00834_0183_BHMTFYTINY/SampleSheet.csv b/tests/resources/bclconvert/200624_A00834_0183_BHMTFYTINY/SampleSheet.csv index abb88cc..c051b24 100755 --- a/tests/resources/bclconvert/200624_A00834_0183_BHMTFYTINY/SampleSheet.csv +++ b/tests/resources/bclconvert/200624_A00834_0183_BHMTFYTINY/SampleSheet.csv @@ -4,10 +4,10 @@ Date,6/24/2020,, Application,Illumina DRAGEN COVIDSeq Test Pipeline,, Instrument Type,NovaSeq6000,, Assay,Illumina COVIDSeq Test,, -Index Adapters,IDT-ILMN DNA-RNA UDP Indexes ,, +Index Adapters,"IDT-ILMN DNA-RNA UDP Indexes ",, Chemistry,Amplicon,, ,,, -[Reads],,,,,, +[Reads],,,,, Read1Cycles,36,, Index1Cycles,10,, Index2Cycles,10,, @@ -18,8 +18,8 @@ FastqCompressionFormat,gzip,, SoftwareVersion,4.1.5,, ,,, [BCLConvert_Data],,, -Lane,Sample_ID,Index,Index2,Sample_Project -1,Sample_14574-Qiagen-IndexSet1-SP-Lane1,GAACTGAGCG,TCGTGGAGCG,AB-1234 -1,Sample_14575-Qiagen-IndexSet1-SP-Lane1,AGGTCAGATA,CTACAAGATA,CD-5678 -2,Sample_14574-Qiagen-IndexSet1-SP-Lane2,GAACTGAGCG,TCGTGGAGCG,AB-1234 -2,Sample_14575-Qiagen-IndexSet1-SP-Lane2,AGGTC AGATA,C TACAA GATA,CD-5678 +Lane,Sample_ID,Index,Index2,Sample_Project,OverrideCycles,custom_Description +1,Sample_14574-Qiagen-IndexSet1-SP-Lane1,GAACTGAGCG,TCGTGGAGCG,AB-1234,Y36;I10;I10,LIBRARY_NAME:test +1,Sample_14575-Qiagen-IndexSet1-SP-Lane1,AGGTCAGATA,CTACAAGATA,CD-5678,Y36;I10;I10,LIBRARY_NAME:test +2,Sample_14574-Qiagen-IndexSet1-SP-Lane2,GAACTGAGCG,TCGTGGAGCG,AB-1234,Y36;I10;I10,LIBRARY_NAME:test +2,Sample_14575-Qiagen-IndexSet1-SP-Lane2,AGGTC AGATA,C TACAA GATA,CD-5678,Y36;I10;I10,LIBRARY_NAME:test diff --git a/tests/test_qc_data.py b/tests/test_qc_data.py index 35e3dee..79776b0 100644 --- a/tests/test_qc_data.py +++ b/tests/test_qc_data.py @@ -1,11 +1,8 @@ from pathlib import Path -from unittest import mock - -import numpy as np import pytest +from checkQC.qc_data_utils import bclconvert_test_runfolder from checkQC.qc_data import QCData -from checkQC.handlers.qc_handler import QCErrorFatal, QCErrorWarning from tests.test_utils import float_eq @@ -15,174 +12,14 @@ def bclconvert_runfolder(): parser_config = { "reports_location": "Reports" } - + runfolder_path = Path(__file__).parent / f"resources/bclconvert/200624_A00834_0183_BHMTFYTINY" qc_data = QCData.from_bclconvert( - Path(__file__).parent / "resources/bclconvert/200624_A00834_0183_BHMTFYTINY", + runfolder_path, parser_config, ) + return bclconvert_test_runfolder(qc_data, runfolder_path) - return { - "qc_data": qc_data, - "expected_instrument": "novaseq_SP", - "expected_read_length": 36, - "expected_samplesheet": { - "len": 4, - "head": [ - { - "lane": 1, - "sample_id": "Sample_14574-Qiagen-IndexSet1-SP-Lane1", - "index": "GAACTGAGCG", - "index2": "TCGTGGAGCG", - "sample_project": "AB-1234", - }, - { - "lane": 1, - "sample_id": "Sample_14575-Qiagen-IndexSet1-SP-Lane1", - "index": "AGGTCAGATA", - "index2": "CTACAAGATA", - "sample_project": "CD-5678", - }, - { - "lane": 2, - "sample_id": "Sample_14574-Qiagen-IndexSet1-SP-Lane2", - "index": "GAACTGAGCG", - "index2": "TCGTGGAGCG", - "sample_project": "AB-1234", - }, - { - "lane": 2, - "sample_id": "Sample_14575-Qiagen-IndexSet1-SP-Lane2", - "index": "AGGTCAGATA", - "index2": "CTACAAGATA", - "sample_project": "CD-5678", - }, - ], - }, - "expected_sequencing_metrics": { - 1: { - "total_cluster_pf": 532_464_327, - "yield": 122_605_416, - "yield_undetermined": 121_940_136, - "top_unknown_barcodes": { - "len": 1029, - "head": [ - { - 'index': 'ATATCTGCTT', 'index2': 'TAGACAATCT', - 'count': 12857, - }, - { - 'index': 'CACCTCTCTT', 'index2': 'CTCGACTCCT', - 'count': 12406, - }, - { - 'index': 'ATGTAACGTT', 'index2': 'ACGATTGCTG', - 'count': 12177, - }, - { - 'index': 'TTCGGTGTGA', 'index2': 'GAACAAGTAT', - 'count': 11590, - }, - { - 'index': 'GGTCCGCTTC', 'index2': 'CTCACACAAG', - 'count': 11509, - }, - ], - }, - "reads": { - 1: { - "mean_error_rate": np.nan, - "percent_q30": 95.70932006835938, - "is_index": False, - "mean_percent_phix_aligned": 0., - }, - 2: { - "mean_error_rate": np.nan, - "percent_q30": 92.57965850830078, - "is_index": True, - "mean_percent_phix_aligned": np.nan, - }, - 3: { - "mean_error_rate": np.nan, - "percent_q30": 90.3790283203125, - "is_index": True, - "mean_percent_phix_aligned": np.nan, - }, - }, - "reads_per_sample": [ - { - "sample_id": "Sample_14574-Qiagen-IndexSet1-SP-Lane1", - "cluster_count": 9920, - }, - { - "sample_id": "Sample_14575-Qiagen-IndexSet1-SP-Lane1", - "cluster_count": 8560, - }, - ], - }, - 2: { - "total_cluster_pf": 530_917_565, - "yield": 124_497_108, - "yield_undetermined": 123_817_428, - "top_unknown_barcodes": { - "len": 1055, - "head": [ - { - 'index': 'ATATCTGCTT', 'index2': 'TAGACAATCT', - 'count': 13176, - }, - { - 'index': 'ATGTAACGTT', 'index2': 'ACGATTGCTG', - 'count': 12395, - }, - { - 'index': 'CACCTCTCTT', 'index2': 'CTCGACTCCT', - 'count': 12247, - }, - { - 'index': 'TTCGGTGTGA', 'index2': 'GAACAAGTAT', - 'count': 11909, - }, - { - 'index': 'TAATTAGCGT', 'index2': 'TGGTTAAGAA', - 'count': 11330, - }, - ], - }, - "reads": { - 1: { - "mean_error_rate": np.nan, - "percent_q30": 95.75276184082031, - "is_index": False, - "mean_percent_phix_aligned": 0., - }, - 2: { - "mean_error_rate": np.nan, - "percent_q30": 92.60448455810547, - "is_index": True, - "mean_percent_phix_aligned": np.nan, - }, - 3: { - "mean_error_rate": np.nan, - "percent_q30": 90.2811050415039, - "is_index": True, - "mean_percent_phix_aligned": np.nan, - }, - }, - "reads_per_sample": [ - { - "sample_id": "Sample_14574-Qiagen-IndexSet1-SP-Lane2", - "cluster_count": 10208, - }, - { - "sample_id": "Sample_14575-Qiagen-IndexSet1-SP-Lane2", - "cluster_count": 8672, - }, - ], - }, - }, - } - def test_qc_data(bclconvert_runfolder): qc_data = bclconvert_runfolder["qc_data"]