This is the initial overview for the CATH AlphaFold NextFlow workflow.
Name: CREATE_ALL_AF2_CHAIN_FASTA
Input:
PARAM:AF2_DOWNLOAD_URL-- AlphaFold Download (gs://public-datasets-deepmind-alphafold/sequences.fasta)
Output:
FILE:ALL_AF2_CHAIN_FASTA
Name: CREATE_DOMAIN_S95_PDB_DIR
Input:
FILE:DOMAIN_LIST_S95DIRECTORY:CATH_PDB_DOMAINS-- v4.3PARAM:S95_DIR-- folder containing S95 PDB files
Output:
DIRECTORY:FOLDSEEK_S95_PDB_DB
Process:
rsync --files-from <FILE:DOMAIN_LIST_S95> <DIRECTORY:CATH_PDB_DOMAINS> <DIRECTORY:FOLDSEEK_S95_PDB_DB>
Name: CREATE_FOLDSEEK_S95_LIBRARY
Input:
PARAM:S95_DIR-- folder containing S95 PDB files
Output:
FILE:FOLDSEEK_S95_LIBRARY
Process:
foldseek createdb <s95_dir> <s95_db>
Notes:
- currently in
/SAN/cath/cath_v4_3_0/databases/foldseek/cath_s95/
Name: CREATE_DATASET_UNIPROT_IDS
Input:
SOURCE:OracleDB--GENE3D_21.CATH_DOMAIN_PREDICTIONS{_EXTRA}FILTER-- CONDITIONAL_EVALUE <= 1e-50, top 10k hits (ordered by increasing evalue)
Output:
FILE:CSV_UNIPROT_IDS--UniProt_ID
Name: CREATE_DATASET_CATH_FILES
Input:
SOURCE:OracleDB--GENE3D_21.CATH_DOMAIN_PREDICTIONS{_EXTRA}FILE:CSV_UNIPROT_IDS--(UniProt_ID)
Output:
FILE:CSV_UNIPROT_MD5--(UniProt_ID, md5)FILE:CRH_OUTPUT--(md5, CATH_domain_ID, bitscore, boundaries, resolved_boundaries)FILE:AF2_DOMAIN_LIST--(AF_domain_ID_orig)FILE:AF2_CHAIN_LIST--(AF_chain_ID)FILE:AF2_CATH_ORIG_ANNOTATIONS--(AF_domain_ID_orig, CATH_domain_ID, UniProt_ID, md5, bitscore, resolved_boundaries, sfam_id, class_id)
Name: CREATE_DATASET_AF2_FILES
Input:
FILE:CSV_UNIPROT_MD5SOURCE:HPC_ENVIRONMENT-- Google Cloud Storage (GCS), Computer Science (CS)FILE:AF2_CHAIN_LIST
Output:
FILE:ALL_CHAIN_FASTA-- one FASTA file containing all AF2 chainsDIRECTORY:AF2_CHAIN_MMCIF-- one mmCIF file per AF2 chain
Process:
GCS--cat [manifest file] | gsutil -m cp -I .CS-- (local): symlinking (check if it exists)
Name: CREATE_ANNOTATION_CHAIN_DISORDER
Input:
FILE:ALL_CHAIN_FASTA
Output:
FILE:SETH_CHAIN_OUTPUT_FILE-- Per-residue SETH scores (disordered 0 -> 1 ordered)
Process:
SETH_1.py -i <your input fasta file name> -o <the desired name of your output file>
Name: FILTER_DOMAINLIST_BY_CHAIN_DISORDER
Input:
FILE:SETH_CHAIN_OUTPUT_FILEFILTER--(SETH_GLOBAL_CHAIN_DISORDER, SETH_LOCAL_DOMAIN_DISORDER)FILE:AF2_DOMAIN_LIST
Output:
FILE:AF2_DOMAIN_LISTFILE:SETH_GLOBAL_CHAIN_DISORDER--(AF_domain_ID_orig, seth_global_disorder_score)
Process:
python filter_disorder.py
Name: OPTIMISE_DOMAIN_BOUNDARIES
Input:
FILE:AF2_CATH_ORIG_ANNOTATIONSFILE:AF2_CHAIN_MMCIFFILE:AF2_DOMAIN_LIST
Output:
FILE:AF2_DOMAIN_LIST_POST_TAILCHOP--(AF_domain_ID_tailchop)FILE:AF2_DOMAIN_MAPPING_POST_TAILCHOP--(AF_domain_ID_orig, AF_domain_ID_tailchop)
Process:
python optimize_boundaries.py
Name: CREATE_ANNOTATION_DOMAIN_DISORDER
Input:
FILE:ALL_CHAIN_FASTAFILE:AF2_DOMAIN_LIST_POST_TAILCHOP
Output:
FILE:SETH_DOMAIN_OUTPUT_FILE-- Per-residue SETH scores (disordered 0 -> 1 ordered)
Process:
SETH_1.py -i <your input fasta file name> -o <the desired name of your output file>
Name: FILTER_DOMAINLIST_BY_DISORDER
Input:
FILE:SETH_DOMAIN_OUTPUT_FILEFILTER--(SETH_GLOBAL_CHAIN_DISORDER, SETH_LOCAL_DOMAIN_DISORDER)FILE:AF2_DOMAIN_LIST_POST_TAILCHOP
Output:
FILE:AF2_DOMAIN_LIST_POST_DOMAIN_DISORDERFILE:AF2_SETH_ANNOTATIONS--(AF_domain_ID_tailchop, seth_global_disorder_score)
Process:
python filter_disorder.py
Name: FILTER_DOMAINLIST_BY_AF2_QUALITY
Input:
FILE:AF2_CHAIN_MMCIFFILE:AF2_DOMAIN_LIST_POST_DOMAIN_DISORDER
Output:
FILE:AF2_DOMAIN_LIST_POST_AF_QUALITYFILE:AF2_QUALITY_ANNOTATIONS--(AF_domain_ID_tailchop, PLDDT_average, LUR_score)
Process:
python filter_af2_quality.py
Name: FILTER_DOMAINLIST_BY_AF2_PACKING
Input:
FILE:AF2_CHAIN_MMCIFFILE:AF2_DOMAIN_LIST_POST_AF_QUALITY
Output:
FILE:AF2_DOMAIN_LIST_POST_AF_PACKINGFILE:AF2_PACKING_ANNOTATIONS--(AF_domain_ID_tailchop, packing_score, surf_vol_score)
Process:
python filter_af2_packing.py
Name: FILTER_DOMAINLIST_BY_SSE
Input:
FILE:AF2_CHAIN_MMCIFFILE:AF2_DOMAIN_LIST_POST_AF_PACKING
Output:
FILE:AF2_DOMAIN_LIST_POST_SSEFILE:AF2_SSE_ANNOTATIONS--(AF_domain_ID_tailchop, sse_number)
Process:
python3 filter_sse.py
Name: CHOP_AF2_DOMAINS
Input:
FILE:AF2_DOMAIN_LIST_POST_SSE--af_<Uniprot_ID>/<start>-<stop>DIRECTORY:AF2_CHAIN_MMCIF
Output:
DIRECTORY:AF2_DOMAIN_MMCIFFILE:AF2_DOMAIN_MAPPING_POST_CHOPPING--(AF_domain_ID_tailchop, AF_domain_ID_chopped)
Process:
submit.sh
/home/ucbcisi/work/2022_06_29.alphafold_rechop_corrected_domains/submit.shName: CREATE_ANNOTATION_FOLDSEEK_S95
Input:
FILE:FOLDSEEK_S95_DBFILE:AF2_DOMAIN_LIST_POST_SSEDIRECTORY:AF2_DOMAIN_MMCIF
Output:
FILE:AF2_FOLDSEEK_ANNOTATIONS--(AF_domain_ID_tailchop, foldseek_bitscore, foldseek_overlap)
Process:
foldseek createdb AF2_DOMAIN_MMCIF AF2_DOMAIN_MMCIF_DB
foldseek search AF2_DOMAIN_MMCIF_DB FOLDSEEK_S95_DB
foldseek convertalis AF2_DOMAIN_MMCIF_DB FOLDSEEK_S95_DBName: CREATE_RESULTS_TABLE
Input:
FILE:AF2_DOMAIN_LIST_POST_SSEFILE:AF2_SETH_ANNOTATIONSFILE:AF2_QUALITY_ANNOTATIONSFILE:AF2_PACKING_ANNOTATIONSFILE:AF2_CATH_ORIG_ANNOTATIONSFILE:AF2_SSE_ANNOTATIONSFILE:AF2_FOLDSEEK_ANNOTATIONS
Output:
FILE:CATH_AF2_TABLE
Process:
python collate_data_to_table.py