AlphaFoldRC/run_alphafold-gpu_2.3.1.py at main · EpiGenomicsCode/AlphaFoldRC · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
#!/usr/bin/python3

"""Singularity/Apptainer launch script for Alphafold."""

def parse_args():
  import argparse

  parser = argparse.ArgumentParser(description='Run AlphaFold structure prediction using SIF image.')

  parser.add_argument(
    '--fasta_paths', required=True,
    help='Paths to FASTA files, each containing a prediction '
    'target that will be folded one after another. If a FASTA file contains '
    'multiple sequences, then it will be folded as a multimer. Paths should be '
    'separated by commas. All FASTA paths must have a unique basename as the '
    'basename is used to name the output directories for each prediction.')

  parser.add_argument(
    '--use_gpu', type=str_to_bool, default=True,
    help='Enable NVIDIA runtime to run with GPUs.')

  import os
  parser.add_argument(
    '--gpu_devices', default=os.environ.get('SGE_GPU', '0'),
    help='Comma separated list GPU identifiers to set environment variable CUDA_VISIBLE_DEVICES.')

  parser.add_argument(
    '--run_relax', type=str_to_bool, default=True,
    help='Whether to do OpenMM energy minimization of each predicted structure.')

  parser.add_argument(
    '--use_gpu_relax', type=str_to_bool, default=True,
    help='Whether to do OpenMM energy minimization using GPU.')

  parser.add_argument(
    '--output_dir', default='./OUTPUT_ALPHAFOLD',
    help='Path to a directory that will store the results.')

  parser.add_argument(
    '--data_dir', default='/storage/icds/RISE/sw8/alphafold/alphafold_2.3_db',
    help='Path to directory with supporting data: AlphaFold parameters and genetic '
    'and template databases. Set to the target of download_all_databases.sh.')

  parser.add_argument(
    '--mount_data_dir', default='/storage/icds/RISE/sw8/alphafold/alphafold_2.3_db',
    help='Path to directory where databases reside.')

  parser.add_argument(
    '--singularity_image_path', default='/storage/icds/RISE/sw8/alphafold/singularity/alphafold_2.3.1',
    help='Path to the AlphaFold singularity image.')

  parser.add_argument(
    '--max_template_date', default='2040-01-01',
    help='Maximum template release date to consider (ISO-8601 format: YYYY-MM-DD). '
    'Important if folding historical test sets.')

  parser.add_argument(
    '--db_preset', default='full_dbs', choices=['full_dbs', 'reduced_dbs'],
    help='Choose preset MSA database configuration - smaller genetic database '
    'config (reduced_dbs) or full genetic database config (full_dbs)')

  parser.add_argument(
    '--model_preset', default='multimer',
    choices=['monomer', 'monomer_casp14', 'monomer_ptm', 'multimer'],
    help='Choose preset model configuration - the monomer model, the monomer model '
    'with extra ensembling, monomer model with pTM head, or multimer model')

  parser.add_argument(
      '--num_multimer_predictions_per_model', default=1,
      help='How many predictions (each with a different random seed) will be '
      'generated per model. E.g. if this is 2 and there are 5 '
      'models then there will be 10 predictions per input. '
      'Note: this FLAG only applies if model_preset=multimer')

  parser.add_argument(
    '--benchmark', default=False,
    help='Run multiple JAX model evaluations to obtain a timing that excludes the '
    'compilation time, which should be more indicative of the time required '
    'for inferencing many proteins.')

  parser.add_argument(
    '--use_precomputed_msas', default=True,
    help='Whether to read MSAs that have been written to disk. WARNING: This will '
    'not check if the sequence, database or configuration have changed.')

  args = parser.parse_args()
  return args

def str_to_bool(v):
    if isinstance(v, bool):
        return v
    if v.lower() in ('yes', 'true', 't', 'y', '1'):
        return True
    elif v.lower() in ('no', 'false', 'f', 'n', '0'):
        return False
    else:
        import argparse
        raise argparse.ArgumentTypeError('Boolean value expected.')

def main():

  args = parse_args()

  # You can individually override the following paths if you have placed the
  # data in locations other than the parser.data_dir.

  # Path to the Uniref90 database for use by JackHMMER.
  import os.path
  uniref90_database_path = os.path.join(
      args.data_dir, 'uniref90', 'uniref90.fasta')

  # Path to the Uniprot database for use by JackHMMER.
  uniprot_database_path = os.path.join(
      args.data_dir, 'uniprot', 'uniprot.fasta')

  # Path to the MGnify database for use by JackHMMER.
  mgnify_database_path = os.path.join(
      args.data_dir, 'mgnify', 'mgy_clusters_2022_05.fa')

  # Path to the BFD database for use by HHblits.
  bfd_database_path = os.path.join(
      args.data_dir, 'bfd',
      'bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt')

  # Path to the Small BFD database for use by JackHMMER.
  small_bfd_database_path = os.path.join(
      args.data_dir, 'small_bfd', 'bfd-first_non_consensus_sequences.fasta')

  # Path to the Uniref30 database for use by HHblits.
  uniref30_database_path = os.path.join(
      args.data_dir, 'uniref30', 'UniRef30_2021_03')

  # Path to the PDB70 database for use by HHsearch.
  pdb70_database_path = os.path.join(args.data_dir, 'pdb70', 'pdb70')

  # Path to the PDB seqres database for use by hmmsearch.
  pdb_seqres_database_path = os.path.join(
      args.data_dir, 'pdb_seqres', 'pdb_seqres.txt')

  # Path to a directory with template mmCIF structures, each named <pdb_id>.cif.
  template_mmcif_dir = os.path.join(args.data_dir, 'pdb_mmcif', 'mmcif_files')

  # Path to a file mapping obsolete PDB IDs to their replacements.
  obsolete_pdbs_path = os.path.join(args.data_dir, 'pdb_mmcif', 'obsolete.dat')

  mounts = []
  command_args = []

  # FASTA paths
  command_args.append(f'--fasta_paths={args.fasta_paths}')

  database_paths = [
      ('uniref90_database_path', uniref90_database_path),
      ('mgnify_database_path', mgnify_database_path),
      ('data_dir', args.data_dir),
      ('template_mmcif_dir', template_mmcif_dir),
      ('obsolete_pdbs_path', obsolete_pdbs_path),
  ]

  if args.model_preset == 'multimer':
    database_paths.append(('uniprot_database_path', uniprot_database_path))
    database_paths.append(('pdb_seqres_database_path',
                           pdb_seqres_database_path))
  else:
    database_paths.append(('pdb70_database_path', pdb70_database_path))

  if args.db_preset == 'reduced_dbs':
    database_paths.append(('small_bfd_database_path', small_bfd_database_path))
  else:
    database_paths.append(('uniref30_database_path', uniref30_database_path))
    database_paths.append(('bfd_database_path', bfd_database_path))

  for name, path in database_paths:
    if path:
      command_args.append(f'--{name}={path}')

  command_args.extend([
      f'--output_dir={args.output_dir}',
      f'--max_template_date={args.max_template_date}',
      f'--db_preset={args.db_preset}',
      f'--model_preset={args.model_preset}',
      f'--num_multimer_predictions_per_model={args.num_multimer_predictions_per_model}',
      f'--run_relax={args.run_relax}',
      f'--use_gpu_relax={args.use_gpu_relax}',
      f'--benchmark={args.benchmark}',
      f'--use_precomputed_msas={args.use_precomputed_msas}',
      '--logtostderr',
  ])

  env_vars = {
          'CUDA_VISIBLE_DEVICES': args.gpu_devices,
          'NVIDIA_VISIBLE_DEVICES': args.gpu_devices,
          # The following flags allow us to make predictions on proteins that
          # would typically be too long to fit into GPU memory.
          'TF_FORCE_UNIFIED_MEMORY': '1',
          'XLA_PYTHON_CLIENT_MEM_FRACTION': '4.0',
          }
  env_vals = ','.join('%s=%s' % (key,value) for key,value in env_vars.items())

  # AlphaFold uses Python tempfile which uses TMPDIR env variable
  import os
  tempdir = os.environ.get('TMPDIR', '/tmp')

  args = ['singularity',
          'run',
          '--nv',  # Use Nvidia container library to use CUDA
          '-B "%s"' % args.mount_data_dir,    # Mount AlphaFold databases
          '-B "%s"' % os.getcwd(),	# Mount current directory for sequence
          '-B "%s"' % tempdir,		# Mount scratch directory
          '--env %s' % env_vals,
          args.singularity_image_path
        ] + command_args
  cmd = ' '.join(args)
  print (cmd)

  from subprocess import run
  import sys
  run(cmd,
      stdout = sys.stdout, stderr = sys.stderr,
      shell = True,
      executable = '/bin/csh',
      check = True)

if __name__ == '__main__':
  main()