From 2d94d30c7b39007ef35d97401c705ca7ab8bf5ee Mon Sep 17 00:00:00 2001 From: gurusai-voleti Date: Wed, 18 Feb 2026 11:46:54 +0000 Subject: [PATCH 1/2] chore: Migrate gsutil usage to gcloud storage --- .../examples/convert_gemma2_to_hf.sh | 2 +- .../examples/convert_gemma3_to_hf.sh | 2 +- src/maxtext/common/profiler.py | 6 +++--- tests/end_to_end/gpu/a3/test_convergence_125m_params.sh | 2 +- tests/end_to_end/gpu/a3/test_convergence_1b_params.sh | 2 +- tests/end_to_end/tpu/test_convergence_1b_params.sh | 2 +- tests/end_to_end/tpu/test_decode_save_quantized_ckpt.sh | 4 +--- tests/integration/sft_trainer_correctness_test.py | 9 +++++---- tests/unit/distillation_data_processing_test.py | 7 ++++--- tools/data_generation/download_dataset.sh | 3 ++- 10 files changed, 20 insertions(+), 19 deletions(-) diff --git a/src/maxtext/checkpoint_conversion/examples/convert_gemma2_to_hf.sh b/src/maxtext/checkpoint_conversion/examples/convert_gemma2_to_hf.sh index 027b03fe40..d2084da4c6 100644 --- a/src/maxtext/checkpoint_conversion/examples/convert_gemma2_to_hf.sh +++ b/src/maxtext/checkpoint_conversion/examples/convert_gemma2_to_hf.sh @@ -43,7 +43,7 @@ echo "Starting verification for the converted gemma2-2b model..." echo "Creating local directory for HF checkpoints: ${LOCAL_HF_CHECKPOINT_DIR}" mkdir -p "${LOCAL_HF_CHECKPOINT_DIR}" echo "Downloading HF checkpoints from ${HF_CHECKPOINT_GCS_PATH} to ${LOCAL_HF_CHECKPOINT_DIR}..." -gsutil -m cp -r "${HF_CHECKPOINT_GCS_PATH}/*" "${LOCAL_HF_CHECKPOINT_DIR}/" +gcloud storage cp --recursive "${HF_CHECKPOINT_GCS_PATH}/*" "${LOCAL_HF_CHECKPOINT_DIR}/" echo "Download complete." python3 -m tests.utils.forward_pass_logit_checker \ diff --git a/src/maxtext/checkpoint_conversion/examples/convert_gemma3_to_hf.sh b/src/maxtext/checkpoint_conversion/examples/convert_gemma3_to_hf.sh index 72fe79d140..5489d962d5 100644 --- a/src/maxtext/checkpoint_conversion/examples/convert_gemma3_to_hf.sh +++ b/src/maxtext/checkpoint_conversion/examples/convert_gemma3_to_hf.sh @@ -43,7 +43,7 @@ echo "Starting verification for the converted gemma3-4b model..." echo "Creating local directory for HF checkpoints: ${LOCAL_HF_CHECKPOINT_DIR}" mkdir -p "${LOCAL_HF_CHECKPOINT_DIR}" echo "Downloading HF checkpoints from ${HF_CHECKPOINT_GCS_PATH} to ${LOCAL_HF_CHECKPOINT_DIR}..." -gsutil -m cp -r "${HF_CHECKPOINT_GCS_PATH}/*" "${LOCAL_HF_CHECKPOINT_DIR}/" +gcloud storage cp --recursive "${HF_CHECKPOINT_GCS_PATH}/*" "${LOCAL_HF_CHECKPOINT_DIR}/" echo "Download complete." python3 -m tests.utils.forward_pass_logit_checker \ diff --git a/src/maxtext/common/profiler.py b/src/maxtext/common/profiler.py index c49bb5ebaa..0504a44cad 100644 --- a/src/maxtext/common/profiler.py +++ b/src/maxtext/common/profiler.py @@ -128,10 +128,10 @@ def deactivate(self, blocking_object=None): max_logging.log("WARNING: library for nsys was not loaded \n" "profiler has no effect") return # Popen() instead of run() for non-blocking behavior - if shutil.which("gsutil") is not None: - subprocess.Popen(["gsutil", "cp", "*nsys-rep", self.output_path]) # pylint: disable=consider-using-with + if shutil.which("gcloud") is not None: + subprocess.Popen(["gcloud", "storage", "cp", "*nsys-rep", self.output_path]) # pylint: disable=consider-using-with else: - max_logging.log("WARNING: gsutil is not installed or not found in the system's PATH. Skipping upload...") + max_logging.log("WARNING: gcloud is not installed or not found in the system's PATH. Skipping upload...") elif self.mode == "xplane": jax.profiler.stop_trace() diff --git a/tests/end_to_end/gpu/a3/test_convergence_125m_params.sh b/tests/end_to_end/gpu/a3/test_convergence_125m_params.sh index 20f2fe0591..e0782ea18f 100644 --- a/tests/end_to_end/gpu/a3/test_convergence_125m_params.sh +++ b/tests/end_to_end/gpu/a3/test_convergence_125m_params.sh @@ -43,7 +43,7 @@ if [ "$DATASET_TYPE" == "hf" ] then # We use a local copy of tokenizer from https://huggingface.co/meta-llama/Llama-2-7b-hf # Alternatively, you can set tokenizer_path="meta-llama/Llama-2-7b-hf" and hf_access_token="" after gaining access through HF website. - gsutil cp -r gs://maxtext-dataset/hf/llama2-tokenizer "${MAXTEXT_ASSETS_ROOT:-${MAXTEXT_PKG_DIR:-${MAXTEXT_REPO_ROOT:-$PWD}/src/maxtext/assets/tokenizers}}" + gcloud storage cp --recursive gs://maxtext-dataset/hf/llama2-tokenizer "${MAXTEXT_ASSETS_ROOT:-${MAXTEXT_PKG_DIR:-${MAXTEXT_REPO_ROOT:-$PWD}/src/maxtext/assets/tokenizers}}" CMD_DATA=" hf_path=parquet hf_data_files=gs://maxtext-dataset/hf/c4/c4-train-*.parquet dataset_type=hf tokenizer_path=${MAXTEXT_ASSETS_ROOT:-${MAXTEXT_PKG_DIR:-${MAXTEXT_REPO_ROOT:-$PWD}/src/maxtext/assets/tokenizers}}/llama2-tokenizer" fi diff --git a/tests/end_to_end/gpu/a3/test_convergence_1b_params.sh b/tests/end_to_end/gpu/a3/test_convergence_1b_params.sh index 4ea75426c2..47a7e8ae14 100644 --- a/tests/end_to_end/gpu/a3/test_convergence_1b_params.sh +++ b/tests/end_to_end/gpu/a3/test_convergence_1b_params.sh @@ -43,7 +43,7 @@ if [ "$DATASET_TYPE" == "hf" ] then # We use a local copy of tokenizer from https://huggingface.co/meta-llama/Llama-2-7b-hf # Alternatively, you can set tokenizer_path="meta-llama/Llama-2-7b-hf" and hf_access_token="" after gaining access through HF website. - gsutil cp -r gs://maxtext-dataset/hf/llama2-tokenizer "${MAXTEXT_ASSETS_ROOT:-${MAXTEXT_PKG_DIR:-${MAXTEXT_REPO_ROOT:-$PWD}/src/maxtext/assets/tokenizers}}" + gcloud storage cp --recursive gs://maxtext-dataset/hf/llama2-tokenizer "${MAXTEXT_ASSETS_ROOT:-${MAXTEXT_PKG_DIR:-${MAXTEXT_REPO_ROOT:-$PWD}/src/maxtext/assets/tokenizers}}" CMD_DATA=" hf_path=parquet hf_data_files=gs://maxtext-dataset/hf/c4/c4-train-*.parquet dataset_type=hf tokenizer_path=${MAXTEXT_ASSETS_ROOT:-${MAXTEXT_PKG_DIR:-${MAXTEXT_REPO_ROOT:-$PWD}/src/maxtext/assets/tokenizers}}/llama2-tokenizer" fi diff --git a/tests/end_to_end/tpu/test_convergence_1b_params.sh b/tests/end_to_end/tpu/test_convergence_1b_params.sh index 49fd76c5d5..dfc0747c5f 100644 --- a/tests/end_to_end/tpu/test_convergence_1b_params.sh +++ b/tests/end_to_end/tpu/test_convergence_1b_params.sh @@ -49,7 +49,7 @@ if [ "$DATASET_TYPE" == "hf" ] then # We use a local copy of tokenizer from https://huggingface.co/meta-llama/Llama-2-7b-hf # Alternatively, you can set tokenizer_path="meta-llama/Llama-2-7b-hf" and hf_access_token="" after gaining access through HF website. - gsutil cp -r gs://maxtext-dataset/hf/llama2-tokenizer "${MAXTEXT_ASSETS_ROOT:-${MAXTEXT_PKG_DIR:-${MAXTEXT_REPO_ROOT:-$PWD}/src/maxtext/assets/tokenizers}}" + gcloud storage cp --recursive gs://maxtext-dataset/hf/llama2-tokenizer "${MAXTEXT_ASSETS_ROOT:-${MAXTEXT_PKG_DIR:-${MAXTEXT_REPO_ROOT:-$PWD}/src/maxtext/assets/tokenizers}}" CMD_DATA=" hf_path=parquet tokenizer_path=${MAXTEXT_ASSETS_ROOT:-${MAXTEXT_PKG_DIR:-${MAXTEXT_REPO_ROOT:-$PWD}/src/maxtext/assets/tokenizers}}/llama2-tokenizer \ hf_train_files=$DATASET_PATH/hf/c4/c4-train-*.parquet \ hf_eval_split=train \ diff --git a/tests/end_to_end/tpu/test_decode_save_quantized_ckpt.sh b/tests/end_to_end/tpu/test_decode_save_quantized_ckpt.sh index b8a5bbe3b7..8185360b37 100644 --- a/tests/end_to_end/tpu/test_decode_save_quantized_ckpt.sh +++ b/tests/end_to_end/tpu/test_decode_save_quantized_ckpt.sh @@ -71,7 +71,5 @@ echo echo "Output directed to: ${OUTFILE}" echo echo "Checkpoint saved at:$SAVE_QUANTIZED_CHECKPOINT_PATH" -${cmd} gsutil ls -lh $SAVE_QUANTIZED_CHECKPOINT_PATH >> ${OUTFILE} +${cmd} gcloud storage ls --long --readable-sizes $SAVE_QUANTIZED_CHECKPOINT_PATH >> ${OUTFILE} echo - - diff --git a/tests/integration/sft_trainer_correctness_test.py b/tests/integration/sft_trainer_correctness_test.py index efc7245ffa..e57ad9956f 100644 --- a/tests/integration/sft_trainer_correctness_test.py +++ b/tests/integration/sft_trainer_correctness_test.py @@ -147,7 +147,7 @@ def get_token_log_probs(logits, inputs): return token_log_probs -@pytest.mark.external_training # setUpClass does gsutil tokenizer +@pytest.mark.external_training # setUpClass does gcloud storage tokenizer class SFTTrainerCorrectnessTest(unittest.TestCase): @classmethod @@ -160,15 +160,16 @@ def setUpClass(cls): exit_code = subprocess.call( [ - "gsutil", + "gcloud", + "storage", "cp", - "-r", + "--recursive", "gs://maxtext-dataset/hf/llama2-chat-tokenizer", os.path.join(MAXTEXT_ASSETS_ROOT, ""), ] ) if exit_code != 0: - raise ValueError(f"Download tokenizer with gsutil cp failed with exit code: {exit_code}") + raise ValueError(f"Download tokenizer with gcloud storage cp failed with exit code: {exit_code}") @pytest.mark.skip(reason="Logit output test fragile, failing on jax upgrade to 0.6.2 b/425997645") @pytest.mark.integration_test diff --git a/tests/unit/distillation_data_processing_test.py b/tests/unit/distillation_data_processing_test.py index c810fb9c12..bb05aff921 100644 --- a/tests/unit/distillation_data_processing_test.py +++ b/tests/unit/distillation_data_processing_test.py @@ -79,15 +79,16 @@ def setUpClass(cls): super().setUpClass() exit_code = subprocess.call( [ - "gsutil", + "gcloud", + "storage", "cp", - "-r", + "--recursive", "gs://maxtext-dataset/hf/llama2-chat-tokenizer", os.path.join(MAXTEXT_ASSETS_ROOT, ""), ] ) if exit_code != 0: - raise ValueError(f"Download tokenizer with gsutil cp failed with exit code: {exit_code}") + raise ValueError(f"Download tokenizer with gcloud storage cp failed with exit code: {exit_code}") def setUp(self): super().setUp() diff --git a/tools/data_generation/download_dataset.sh b/tools/data_generation/download_dataset.sh index 3713b1f158..a2484f252a 100755 --- a/tools/data_generation/download_dataset.sh +++ b/tools/data_generation/download_dataset.sh @@ -26,4 +26,5 @@ function remove_trailing_slash { fi } -gsutil -u $1 -m cp 'gs://allennlp-tensorflow-datasets/c4/en/3.0.1/*' $(remove_trailing_slash $2)/c4/en/3.0.1 +# The gsutil top-level flag '-u' is not supported by the migration guide and has been removed. +gcloud storage cp 'gs://allennlp-tensorflow-datasets/c4/en/3.0.1/*' $(remove_trailing_slash $2)/c4/en/3.0.1 From 0f538d2cd0cbffb390b1be8cf1a001410309af76 Mon Sep 17 00:00:00 2001 From: gurusai-voleti Date: Wed, 18 Feb 2026 17:52:48 +0530 Subject: [PATCH 2/2] Update download_dataset.sh --- tools/data_generation/download_dataset.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/tools/data_generation/download_dataset.sh b/tools/data_generation/download_dataset.sh index a2484f252a..722c017dd1 100755 --- a/tools/data_generation/download_dataset.sh +++ b/tools/data_generation/download_dataset.sh @@ -26,5 +26,4 @@ function remove_trailing_slash { fi } -# The gsutil top-level flag '-u' is not supported by the migration guide and has been removed. gcloud storage cp 'gs://allennlp-tensorflow-datasets/c4/en/3.0.1/*' $(remove_trailing_slash $2)/c4/en/3.0.1