Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ echo "Starting verification for the converted gemma2-2b model..."
echo "Creating local directory for HF checkpoints: ${LOCAL_HF_CHECKPOINT_DIR}"
mkdir -p "${LOCAL_HF_CHECKPOINT_DIR}"
echo "Downloading HF checkpoints from ${HF_CHECKPOINT_GCS_PATH} to ${LOCAL_HF_CHECKPOINT_DIR}..."
gsutil -m cp -r "${HF_CHECKPOINT_GCS_PATH}/*" "${LOCAL_HF_CHECKPOINT_DIR}/"
gcloud storage cp --recursive "${HF_CHECKPOINT_GCS_PATH}/*" "${LOCAL_HF_CHECKPOINT_DIR}/"
echo "Download complete."

python3 -m tests.utils.forward_pass_logit_checker \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ echo "Starting verification for the converted gemma3-4b model..."
echo "Creating local directory for HF checkpoints: ${LOCAL_HF_CHECKPOINT_DIR}"
mkdir -p "${LOCAL_HF_CHECKPOINT_DIR}"
echo "Downloading HF checkpoints from ${HF_CHECKPOINT_GCS_PATH} to ${LOCAL_HF_CHECKPOINT_DIR}..."
gsutil -m cp -r "${HF_CHECKPOINT_GCS_PATH}/*" "${LOCAL_HF_CHECKPOINT_DIR}/"
gcloud storage cp --recursive "${HF_CHECKPOINT_GCS_PATH}/*" "${LOCAL_HF_CHECKPOINT_DIR}/"
echo "Download complete."

python3 -m tests.utils.forward_pass_logit_checker \
Expand Down
6 changes: 3 additions & 3 deletions src/maxtext/common/profiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,10 +128,10 @@ def deactivate(self, blocking_object=None):
max_logging.log("WARNING: library for nsys was not loaded \n" "profiler has no effect")
return
# Popen() instead of run() for non-blocking behavior
if shutil.which("gsutil") is not None:
subprocess.Popen(["gsutil", "cp", "*nsys-rep", self.output_path]) # pylint: disable=consider-using-with
if shutil.which("gcloud") is not None:
subprocess.Popen(["gcloud", "storage", "cp", "*nsys-rep", self.output_path]) # pylint: disable=consider-using-with
else:
max_logging.log("WARNING: gsutil is not installed or not found in the system's PATH. Skipping upload...")
max_logging.log("WARNING: gcloud is not installed or not found in the system's PATH. Skipping upload...")
elif self.mode == "xplane":
jax.profiler.stop_trace()

Expand Down
2 changes: 1 addition & 1 deletion tests/end_to_end/gpu/a3/test_convergence_125m_params.sh
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ if [ "$DATASET_TYPE" == "hf" ]
then
# We use a local copy of tokenizer from https://huggingface.co/meta-llama/Llama-2-7b-hf
# Alternatively, you can set tokenizer_path="meta-llama/Llama-2-7b-hf" and hf_access_token="<your-token>" after gaining access through HF website.
gsutil cp -r gs://maxtext-dataset/hf/llama2-tokenizer "${MAXTEXT_ASSETS_ROOT:-${MAXTEXT_PKG_DIR:-${MAXTEXT_REPO_ROOT:-$PWD}/src/maxtext/assets/tokenizers}}"
gcloud storage cp --recursive gs://maxtext-dataset/hf/llama2-tokenizer "${MAXTEXT_ASSETS_ROOT:-${MAXTEXT_PKG_DIR:-${MAXTEXT_REPO_ROOT:-$PWD}/src/maxtext/assets/tokenizers}}"
CMD_DATA=" hf_path=parquet hf_data_files=gs://maxtext-dataset/hf/c4/c4-train-*.parquet dataset_type=hf tokenizer_path=${MAXTEXT_ASSETS_ROOT:-${MAXTEXT_PKG_DIR:-${MAXTEXT_REPO_ROOT:-$PWD}/src/maxtext/assets/tokenizers}}/llama2-tokenizer"
fi

Expand Down
2 changes: 1 addition & 1 deletion tests/end_to_end/gpu/a3/test_convergence_1b_params.sh
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ if [ "$DATASET_TYPE" == "hf" ]
then
# We use a local copy of tokenizer from https://huggingface.co/meta-llama/Llama-2-7b-hf
# Alternatively, you can set tokenizer_path="meta-llama/Llama-2-7b-hf" and hf_access_token="<your-token>" after gaining access through HF website.
gsutil cp -r gs://maxtext-dataset/hf/llama2-tokenizer "${MAXTEXT_ASSETS_ROOT:-${MAXTEXT_PKG_DIR:-${MAXTEXT_REPO_ROOT:-$PWD}/src/maxtext/assets/tokenizers}}"
gcloud storage cp --recursive gs://maxtext-dataset/hf/llama2-tokenizer "${MAXTEXT_ASSETS_ROOT:-${MAXTEXT_PKG_DIR:-${MAXTEXT_REPO_ROOT:-$PWD}/src/maxtext/assets/tokenizers}}"
CMD_DATA=" hf_path=parquet hf_data_files=gs://maxtext-dataset/hf/c4/c4-train-*.parquet dataset_type=hf tokenizer_path=${MAXTEXT_ASSETS_ROOT:-${MAXTEXT_PKG_DIR:-${MAXTEXT_REPO_ROOT:-$PWD}/src/maxtext/assets/tokenizers}}/llama2-tokenizer"
fi

Expand Down
2 changes: 1 addition & 1 deletion tests/end_to_end/tpu/test_convergence_1b_params.sh
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ if [ "$DATASET_TYPE" == "hf" ]
then
# We use a local copy of tokenizer from https://huggingface.co/meta-llama/Llama-2-7b-hf
# Alternatively, you can set tokenizer_path="meta-llama/Llama-2-7b-hf" and hf_access_token="<your-token>" after gaining access through HF website.
gsutil cp -r gs://maxtext-dataset/hf/llama2-tokenizer "${MAXTEXT_ASSETS_ROOT:-${MAXTEXT_PKG_DIR:-${MAXTEXT_REPO_ROOT:-$PWD}/src/maxtext/assets/tokenizers}}"
gcloud storage cp --recursive gs://maxtext-dataset/hf/llama2-tokenizer "${MAXTEXT_ASSETS_ROOT:-${MAXTEXT_PKG_DIR:-${MAXTEXT_REPO_ROOT:-$PWD}/src/maxtext/assets/tokenizers}}"
CMD_DATA=" hf_path=parquet tokenizer_path=${MAXTEXT_ASSETS_ROOT:-${MAXTEXT_PKG_DIR:-${MAXTEXT_REPO_ROOT:-$PWD}/src/maxtext/assets/tokenizers}}/llama2-tokenizer \
hf_train_files=$DATASET_PATH/hf/c4/c4-train-*.parquet \
hf_eval_split=train \
Expand Down
4 changes: 1 addition & 3 deletions tests/end_to_end/tpu/test_decode_save_quantized_ckpt.sh
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,5 @@ echo
echo "Output directed to: ${OUTFILE}"
echo
echo "Checkpoint saved at:$SAVE_QUANTIZED_CHECKPOINT_PATH"
${cmd} gsutil ls -lh $SAVE_QUANTIZED_CHECKPOINT_PATH >> ${OUTFILE}
${cmd} gcloud storage ls --long --readable-sizes $SAVE_QUANTIZED_CHECKPOINT_PATH >> ${OUTFILE}
echo


9 changes: 5 additions & 4 deletions tests/integration/sft_trainer_correctness_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,7 @@ def get_token_log_probs(logits, inputs):
return token_log_probs


@pytest.mark.external_training # setUpClass does gsutil tokenizer
@pytest.mark.external_training # setUpClass does gcloud storage tokenizer
class SFTTrainerCorrectnessTest(unittest.TestCase):

@classmethod
Expand All @@ -160,15 +160,16 @@ def setUpClass(cls):

exit_code = subprocess.call(
[
"gsutil",
"gcloud",
"storage",
"cp",
"-r",
"--recursive",
"gs://maxtext-dataset/hf/llama2-chat-tokenizer",
os.path.join(MAXTEXT_ASSETS_ROOT, ""),
]
)
if exit_code != 0:
raise ValueError(f"Download tokenizer with gsutil cp failed with exit code: {exit_code}")
raise ValueError(f"Download tokenizer with gcloud storage cp failed with exit code: {exit_code}")

@pytest.mark.skip(reason="Logit output test fragile, failing on jax upgrade to 0.6.2 b/425997645")
@pytest.mark.integration_test
Expand Down
7 changes: 4 additions & 3 deletions tests/unit/distillation_data_processing_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,15 +79,16 @@ def setUpClass(cls):
super().setUpClass()
exit_code = subprocess.call(
[
"gsutil",
"gcloud",
"storage",
"cp",
"-r",
"--recursive",
"gs://maxtext-dataset/hf/llama2-chat-tokenizer",
os.path.join(MAXTEXT_ASSETS_ROOT, ""),
]
)
if exit_code != 0:
raise ValueError(f"Download tokenizer with gsutil cp failed with exit code: {exit_code}")
raise ValueError(f"Download tokenizer with gcloud storage cp failed with exit code: {exit_code}")

def setUp(self):
super().setUp()
Expand Down
2 changes: 1 addition & 1 deletion tools/data_generation/download_dataset.sh
Original file line number Diff line number Diff line change
Expand Up @@ -26,4 +26,4 @@ function remove_trailing_slash {
fi
}

gsutil -u $1 -m cp 'gs://allennlp-tensorflow-datasets/c4/en/3.0.1/*' $(remove_trailing_slash $2)/c4/en/3.0.1
gcloud storage cp 'gs://allennlp-tensorflow-datasets/c4/en/3.0.1/*' $(remove_trailing_slash $2)/c4/en/3.0.1