From eac74078775e74595cdc994178dbb3a19ed76eff Mon Sep 17 00:00:00 2001 From: Alban Desmaison Date: Fri, 6 Mar 2026 12:10:19 -0800 Subject: [PATCH] Update torchmultimodal test golden values after trunc_normal_ change Summary: D94908913 changed `torch.nn.init.trunc_normal_` from erfinv-based to rejection-sampling, which produces different numerical outputs with the same seed. This updates the hardcoded golden values in torchmultimodal tests to match the new implementation. Updated tests: - test_albef.py: image_embeddings, image_embeddings_momentum, multimodal_embeddings, multimodal_embeddings_momentum - test_image_encoder.py: test_vision_transformer - test_omnivore.py: swin_t, swin_s, swin_b forward tests - test_swin_transformer_3d_encoder.py: test_swin_transformer_3d_encoder - test_vision_transformer.py: test_vision_transformer_patch_drop Partial fix for T258469553 Differential Revision: D95591588 --- tests/models/albef/test_albef.py | 16 ++++++++-------- tests/models/albef/test_image_encoder.py | 4 ++-- tests/models/test_omnivore.py | 18 +++++++++--------- .../test_swin_transformer_3d_encoder.py | 2 +- 4 files changed, 20 insertions(+), 20 deletions(-) diff --git a/tests/models/albef/test_albef.py b/tests/models/albef/test_albef.py index 922712a39..8b0663151 100644 --- a/tests/models/albef/test_albef.py +++ b/tests/models/albef/test_albef.py @@ -82,8 +82,8 @@ def albef_model_output(albef_model): def test_albef_image_embeddings(albef_model_output): expected = Tensor( [ - [[1.364883, -1.003092, -0.361791], [-0.634884, 1.411830, -0.776947]], - [[1.401580, -0.537510, -0.864071], [1.378901, -0.417473, -0.961429]], + [[1.337287, -0.270253, -1.067032], [1.414201, -0.705347, -0.708854]], + [[1.402044, -0.540827, -0.861216], [-1.410284, 0.613923, 0.796362]], ] ) assert_expected(albef_model_output.image_embeddings, expected, rtol=0, atol=1e-4) @@ -92,8 +92,8 @@ def test_albef_image_embeddings(albef_model_output): def test_albef_image_embeddings_momentum(albef_model_output): expected = Tensor( [ - [[1.364883, -1.003092, -0.361791], [-0.634884, 1.411830, -0.776947]], - [[1.401580, -0.537510, -0.864070], [1.378902, -0.417473, -0.961429]], + [[1.337286, -0.270253, -1.067033], [1.414201, -0.705347, -0.708854]], + [[1.402043, -0.540827, -0.861217], [-1.410284, 0.613922, 0.796362]], ] ) assert_expected(albef_model_output.image_embeddings_m, expected, rtol=0, atol=1e-4) @@ -122,8 +122,8 @@ def test_albef_text_embeddings_momentum(albef_model_output): def test_albef_multimodal_embeddings(albef_model_output): expected = Tensor( [ - [[-0.068738, 1.257666, -1.188928], [1.409873, -0.609056, -0.800817]], - [[-1.402520, 0.544084, 0.858435], [1.202279, -1.246038, 0.043760]], + [[1.228663, -0.007874, -1.220789], [1.401657, -0.863674, -0.537983]], + [[-1.021501, 1.357746, -0.336245], [1.409910, -0.800425, -0.609485]], ] ) assert_expected( @@ -134,8 +134,8 @@ def test_albef_multimodal_embeddings(albef_model_output): def test_albef_multimodal_embeddings_momentum(albef_model_output): expected = Tensor( [ - [[-0.068738, 1.257666, -1.188928], [1.409873, -0.609056, -0.800817]], - [[-1.402520, 0.544084, 0.858435], [1.202279, -1.246038, 0.043760]], + [[1.228662, -0.007872, -1.220790], [1.401657, -0.863674, -0.537983]], + [[-1.021501, 1.357746, -0.336245], [1.409910, -0.800426, -0.609485]], ] ) assert_expected( diff --git a/tests/models/albef/test_image_encoder.py b/tests/models/albef/test_image_encoder.py index df580c2a2..ed461d27d 100644 --- a/tests/models/albef/test_image_encoder.py +++ b/tests/models/albef/test_image_encoder.py @@ -31,8 +31,8 @@ def test_vision_transformer(self): output = vit(input) expected = Tensor( [ - [1.399478, -0.875986, -0.523492], - [-0.869867, 1.400589, -0.530722], + [1.407929, -0.819281, -0.588648], + [-0.692709, 1.414116, -0.721407], ] ).unsqueeze(0) assert_expected(output, expected, rtol=0, atol=1e-4) diff --git a/tests/models/test_omnivore.py b/tests/models/test_omnivore.py index 1134d20da..458e41a6d 100644 --- a/tests/models/test_omnivore.py +++ b/tests/models/test_omnivore.py @@ -40,19 +40,19 @@ def test_omnivore_swin_t_forward(omnivore_swin_t_model, device): assert_expected(image_score.size(), torch.Size((1, 1000))) assert_expected( - image_score.abs().sum(), torch.tensor(184.01417), rtol=1e-3, atol=1e-3 + image_score.abs().sum(), torch.tensor(208.64470), rtol=1e-3, atol=1e-3 ) rgbd = torch.randn((1, 4, 1, 112, 112), device=device) rgbd_score = model(rgbd, input_type="rgbd") assert_expected(rgbd_score.size(), torch.Size((1, 19))) - assert_expected(rgbd_score.abs().sum(), torch.tensor(3.60813), rtol=1e-3, atol=1e-3) + assert_expected(rgbd_score.abs().sum(), torch.tensor(4.42536), rtol=1e-3, atol=1e-3) video = torch.randn((1, 3, 4, 112, 112), device=device) video_score = model(video, input_type="video") assert_expected(video_score.size(), torch.Size((1, 400))) assert_expected( - video_score.abs().sum(), torch.tensor(110.70048), rtol=1e-3, atol=1e-3 + video_score.abs().sum(), torch.tensor(71.27126), rtol=1e-3, atol=1e-3 ) @@ -64,19 +64,19 @@ def test_omnivore_swin_s_forward(omnivore_swin_s_model, device): assert_expected(image_score.size(), torch.Size((1, 1000))) assert_expected( - image_score.abs().sum(), torch.tensor(239.73104), rtol=1e-3, atol=1e-3 + image_score.abs().sum(), torch.tensor(215.43909), rtol=1e-3, atol=1e-3 ) rgbd = torch.randn((1, 4, 1, 112, 112), device=device) rgbd_score = model(rgbd, input_type="rgbd") assert_expected(rgbd_score.size(), torch.Size((1, 19))) - assert_expected(rgbd_score.abs().sum(), torch.tensor(5.80919), rtol=1e-3, atol=1e-3) + assert_expected(rgbd_score.abs().sum(), torch.tensor(4.23604), rtol=1e-3, atol=1e-3) video = torch.randn((1, 3, 4, 112, 112), device=device) video_score = model(video, input_type="video") assert_expected(video_score.size(), torch.Size((1, 400))) assert_expected( - video_score.abs().sum(), torch.tensor(136.49894), rtol=1e-3, atol=1e-3 + video_score.abs().sum(), torch.tensor(136.16075), rtol=1e-3, atol=1e-3 ) @@ -88,19 +88,19 @@ def test_omnivore_swin_b_forward(omnivore_swin_b_model, device): assert_expected(image_score.size(), torch.Size((1, 1000))) assert_expected( - image_score.abs().sum(), torch.tensor(278.06488), rtol=1e-3, atol=1e-3 + image_score.abs().sum(), torch.tensor(251.88954), rtol=1e-3, atol=1e-3 ) rgbd = torch.randn((1, 4, 1, 112, 112), device=device) rgbd_score = model(rgbd, input_type="rgbd") assert_expected(rgbd_score.size(), torch.Size((1, 19))) - assert_expected(rgbd_score.abs().sum(), torch.tensor(4.52186), rtol=1e-3, atol=1e-3) + assert_expected(rgbd_score.abs().sum(), torch.tensor(3.82680), rtol=1e-3, atol=1e-3) video = torch.randn((1, 3, 4, 112, 112), device=device) video_score = model(video, input_type="video") assert_expected(video_score.size(), torch.Size((1, 400))) assert_expected( - video_score.abs().sum(), torch.tensor(138.22859), rtol=1e-3, atol=1e-3 + video_score.abs().sum(), torch.tensor(140.93663), rtol=1e-3, atol=1e-3 ) diff --git a/tests/modules/encoders/test_swin_transformer_3d_encoder.py b/tests/modules/encoders/test_swin_transformer_3d_encoder.py index d41544cb2..ed33d331b 100644 --- a/tests/modules/encoders/test_swin_transformer_3d_encoder.py +++ b/tests/modules/encoders/test_swin_transformer_3d_encoder.py @@ -38,7 +38,7 @@ def test_swin_transformer_3d_encoder(self): scores = self.encoder(image) self.assertEqual(scores.size(), torch.Size([1, 768])) - self.assertAlmostEqual(scores.abs().sum().item(), 247.14674, 2) + self.assertAlmostEqual(scores.abs().sum().item(), 279.50031, 2) def test_swin_transformer_3d_scripting(self): torch.jit.script(self.encoder)