import gcimport unittestimport numpy as npimport torchfrom diffusers import DanceDiffusionPipeline, IPNDMScheduler, UNet1DModelfrom diffusers.utils import slow, torch_devicefrom diffusers.utils.testing_utils import require_torch_gputorch.backends.cuda.matmul.allow_tf32 = Falseclass PipelineFastTests(unittest.TestCase):    def tearDown(self):        # clean up the VRAM after each test        super().tearDown()        gc.collect()        torch.cuda.empty_cache()    @property    def dummy_unet(self):        torch.manual_seed(0)        model = UNet1DModel(            block_out_channels=(32, 32, 64),            extra_in_channels=16,            sample_size=512,            sample_rate=16_000,            in_channels=2,            out_channels=2,            flip_sin_to_cos=True,            use_timestep_embedding=False,            time_embedding_type="fourier",            mid_block_type="UNetMidBlock1D",            down_block_types=["DownBlock1DNoSkip"] + ["DownBlock1D"] + ["AttnDownBlock1D"],            up_block_types=["AttnUpBlock1D"] + ["UpBlock1D"] + ["UpBlock1DNoSkip"],        )        return model    def test_dance_diffusion(self):        device = "cpu"  # ensure determinism for the device-dependent torch.Generator        scheduler = IPNDMScheduler()        pipe = DanceDiffusionPipeline(unet=self.dummy_unet, scheduler=scheduler)        pipe = pipe.to(device)        pipe.set_progress_bar_config(disable=None)        generator = torch.Generator(device=device).manual_seed(0)        output = pipe(generator=generator, num_inference_steps=4)        audio = output.audios        generator = torch.Generator(device=device).manual_seed(0)        output = pipe(generator=generator, num_inference_steps=4, return_dict=False)        audio_from_tuple = output[0]        audio_slice = audio[0, -3:, -3:]        audio_from_tuple_slice = audio_from_tuple[0, -3:, -3:]        assert audio.shape == (1, 2, self.dummy_unet.sample_size)        expected_slice = np.array([-0.7265, 1.0000, -0.8388, 0.1175, 0.9498, -1.0000])        assert np.abs(audio_slice.flatten() - expected_slice).max() < 1e-2        assert np.abs(audio_from_tuple_slice.flatten() - expected_slice).max() < 1e-2@slow@require_torch_gpuclass PipelineIntegrationTests(unittest.TestCase):    def tearDown(self):        # clean up the VRAM after each test        super().tearDown()        gc.collect()        torch.cuda.empty_cache()    def test_dance_diffusion(self):        device = torch_device        pipe = DanceDiffusionPipeline.from_pretrained("harmonai/maestro-150k")        pipe = pipe.to(device)        pipe.set_progress_bar_config(disable=None)        generator = torch.Generator(device=device).manual_seed(0)        output = pipe(generator=generator, num_inference_steps=100, audio_length_in_s=4.096)        audio = output.audios        audio_slice = audio[0, -3:, -3:]        assert audio.shape == (1, 2, pipe.unet.sample_size)        expected_slice = np.array([-0.1576, -0.1526, -0.127, -0.2699, -0.2762, -0.2487])        assert np.abs(audio_slice.flatten() - expected_slice).max() < 1e-2    def test_dance_diffusion_fp16(self):        device = torch_device        pipe = DanceDiffusionPipeline.from_pretrained("harmonai/maestro-150k", torch_dtype=torch.float16)        pipe = pipe.to(device)        pipe.set_progress_bar_config(disable=None)        generator = torch.Generator(device=device).manual_seed(0)        output = pipe(generator=generator, num_inference_steps=100, audio_length_in_s=4.096)        audio = output.audios        audio_slice = audio[0, -3:, -3:]        assert audio.shape == (1, 2, pipe.unet.sample_size)        expected_slice = np.array([-0.1693, -0.1698, -0.1447, -0.3044, -0.3203, -0.2937])        assert np.abs(audio_slice.flatten() - expected_slice).max() < 1e-2