import gcimport randomimport unittestimport numpy as npimport torchfrom diffusers import AutoencoderKL, CycleDiffusionPipeline, DDIMScheduler, UNet2DConditionModel, UNet2DModel, VQModelfrom diffusers.utils import floats_tensor, load_image, load_numpy, slow, torch_devicefrom diffusers.utils.testing_utils import require_torch_gpufrom transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizerfrom ...test_pipelines_common import PipelineTesterMixintorch.backends.cuda.matmul.allow_tf32 = Falseclass CycleDiffusionPipelineFastTests(PipelineTesterMixin, unittest.TestCase):    def tearDown(self):        # clean up the VRAM after each test        super().tearDown()        gc.collect()        torch.cuda.empty_cache()    @property    def dummy_image(self):        batch_size = 1        num_channels = 3        sizes = (32, 32)        image = floats_tensor((batch_size, num_channels) + sizes, rng=random.Random(0)).to(torch_device)        return image    @property    def dummy_uncond_unet(self):        torch.manual_seed(0)        model = UNet2DModel(            block_out_channels=(32, 64),            layers_per_block=2,            sample_size=32,            in_channels=3,            out_channels=3,            down_block_types=("DownBlock2D", "AttnDownBlock2D"),            up_block_types=("AttnUpBlock2D", "UpBlock2D"),        )        return model    @property    def dummy_cond_unet(self):        torch.manual_seed(0)        model = UNet2DConditionModel(            block_out_channels=(32, 64),            layers_per_block=2,            sample_size=32,            in_channels=4,            out_channels=4,            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),            cross_attention_dim=32,        )        return model    @property    def dummy_cond_unet_inpaint(self):        torch.manual_seed(0)        model = UNet2DConditionModel(            block_out_channels=(32, 64),            layers_per_block=2,            sample_size=32,            in_channels=9,            out_channels=4,            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),            cross_attention_dim=32,        )        return model    @property    def dummy_vq_model(self):        torch.manual_seed(0)        model = VQModel(            block_out_channels=[32, 64],            in_channels=3,            out_channels=3,            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],            latent_channels=3,        )        return model    @property    def dummy_vae(self):        torch.manual_seed(0)        model = AutoencoderKL(            block_out_channels=[32, 64],            in_channels=3,            out_channels=3,            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],            latent_channels=4,        )        return model    @property    def dummy_text_encoder(self):        torch.manual_seed(0)        config = CLIPTextConfig(            bos_token_id=0,            eos_token_id=2,            hidden_size=32,            intermediate_size=37,            layer_norm_eps=1e-05,            num_attention_heads=4,            num_hidden_layers=5,            pad_token_id=1,            vocab_size=1000,        )        return CLIPTextModel(config)    @property    def dummy_extractor(self):        def extract(*args, **kwargs):            class Out:                def __init__(self):                    self.pixel_values = torch.ones([0])                def to(self, device):                    self.pixel_values.to(device)                    return self            return Out()        return extract    def test_stable_diffusion_cycle(self):        device = "cpu"  # ensure determinism for the device-dependent torch.Generator        unet = self.dummy_cond_unet        scheduler = DDIMScheduler(            beta_start=0.00085,            beta_end=0.012,            beta_schedule="scaled_linear",            num_train_timesteps=1000,            clip_sample=False,            set_alpha_to_one=False,        )        vae = self.dummy_vae        bert = self.dummy_text_encoder        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")        # make sure here that pndm scheduler skips prk        sd_pipe = CycleDiffusionPipeline(            unet=unet,            scheduler=scheduler,            vae=vae,            text_encoder=bert,            tokenizer=tokenizer,            safety_checker=None,            feature_extractor=self.dummy_extractor,        )        sd_pipe = sd_pipe.to(device)        sd_pipe.set_progress_bar_config(disable=None)        source_prompt = "An astronaut riding a horse"        prompt = "An astronaut riding an elephant"        init_image = self.dummy_image.to(device)        generator = torch.Generator(device=device).manual_seed(0)        output = sd_pipe(            prompt=prompt,            source_prompt=source_prompt,            generator=generator,            num_inference_steps=2,            init_image=init_image,            eta=0.1,            strength=0.8,            guidance_scale=3,            source_guidance_scale=1,            output_type="np",        )        images = output.images        image_slice = images[0, -3:, -3:, -1]        assert images.shape == (1, 32, 32, 3)        expected_slice = np.array([0.4459, 0.4943, 0.4544, 0.6643, 0.5474, 0.4327, 0.5701, 0.5959, 0.5179])        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2    @unittest.skipIf(torch_device != "cuda", "This test requires a GPU")    def test_stable_diffusion_cycle_fp16(self):        unet = self.dummy_cond_unet        scheduler = DDIMScheduler(            beta_start=0.00085,            beta_end=0.012,            beta_schedule="scaled_linear",            num_train_timesteps=1000,            clip_sample=False,            set_alpha_to_one=False,        )        vae = self.dummy_vae        bert = self.dummy_text_encoder        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")        unet = unet.half()        vae = vae.half()        bert = bert.half()        # make sure here that pndm scheduler skips prk        sd_pipe = CycleDiffusionPipeline(            unet=unet,            scheduler=scheduler,            vae=vae,            text_encoder=bert,            tokenizer=tokenizer,            safety_checker=None,            feature_extractor=self.dummy_extractor,        )        sd_pipe = sd_pipe.to(torch_device)        sd_pipe.set_progress_bar_config(disable=None)        source_prompt = "An astronaut riding a horse"        prompt = "An astronaut riding an elephant"        init_image = self.dummy_image.to(torch_device)        generator = torch.Generator(device=torch_device).manual_seed(0)        output = sd_pipe(            prompt=prompt,            source_prompt=source_prompt,            generator=generator,            num_inference_steps=2,            init_image=init_image,            eta=0.1,            strength=0.8,            guidance_scale=3,            source_guidance_scale=1,            output_type="np",        )        images = output.images        image_slice = images[0, -3:, -3:, -1]        assert images.shape == (1, 32, 32, 3)        expected_slice = np.array([0.3506, 0.4543, 0.446, 0.4575, 0.5195, 0.4155, 0.5273, 0.518, 0.4116])        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2@slow@require_torch_gpuclass CycleDiffusionPipelineIntegrationTests(unittest.TestCase):    def tearDown(self):        # clean up the VRAM after each test        super().tearDown()        gc.collect()        torch.cuda.empty_cache()    def test_cycle_diffusion_pipeline_fp16(self):        init_image = load_image(            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"            "/cycle-diffusion/black_colored_car.png"        )        expected_image = load_numpy(            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/cycle-diffusion/blue_colored_car_fp16.npy"        )        init_image = init_image.resize((512, 512))        model_id = "CompVis/stable-diffusion-v1-4"        scheduler = DDIMScheduler.from_pretrained(model_id, subfolder="scheduler")        pipe = CycleDiffusionPipeline.from_pretrained(            model_id, scheduler=scheduler, safety_checker=None, torch_dtype=torch.float16, revision="fp16"        )        pipe.to(torch_device)        pipe.set_progress_bar_config(disable=None)        pipe.enable_attention_slicing()        source_prompt = "A black colored car"        prompt = "A blue colored car"        generator = torch.Generator(device=torch_device).manual_seed(0)        output = pipe(            prompt=prompt,            source_prompt=source_prompt,            init_image=init_image,            num_inference_steps=100,            eta=0.1,            strength=0.85,            guidance_scale=3,            source_guidance_scale=1,            generator=generator,            output_type="np",        )        image = output.images        # the values aren't exactly equal, but the images look the same visually        assert np.abs(image - expected_image).max() < 5e-1    def test_cycle_diffusion_pipeline(self):        init_image = load_image(            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"            "/cycle-diffusion/black_colored_car.png"        )        expected_image = load_numpy(            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/cycle-diffusion/blue_colored_car.npy"        )        init_image = init_image.resize((512, 512))        model_id = "CompVis/stable-diffusion-v1-4"        scheduler = DDIMScheduler.from_pretrained(model_id, subfolder="scheduler")        pipe = CycleDiffusionPipeline.from_pretrained(model_id, scheduler=scheduler, safety_checker=None)        pipe.to(torch_device)        pipe.set_progress_bar_config(disable=None)        pipe.enable_attention_slicing()        source_prompt = "A black colored car"        prompt = "A blue colored car"        generator = torch.Generator(device=torch_device).manual_seed(0)        output = pipe(            prompt=prompt,            source_prompt=source_prompt,            init_image=init_image,            num_inference_steps=100,            eta=0.1,            strength=0.85,            guidance_scale=3,            source_guidance_scale=1,            generator=generator,            output_type="np",        )        image = output.images        assert np.abs(image - expected_image).max() < 1e-2