optimum-habana
1950 строк · 68.7 Кб
1# coding=utf-8
2# Copyright 2022 The HuggingFace Inc. team.
3# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
4#
5# Licensed under the Apache License, Version 2.0 (the "License");
6# you may not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9# http://www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an "AS IS" BASIS,
13# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
16
17import json
18import os
19import re
20import subprocess
21import tempfile
22from io import BytesIO
23from pathlib import Path
24from unittest import TestCase, skipUnless
25
26import numpy as np
27import requests
28import torch
29from diffusers import AutoencoderKL, ControlNetModel, UNet2DConditionModel, UniPCMultistepScheduler
30from diffusers.pipelines.controlnet.pipeline_controlnet import MultiControlNetModel
31from diffusers.utils import load_image
32from diffusers.utils.torch_utils import randn_tensor
33from huggingface_hub import snapshot_download
34from parameterized import parameterized
35from PIL import Image
36from transformers import CLIPTextConfig, CLIPTextModel, CLIPTextModelWithProjection, CLIPTokenizer
37from transformers.testing_utils import parse_flag_from_env, slow
38
39from optimum.habana import GaudiConfig
40from optimum.habana.diffusers import (
41GaudiDDIMScheduler,
42GaudiDiffusionPipeline,
43GaudiEulerAncestralDiscreteScheduler,
44GaudiEulerDiscreteScheduler,
45GaudiStableDiffusionControlNetPipeline,
46GaudiStableDiffusionLDM3DPipeline,
47GaudiStableDiffusionPipeline,
48GaudiStableDiffusionUpscalePipeline,
49GaudiStableDiffusionXLPipeline,
50)
51from optimum.habana.utils import set_seed
52
53from .clip_coco_utils import download_files
54
55
56if os.environ.get("GAUDI2_CI", "0") == "1":
57THROUGHPUT_BASELINE_BF16 = 1.016
58THROUGHPUT_BASELINE_AUTOCAST = 0.394
59TEXTUAL_INVERSION_THROUGHPUT = 104.29806
60TEXTUAL_INVERSION_RUNTIME = 114.1344320399221
61CONTROLNET_THROUGHPUT = 92.886919836857
62CONTROLNET_RUNTIME = 537.4276602957398
63else:
64THROUGHPUT_BASELINE_BF16 = 0.309
65THROUGHPUT_BASELINE_AUTOCAST = 0.114
66TEXTUAL_INVERSION_THROUGHPUT = 58.17508958300077
67TEXTUAL_INVERSION_RUNTIME = 202.94231038199996
68CONTROLNET_THROUGHPUT = 44.412012818816905
69CONTROLNET_RUNTIME = 1124.0202105600001
70
71
72_run_custom_bf16_ops_test_ = parse_flag_from_env("CUSTOM_BF16_OPS", default=False)
73
74
75def custom_bf16_ops(test_case):
76"""
77Decorator marking a test as needing custom bf16 ops.
78Custom bf16 ops must be declared before `habana_frameworks.torch.core` is imported, which is not possible if some other tests are executed before.
79
80Such tests are skipped by default. Set the CUSTOM_BF16_OPS environment variable to a truthy value to run them.
81
82"""
83return skipUnless(_run_custom_bf16_ops_test_, "test requires custom bf16 ops")(test_case)
84
85
86class GaudiPipelineUtilsTester(TestCase):
87"""
88Tests the features added on top of diffusers/pipeline_utils.py.
89"""
90
91def test_use_hpu_graphs_raise_error_without_habana(self):
92with self.assertRaises(ValueError):
93_ = GaudiDiffusionPipeline(
94use_habana=False,
95use_hpu_graphs=True,
96)
97
98def test_gaudi_config_raise_error_without_habana(self):
99with self.assertRaises(ValueError):
100_ = GaudiDiffusionPipeline(
101use_habana=False,
102gaudi_config=GaudiConfig(),
103)
104
105def test_device(self):
106pipeline_1 = GaudiDiffusionPipeline(
107use_habana=True,
108gaudi_config=GaudiConfig(),
109)
110self.assertEqual(pipeline_1._device.type, "hpu")
111
112pipeline_2 = GaudiDiffusionPipeline(
113use_habana=False,
114)
115self.assertEqual(pipeline_2._device.type, "cpu")
116
117def test_gaudi_config_types(self):
118# gaudi_config is a string
119_ = GaudiDiffusionPipeline(
120use_habana=True,
121gaudi_config="Habana/stable-diffusion",
122)
123
124# gaudi_config is instantiated beforehand
125gaudi_config = GaudiConfig.from_pretrained("Habana/stable-diffusion")
126_ = GaudiDiffusionPipeline(
127use_habana=True,
128gaudi_config=gaudi_config,
129)
130
131def test_default(self):
132pipeline = GaudiDiffusionPipeline(
133use_habana=True,
134gaudi_config=GaudiConfig(),
135)
136
137self.assertTrue(hasattr(pipeline, "htcore"))
138
139def test_use_hpu_graphs(self):
140pipeline = GaudiDiffusionPipeline(
141use_habana=True,
142use_hpu_graphs=True,
143gaudi_config=GaudiConfig(),
144)
145
146self.assertTrue(hasattr(pipeline, "ht"))
147self.assertTrue(hasattr(pipeline, "hpu_stream"))
148self.assertTrue(hasattr(pipeline, "cache"))
149
150def test_save_pretrained(self):
151model_name = "hf-internal-testing/tiny-stable-diffusion-torch"
152scheduler = GaudiDDIMScheduler.from_pretrained(model_name, subfolder="scheduler")
153pipeline = GaudiStableDiffusionPipeline.from_pretrained(
154model_name,
155scheduler=scheduler,
156use_habana=True,
157gaudi_config=GaudiConfig(),
158)
159
160with tempfile.TemporaryDirectory() as tmp_dir:
161pipeline.save_pretrained(tmp_dir)
162self.assertTrue(Path(tmp_dir, "gaudi_config.json").is_file())
163
164
165class GaudiStableDiffusionPipelineTester(TestCase):
166"""
167Tests the StableDiffusionPipeline for Gaudi.
168"""
169
170def get_dummy_components(self, time_cond_proj_dim=None):
171torch.manual_seed(0)
172unet = UNet2DConditionModel(
173block_out_channels=(4, 8),
174layers_per_block=1,
175sample_size=32,
176time_cond_proj_dim=time_cond_proj_dim,
177in_channels=4,
178out_channels=4,
179down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
180up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
181cross_attention_dim=32,
182norm_num_groups=2,
183)
184scheduler = GaudiDDIMScheduler(
185beta_start=0.00085,
186beta_end=0.012,
187beta_schedule="scaled_linear",
188clip_sample=False,
189set_alpha_to_one=False,
190)
191torch.manual_seed(0)
192vae = AutoencoderKL(
193block_out_channels=[4, 8],
194in_channels=3,
195out_channels=3,
196down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
197up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
198latent_channels=4,
199norm_num_groups=2,
200)
201torch.manual_seed(0)
202text_encoder_config = CLIPTextConfig(
203bos_token_id=0,
204eos_token_id=2,
205hidden_size=32,
206intermediate_size=64,
207layer_norm_eps=1e-05,
208num_attention_heads=8,
209num_hidden_layers=3,
210pad_token_id=1,
211vocab_size=1000,
212)
213text_encoder = CLIPTextModel(text_encoder_config)
214tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
215
216components = {
217"unet": unet,
218"scheduler": scheduler,
219"vae": vae,
220"text_encoder": text_encoder,
221"tokenizer": tokenizer,
222"safety_checker": None,
223"feature_extractor": None,
224}
225return components
226
227def get_dummy_inputs(self, device, seed=0):
228generator = torch.Generator(device=device).manual_seed(seed)
229inputs = {
230"prompt": "A painting of a squirrel eating a burger",
231"generator": generator,
232"num_inference_steps": 2,
233"guidance_scale": 6.0,
234"output_type": "numpy",
235}
236return inputs
237
238def test_stable_diffusion_ddim(self):
239device = "cpu"
240
241components = self.get_dummy_components()
242gaudi_config = GaudiConfig(use_torch_autocast=False)
243
244sd_pipe = GaudiStableDiffusionPipeline(
245use_habana=True,
246gaudi_config=gaudi_config,
247**components,
248)
249sd_pipe.set_progress_bar_config(disable=None)
250
251inputs = self.get_dummy_inputs(device)
252output = sd_pipe(**inputs)
253image = output.images[0]
254
255image_slice = image[-3:, -3:, -1]
256
257self.assertEqual(image.shape, (64, 64, 3))
258expected_slice = np.array([0.3203, 0.4555, 0.4711, 0.3505, 0.3973, 0.4650, 0.5137, 0.3392, 0.4045])
259
260self.assertLess(np.abs(image_slice.flatten() - expected_slice).max(), 1e-2)
261
262def test_stable_diffusion_no_safety_checker(self):
263gaudi_config = GaudiConfig()
264scheduler = GaudiDDIMScheduler(
265beta_start=0.00085,
266beta_end=0.012,
267beta_schedule="scaled_linear",
268clip_sample=False,
269set_alpha_to_one=False,
270)
271pipe = GaudiStableDiffusionPipeline.from_pretrained(
272"hf-internal-testing/tiny-stable-diffusion-pipe",
273scheduler=scheduler,
274safety_checker=None,
275use_habana=True,
276gaudi_config=gaudi_config,
277)
278self.assertIsInstance(pipe, GaudiStableDiffusionPipeline)
279self.assertIsInstance(pipe.scheduler, GaudiDDIMScheduler)
280self.assertIsNone(pipe.safety_checker)
281
282image = pipe("example prompt", num_inference_steps=2).images[0]
283self.assertIsNotNone(image)
284
285# Check that there's no error when saving a pipeline with one of the models being None
286with tempfile.TemporaryDirectory() as tmpdirname:
287pipe.save_pretrained(tmpdirname)
288pipe = GaudiStableDiffusionPipeline.from_pretrained(
289tmpdirname,
290use_habana=True,
291gaudi_config=tmpdirname,
292)
293
294# Sanity check that the pipeline still works
295self.assertIsNone(pipe.safety_checker)
296image = pipe("example prompt", num_inference_steps=2).images[0]
297self.assertIsNotNone(image)
298
299@parameterized.expand(["pil", "np", "latent"])
300def test_stable_diffusion_output_types(self, output_type):
301components = self.get_dummy_components()
302gaudi_config = GaudiConfig()
303
304sd_pipe = GaudiStableDiffusionPipeline(
305use_habana=True,
306gaudi_config=gaudi_config,
307**components,
308)
309sd_pipe.set_progress_bar_config(disable=None)
310
311prompt = "A painting of a squirrel eating a burger"
312num_prompts = 2
313num_images_per_prompt = 3
314
315outputs = sd_pipe(
316num_prompts * [prompt],
317num_images_per_prompt=num_images_per_prompt,
318num_inference_steps=2,
319output_type=output_type,
320)
321
322self.assertEqual(len(outputs.images), 2 * 3)
323# TODO: enable safety checker
324# if output_type == "latent":
325# self.assertIsNone(outputs.nsfw_content_detected)
326# else:
327# self.assertEqual(len(outputs.nsfw_content_detected), 2 * 3)
328
329# TODO: enable this test when PNDMScheduler is adapted to Gaudi
330# def test_stable_diffusion_negative_prompt(self):
331# device = "cpu" # ensure determinism for the device-dependent torch.Generator
332# unet = self.dummy_cond_unet
333# scheduler = PNDMScheduler(skip_prk_steps=True)
334# vae = self.dummy_vae
335# bert = self.dummy_text_encoder
336# tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
337
338# # make sure here that pndm scheduler skips prk
339# sd_pipe = StableDiffusionPipeline(
340# unet=unet,
341# scheduler=scheduler,
342# vae=vae,
343# text_encoder=bert,
344# tokenizer=tokenizer,
345# safety_checker=None,
346# feature_extractor=self.dummy_extractor,
347# )
348# sd_pipe = sd_pipe.to(device)
349# sd_pipe.set_progress_bar_config(disable=None)
350
351# prompt = "A painting of a squirrel eating a burger"
352# negative_prompt = "french fries"
353# generator = torch.Generator(device=device).manual_seed(0)
354# output = sd_pipe(
355# prompt,
356# negative_prompt=negative_prompt,
357# generator=generator,
358# guidance_scale=6.0,
359# num_inference_steps=2,
360# output_type="np",
361# )
362
363# image = output.images
364# image_slice = image[0, -3:, -3:, -1]
365
366# assert image.shape == (1, 128, 128, 3)
367# expected_slice = np.array([0.4851, 0.4617, 0.4765, 0.5127, 0.4845, 0.5153, 0.5141, 0.4886, 0.4719])
368# assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
369
370def test_stable_diffusion_num_images_per_prompt(self):
371components = self.get_dummy_components()
372gaudi_config = GaudiConfig()
373
374sd_pipe = GaudiStableDiffusionPipeline(
375use_habana=True,
376gaudi_config=gaudi_config,
377**components,
378)
379sd_pipe.set_progress_bar_config(disable=None)
380
381prompt = "A painting of a squirrel eating a burger"
382
383# Test num_images_per_prompt=1 (default)
384images = sd_pipe(prompt, num_inference_steps=2, output_type="np").images
385
386self.assertEqual(len(images), 1)
387self.assertEqual(images[0].shape, (64, 64, 3))
388
389# Test num_images_per_prompt=1 (default) for several prompts
390num_prompts = 3
391images = sd_pipe([prompt] * num_prompts, num_inference_steps=2, output_type="np").images
392
393self.assertEqual(len(images), num_prompts)
394self.assertEqual(images[-1].shape, (64, 64, 3))
395
396# Test num_images_per_prompt for single prompt
397num_images_per_prompt = 2
398images = sd_pipe(
399prompt, num_inference_steps=2, output_type="np", num_images_per_prompt=num_images_per_prompt
400).images
401
402self.assertEqual(len(images), num_images_per_prompt)
403self.assertEqual(images[-1].shape, (64, 64, 3))
404
405# Test num_images_per_prompt for several prompts
406num_prompts = 2
407images = sd_pipe(
408[prompt] * num_prompts,
409num_inference_steps=2,
410output_type="np",
411num_images_per_prompt=num_images_per_prompt,
412).images
413
414self.assertEqual(len(images), num_prompts * num_images_per_prompt)
415self.assertEqual(images[-1].shape, (64, 64, 3))
416
417def test_stable_diffusion_batch_sizes(self):
418components = self.get_dummy_components()
419gaudi_config = GaudiConfig()
420
421sd_pipe = GaudiStableDiffusionPipeline(
422use_habana=True,
423gaudi_config=gaudi_config,
424**components,
425)
426sd_pipe.set_progress_bar_config(disable=None)
427
428prompt = "A painting of a squirrel eating a burger"
429
430# Test batch_size > 1 where batch_size is a divider of the total number of generated images
431batch_size = 3
432num_images_per_prompt = batch_size**2
433images = sd_pipe(
434prompt,
435num_inference_steps=2,
436output_type="np",
437batch_size=batch_size,
438num_images_per_prompt=num_images_per_prompt,
439).images
440
441self.assertEqual(len(images), num_images_per_prompt)
442self.assertEqual(images[-1].shape, (64, 64, 3))
443
444# Same test for several prompts
445num_prompts = 3
446images = sd_pipe(
447[prompt] * num_prompts,
448num_inference_steps=2,
449output_type="np",
450batch_size=batch_size,
451num_images_per_prompt=num_images_per_prompt,
452).images
453
454self.assertEqual(len(images), num_prompts * num_images_per_prompt)
455self.assertEqual(images[-1].shape, (64, 64, 3))
456
457# Test batch_size when it is not a divider of the toal number of generated images for a single prompt
458num_images_per_prompt = 7
459images = sd_pipe(
460prompt,
461num_inference_steps=2,
462output_type="np",
463batch_size=batch_size,
464num_images_per_prompt=num_images_per_prompt,
465).images
466
467self.assertEqual(len(images), num_images_per_prompt)
468self.assertEqual(images[-1].shape, (64, 64, 3))
469
470# Same test for several prompts
471num_prompts = 2
472images = sd_pipe(
473[prompt] * num_prompts,
474num_inference_steps=2,
475output_type="np",
476batch_size=batch_size,
477num_images_per_prompt=num_images_per_prompt,
478).images
479
480self.assertEqual(len(images), num_prompts * num_images_per_prompt)
481self.assertEqual(images[-1].shape, (64, 64, 3))
482
483def test_stable_diffusion_bf16(self):
484"""Test that stable diffusion works with bf16"""
485components = self.get_dummy_components()
486gaudi_config = GaudiConfig()
487
488sd_pipe = GaudiStableDiffusionPipeline(
489use_habana=True,
490gaudi_config=gaudi_config,
491**components,
492)
493sd_pipe.set_progress_bar_config(disable=None)
494
495prompt = "A painting of a squirrel eating a burger"
496generator = torch.Generator(device="cpu").manual_seed(0)
497image = sd_pipe([prompt], generator=generator, num_inference_steps=2, output_type="np").images[0]
498
499self.assertEqual(image.shape, (64, 64, 3))
500
501def test_stable_diffusion_default(self):
502components = self.get_dummy_components()
503
504sd_pipe = GaudiStableDiffusionPipeline(
505use_habana=True,
506gaudi_config="Habana/stable-diffusion",
507**components,
508)
509sd_pipe.set_progress_bar_config(disable=None)
510
511prompt = "A painting of a squirrel eating a burger"
512generator = torch.Generator(device="cpu").manual_seed(0)
513images = sd_pipe(
514[prompt] * 2,
515generator=generator,
516num_inference_steps=2,
517output_type="np",
518batch_size=3,
519num_images_per_prompt=5,
520).images
521
522self.assertEqual(len(images), 10)
523self.assertEqual(images[-1].shape, (64, 64, 3))
524
525def test_stable_diffusion_hpu_graphs(self):
526components = self.get_dummy_components()
527
528sd_pipe = GaudiStableDiffusionPipeline(
529use_habana=True,
530use_hpu_graphs=True,
531gaudi_config="Habana/stable-diffusion",
532**components,
533)
534sd_pipe.set_progress_bar_config(disable=None)
535
536prompt = "A painting of a squirrel eating a burger"
537generator = torch.Generator(device="cpu").manual_seed(0)
538images = sd_pipe(
539[prompt] * 2,
540generator=generator,
541num_inference_steps=2,
542output_type="np",
543batch_size=3,
544num_images_per_prompt=5,
545).images
546
547self.assertEqual(len(images), 10)
548self.assertEqual(images[-1].shape, (64, 64, 3))
549
550@slow
551def test_no_throughput_regression_bf16(self):
552prompts = [
553"An image of a squirrel in Picasso style",
554"High quality photo of an astronaut riding a horse in space",
555]
556num_images_per_prompt = 11
557batch_size = 4
558model_name = "runwayml/stable-diffusion-v1-5"
559scheduler = GaudiDDIMScheduler.from_pretrained(model_name, subfolder="scheduler")
560
561pipeline = GaudiStableDiffusionPipeline.from_pretrained(
562model_name,
563scheduler=scheduler,
564use_habana=True,
565use_hpu_graphs=True,
566gaudi_config=GaudiConfig.from_pretrained("Habana/stable-diffusion"),
567torch_dtype=torch.bfloat16,
568)
569set_seed(27)
570outputs = pipeline(
571prompt=prompts,
572num_images_per_prompt=num_images_per_prompt,
573batch_size=batch_size,
574)
575self.assertEqual(len(outputs.images), num_images_per_prompt * len(prompts))
576self.assertGreaterEqual(outputs.throughput, 0.95 * THROUGHPUT_BASELINE_BF16)
577
578@custom_bf16_ops
579@slow
580def test_no_throughput_regression_autocast(self):
581prompts = [
582"An image of a squirrel in Picasso style",
583"High quality photo of an astronaut riding a horse in space",
584]
585num_images_per_prompt = 11
586batch_size = 4
587model_name = "stabilityai/stable-diffusion-2-1"
588scheduler = GaudiDDIMScheduler.from_pretrained(model_name, subfolder="scheduler")
589
590pipeline = GaudiStableDiffusionPipeline.from_pretrained(
591model_name,
592scheduler=scheduler,
593use_habana=True,
594use_hpu_graphs=True,
595gaudi_config=GaudiConfig.from_pretrained("Habana/stable-diffusion-2"),
596)
597set_seed(27)
598outputs = pipeline(
599prompt=prompts,
600num_images_per_prompt=num_images_per_prompt,
601batch_size=batch_size,
602height=768,
603width=768,
604)
605self.assertEqual(len(outputs.images), num_images_per_prompt * len(prompts))
606self.assertGreaterEqual(outputs.throughput, 0.95 * THROUGHPUT_BASELINE_AUTOCAST)
607
608@slow
609def test_no_generation_regression(self):
610model_name = "CompVis/stable-diffusion-v1-4"
611# fp32
612scheduler = GaudiDDIMScheduler.from_pretrained(model_name, subfolder="scheduler")
613pipeline = GaudiStableDiffusionPipeline.from_pretrained(
614model_name,
615scheduler=scheduler,
616safety_checker=None,
617use_habana=True,
618use_hpu_graphs=True,
619gaudi_config=GaudiConfig(use_torch_autocast=False),
620)
621set_seed(27)
622outputs = pipeline(
623prompt="An image of a squirrel in Picasso style",
624output_type="np",
625)
626
627if os.environ.get("GAUDI2_CI", "0") == "1":
628expected_slice = np.array(
629[
6300.68306947,
6310.6812112,
6320.67309505,
6330.70057267,
6340.6582885,
6350.6325019,
6360.6708976,
6370.6226433,
6380.58038336,
639]
640)
641else:
642expected_slice = np.array(
643[0.70760196, 0.7136303, 0.7000798, 0.714934, 0.6776865, 0.6800843, 0.6923707, 0.6653969, 0.6408076]
644)
645image = outputs.images[0]
646
647self.assertEqual(image.shape, (512, 512, 3))
648self.assertLess(np.abs(expected_slice - image[-3:, -3:, -1].flatten()).max(), 5e-3)
649
650@slow
651def test_no_generation_regression_ldm3d(self):
652model_name = "Intel/ldm3d-4c"
653# fp32
654scheduler = GaudiDDIMScheduler.from_pretrained(model_name, subfolder="scheduler")
655pipeline = GaudiStableDiffusionLDM3DPipeline.from_pretrained(
656model_name,
657scheduler=scheduler,
658safety_checker=None,
659use_habana=True,
660use_hpu_graphs=True,
661gaudi_config=GaudiConfig(),
662)
663set_seed(27)
664outputs = pipeline(
665prompt="An image of a squirrel in Picasso style",
666output_type="np",
667)
668
669if os.environ.get("GAUDI2_CI", "0") == "1":
670expected_slice_rgb = np.array(
671[
6720.2099357,
6730.16664368,
6740.08352646,
6750.20643419,
6760.16748399,
6770.08781305,
6780.21379063,
6790.19943115,
6800.04389626,
681]
682)
683expected_slice_depth = np.array(
684[
6850.68369114,
6860.6827824,
6870.6852779,
6880.6836072,
6890.6888298,
6900.6895473,
6910.6853674,
6920.67561126,
6930.660434,
694]
695)
696else:
697expected_slice_rgb = np.array([0.7083766, 1.0, 1.0, 0.70610344, 0.9867363, 1.0, 0.7214538, 1.0, 1.0])
698expected_slice_depth = np.array(
699[
7000.919621,
7010.92072034,
7020.9184986,
7030.91994286,
7040.9242079,
7050.93387043,
7060.92345214,
7070.93558526,
7080.9223714,
709]
710)
711rgb = outputs.rgb[0]
712depth = outputs.depth[0]
713
714self.assertEqual(rgb.shape, (512, 512, 3))
715self.assertEqual(depth.shape, (512, 512, 1))
716self.assertLess(np.abs(expected_slice_rgb - rgb[-3:, -3:, -1].flatten()).max(), 5e-3)
717self.assertLess(np.abs(expected_slice_depth - depth[-3:, -3:, -1].flatten()).max(), 5e-3)
718
719@slow
720def test_no_generation_regression_upscale(self):
721model_name = "stabilityai/stable-diffusion-x4-upscaler"
722# fp32
723scheduler = GaudiDDIMScheduler.from_pretrained(model_name, subfolder="scheduler")
724pipeline = GaudiStableDiffusionUpscalePipeline.from_pretrained(
725model_name,
726scheduler=scheduler,
727use_habana=True,
728use_hpu_graphs=True,
729gaudi_config=GaudiConfig(use_torch_autocast=False),
730)
731set_seed(27)
732
733url = "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd2-upscale/low_res_cat.png"
734response = requests.get(url)
735low_res_img = Image.open(BytesIO(response.content)).convert("RGB")
736low_res_img = low_res_img.resize((128, 128))
737prompt = "a white cat"
738upscaled_image = pipeline(prompt=prompt, image=low_res_img, output_type="np").images[0]
739if os.environ.get("GAUDI2_CI", "0") == "1":
740expected_slice = np.array(
741[
7420.16527882,
7430.161616,
7440.15665859,
7450.1660901,
7460.1594379,
7470.14936888,
7480.1578255,
7490.15342498,
7500.14590919,
751]
752)
753else:
754expected_slice = np.array(
755[
7560.1652787,
7570.16161594,
7580.15665877,
7590.16608998,
7600.1594378,
7610.14936894,
7620.15782538,
7630.15342498,
7640.14590913,
765]
766)
767self.assertEqual(upscaled_image.shape, (512, 512, 3))
768self.assertLess(np.abs(expected_slice - upscaled_image[-3:, -3:, -1].flatten()).max(), 5e-3)
769
770@slow
771def test_textual_inversion(self):
772path_to_script = (
773Path(os.path.dirname(__file__)).parent
774/ "examples"
775/ "stable-diffusion"
776/ "training"
777/ "textual_inversion.py"
778)
779
780with tempfile.TemporaryDirectory() as data_dir:
781snapshot_download(
782"diffusers/cat_toy_example", local_dir=data_dir, repo_type="dataset", ignore_patterns=".gitattributes"
783)
784with tempfile.TemporaryDirectory() as run_dir:
785cmd_line = [
786"python3",
787f"{path_to_script.parent.parent.parent / 'gaudi_spawn.py'}",
788"--use_mpi",
789"--world_size",
790"8",
791f"{path_to_script}",
792"--pretrained_model_name_or_path runwayml/stable-diffusion-v1-5",
793f"--train_data_dir {data_dir}",
794'--learnable_property "object"',
795'--placeholder_token "<cat-toy>"',
796'--initializer_token "toy"',
797"--resolution 512",
798"--train_batch_size 4",
799"--max_train_steps 375",
800"--learning_rate 5.0e-04",
801"--scale_lr",
802'--lr_scheduler "constant"',
803"--lr_warmup_steps 0",
804f"--output_dir {run_dir}",
805"--save_as_full_pipeline",
806"--gaudi_config_name Habana/stable-diffusion",
807"--throughput_warmup_steps 3",
808"--seed 27",
809]
810pattern = re.compile(r"([\"\'].+?[\"\'])|\s")
811cmd_line = [x for y in cmd_line for x in re.split(pattern, y) if x]
812
813# Run textual inversion
814p = subprocess.Popen(cmd_line)
815return_code = p.wait()
816
817# Ensure the run finished without any issue
818self.assertEqual(return_code, 0)
819
820# Assess throughput
821with open(Path(run_dir) / "speed_metrics.json") as fp:
822results = json.load(fp)
823self.assertGreaterEqual(results["train_samples_per_second"], 0.95 * TEXTUAL_INVERSION_THROUGHPUT)
824self.assertLessEqual(results["train_runtime"], 1.05 * TEXTUAL_INVERSION_RUNTIME)
825
826# Assess generated image
827pipe = GaudiStableDiffusionPipeline.from_pretrained(
828run_dir,
829torch_dtype=torch.bfloat16,
830use_habana=True,
831use_hpu_graphs=True,
832gaudi_config=GaudiConfig(use_habana_mixed_precision=False),
833)
834prompt = "A <cat-toy> backpack"
835set_seed(27)
836image = pipe(prompt, num_inference_steps=50, guidance_scale=7.5, output_type="np").images[0]
837
838# TODO: see how to generate images in a reproducible way
839# expected_slice = np.array(
840# [0.57421875, 0.5703125, 0.58203125, 0.58203125, 0.578125, 0.5859375, 0.578125, 0.57421875, 0.56640625]
841# )
842self.assertEqual(image.shape, (512, 512, 3))
843# self.assertLess(np.abs(expected_slice - image[-3:, -3:, -1].flatten()).max(), 5e-3)
844
845
846class GaudiStableDiffusionXLPipelineTester(TestCase):
847"""
848Tests the StableDiffusionXLPipeline for Gaudi.
849"""
850
851def get_dummy_components(self, time_cond_proj_dim=None, timestep_spacing="leading"):
852torch.manual_seed(0)
853unet = UNet2DConditionModel(
854block_out_channels=(2, 4),
855layers_per_block=2,
856time_cond_proj_dim=time_cond_proj_dim,
857sample_size=32,
858in_channels=4,
859out_channels=4,
860down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
861up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
862# SD2-specific config below
863attention_head_dim=(2, 4),
864use_linear_projection=True,
865addition_embed_type="text_time",
866addition_time_embed_dim=8,
867transformer_layers_per_block=(1, 2),
868projection_class_embeddings_input_dim=80, # 6 * 8 + 32
869cross_attention_dim=64,
870norm_num_groups=1,
871)
872scheduler = GaudiEulerDiscreteScheduler(
873beta_start=0.00085,
874beta_end=0.012,
875steps_offset=1,
876beta_schedule="scaled_linear",
877timestep_spacing=timestep_spacing,
878)
879torch.manual_seed(0)
880vae = AutoencoderKL(
881block_out_channels=[32, 64],
882in_channels=3,
883out_channels=3,
884down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
885up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
886latent_channels=4,
887sample_size=128,
888)
889torch.manual_seed(0)
890text_encoder_config = CLIPTextConfig(
891bos_token_id=0,
892eos_token_id=2,
893hidden_size=32,
894intermediate_size=37,
895layer_norm_eps=1e-05,
896num_attention_heads=4,
897num_hidden_layers=5,
898pad_token_id=1,
899vocab_size=1000,
900# SD2-specific config below
901hidden_act="gelu",
902projection_dim=32,
903)
904text_encoder = CLIPTextModel(text_encoder_config)
905tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
906
907text_encoder_2 = CLIPTextModelWithProjection(text_encoder_config)
908tokenizer_2 = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
909
910components = {
911"unet": unet,
912"scheduler": scheduler,
913"vae": vae,
914"text_encoder": text_encoder,
915"tokenizer": tokenizer,
916"text_encoder_2": text_encoder_2,
917"tokenizer_2": tokenizer_2,
918"image_encoder": None,
919"feature_extractor": None,
920}
921return components
922
923def get_dummy_inputs(self, device, seed=0):
924generator = torch.Generator(device=device).manual_seed(seed)
925inputs = {
926"prompt": "A painting of a squirrel eating a burger",
927"generator": generator,
928"num_inference_steps": 2,
929"guidance_scale": 5.0,
930"output_type": "np",
931}
932return inputs
933
934def test_stable_diffusion_xl_euler(self):
935device = "cpu" # ensure determinism for the device-dependent torch.Generator
936components = self.get_dummy_components()
937gaudi_config = GaudiConfig(use_torch_autocast=False)
938sd_pipe = GaudiStableDiffusionXLPipeline(use_habana=True, gaudi_config=gaudi_config, **components)
939sd_pipe.set_progress_bar_config(disable=None)
940
941inputs = self.get_dummy_inputs(device)
942image = sd_pipe(**inputs).images[0]
943
944image_slice = image[-3:, -3:, -1]
945
946self.assertEqual(image.shape, (64, 64, 3))
947expected_slice = np.array([0.5552, 0.5569, 0.4725, 0.4348, 0.4994, 0.4632, 0.5142, 0.5012, 0.47])
948
949# The threshold should be 1e-2 below but it started failing
950# from Diffusers v0.24. However, generated images still look similar.
951self.assertLess(np.abs(image_slice.flatten() - expected_slice).max(), 1e-1)
952
953def test_stable_diffusion_xl_euler_ancestral(self):
954device = "cpu" # ensure determinism for the device-dependent torch.Generator
955components = self.get_dummy_components()
956gaudi_config = GaudiConfig(use_torch_autocast=False)
957sd_pipe = GaudiStableDiffusionXLPipeline(use_habana=True, gaudi_config=gaudi_config, **components)
958sd_pipe.scheduler = GaudiEulerAncestralDiscreteScheduler.from_config(sd_pipe.scheduler.config)
959sd_pipe.set_progress_bar_config(disable=None)
960
961inputs = self.get_dummy_inputs(device)
962image = sd_pipe(**inputs).images[0]
963
964image_slice = image[-3:, -3:, -1]
965
966self.assertEqual(image.shape, (64, 64, 3))
967expected_slice = np.array([0.4675, 0.5173, 0.4611, 0.4067, 0.5250, 0.4674, 0.5446, 0.5094, 0.4791])
968self.assertLess(np.abs(image_slice.flatten() - expected_slice).max(), 1e-2)
969
970def test_stable_diffusion_xl_turbo_euler_ancestral(self):
971device = "cpu" # ensure determinism for the device-dependent torch.Generator
972components = self.get_dummy_components(timestep_spacing="trailing")
973gaudi_config = GaudiConfig(use_torch_autocast=False)
974
975sd_pipe = GaudiStableDiffusionXLPipeline(use_habana=True, gaudi_config=gaudi_config, **components)
976sd_pipe.scheduler = GaudiEulerAncestralDiscreteScheduler.from_config(sd_pipe.scheduler.config)
977
978sd_pipe.set_progress_bar_config(disable=None)
979
980inputs = self.get_dummy_inputs(device)
981image = sd_pipe(**inputs).images[0]
982
983image_slice = image[-3:, -3:, -1]
984
985self.assertEqual(image.shape, (64, 64, 3))
986expected_slice = np.array([0.4675, 0.5173, 0.4611, 0.4067, 0.5250, 0.4674, 0.5446, 0.5094, 0.4791])
987self.assertLess(np.abs(image_slice.flatten() - expected_slice).max(), 1e-2)
988
989@parameterized.expand(["pil", "np", "latent"])
990def test_stable_diffusion_xl_output_types(self, output_type):
991components = self.get_dummy_components()
992gaudi_config = GaudiConfig()
993
994sd_pipe = GaudiStableDiffusionXLPipeline(
995use_habana=True,
996gaudi_config=gaudi_config,
997**components,
998)
999sd_pipe.set_progress_bar_config(disable=None)
1000
1001prompt = "A painting of a squirrel eating a burger"
1002num_prompts = 2
1003num_images_per_prompt = 3
1004
1005outputs = sd_pipe(
1006num_prompts * [prompt],
1007num_images_per_prompt=num_images_per_prompt,
1008num_inference_steps=2,
1009output_type=output_type,
1010)
1011
1012self.assertEqual(len(outputs.images), 2 * 3)
1013
1014def test_stable_diffusion_xl_num_images_per_prompt(self):
1015components = self.get_dummy_components()
1016gaudi_config = GaudiConfig()
1017
1018sd_pipe = GaudiStableDiffusionXLPipeline(
1019use_habana=True,
1020gaudi_config=gaudi_config,
1021**components,
1022)
1023sd_pipe.set_progress_bar_config(disable=None)
1024
1025prompt = "A painting of a squirrel eating a burger"
1026
1027# Test num_images_per_prompt=1 (default)
1028images = sd_pipe(prompt, num_inference_steps=2, output_type="np").images
1029
1030self.assertEqual(len(images), 1)
1031self.assertEqual(images[0].shape, (64, 64, 3))
1032
1033# Test num_images_per_prompt=1 (default) for several prompts
1034num_prompts = 3
1035images = sd_pipe([prompt] * num_prompts, num_inference_steps=2, output_type="np").images
1036
1037self.assertEqual(len(images), num_prompts)
1038self.assertEqual(images[-1].shape, (64, 64, 3))
1039
1040# Test num_images_per_prompt for single prompt
1041num_images_per_prompt = 2
1042images = sd_pipe(
1043prompt, num_inference_steps=2, output_type="np", num_images_per_prompt=num_images_per_prompt
1044).images
1045
1046self.assertEqual(len(images), num_images_per_prompt)
1047self.assertEqual(images[-1].shape, (64, 64, 3))
1048
1049# Test num_images_per_prompt for several prompts
1050num_prompts = 2
1051images = sd_pipe(
1052[prompt] * num_prompts,
1053num_inference_steps=2,
1054output_type="np",
1055num_images_per_prompt=num_images_per_prompt,
1056).images
1057
1058self.assertEqual(len(images), num_prompts * num_images_per_prompt)
1059self.assertEqual(images[-1].shape, (64, 64, 3))
1060
1061def test_stable_diffusion_xl_batch_sizes(self):
1062components = self.get_dummy_components()
1063gaudi_config = GaudiConfig()
1064
1065sd_pipe = GaudiStableDiffusionXLPipeline(
1066use_habana=True,
1067gaudi_config=gaudi_config,
1068**components,
1069)
1070sd_pipe.set_progress_bar_config(disable=None)
1071
1072prompt = "A painting of a squirrel eating a burger"
1073
1074# Test batch_size > 1 where batch_size is a divider of the total number of generated images
1075batch_size = 3
1076num_images_per_prompt = batch_size**2
1077images = sd_pipe(
1078prompt,
1079num_inference_steps=2,
1080output_type="np",
1081batch_size=batch_size,
1082num_images_per_prompt=num_images_per_prompt,
1083).images
1084self.assertEqual(len(images), num_images_per_prompt)
1085self.assertEqual(images[-1].shape, (64, 64, 3))
1086
1087# Same test for several prompts
1088num_prompts = 3
1089images = sd_pipe(
1090[prompt] * num_prompts,
1091num_inference_steps=2,
1092output_type="np",
1093batch_size=batch_size,
1094num_images_per_prompt=num_images_per_prompt,
1095).images
1096
1097self.assertEqual(len(images), num_prompts * num_images_per_prompt)
1098self.assertEqual(images[-1].shape, (64, 64, 3))
1099
1100# Test batch_size when it is not a divider of the total number of generated images for a single prompt
1101num_images_per_prompt = 7
1102images = sd_pipe(
1103prompt,
1104num_inference_steps=2,
1105output_type="np",
1106batch_size=batch_size,
1107num_images_per_prompt=num_images_per_prompt,
1108).images
1109
1110self.assertEqual(len(images), num_images_per_prompt)
1111self.assertEqual(images[-1].shape, (64, 64, 3))
1112
1113# Same test for several prompts
1114num_prompts = 2
1115images = sd_pipe(
1116[prompt] * num_prompts,
1117num_inference_steps=2,
1118output_type="np",
1119batch_size=batch_size,
1120num_images_per_prompt=num_images_per_prompt,
1121).images
1122
1123self.assertEqual(len(images), num_prompts * num_images_per_prompt)
1124self.assertEqual(images[-1].shape, (64, 64, 3))
1125
1126def test_stable_diffusion_xl_bf16(self):
1127"""Test that stable diffusion works with bf16"""
1128components = self.get_dummy_components()
1129gaudi_config = GaudiConfig()
1130
1131sd_pipe = GaudiStableDiffusionXLPipeline(
1132use_habana=True,
1133gaudi_config=gaudi_config,
1134**components,
1135)
1136sd_pipe.set_progress_bar_config(disable=None)
1137
1138prompt = "A painting of a squirrel eating a burger"
1139generator = torch.Generator(device="cpu").manual_seed(0)
1140image = sd_pipe([prompt], generator=generator, num_inference_steps=2, output_type="np").images[0]
1141
1142self.assertEqual(image.shape, (64, 64, 3))
1143
1144def test_stable_diffusion_xl_default(self):
1145components = self.get_dummy_components()
1146
1147sd_pipe = GaudiStableDiffusionXLPipeline(
1148use_habana=True,
1149gaudi_config="Habana/stable-diffusion",
1150**components,
1151)
1152sd_pipe.set_progress_bar_config(disable=None)
1153
1154prompt = "A painting of a squirrel eating a burger"
1155generator = torch.Generator(device="cpu").manual_seed(0)
1156images = sd_pipe(
1157[prompt] * 2,
1158generator=generator,
1159num_inference_steps=2,
1160output_type="np",
1161batch_size=3,
1162num_images_per_prompt=5,
1163).images
1164
1165self.assertEqual(len(images), 10)
1166self.assertEqual(images[-1].shape, (64, 64, 3))
1167
1168def test_stable_diffusion_xl_hpu_graphs(self):
1169components = self.get_dummy_components()
1170
1171sd_pipe = GaudiStableDiffusionXLPipeline(
1172use_habana=True,
1173use_hpu_graphs=True,
1174gaudi_config="Habana/stable-diffusion",
1175**components,
1176)
1177sd_pipe.set_progress_bar_config(disable=None)
1178
1179prompt = "A painting of a squirrel eating a burger"
1180generator = torch.Generator(device="cpu").manual_seed(0)
1181images = sd_pipe(
1182[prompt] * 2,
1183generator=generator,
1184num_inference_steps=2,
1185output_type="np",
1186batch_size=3,
1187num_images_per_prompt=5,
1188).images
1189
1190self.assertEqual(len(images), 10)
1191self.assertEqual(images[-1].shape, (64, 64, 3))
1192
1193
1194class GaudiStableDiffusionControlNetPipelineTester(TestCase):
1195"""
1196Tests the StableDiffusionControlNetPipeline for Gaudi.
1197"""
1198
1199def get_dummy_components(self, time_cond_proj_dim=None):
1200torch.manual_seed(0)
1201unet = UNet2DConditionModel(
1202block_out_channels=(4, 8),
1203layers_per_block=2,
1204sample_size=32,
1205time_cond_proj_dim=time_cond_proj_dim,
1206in_channels=4,
1207out_channels=4,
1208down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
1209up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
1210cross_attention_dim=32,
1211norm_num_groups=1,
1212)
1213
1214def init_weights(m):
1215if isinstance(m, torch.nn.Conv2d):
1216torch.nn.init.normal(m.weight)
1217m.bias.data.fill_(1.0)
1218
1219torch.manual_seed(0)
1220controlnet = ControlNetModel(
1221block_out_channels=(4, 8),
1222layers_per_block=2,
1223in_channels=4,
1224down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
1225cross_attention_dim=32,
1226conditioning_embedding_out_channels=(16, 32),
1227norm_num_groups=1,
1228)
1229controlnet.controlnet_down_blocks.apply(init_weights)
1230
1231scheduler = GaudiDDIMScheduler(
1232beta_start=0.00085,
1233beta_end=0.012,
1234beta_schedule="scaled_linear",
1235clip_sample=False,
1236set_alpha_to_one=False,
1237)
1238torch.manual_seed(0)
1239vae = AutoencoderKL(
1240block_out_channels=[4, 8],
1241in_channels=3,
1242out_channels=3,
1243down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
1244up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
1245latent_channels=4,
1246norm_num_groups=2,
1247)
1248torch.manual_seed(0)
1249text_encoder_config = CLIPTextConfig(
1250bos_token_id=0,
1251eos_token_id=2,
1252hidden_size=32,
1253intermediate_size=37,
1254layer_norm_eps=1e-05,
1255num_attention_heads=4,
1256num_hidden_layers=5,
1257pad_token_id=1,
1258vocab_size=1000,
1259)
1260text_encoder = CLIPTextModel(text_encoder_config)
1261tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
1262
1263components = {
1264"unet": unet,
1265"controlnet": controlnet,
1266"scheduler": scheduler,
1267"vae": vae,
1268"text_encoder": text_encoder,
1269"tokenizer": tokenizer,
1270"safety_checker": None,
1271"feature_extractor": None,
1272}
1273return components
1274
1275def get_dummy_inputs(self, device, seed=0):
1276generator = torch.Generator(device=device).manual_seed(seed)
1277controlnet_embedder_scale_factor = 2
1278images = [
1279randn_tensor(
1280(1, 3, 32 * controlnet_embedder_scale_factor, 32 * controlnet_embedder_scale_factor),
1281generator=generator,
1282device=torch.device(device),
1283),
1284]
1285inputs = {
1286"prompt": "A painting of a squirrel eating a burger",
1287"generator": generator,
1288"num_inference_steps": 2,
1289"guidance_scale": 6.0,
1290"output_type": "np",
1291"image": images,
1292}
1293return inputs
1294
1295def test_stable_diffusion_controlnet_num_images_per_prompt(self):
1296components = self.get_dummy_components()
1297gaudi_config = GaudiConfig()
1298
1299sd_pipe = GaudiStableDiffusionControlNetPipeline(
1300use_habana=True,
1301gaudi_config=gaudi_config,
1302**components,
1303)
1304sd_pipe.set_progress_bar_config(disable=None)
1305
1306inputs = self.get_dummy_inputs(device="cpu")
1307prompt = inputs["prompt"]
1308# Test num_images_per_prompt=1 (default)
1309images = sd_pipe(**inputs).images
1310
1311self.assertEqual(len(images), 1)
1312self.assertEqual(images[0].shape, (64, 64, 3))
1313
1314# Test num_images_per_prompt=1 (default) for several prompts
1315num_prompts = 3
1316inputs["prompt"] = [prompt] * num_prompts
1317images = sd_pipe(**inputs).images
1318
1319self.assertEqual(len(images), num_prompts)
1320self.assertEqual(images[-1].shape, (64, 64, 3))
1321
1322# Test num_images_per_prompt for single prompt
1323num_images_per_prompt = 2
1324inputs["prompt"] = prompt
1325images = sd_pipe(num_images_per_prompt=num_images_per_prompt, **inputs).images
1326
1327self.assertEqual(len(images), num_images_per_prompt)
1328self.assertEqual(images[-1].shape, (64, 64, 3))
1329
1330## Test num_images_per_prompt for several prompts
1331num_prompts = 2
1332inputs["prompt"] = [prompt] * num_prompts
1333images = sd_pipe(num_images_per_prompt=num_images_per_prompt, **inputs).images
1334
1335self.assertEqual(len(images), num_prompts * num_images_per_prompt)
1336self.assertEqual(images[-1].shape, (64, 64, 3))
1337
1338def test_stable_diffusion_controlnet_batch_sizes(self):
1339components = self.get_dummy_components()
1340gaudi_config = GaudiConfig()
1341
1342sd_pipe = GaudiStableDiffusionControlNetPipeline(
1343use_habana=True,
1344gaudi_config=gaudi_config,
1345**components,
1346)
1347sd_pipe.set_progress_bar_config(disable=None)
1348
1349inputs = self.get_dummy_inputs(device="cpu")
1350prompt = inputs["prompt"]
1351# Test batch_size > 1 where batch_size is a divider of the total number of generated images
1352batch_size = 3
1353num_images_per_prompt = batch_size**2
1354images = sd_pipe(
1355batch_size=batch_size,
1356num_images_per_prompt=num_images_per_prompt,
1357**inputs,
1358).images
1359self.assertEqual(len(images), num_images_per_prompt)
1360self.assertEqual(images[-1].shape, (64, 64, 3))
1361
1362# Same test for several prompts
1363num_prompts = 3
1364inputs["prompt"] = [prompt] * num_prompts
1365
1366images = sd_pipe(
1367batch_size=batch_size,
1368num_images_per_prompt=num_images_per_prompt,
1369**inputs,
1370).images
1371
1372self.assertEqual(len(images), num_prompts * num_images_per_prompt)
1373self.assertEqual(images[-1].shape, (64, 64, 3))
1374
1375inputs["prompt"] = prompt
1376# Test batch_size when it is not a divider of the total number of generated images for a single prompt
1377num_images_per_prompt = 7
1378images = sd_pipe(
1379batch_size=batch_size,
1380num_images_per_prompt=num_images_per_prompt,
1381**inputs,
1382).images
1383
1384self.assertEqual(len(images), num_images_per_prompt)
1385self.assertEqual(images[-1].shape, (64, 64, 3))
1386
1387# Same test for several prompts
1388num_prompts = 2
1389inputs["prompt"] = [prompt] * num_prompts
1390images = sd_pipe(batch_size=batch_size, num_images_per_prompt=num_images_per_prompt, **inputs).images
1391
1392self.assertEqual(len(images), num_prompts * num_images_per_prompt)
1393self.assertEqual(images[-1].shape, (64, 64, 3))
1394
1395def test_stable_diffusion_controlnet_bf16(self):
1396"""Test that stable diffusion works with bf16"""
1397components = self.get_dummy_components()
1398gaudi_config = GaudiConfig()
1399
1400sd_pipe = GaudiStableDiffusionControlNetPipeline(
1401use_habana=True,
1402gaudi_config=gaudi_config,
1403**components,
1404)
1405sd_pipe.set_progress_bar_config(disable=None)
1406
1407inputs = self.get_dummy_inputs(device="cpu")
1408image = sd_pipe(**inputs).images[0]
1409
1410self.assertEqual(image.shape, (64, 64, 3))
1411
1412def test_stable_diffusion_controlnet_default(self):
1413components = self.get_dummy_components()
1414
1415sd_pipe = GaudiStableDiffusionControlNetPipeline(
1416use_habana=True,
1417gaudi_config="Habana/stable-diffusion",
1418**components,
1419)
1420sd_pipe.set_progress_bar_config(disable=None)
1421
1422inputs = self.get_dummy_inputs(device="cpu")
1423inputs["prompt"] = [inputs["prompt"]] * 2
1424images = sd_pipe(
1425batch_size=3,
1426num_images_per_prompt=5,
1427**inputs,
1428).images
1429
1430self.assertEqual(len(images), 10)
1431self.assertEqual(images[-1].shape, (64, 64, 3))
1432
1433def test_stable_diffusion_controlnet_hpu_graphs(self):
1434components = self.get_dummy_components()
1435
1436sd_pipe = GaudiStableDiffusionControlNetPipeline(
1437use_habana=True,
1438use_hpu_graphs=True,
1439gaudi_config="Habana/stable-diffusion",
1440**components,
1441)
1442sd_pipe.set_progress_bar_config(disable=None)
1443
1444inputs = self.get_dummy_inputs(device="cpu")
1445inputs["prompt"] = [inputs["prompt"]] * 2
1446
1447images = sd_pipe(
1448batch_size=3,
1449num_images_per_prompt=5,
1450**inputs,
1451).images
1452
1453self.assertEqual(len(images), 10)
1454self.assertEqual(images[-1].shape, (64, 64, 3))
1455
1456
1457class GaudiStableDiffusionMultiControlNetPipelineTester(TestCase):
1458"""
1459Tests the StableDiffusionControlNetPipeline for Gaudi.
1460"""
1461
1462def get_dummy_components(self, time_cond_proj_dim=None):
1463torch.manual_seed(0)
1464unet = UNet2DConditionModel(
1465block_out_channels=(4, 8),
1466layers_per_block=2,
1467sample_size=32,
1468time_cond_proj_dim=time_cond_proj_dim,
1469in_channels=4,
1470out_channels=4,
1471down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
1472up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
1473cross_attention_dim=32,
1474norm_num_groups=1,
1475)
1476
1477def init_weights(m):
1478if isinstance(m, torch.nn.Conv2d):
1479torch.nn.init.normal(m.weight)
1480m.bias.data.fill_(1.0)
1481
1482torch.manual_seed(0)
1483controlnet1 = ControlNetModel(
1484block_out_channels=(4, 8),
1485layers_per_block=2,
1486in_channels=4,
1487down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
1488cross_attention_dim=32,
1489conditioning_embedding_out_channels=(16, 32),
1490norm_num_groups=1,
1491)
1492controlnet1.controlnet_down_blocks.apply(init_weights)
1493
1494torch.manual_seed(0)
1495controlnet2 = ControlNetModel(
1496block_out_channels=(4, 8),
1497layers_per_block=2,
1498in_channels=4,
1499down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
1500cross_attention_dim=32,
1501conditioning_embedding_out_channels=(16, 32),
1502norm_num_groups=1,
1503)
1504controlnet2.controlnet_down_blocks.apply(init_weights)
1505
1506scheduler = GaudiDDIMScheduler(
1507beta_start=0.00085,
1508beta_end=0.012,
1509beta_schedule="scaled_linear",
1510clip_sample=False,
1511set_alpha_to_one=False,
1512)
1513torch.manual_seed(0)
1514vae = AutoencoderKL(
1515block_out_channels=[4, 8],
1516in_channels=3,
1517out_channels=3,
1518down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
1519up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
1520latent_channels=4,
1521norm_num_groups=2,
1522)
1523torch.manual_seed(0)
1524text_encoder_config = CLIPTextConfig(
1525bos_token_id=0,
1526eos_token_id=2,
1527hidden_size=32,
1528intermediate_size=37,
1529layer_norm_eps=1e-05,
1530num_attention_heads=4,
1531num_hidden_layers=5,
1532pad_token_id=1,
1533vocab_size=1000,
1534)
1535text_encoder = CLIPTextModel(text_encoder_config)
1536tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
1537
1538controlnet = MultiControlNetModel([controlnet1, controlnet2])
1539
1540components = {
1541"unet": unet,
1542"controlnet": controlnet,
1543"scheduler": scheduler,
1544"vae": vae,
1545"text_encoder": text_encoder,
1546"tokenizer": tokenizer,
1547"safety_checker": None,
1548"feature_extractor": None,
1549}
1550return components
1551
1552def get_dummy_inputs(self, device, seed=0):
1553generator = torch.Generator(device=device).manual_seed(seed)
1554controlnet_embedder_scale_factor = 2
1555images = [
1556randn_tensor(
1557(1, 3, 32 * controlnet_embedder_scale_factor, 32 * controlnet_embedder_scale_factor),
1558generator=generator,
1559device=torch.device(device),
1560),
1561randn_tensor(
1562(1, 3, 32 * controlnet_embedder_scale_factor, 32 * controlnet_embedder_scale_factor),
1563generator=generator,
1564device=torch.device(device),
1565),
1566]
1567inputs = {
1568"prompt": "A painting of a squirrel eating a burger",
1569"generator": generator,
1570"num_inference_steps": 2,
1571"guidance_scale": 6.0,
1572"output_type": "np",
1573"image": images,
1574}
1575return inputs
1576
1577def test_stable_diffusion_multicontrolnet_num_images_per_prompt(self):
1578components = self.get_dummy_components()
1579gaudi_config = GaudiConfig()
1580
1581sd_pipe = GaudiStableDiffusionControlNetPipeline(
1582use_habana=True,
1583gaudi_config=gaudi_config,
1584**components,
1585)
1586sd_pipe.set_progress_bar_config(disable=None)
1587
1588inputs = self.get_dummy_inputs(device="cpu")
1589prompt = inputs["prompt"]
1590# Test num_images_per_prompt=1 (default)
1591images = sd_pipe(**inputs).images
1592
1593self.assertEqual(len(images), 1)
1594self.assertEqual(images[0].shape, (64, 64, 3))
1595
1596# Test num_images_per_prompt=1 (default) for several prompts
1597num_prompts = 3
1598inputs["prompt"] = [prompt] * num_prompts
1599images = sd_pipe(**inputs).images
1600
1601self.assertEqual(len(images), num_prompts)
1602self.assertEqual(images[-1].shape, (64, 64, 3))
1603
1604# Test num_images_per_prompt for single prompt
1605num_images_per_prompt = 2
1606inputs["prompt"] = prompt
1607images = sd_pipe(num_images_per_prompt=num_images_per_prompt, **inputs).images
1608
1609self.assertEqual(len(images), num_images_per_prompt)
1610self.assertEqual(images[-1].shape, (64, 64, 3))
1611
1612## Test num_images_per_prompt for several prompts
1613num_prompts = 2
1614inputs["prompt"] = [prompt] * num_prompts
1615images = sd_pipe(num_images_per_prompt=num_images_per_prompt, **inputs).images
1616
1617self.assertEqual(len(images), num_prompts * num_images_per_prompt)
1618self.assertEqual(images[-1].shape, (64, 64, 3))
1619
1620def test_stable_diffusion_multicontrolnet_batch_sizes(self):
1621components = self.get_dummy_components()
1622gaudi_config = GaudiConfig()
1623
1624sd_pipe = GaudiStableDiffusionControlNetPipeline(
1625use_habana=True,
1626gaudi_config=gaudi_config,
1627**components,
1628)
1629sd_pipe.set_progress_bar_config(disable=None)
1630
1631inputs = self.get_dummy_inputs(device="cpu")
1632prompt = inputs["prompt"]
1633# Test batch_size > 1 where batch_size is a divider of the total number of generated images
1634batch_size = 3
1635num_images_per_prompt = batch_size**2
1636images = sd_pipe(
1637batch_size=batch_size,
1638num_images_per_prompt=num_images_per_prompt,
1639**inputs,
1640).images
1641self.assertEqual(len(images), num_images_per_prompt)
1642self.assertEqual(images[-1].shape, (64, 64, 3))
1643
1644# Same test for several prompts
1645num_prompts = 3
1646inputs["prompt"] = [prompt] * num_prompts
1647
1648images = sd_pipe(
1649batch_size=batch_size,
1650num_images_per_prompt=num_images_per_prompt,
1651**inputs,
1652).images
1653
1654self.assertEqual(len(images), num_prompts * num_images_per_prompt)
1655self.assertEqual(images[-1].shape, (64, 64, 3))
1656
1657inputs["prompt"] = prompt
1658# Test batch_size when it is not a divider of the total number of generated images for a single prompt
1659num_images_per_prompt = 7
1660images = sd_pipe(
1661batch_size=batch_size,
1662num_images_per_prompt=num_images_per_prompt,
1663**inputs,
1664).images
1665
1666self.assertEqual(len(images), num_images_per_prompt)
1667self.assertEqual(images[-1].shape, (64, 64, 3))
1668
1669# Same test for several prompts
1670num_prompts = 2
1671inputs["prompt"] = [prompt] * num_prompts
1672images = sd_pipe(batch_size=batch_size, num_images_per_prompt=num_images_per_prompt, **inputs).images
1673
1674self.assertEqual(len(images), num_prompts * num_images_per_prompt)
1675self.assertEqual(images[-1].shape, (64, 64, 3))
1676
1677def test_stable_diffusion_multicontrolnet_bf16(self):
1678"""Test that stable diffusion works with bf16"""
1679components = self.get_dummy_components()
1680gaudi_config = GaudiConfig()
1681
1682sd_pipe = GaudiStableDiffusionControlNetPipeline(
1683use_habana=True,
1684gaudi_config=gaudi_config,
1685**components,
1686)
1687sd_pipe.set_progress_bar_config(disable=None)
1688
1689inputs = self.get_dummy_inputs(device="cpu")
1690image = sd_pipe(**inputs).images[0]
1691
1692self.assertEqual(image.shape, (64, 64, 3))
1693
1694def test_stable_diffusion_multicontrolnet_default(self):
1695components = self.get_dummy_components()
1696
1697sd_pipe = GaudiStableDiffusionControlNetPipeline(
1698use_habana=True,
1699gaudi_config="Habana/stable-diffusion",
1700**components,
1701)
1702sd_pipe.set_progress_bar_config(disable=None)
1703
1704inputs = self.get_dummy_inputs(device="cpu")
1705inputs["prompt"] = [inputs["prompt"]] * 2
1706images = sd_pipe(
1707batch_size=3,
1708num_images_per_prompt=5,
1709**inputs,
1710).images
1711
1712self.assertEqual(len(images), 10)
1713self.assertEqual(images[-1].shape, (64, 64, 3))
1714
1715def test_stable_diffusion_multicontrolnet_hpu_graphs(self):
1716components = self.get_dummy_components()
1717
1718sd_pipe = GaudiStableDiffusionControlNetPipeline(
1719use_habana=True,
1720use_hpu_graphs=True,
1721gaudi_config="Habana/stable-diffusion",
1722**components,
1723)
1724sd_pipe.set_progress_bar_config(disable=None)
1725
1726inputs = self.get_dummy_inputs(device="cpu")
1727inputs["prompt"] = [inputs["prompt"]] * 2
1728
1729images = sd_pipe(
1730batch_size=3,
1731num_images_per_prompt=5,
1732**inputs,
1733).images
1734
1735self.assertEqual(len(images), 10)
1736self.assertEqual(images[-1].shape, (64, 64, 3))
1737
1738
1739class TrainTextToImage(TestCase):
1740"""
1741Tests the Stable Diffusion text_to_image Training for Gaudi.
1742"""
1743
1744def test_train_text_to_image_script(self):
1745path_to_script = (
1746Path(os.path.dirname(__file__)).parent
1747/ "examples"
1748/ "stable-diffusion"
1749/ "training"
1750/ "train_text_to_image_sdxl.py"
1751)
1752
1753cmd_line = f"""ls {path_to_script}""".split()
1754
1755# check find existence
1756p = subprocess.Popen(cmd_line)
1757return_code = p.wait()
1758
1759# Ensure the run finished without any issue
1760self.assertEqual(return_code, 0)
1761
1762@slow
1763def test_train_text_to_image_sdxl(self):
1764with tempfile.TemporaryDirectory() as tmpdir:
1765path_to_script = (
1766Path(os.path.dirname(__file__)).parent
1767/ "examples"
1768/ "stable-diffusion"
1769/ "training"
1770/ "train_text_to_image_sdxl.py"
1771)
1772
1773cmd_line = f"""
1774python3
1775{path_to_script}
1776--pretrained_model_name_or_path stabilityai/stable-diffusion-xl-base-1.0
1777--pretrained_vae_model_name_or_path stabilityai/sdxl-vae
1778--dataset_name lambdalabs/pokemon-blip-captions
1779--resolution 64
1780--center_crop
1781--random_flip
1782--proportion_empty_prompts=0.2
1783--train_batch_size 1
1784--gradient_accumulation_steps 4
1785--learning_rate 1e-05
1786--max_grad_norm 1
1787--lr_scheduler constant
1788--lr_warmup_steps 0
1789--gaudi_config_name Habana/stable-diffusion
1790--throughput_warmup_steps 3
1791--use_hpu_graphs
1792--bf16
1793--max_train_steps 2
1794--output_dir {tmpdir}
1795""".split()
1796
1797# Run train_text_to_image_sdxl.y
1798p = subprocess.Popen(cmd_line)
1799return_code = p.wait()
1800
1801# Ensure the run finished without any issue
1802self.assertEqual(return_code, 0)
1803
1804# save_pretrained smoke test
1805self.assertTrue(os.path.isfile(os.path.join(tmpdir, "unet", "diffusion_pytorch_model.safetensors")))
1806self.assertTrue(os.path.isfile(os.path.join(tmpdir, "scheduler", "scheduler_config.json")))
1807
1808@slow
1809def test_train_text_to_image_sdxl_lora(self):
1810with tempfile.TemporaryDirectory() as tmpdir:
1811path_to_script = (
1812Path(os.path.dirname(__file__)).parent
1813/ "examples"
1814/ "stable-diffusion"
1815/ "training"
1816/ "train_text_to_image_sdxl.py"
1817)
1818
1819cmd_line = f"""
1820python3
1821{path_to_script}
1822--pretrained_model_name_or_path=stabilityai/stable-diffusion-xl-base-1.0
1823--pretrained_vae_model_name_or_path=madebyollin/sdxl-vae-fp16-fix
1824--dataset_name=lambdalabs/pokemon-blip-captions
1825--caption_column=text
1826--resolution=64
1827--random_flip
1828--train_batch_size=1
1829--learning_rate=1e-04
1830--lr_scheduler=constant
1831--lr_warmup_steps=0
1832--seed=42
1833--finetuning_method=lora
1834--gaudi_config_name=Habana/stable-diffusion
1835--throughput_warmup_steps=3
1836--use_hpu_graphs
1837--bf16
1838--max_train_steps 2
1839--output_dir {tmpdir}
1840""".split()
1841
1842# Run train_text_to_image_lora.py
1843p = subprocess.Popen(cmd_line)
1844return_code = p.wait()
1845
1846# Ensure the run finished without any issue
1847self.assertEqual(return_code, 0)
1848
1849# save_pretrained smoke test
1850self.assertTrue(os.path.isfile(os.path.join(tmpdir, "pytorch_lora_weights.safetensors")))
1851
1852
1853class TrainControlNet(TestCase):
1854"""
1855Tests the train_controlnet.py script for Gaudi.
1856"""
1857
1858def test_train_controlnet_script(self):
1859path_to_script = (
1860Path(os.path.dirname(__file__)).parent
1861/ "examples"
1862/ "stable-diffusion"
1863/ "training"
1864/ "train_controlnet.py"
1865)
1866
1867cmd_line = f"""ls {path_to_script}""".split()
1868
1869# check find existence
1870p = subprocess.Popen(cmd_line)
1871return_code = p.wait()
1872
1873# Ensure the run finished without any issue
1874self.assertEqual(return_code, 0)
1875
1876@slow
1877def test_train_controlnet(self):
1878with tempfile.TemporaryDirectory() as tmpdir:
1879path_to_script = (
1880Path(os.path.dirname(__file__)).parent
1881/ "examples"
1882/ "stable-diffusion"
1883/ "training"
1884/ "train_controlnet.py"
1885)
1886
1887download_files(
1888[
1889"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/controlnet_training/conditioning_image_1.png",
1890"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/controlnet_training/conditioning_image_2.png",
1891],
1892path=tmpdir,
1893)
1894
1895cmd_line = f"""
1896python3
1897{path_to_script.parent.parent.parent / 'gaudi_spawn.py'}
1898--use_mpi
1899--world_size 8
1900{path_to_script}
1901--pretrained_model_name_or_path runwayml/stable-diffusion-v1-5
1902--dataset_name fusing/fill50k
1903--resolution 512
1904--train_batch_size 4
1905--learning_rate 1e-05
1906--validation_steps 1000
1907--validation_image "{tmpdir}/conditioning_image_1.png" "{tmpdir}/conditioning_image_2.png"
1908--validation_prompt "red circle with blue background" "cyan circle with brown floral background"
1909--checkpointing_steps 1000
1910--throughput_warmup_steps 3
1911--use_hpu_graphs
1912--bf16
1913--num_train_epochs 1
1914--output_dir {tmpdir}
1915""".split()
1916
1917# Run train_controlnet.y
1918p = subprocess.Popen(cmd_line)
1919return_code = p.wait()
1920
1921# Ensure the run finished without any issue
1922self.assertEqual(return_code, 0)
1923
1924# Assess throughput
1925with open(Path(tmpdir) / "speed_metrics.json") as fp:
1926results = json.load(fp)
1927self.assertGreaterEqual(results["train_samples_per_second"], 0.95 * CONTROLNET_THROUGHPUT)
1928self.assertLessEqual(results["train_runtime"], 1.05 * CONTROLNET_RUNTIME)
1929
1930# Assess generated image
1931controlnet = ControlNetModel.from_pretrained(tmpdir, torch_dtype=torch.bfloat16)
1932pipe = GaudiStableDiffusionControlNetPipeline.from_pretrained(
1933"runwayml/stable-diffusion-v1-5",
1934controlnet=controlnet,
1935torch_dtype=torch.bfloat16,
1936use_habana=True,
1937use_hpu_graphs=True,
1938gaudi_config=GaudiConfig(use_habana_mixed_precision=False),
1939)
1940pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
1941
1942control_image = load_image(f"{tmpdir}/conditioning_image_1.png")
1943prompt = "pale golden rod circle with old lace background"
1944
1945generator = set_seed(27)
1946image = pipe(
1947prompt, num_inference_steps=20, generator=generator, image=control_image, output_type="np"
1948).images[0]
1949
1950self.assertEqual(image.shape, (512, 512, 3))
1951