keras-team
diff --git a/‎keras_nlp/api/models/__init__.py‎
Lines changed: 7 additions & 0 deletions b/‎keras_nlp/api/models/__init__.py‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎keras_nlp/src/models/stable_diffusion_v3/clip_encoder_block.py‎
Lines changed: 17 additions & 3 deletions b/‎keras_nlp/src/models/stable_diffusion_v3/clip_encoder_block.py‎
Lines changed: 17 additions & 3 deletions
diff --git a/‎keras_nlp/src/models/stable_diffusion_v3/clip_preprocessor.py‎
Lines changed: 2 additions & 2 deletions b/‎keras_nlp/src/models/stable_diffusion_v3/clip_preprocessor.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎keras_nlp/src/models/stable_diffusion_v3/clip_text_encoder.py‎
Lines changed: 45 additions & 15 deletions b/‎keras_nlp/src/models/stable_diffusion_v3/clip_text_encoder.py‎
Lines changed: 45 additions & 15 deletions
diff --git a/‎keras_nlp/src/models/stable_diffusion_v3/flow_match_euler_discrete_scheduler.py‎
Lines changed: 55 additions & 0 deletions b/‎keras_nlp/src/models/stable_diffusion_v3/flow_match_euler_discrete_scheduler.py‎
Lines changed: 55 additions & 0 deletions
diff --git a/‎keras_nlp/src/models/stable_diffusion_v3/mmdit.py‎
Lines changed: 2 additions & 2 deletions b/‎keras_nlp/src/models/stable_diffusion_v3/mmdit.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎keras_nlp/src/models/stable_diffusion_v3/mmdit_block.py‎
Lines changed: 6 additions & 3 deletions b/‎keras_nlp/src/models/stable_diffusion_v3/mmdit_block.py‎
Lines changed: 6 additions & 3 deletions
@@ -220,9 +220,16 @@
 )
 from keras_nlp.src.models.roberta.roberta_tokenizer import RobertaTokenizer
 from keras_nlp.src.models.seq_2_seq_lm import Seq2SeqLM
+from keras_nlp.src.models.stable_diffusion_v3.stable_diffusion_3_backbone import (
+    StableDiffusion3Backbone,
+)
+from keras_nlp.src.models.stable_diffusion_v3.stable_diffusion_3_text_to_image import (
+    StableDiffusion3TextToImage,
+)
 from keras_nlp.src.models.t5.t5_backbone import T5Backbone
 from keras_nlp.src.models.t5.t5_tokenizer import T5Tokenizer
 from keras_nlp.src.models.task import Task
+from keras_nlp.src.models.text_to_image import TextToImage
 from keras_nlp.src.models.vgg.vgg_backbone import VGGBackbone
 from keras_nlp.src.models.vgg.vgg_image_classifier import VGGImageClassifier
 from keras_nlp.src.models.vit_det.vit_det_backbone import ViTDetBackbone
 
@@ -19,6 +19,20 @@ def quick_gelu(x):
     return x * ops.sigmoid(1.702 * x)
 
 
+class CLIPMultiHeadAttention(layers.MultiHeadAttention):
+    # We should set compute_dtype to be float32 in Softmax.
+    # TODO: We can fix this upstream.
+    def _build_attention(self, rank):
+        super()._build_attention(rank)
+        self._softmax.dtype_policy = "float32"
+
+    def _masked_softmax(self, attention_scores, attention_mask=None):
+        attention_scores = super()._masked_softmax(
+            attention_scores, attention_mask
+        )
+        return ops.cast(attention_scores, self.compute_dtype)
+
+
 class CLIPEncoderBlock(layers.Layer):
     def __init__(
         self,
@@ -43,16 +57,16 @@ def __init__(
             intermediate_activation = quick_gelu
 
         self.layer_norm_1 = layers.LayerNormalization(
-            epsilon=0.00001, dtype=self.dtype_policy, name="layer_norm_1"
+            epsilon=1e-5, dtype="float32", name="layer_norm_1"
         )
-        self.attention = layers.MultiHeadAttention(
+        self.attention = CLIPMultiHeadAttention(
             num_heads,
             hidden_dim // num_heads,
             dtype=self.dtype_policy,
             name="attention",
         )
         self.layer_norm_2 = layers.LayerNormalization(
-            epsilon=0.00001, dtype=self.dtype_policy, name="layer_norm_2"
+            epsilon=1e-5, dtype="float32", name="layer_norm_2"
         )
         self.dense_1 = layers.Dense(
             self.intermediate_dim, dtype=self.dtype_policy, name="dense_1"
 
@@ -36,9 +36,9 @@ def __init__(
         tokenizer,
         sequence_length=77,
         add_start_token=True,
-        add_end_token=False,
+        add_end_token=True,
         to_lower=True,
-        pad_with_end_token=True,
+        pad_with_end_token=False,
         **kwargs,
     ):
         super().__init__(**kwargs)
 
@@ -23,6 +23,44 @@
 )
 
 
+class Projection(layers.Layer):
+    def __init__(self, hidden_dim, **kwargs):
+        super().__init__(**kwargs)
+        self.hidden_dim = int(hidden_dim)
+
+        self.text_projection = layers.Dense(
+            hidden_dim,
+            use_bias=False,
+            dtype=self.dtype_policy,
+            name="text_projection",
+        )
+
+    def build(self, inputs_shape, token_ids_shape):
+        inputs_shape = list(inputs_shape)
+        self.text_projection.build([None, inputs_shape[-1]])
+        self.text_projection._kernel.assign(
+            ops.transpose(ops.eye(self.hidden_dim), (1, 0))
+        )
+
+    def call(self, inputs, token_ids):
+        indices = ops.expand_dims(
+            ops.cast(ops.argmax(token_ids, axis=-1), "int32"), axis=-1
+        )
+        pooled_output = ops.take_along_axis(inputs, indices[:, :, None], axis=1)
+        pooled_output = ops.squeeze(pooled_output, axis=1)
+        projection_output = self.text_projection(pooled_output)
+        return projection_output, pooled_output
+
+    def get_config(self):
+        config = super().get_config()
+        config.update(
+            {
+                "hidden_dim": self.hidden_dim,
+            }
+        )
+        return config
+
+
 class CLIPTextEncoder(keras.Model):
     def __init__(
         self,
@@ -63,13 +101,10 @@ def __init__(
             for _ in range(num_layers)
         ]
         self.layer_norm = layers.LayerNormalization(
-            epsilon=0.00001, dtype=dtype, name="layer_norm"
+            epsilon=1e-6, dtype="float32", name="layer_norm"
         )
-        self.text_projection = layers.Dense(
-            hidden_dim,
-            use_bias=False,
-            dtype=dtype,
-            name="text_projection",
+        self.text_projection = Projection(
+            hidden_dim, dtype=dtype, name="text_projection"
         )
 
         # === Functional Model ===
@@ -78,24 +113,19 @@ def __init__(
         )
         x = self.embedding(encoder_token_ids)
         encoder_intermediate_output = None
+
         # Encoder.
         for i, block in enumerate(self.encoder_layers):
             x = block(x)
             if i == intermediate_output_index:
                 encoder_intermediate_output = x
         x = self.layer_norm(x)
         encoder_output = x
-        if encoder_intermediate_output is not None:
-            encoder_intermediate_output = self.layer_norm(
-                encoder_intermediate_output
-            )
+
         # Projection.
-        indices = ops.expand_dims(
-            ops.cast(ops.argmax(encoder_token_ids, axis=-1), "int32"), axis=-1
+        projection_output, pooled_output = self.text_projection(
+            x, encoder_token_ids
         )
-        pooled_output = ops.take_along_axis(x, indices[:, :, None], axis=1)
-        pooled_output = ops.squeeze(pooled_output, axis=1)
-        projection_output = self.text_projection(pooled_output)
 
         outputs = {
             "encoder_sequence_output": encoder_output,
 
@@ -0,0 +1,55 @@
+# Copyright 2024 The KerasNLP Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from keras import ops
+
+
+class FlowMatchEulerDiscreteScheduler:
+    def __init__(self, num_train_timesteps=1000, shift=1.0):
+        self.num_train_timesteps = int(num_train_timesteps)
+        self.shift = float(shift)
+
+        timesteps = ops.linspace(
+            1, num_train_timesteps, num_train_timesteps, dtype="float32"
+        )
+        timesteps = ops.flip(timesteps, axis=0)
+        sigmas = self.timestep_to_sigma(timesteps)
+
+        self.timesteps = ops.multiply(sigmas, num_train_timesteps)
+        self.sigma_min = sigmas[-1]
+        self.sigma_max = sigmas[0]
+
+    def sigma_to_timestep(self, sigma):
+        return sigma * self.num_train_timesteps
+
+    def timestep_to_sigma(self, timestep):
+        sigma = ops.divide(timestep, self.num_train_timesteps)
+        if self.shift != 1.0:
+            sigma = ops.divide(
+                ops.multiply(self.shift, sigma),
+                ops.add(1, ops.multiply(self.shift - 1.0, sigma)),
+            )
+        return sigma
+
+    def get_sigma(self, step, num_steps):
+        start = self.sigma_to_timestep(self.sigma_max)
+        end = self.sigma_to_timestep(self.sigma_min)
+        step_size = ops.divide(
+            ops.subtract(end, start), ops.subtract(num_steps, 1)
+        )
+        result_timestep = ops.add(start, ops.multiply(step, step_size))
+        result_sigma = self.timestep_to_sigma(result_timestep)
+        return ops.maximum(result_sigma, 0.0)
+
+    def step(self, latents, noise_residual, sigma, sigma_next):
+        return latents + (sigma_next - sigma) * noise_residual
@@ -186,7 +186,7 @@ def __init__(self, hidden_dim, output_dim, **kwargs):
             epsilon=1e-6,
             center=False,
             scale=False,
-            dtype=self.dtype_policy,
+            dtype="float32",
             name="norm",
         )
         self.output_dense = layers.Dense(
@@ -274,7 +274,7 @@ def __init__(
         output_dim,
         mlp_ratio=4.0,
         latent_shape=(64, 64, 16),
-        context_shape=(1024, 4096),
+        context_shape=(None, 4096),
         pooled_projection_shape=(2048,),
         data_format=None,
         dtype=None,
 
@@ -55,7 +55,7 @@ def __init__(
             epsilon=1e-6,
             center=False,
             scale=False,
-            dtype=self.dtype_policy,
+            dtype="float32",
             name="norm1",
         )
         self.attention_qkv = layers.Dense(
@@ -69,7 +69,7 @@ def __init__(
                 epsilon=1e-6,
                 center=False,
                 scale=False,
-                dtype=self.dtype_policy,
+                dtype="float32",
                 name="norm2",
             )
             self.mlp = models.Sequential(
@@ -230,6 +230,7 @@ def __init__(
             dtype=self.dtype_policy,
             name="context_block",
         )
+        self.softmax = layers.Softmax(dtype="float32")
 
     def build(self, inputs_shape, context_shape, timestep_embedding_shape):
         self.x_block.build(inputs_shape, timestep_embedding_shape)
@@ -240,7 +241,9 @@ def _compute_attention(self, query, key, value):
             query, ops.cast(self._inverse_sqrt_key_dim, query.dtype)
         )
         attention_scores = ops.einsum(self._dot_product_equation, key, query)
-        attention_scores = ops.nn.softmax(attention_scores, axis=-1)
+        original_dtype = attention_scores.dtype
+        attention_scores = self.softmax(attention_scores)
+        attention_scores = ops.cast(attention_scores, original_dtype)
         attention_output = ops.einsum(
             self._combine_equation, attention_scores, value
         )