add logging

DavidLandup0 · DavidLandup0 · commit 0a1590876eb3 · 2025-08-16T22:49:36.000+09:00
diff --git a/keras_hub/src/models/smollm3/smollm3_backbone.py b/keras_hub/src/models/smollm3/smollm3_backbone.py
@@ -77,6 +77,15 @@ def __init__(
             output_dim=hidden_dim,
             name="token_embedding",
         )
+
+        self.rotary_embedding = SmolLM3RotaryEmbedding(
+            hidden_size=hidden_dim,
+            num_attention_heads=num_attention_heads,
+            max_position_embeddings=max_position_embeddings,
+            rope_theta=rope_theta,
+            partial_rotary_factor=partial_rotary_factor,
+        )
+
         self.transformer_layers = []
         for i in range(num_layers):
             layer = SmolLM3DecoderLayer(
@@ -100,14 +109,6 @@ def __init__(
             name="sequence_output_layernorm",
         )
 
-        self.rotary_embedding = SmolLM3RotaryEmbedding(
-            hidden_size=hidden_dim,
-            num_attention_heads=num_attention_heads,
-            max_position_embeddings=max_position_embeddings,
-            rope_theta=rope_theta,
-            partial_rotary_factor=partial_rotary_factor,
-        )
-
         # === Functional Model ===
         token_id_input = keras.Input(
             shape=(None,), dtype="int32", name="token_ids"
diff --git a/keras_hub/src/models/smollm3/smollm3_causal_lm.py b/keras_hub/src/models/smollm3/smollm3_causal_lm.py
@@ -70,11 +70,11 @@ def call_with_cache(
         x = self.backbone.token_embedding(token_ids)
 
         # Each decoder layer has a cache; we update them separately.
-        
         updated_cache = []
+        position_embeddings = self.backbone.rotary_embedding(x, start_index=cache_update_index)
         for i in range(self.backbone.num_layers):
-            position_embeddings = self.backbone.rotary_embedding(x, start_index=cache_update_index)
             current_cache = cache[:, i, ...]
+            print(x.shape)
             x, next_cache = self.backbone.transformer_layers[i](
                 x,
                 position_embeddings=position_embeddings,
@@ -103,9 +103,8 @@ def _build_cache(self, token_ids):
             head_dim,
         ]
         cache = ops.zeros(shape, dtype=self.compute_dtype)
-        index = ops.convert_to_tensor(0, dtype="int32")
         # Seed the cache.
-        _, hidden_states, cache = self.call_with_cache(token_ids, cache, index)
+        _, hidden_states, cache = self.call_with_cache(token_ids, cache, 0)
         return hidden_states, cache
 
     def generate_step(