Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions torchchat/generate.py
Original file line number Diff line number Diff line change
Expand Up @@ -574,6 +574,9 @@ def decode_n_tokens(
**sampling_kwargs,
)
input_pos += 1
if os.getenv('DEBUG_CACHE'):
print(f"final token input_pos: {input_pos}")
yield cur_token.clone(), next_prob.clone()
break

if not encountered_eos:
Expand Down Expand Up @@ -1170,6 +1173,7 @@ def callback(x, *, done_generating=False):
prof = torch.profiler.profile()
t0 = time.perf_counter()
num_tokens_generated = 0
local_token_tensor = []
with prof:
generator_func = self.generate(
self.model,
Expand All @@ -1191,6 +1195,9 @@ def callback(x, *, done_generating=False):
start_pos += encoded.size(0)
for token_tensor, metrics in generator_func:
if token_tensor is not None:
if os.getenv('DEBUG_CACHE'):
print(f"Token tensor: {token_tensor}")
local_token_tensor.append(token_tensor.tolist()[0])
start_pos += token_tensor.size(0)
num_tokens_generated += token_tensor.size(0)
if metrics is not None:
Expand All @@ -1199,6 +1206,9 @@ def callback(x, *, done_generating=False):
jit_compile = is_first_sample and (
generator_args.compile or generator_args.compile_prefill
)
if os.getenv('DEBUG_CACHE'):
print(f"local_token_tensor: {local_token_tensor}")
print(self.tokenizer.decode(local_token_tensor))
compilation_time = time.perf_counter() - t0
device_sync(device=self.builder_args.device)
t = time.perf_counter() - t0
Expand Down
2 changes: 2 additions & 0 deletions torchchat/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -723,6 +723,8 @@ def distribute(self, device_mesh: DeviceMesh):

def forward(self, x: Tensor, input_pos: Optional[Tensor] = None, cache_lane: int = 0) -> Tensor:
assert self.freqs_cis is not None, "Caches must be initialized first"
if os.getenv('DEBUG_CACHE'):
print("Transformer forward input pos", input_pos)
mask = self.causal_mask[None, None, input_pos]
freqs_cis = self.freqs_cis[input_pos]
if self.tok_embeddings:
Expand Down