-
Notifications
You must be signed in to change notification settings - Fork 1.4k
Description
In latest commit, https://huggingface.co/mosaicml/mpt-7b/commit/67cf22a4e6809edb7308dd0a2ae2c1ffb86f4984, BigDL throws below error when generate text.
INFO 2024-02-20 06:41:05,962 proxy 172.17.0.2 0dfd2310-daba-4d40-8b27-6ccbbd608fd2 /mpt-7b-bigdl router.py:959 - Using router <class 'ray.serve._private.router.PowerOfTwoChoicesReplicaScheduler'>.
INFO 2024-02-20 06:41:05,978 proxy 172.17.0.2 router.py:496 - Got updated replicas for deployment 'PredictorDeployment' in application 'mpt-7b-bigdl': {'mpt-7b-bigdl#PredictorDeployment#jBsKpA'}.
ERROR 2024-02-20 06:41:06,031 proxy 172.17.0.2 0dfd2310-daba-4d40-8b27-6ccbbd608fd2 /mpt-7b-bigdl proxy.py:1045 - ray::ServeReplica:mpt-7b-bigdl:PredictorDeployment.handle_request_streaming() (pid=8543, ip=172.17.0.2, actor_id=55a8f57651e95e4bfc9348b101000000, repr=<ray.serve._private.replica.ServeReplica:mpt-7b-bigdl:PredictorDeployment object at 0x7fa349fc2d30>)
async for result in generator:
File "/opt/conda/lib/python3.9/site-packages/ray/serve/_private/replica.py", line 338, in _handle_http_request_generator
raise e from None
File "/opt/conda/lib/python3.9/site-packages/ray/serve/_private/replica.py", line 914, in call_user_method
raise e from None
ray.exceptions.RayTaskError: ray::ServeReplica:mpt-7b-bigdl:PredictorDeployment.handle_request_streaming() (pid=8543, ip=172.17.0.2)
File "/opt/conda/lib/python3.9/site-packages/ray/serve/_private/utils.py", line 165, in wrap_to_ray_error
raise exception
File "/opt/conda/lib/python3.9/site-packages/ray/serve/_private/replica.py", line 895, in call_user_method
result = await method_to_call(*request_args, **request_kwargs)
File "/root/llm-on-ray/inference/predictor_deployment.py", line 121, in call
return self.predictor.generate(prompts, **config)
File "/root/llm-on-ray/inference/transformer_predictor.py", line 122, in generate
gen_tokens = self.model.generate(
File "/opt/conda/lib/python3.9/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
return func(*args, **kwargs)
File "/opt/conda/lib/python3.9/site-packages/transformers/generation/utils.py", line 1538, in generate
return self.greedy_search(
File "/opt/conda/lib/python3.9/site-packages/transformers/generation/utils.py", line 2362, in greedy_search
outputs = self(
File "/opt/conda/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/opt/conda/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(*args, **kwargs)
File "/root/.cache/huggingface/modules/transformers_modules/mosaicml/mpt-7b/67cf22a4e6809edb7308dd0a2ae2c1ffb86f4984/modeling_mpt.py", line 436, in forward
outputs = self.transformer(input_ids=input_ids, past_key_values=past_key_values, attention_mask=attention_mask, prefix_mask=prefix_mask, sequence_id=sequence_id, return_dict=return_dict, output_attentions=output_attentions, output_hidden_states=output_hidden_states, use_cache=use_cache, inputs_embeds=inputs_embeds)
File "/opt/conda/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/opt/conda/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(*args, **kwargs)
File "/root/.cache/huggingface/modules/transformers_modules/mosaicml/mpt-7b/67cf22a4e6809edb7308dd0a2ae2c1ffb86f4984/modeling_mpt.py", line 357, in forward
(x, attn_weights, present) = block(x, past_key_value=past_key_value, attn_bias=attn_bias, rotary_emb_w_meta_info=rotary_emb_w_meta_info, attention_mask=attention_mask, is_causal=self.is_causal, output_attentions=bool(output_attentions), alibi_slopes=alibi_slopes, flash_attn_padding_info=flash_attn_padding_info)
File "/opt/conda/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/opt/conda/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(*args, **kwargs)
File "/root/.cache/huggingface/modules/transformers_modules/mosaicml/mpt-7b/67cf22a4e6809edb7308dd0a2ae2c1ffb86f4984/blocks.py", line 40, in forward
(b, attn_weights, past_key_value) = self.attn(a, past_key_value=past_key_value, attn_bias=attn_bias, rotary_emb_w_meta_info=rotary_emb_w_meta_info, attention_mask=attention_mask, is_causal=is_causal, needs_weights=output_attentions, alibi_slopes=alibi_slopes, flash_attn_padding_info=flash_attn_padding_info)
File "/opt/conda/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/opt/conda/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(*args, **kwargs)
TypeError: mpt_multihead_attention_forward() got an unexpected keyword argument 'rotary_emb_w_meta_info'
Traceback (most recent call last):
File "/opt/conda/lib/python3.9/site-packages/ray/serve/_private/proxy.py", line 979, in send_request_to_replica
async for asgi_message_batch in response_generator:
File "/opt/conda/lib/python3.9/site-packages/ray/serve/_private/proxy_response_generator.py", line 111, in anext
raise e from None
File "/opt/conda/lib/python3.9/site-packages/ray/serve/_private/proxy_response_generator.py", line 91, in anext
result = await self._get_next_streaming_result()
File "/opt/conda/lib/python3.9/site-packages/ray/serve/_private/proxy_response_generator.py", line 134, in _get_next_streaming_result
return next_result_task.result()
File "/opt/conda/lib/python3.9/site-packages/ray/serve/_private/proxy_response_generator.py", line 116, in _await_response_anext
return await self._response.anext()
File "/opt/conda/lib/python3.9/site-packages/ray/serve/handle.py", line 781, in anext
return await next_obj_ref
ray.exceptions.RayTaskError(TypeError): ray::ServeReplica:mpt-7b-bigdl:PredictorDeployment.handle_request_streaming() (pid=8543, ip=172.17.0.2, actor_id=55a8f57651e95e4bfc9348b101000000, repr=<ray.serve._private.replica.ServeReplica:mpt-7b-bigdl:PredictorDeployment object at 0x7fa349fc2d30>)
async for result in generator:
File "/opt/conda/lib/python3.9/site-packages/ray/serve/_private/replica.py", line 338, in _handle_http_request_generator
raise e from None
File "/opt/conda/lib/python3.9/site-packages/ray/serve/_private/replica.py", line 914, in call_user_method
raise e from None
ray.exceptions.RayTaskError: ray::ServeReplica:mpt-7b-bigdl:PredictorDeployment.handle_request_streaming() (pid=8543, ip=172.17.0.2)
File "/opt/conda/lib/python3.9/site-packages/ray/serve/_private/utils.py", line 165, in wrap_to_ray_error
raise exception
File "/opt/conda/lib/python3.9/site-packages/ray/serve/_private/replica.py", line 895, in call_user_method
result = await method_to_call(*request_args, **request_kwargs)
File "/root/llm-on-ray/inference/predictor_deployment.py", line 121, in call
return self.predictor.generate(prompts, **config)
File "/root/llm-on-ray/inference/transformer_predictor.py", line 122, in generate
gen_tokens = self.model.generate(
File "/opt/conda/lib/python3.9/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
return func(*args, **kwargs)
File "/opt/conda/lib/python3.9/site-packages/transformers/generation/utils.py", line 1538, in generate
return self.greedy_search(
File "/opt/conda/lib/python3.9/site-packages/transformers/generation/utils.py", line 2362, in greedy_search
outputs = self(
File "/opt/conda/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/opt/conda/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(*args, **kwargs)
File "/root/.cache/huggingface/modules/transformers_modules/mosaicml/mpt-7b/67cf22a4e6809edb7308dd0a2ae2c1ffb86f4984/modeling_mpt.py", line 436, in forward
outputs = self.transformer(input_ids=input_ids, past_key_values=past_key_values, attention_mask=attention_mask, prefix_mask=prefix_mask, sequence_id=sequence_id, return_dict=return_dict, output_attentions=output_attentions, output_hidden_states=output_hidden_states, use_cache=use_cache, inputs_embeds=inputs_embeds)
File "/opt/conda/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/opt/conda/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(*args, **kwargs)
File "/root/.cache/huggingface/modules/transformers_modules/mosaicml/mpt-7b/67cf22a4e6809edb7308dd0a2ae2c1ffb86f4984/modeling_mpt.py", line 357, in forward
(x, attn_weights, present) = block(x, past_key_value=past_key_value, attn_bias=attn_bias, rotary_emb_w_meta_info=rotary_emb_w_meta_info, attention_mask=attention_mask, is_causal=self.is_causal, output_attentions=bool(output_attentions), alibi_slopes=alibi_slopes, flash_attn_padding_info=flash_attn_padding_info)
File "/opt/conda/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/opt/conda/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(*args, **kwargs)
File "/root/.cache/huggingface/modules/transformers_modules/mosaicml/mpt-7b/67cf22a4e6809edb7308dd0a2ae2c1ffb86f4984/blocks.py", line 40, in forward
(b, attn_weights, past_key_value) = self.attn(a, past_key_value=past_key_value, attn_bias=attn_bias, rotary_emb_w_meta_info=rotary_emb_w_meta_info, attention_mask=attention_mask, is_causal=is_causal, needs_weights=output_attentions, alibi_slopes=alibi_slopes, flash_attn_padding_info=flash_attn_padding_info)
File "/opt/conda/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/opt/conda/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(*args, **kwargs)
TypeError: mpt_multihead_attention_forward() got an unexpected keyword argument 'rotary_emb_w_meta_info'