Skip to content
This repository was archived by the owner on Sep 23, 2025. It is now read-only.
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 9 additions & 6 deletions examples/inference/api_server_openai/query_http_requests.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,17 +35,21 @@
help="Whether to enable streaming response",
)
parser.add_argument(
"--max_new_tokens", default=None, help="The maximum numbers of tokens to generate"
"--max_new_tokens", default=256, help="The maximum numbers of tokens to generate"
)
parser.add_argument(
"--temperature", default=None, help="The value used to modulate the next token probabilities"
"--temperature", default=0.2, help="The value used to modulate the next token probabilities"
)
parser.add_argument(
"--top_p",
default=None,
default=0.7,
help="If set to float < 1, only the smallest set of most probable tokens with probabilities that add up to`Top p` or higher are kept for generation",
)

parser.add_argument(
"--input_text",
default="Tell me a long story with many words.",
help="question to ask model",
)
args = parser.parse_args()

s = requests.Session()
Expand All @@ -54,8 +58,7 @@
body = {
"model": args.model_name,
"messages": [
{"role": "assistant", "content": "You are a helpful assistant."},
{"role": "user", "content": "Tell me a long story with many words."},
{"role": "user", "content": args.input_text},
],
"stream": args.streaming_response,
"max_tokens": args.max_new_tokens,
Expand Down
2 changes: 1 addition & 1 deletion inference/api_openai_backend/request_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@
from fastapi import status, HTTPException, Request
from starlette.responses import JSONResponse
from pydantic import ValidationError as PydanticValidationError
from logger import get_logger
from inference.logger import get_logger
from .openai_protocol import Prompt, ModelResponse, ErrorResponse, FinishReason

logger = get_logger(__name__)
Expand Down
2 changes: 1 addition & 1 deletion inference/deepspeed_predictor.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from typing import List
import os
from predictor import Predictor
from utils import get_torch_dtype
from inference.utils import get_torch_dtype
from inference.inference_config import (
InferenceConfig,
GenerateResult,
Expand Down
2 changes: 1 addition & 1 deletion inference/serve.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@

import ray
import sys
from utils import get_deployment_actor_options
from inference.utils import get_deployment_actor_options
from pydantic_yaml import parse_yaml_raw_as
from api_server_simple import serve_run
from api_server_openai import openai_serve_run
Expand Down
2 changes: 1 addition & 1 deletion tests/inference/test_utils.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import pytest
import torch

from utils import (
from inference.utils import (
get_deployment_actor_options,
StoppingCriteriaSub,
max_input_len,
Expand Down
Loading