diff --git a/.github/workflows/workflow_finetune.yml b/.github/workflows/workflow_finetune.yml
index 5a8e32720..854732bff 100644
--- a/.github/workflows/workflow_finetune.yml
+++ b/.github/workflows/workflow_finetune.yml
@@ -85,7 +85,7 @@ jobs:
           docker exec "finetune" bash -c "source \$(python -c 'import oneccl_bindings_for_pytorch as torch_ccl;print(torch_ccl.cwd)')/env/setvars.sh; RAY_SERVE_ENABLE_EXPERIMENTAL_STREAMING=1 ray start --head --node-ip-address 127.0.0.1 --ray-debugger-external; RAY_SERVE_ENABLE_EXPERIMENTAL_STREAMING=1  ray start --address='127.0.0.1:6379' --ray-debugger-external"
           CMD=$(cat << EOF
           import yaml
-          conf_path = "finetune/finetune.yaml"
+          conf_path = "llm_on_ray/finetune/finetune.yaml"
           with open(conf_path, encoding="utf-8") as reader:
               result = yaml.load(reader, Loader=yaml.FullLoader)
               result['General']['base_model'] = "${{ matrix.model }}"
@@ -113,14 +113,14 @@ jobs:
           EOF
           )
           docker exec "finetune" python -c "$CMD"
-          docker exec "finetune" bash -c "python finetune/finetune.py --config_file finetune/finetune.yaml"
+          docker exec "finetune" bash -c "llm_on_ray-finetune --config_file llm_on_ray/finetune/finetune.yaml"
 
       - name: Run PEFT-LoRA Test
         run: |
           docker exec "finetune" bash -c "rm -rf /tmp/llm-ray/*"
           CMD=$(cat << EOF
           import yaml
-          conf_path = "finetune/finetune.yaml"
+          conf_path = "llm_on_ray/finetune/finetune.yaml"
           with open(conf_path, encoding="utf-8") as reader:
               result = yaml.load(reader, Loader=yaml.FullLoader)
               result['General']['lora_config'] = {
@@ -138,7 +138,7 @@ jobs:
           EOF
           )
           docker exec "finetune" python -c "$CMD"
-          docker exec "finetune" bash -c "python finetune/finetune.py --config_file finetune/finetune.yaml"
+          docker exec "finetune" bash -c "llm_on_ray-finetune --config_file llm_on_ray/finetune/finetune.yaml"
 
       - name: Run Deltatuner Test on DENAS-LoRA Model
         run: |
@@ -150,7 +150,7 @@ jobs:
           import os
           import yaml
           os.system("cp -r $(python -m pip show deltatuner | grep Location | cut -d: -f2)/deltatuner/conf/best_structure examples/")
-          conf_path = "finetune/finetune.yaml"
+          conf_path = "llm_on_ray/finetune/finetune.yaml"
           with open(conf_path, encoding="utf-8") as reader:
               result = yaml.load(reader, Loader=yaml.FullLoader)
               result['General']['lora_config'] = {
@@ -168,7 +168,7 @@ jobs:
               yaml.dump(result, output, sort_keys=False)
           EOF)
             docker exec "finetune" python -c "$CMD"
-            docker exec "finetune" bash -c "python finetune/finetune.py --config_file finetune/finetune.yaml"
+            docker exec "finetune" bash -c "llm_on_ray-finetune --config_file llm_on_ray/finetune/finetune.yaml"
           fi
 
       - name: Stop Ray
diff --git a/.github/workflows/workflow_inference.yml b/.github/workflows/workflow_inference.yml
index 6a5617a66..7ea4359be 100644
--- a/.github/workflows/workflow_inference.yml
+++ b/.github/workflows/workflow_inference.yml
@@ -118,14 +118,14 @@ jobs:
           CMD=$(cat << EOF
           import yaml
           if ("${{ matrix.model }}" == "starcoder"):
-              conf_path = "inference/models/starcoder.yaml"
+              conf_path = "llm_on_ray/inference/models/starcoder.yaml"
               with open(conf_path, encoding="utf-8") as reader:
                   result = yaml.load(reader, Loader=yaml.FullLoader)
                   result['model_description']["config"]["use_auth_token"] = "${{ env.HF_ACCESS_TOKEN }}"
               with open(conf_path, 'w') as output:
                   yaml.dump(result, output, sort_keys=False)
           if ("${{ matrix.model }}" == "llama-2-7b-chat-hf"):
-              conf_path = "inference/models/llama-2-7b-chat-hf.yaml"
+              conf_path = "llm_on_ray/inference/models/llama-2-7b-chat-hf.yaml"
               with open(conf_path, encoding="utf-8") as reader:
                   result = yaml.load(reader, Loader=yaml.FullLoader)
                   result['model_description']["config"]["use_auth_token"] = "${{ env.HF_ACCESS_TOKEN }}"
@@ -135,11 +135,11 @@ jobs:
           )
           docker exec "${TARGET}" python -c "$CMD"
           if [[ ${{ matrix.model }} == "mpt-7b-bigdl" ]]; then
-            docker exec "${TARGET}" bash -c "python inference/serve.py --config_file inference/models/bigdl/mpt-7b-bigdl.yaml --simple"
+            docker exec "${TARGET}" bash -c "llm_on_ray-serve --config_file llm_on_ray/inference/models/bigdl/mpt-7b-bigdl.yaml --simple"
           elif [[ ${{ matrix.model }} == "llama-2-7b-chat-hf-vllm" ]]; then
-            docker exec "${TARGET}" bash -c "python inference/serve.py --config_file .github/workflows/config/llama-2-7b-chat-hf-vllm-fp32.yaml --simple"
+            docker exec "${TARGET}" bash -c "llm_on_ray-serve --config_file .github/workflows/config/llama-2-7b-chat-hf-vllm-fp32.yaml --simple"
           else
-            docker exec "${TARGET}" bash -c "python inference/serve.py --simple --models ${{ matrix.model }}"
+            docker exec "${TARGET}" bash -c "llm_on_ray-serve --simple --models ${{ matrix.model }}"
           fi
           echo Non-streaming query:
           docker exec "${TARGET}" bash -c "python examples/inference/api_server_simple/query_single.py --model_endpoint http://127.0.0.1:8000/${{ matrix.model }}"
@@ -150,7 +150,7 @@ jobs:
         if: ${{ matrix.dtuner_model }}
         run: |
           TARGET=${{steps.target.outputs.target}}
-          docker exec "${TARGET}" bash -c "python inference/serve.py --config_file .github/workflows/config/mpt_deltatuner.yaml --simple"
+          docker exec "${TARGET}" bash -c "llm_on_ray-serve --config_file .github/workflows/config/mpt_deltatuner.yaml --simple"
           docker exec "${TARGET}" bash -c "python examples/inference/api_server_simple/query_single.py --model_endpoint http://127.0.0.1:8000/${{ matrix.model }}"
           docker exec "${TARGET}" bash -c "python examples/inference/api_server_simple/query_single.py --model_endpoint http://127.0.0.1:8000/${{ matrix.model }} --streaming_response"
 
@@ -160,8 +160,8 @@ jobs:
           if [[ ${{ matrix.model }} =~ ^(gpt2|falcon-7b|starcoder|mpt-7b.*)$ ]]; then
             echo ${{ matrix.model }} is not supported!
           elif [[ ! ${{ matrix.model }} == "llama-2-7b-chat-hf-vllm" ]]; then
-            docker exec "${TARGET}" bash -c "python .github/workflows/config/update_inference_config.py --config_file inference/models/\"${{ matrix.model }}\".yaml --output_file \"${{ matrix.model }}\".yaml.deepspeed --deepspeed"
-            docker exec "${TARGET}" bash -c "python inference/serve.py --config_file \"${{ matrix.model }}\".yaml.deepspeed --simple"
+            docker exec "${TARGET}" bash -c "python .github/workflows/config/update_inference_config.py --config_file llm_on_ray/inference/models/\"${{ matrix.model }}\".yaml --output_file \"${{ matrix.model }}\".yaml.deepspeed --deepspeed"
+            docker exec "${TARGET}" bash -c "llm_on_ray-serve --config_file \"${{ matrix.model }}\".yaml.deepspeed --simple"
             docker exec "${TARGET}" bash -c "python examples/inference/api_server_simple/query_single.py --model_endpoint http://127.0.0.1:8000/${{ matrix.model }}"
             docker exec "${TARGET}" bash -c "python examples/inference/api_server_simple/query_single.py --model_endpoint http://127.0.0.1:8000/${{ matrix.model }} --streaming_response"
           fi
@@ -173,7 +173,7 @@ jobs:
           if [[ ${{ matrix.model }} =~ ^(gpt2|falcon-7b|starcoder|mpt-7b.*)$ ]]; then
             echo ${{ matrix.model }} is not supported!
           else
-            docker exec "${TARGET}" bash -c "python inference/serve.py --config_file .github/workflows/config/mpt_deltatuner_deepspeed.yaml --simple"
+            docker exec "${TARGET}" bash -c "llm_on_ray-serve --config_file .github/workflows/config/mpt_deltatuner_deepspeed.yaml --simple"
             docker exec "${TARGET}" bash -c "python examples/inference/api_server_simple/query_single.py --model_endpoint http://127.0.0.1:8000/${{ matrix.model }}"
             docker exec "${TARGET}" bash -c "python examples/inference/api_server_simple/query_single.py --model_endpoint http://127.0.0.1:8000/${{ matrix.model }} --streaming_response"
           fi
@@ -182,9 +182,9 @@ jobs:
         run: |
           TARGET=${{steps.target.outputs.target}}
           if [[ ${{ matrix.model }} == "mpt-7b-bigdl" ]]; then
-            docker exec "${TARGET}" bash -c "python inference/serve.py --config_file inference/models/bigdl/mpt-7b-bigdl.yaml"
+            docker exec "${TARGET}" bash -c "llm_on_ray-serve --config_file llm_on_ray/inference/models/bigdl/mpt-7b-bigdl.yaml"
           elif [[ ! ${{ matrix.model }} == "llama-2-7b-chat-hf-vllm" ]]; then
-            docker exec "${TARGET}" bash -c "python inference/serve.py --models ${{ matrix.model }}"
+            docker exec "${TARGET}" bash -c "llm_on_ray-serve --models ${{ matrix.model }}"
             docker exec "${TARGET}" bash -c "python examples/inference/api_server_openai/query_http_requests.py --model_name ${{ matrix.model }}"
           fi
 
@@ -202,9 +202,3 @@ jobs:
           TARGET=${{steps.target.outputs.target}}
           cid=$(docker ps -q --filter "name=${TARGET}")
           if [[ ! -z "$cid" ]]; then docker stop $cid && docker rm $cid; fi
-
-      - name: Test Summary
-        run: echo "to be continued"
-
-
-
diff --git a/.github/workflows/workflow_orders_on_merge.yml b/.github/workflows/workflow_orders_on_merge.yml
index a057f8ea6..d491baca1 100644
--- a/.github/workflows/workflow_orders_on_merge.yml
+++ b/.github/workflows/workflow_orders_on_merge.yml
@@ -7,11 +7,11 @@ on:
     paths:
       - '.github/**'
       - 'docker/**'
-      - 'common/**'
       - 'dev/docker/**'
-      - 'finetune/**'
-      - 'inference/**'
-      - 'rlhf/**'
+      - 'llm_on_ray/common/**'
+      - 'llm_on_ray/finetune/**'
+      - 'llm_on_ray/inference/**'
+      - 'llm_on_ray/rlhf/**'
       - 'tools/**'
       - 'pyproject.toml'
       - 'tests/**'
diff --git a/.github/workflows/workflow_orders_on_pr.yml b/.github/workflows/workflow_orders_on_pr.yml
index 0fdb9bb01..9f5df5d83 100644
--- a/.github/workflows/workflow_orders_on_pr.yml
+++ b/.github/workflows/workflow_orders_on_pr.yml
@@ -7,11 +7,11 @@ on:
     paths:
       - '.github/**'
       - 'docker/**'
-      - 'common/**'
       - 'dev/docker/**'
-      - 'finetune/**'
-      - 'inference/**'
-      - 'rlhf/**'
+      - 'llm_on_ray/common/**'
+      - 'llm_on_ray/finetune/**'
+      - 'llm_on_ray/inference/**'
+      - 'llm_on_ray/rlhf/**'
       - 'tools/**'
       - 'pyproject.toml'
       - 'tests/**'
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index eef34287b..c539326c1 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -6,7 +6,7 @@ repos:
     rev: v0.0.289
     hooks:
       - id: ruff
-        args: [ --fix, --exit-non-zero-on-fix, --ignore=E402, --ignore=E501, --ignore=E731]
+        args: [ --fix, --exit-non-zero-on-fix, --ignore=E402, --ignore=E501, --ignore=E731, --ignore=F401]
 
   # Black needs to be ran after ruff with --fix
   - repo: https://github.com/psf/black
diff --git a/README.md b/README.md
index c544f7cc7..7c9419783 100644
--- a/README.md
+++ b/README.md
@@ -37,7 +37,7 @@ LLM-on-Ray's modular workflow structure is designed to comprehensively cater to
 This guide will assist you in setting up LLM-on-Ray on Intel CPU locally, covering the initial setup, finetuning models, and deploying them for serving.
 ### Setup
 
-#### 1. Clone the repository and install dependencies.
+#### 1. Clone the repository, install llm-on-ray and its dependencies.
 Software requirement: Git and Conda
 ```bash
 git clone https://github.com/intel/llm-on-ray.git
@@ -62,14 +62,14 @@ ray start --head
 Use the following command to finetune a model using an example dataset and default configurations. The finetuned model will be stored in `/tmp/llm-ray/output` by default. To customize the base model, dataset and configurations, please see the [finetuning document](#finetune):
 
 ```bash
-python finetune/finetune.py --config_file finetune/finetune.yaml
+llm_on_ray-finetune --config_file llm_on_ray/finetune/finetune.yaml
 ```
 
 ### Serving
 Deploy a model on Ray and expose an endpoint for serving. This command uses GPT2 as an example, but more model configuration examples can be found in the [inference/models](inference/models) directory:
 
 ```bash
-python inference/serve.py --config_file inference/models/gpt2.yaml
+llm_on_ray-serve --config_file llm_on_ray/inference/models/gpt2.yaml
 ```
 
 The default served method is to provide an OpenAI-compatible API server ([OpenAI API Reference](https://platform.openai.com/docs/api-reference/chat)), you can access and test it in many ways:
@@ -95,7 +95,7 @@ python examples/inference/api_server_openai/query_openai_sdk.py
 ```
 Or you can serve specific model to a simple endpoint according to the `port` and `route_prefix` parameters in configuration file,
 ```bash
-python inference/serve.py --config_file inference/models/gpt2.yaml --simple
+llm_on_ray-serve --config_file llm_on_ray/inference/models/gpt2.yaml --simple
 ```
 After deploying the model endpoint, you can access and test it by using the script below:
 ```bash
diff --git a/common/__init__.py b/common/__init__.py
deleted file mode 100644
index 3960d2f50..000000000
--- a/common/__init__.py
+++ /dev/null
@@ -1,23 +0,0 @@
-from .logging import logger
-from .load import *  # noqa: F403 # unable to detect undefined names
-from . import agentenv
-from .torch_config import TorchConfig  # noqa: F401
-from typing import Dict, Any
-import sys
-from .config import Config  # noqa: F401
-from .init import init  # noqa: F401
-
-
-@load_check_decorator  # noqa: F405 # may be undefined, or defined from star imports
-def get_agentenv(config: Dict[str, Any]):
-    logger.info(f"{sys._getframe().f_code.co_name} config: {config}")
-    agentenv_type = config.get("type", None)
-    Factory = agentenv.AgentEnv.registory.get(agentenv_type)
-    if Factory is None:
-        raise ValueError(f"there is no {agentenv_type} AgentEnv.")
-    try:
-        _ = Factory(config)
-    except Exception as e:
-        logger.critical(f"{Factory.__name__} init error: {e}", exc_info=True)
-        exit(1)
-    return _
diff --git a/common/agentenv/__init__.py b/common/agentenv/__init__.py
deleted file mode 100644
index fe05d28ec..000000000
--- a/common/agentenv/__init__.py
+++ /dev/null
@@ -1,9 +0,0 @@
-import os
-from .agentenv import AgentEnv
-from ..common import import_all_module
-
-realpath = os.path.realpath(__file__)
-basedir = os.path.dirname(realpath)
-import_all_module(basedir, "common.agentenv")
-
-__all__ = ["AgentEnv"]
diff --git a/common/dataprocesser/__init__.py b/common/dataprocesser/__init__.py
deleted file mode 100644
index 7e74e6a13..000000000
--- a/common/dataprocesser/__init__.py
+++ /dev/null
@@ -1,9 +0,0 @@
-import os
-from .dataprocesser import DataProcesser
-from ..common import import_all_module
-
-realpath = os.path.realpath(__file__)
-basedir = os.path.dirname(realpath)
-import_all_module(basedir, "common.dataprocesser")
-
-__all__ = ["DataProcesser"]
diff --git a/common/dataset/__init__.py b/common/dataset/__init__.py
deleted file mode 100644
index 9b04a188b..000000000
--- a/common/dataset/__init__.py
+++ /dev/null
@@ -1,9 +0,0 @@
-import os
-from .dataset import Dataset
-from ..common import import_all_module
-
-realpath = os.path.realpath(__file__)
-basedir = os.path.dirname(realpath)
-import_all_module(basedir, "common.dataset")
-
-__all__ = ["Dataset"]
diff --git a/common/initializer/__init__.py b/common/initializer/__init__.py
deleted file mode 100644
index 2cdc27adb..000000000
--- a/common/initializer/__init__.py
+++ /dev/null
@@ -1,9 +0,0 @@
-import os
-from .initializer import Initializer
-from ..common import import_all_module
-
-realpath = os.path.realpath(__file__)
-basedir = os.path.dirname(realpath)
-import_all_module(basedir, "common.initializer")
-
-__all__ = ["Initializer"]
diff --git a/common/model/__init__.py b/common/model/__init__.py
deleted file mode 100644
index df7989ceb..000000000
--- a/common/model/__init__.py
+++ /dev/null
@@ -1,9 +0,0 @@
-import os
-from .model import Model
-from ..common import import_all_module
-
-realpath = os.path.realpath(__file__)
-basedir = os.path.dirname(realpath)
-import_all_module(basedir, "common.model")
-
-__all__ = ["Model"]
diff --git a/common/optimizer/__init__.py b/common/optimizer/__init__.py
deleted file mode 100644
index 122acc90f..000000000
--- a/common/optimizer/__init__.py
+++ /dev/null
@@ -1,9 +0,0 @@
-import os
-from .optimizer import Optimizer
-from ..common import import_all_module
-
-realpath = os.path.realpath(__file__)
-basedir = os.path.dirname(realpath)
-import_all_module(basedir, "common.optimizer")
-
-__all__ = ["Optimizer"]
diff --git a/common/tokenizer/__init__.py b/common/tokenizer/__init__.py
deleted file mode 100644
index 63c281496..000000000
--- a/common/tokenizer/__init__.py
+++ /dev/null
@@ -1,9 +0,0 @@
-import os
-from .tokenizer import Tokenizer
-from ..common import import_all_module
-
-realpath = os.path.realpath(__file__)
-basedir = os.path.dirname(realpath)
-import_all_module(basedir, "common.tokenizer")
-
-__all__ = ["Tokenizer"]
diff --git a/common/trainer/__init__.py b/common/trainer/__init__.py
deleted file mode 100644
index b33b565a5..000000000
--- a/common/trainer/__init__.py
+++ /dev/null
@@ -1,9 +0,0 @@
-import os
-from .trainer import Trainer
-from ..common import import_all_module
-
-realpath = os.path.realpath(__file__)
-basedir = os.path.dirname(realpath)
-import_all_module(basedir, "common.trainer")
-
-__all__ = ["Trainer"]
diff --git a/dev/docker/Dockerfile.bigdl-cpu b/dev/docker/Dockerfile.bigdl-cpu
index 411449e41..3838b3382 100644
--- a/dev/docker/Dockerfile.bigdl-cpu
+++ b/dev/docker/Dockerfile.bigdl-cpu
@@ -27,7 +27,8 @@ RUN --mount=type=cache,target=/opt/conda/pkgs conda init bash && \
 COPY ./pyproject.toml .
 COPY ./MANIFEST.in .
 
-RUN mkdir ./finetune && mkdir ./inference
+# create llm_on_ray package directory to bypass the following 'pip install -e' command
+RUN mkdir ./llm_on_ray
 
 RUN --mount=type=cache,target=/root/.cache/pip pip install -e .[bigdl-cpu] --extra-index-url https://download.pytorch.org/whl/cpu \
     --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/cpu/us/
diff --git a/dev/docker/Dockerfile.cpu_and_deepspeed b/dev/docker/Dockerfile.cpu_and_deepspeed
index 5371fae78..3e4fe5ff0 100644
--- a/dev/docker/Dockerfile.cpu_and_deepspeed
+++ b/dev/docker/Dockerfile.cpu_and_deepspeed
@@ -27,7 +27,8 @@ RUN --mount=type=cache,target=/opt/conda/pkgs conda init bash && \
 COPY ./pyproject.toml .
 COPY ./MANIFEST.in .
 
-RUN mkdir ./finetune && mkdir ./inference
+# create llm_on_ray package directory to bypass the following 'pip install -e' command
+RUN mkdir ./llm_on_ray
 
 RUN --mount=type=cache,target=/root/.cache/pip pip install -e .[cpu,deepspeed] --extra-index-url https://download.pytorch.org/whl/cpu \
     --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/cpu/us/
diff --git a/dev/docker/Dockerfile.vllm b/dev/docker/Dockerfile.vllm
index 4585ccef0..3f298ba69 100644
--- a/dev/docker/Dockerfile.vllm
+++ b/dev/docker/Dockerfile.vllm
@@ -28,7 +28,8 @@ COPY ./pyproject.toml .
 COPY ./MANIFEST.in .
 COPY ./dev/scripts/install-vllm-cpu.sh .
 
-RUN mkdir ./finetune && mkdir ./inference
+# create llm_on_ray package directory to bypass the following 'pip install -e' command
+RUN mkdir ./llm_on_ray
 
 RUN --mount=type=cache,target=/root/.cache/pip pip install -e .[cpu] --extra-index-url https://download.pytorch.org/whl/cpu \
     --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/cpu/us/
diff --git a/docs/finetune.md b/docs/finetune.md
index dda3505f2..ee05cc52e 100755
--- a/docs/finetune.md
+++ b/docs/finetune.md
@@ -65,5 +65,5 @@ The following models have been verified on Intel CPUs or GPUs.
 ## Finetune the model
 To finetune your model, execute the following command. The finetuned model will be saved in /tmp/llm-ray/output by default.
 ``` bash
-python finetune/finetune.py --config_file <your finetuning conf file>
+llm_on_ray-finetune --config_file <your finetuning conf file>
 ```
diff --git a/docs/pretrain.md b/docs/pretrain.md
index 2b3667523..cf4a5931d 100644
--- a/docs/pretrain.md
+++ b/docs/pretrain.md
@@ -122,28 +122,28 @@ Set up `megatron_deepspeed_path` in the configuration.
 
 ```bash
 cd /home/user/workspace/llm-on-ray
-#Bloom-7B
-python pretrain/megatron_deepspeed_pretrain.py --config_file pretrain/config/bloom_7b_megatron_deepspeed_zs0_8Gaudi_pretrain.conf
-#llama-7B
-python pretrain/megatron_deepspeed_pretrain.py --config_file pretrain/config/llama_7b_megatron_deepspeed_zs0_8Gaudi_pretrain.conf
+# Bloom-7B
+llm_on_ray-megatron_deepspeed_pretrain --config_file llm_on_ray/pretrain/config/bloom_7b_megatron_deepspeed_zs0_8Gaudi_pretrain.conf
+# llama-7B
+llm_on_ray-megatron_deepspeed_pretrain --config_file llm_on_ray/pretrain/config/llama_7b_megatron_deepspeed_zs0_8Gaudi_pretrain.conf
 ```
 
 ##### Huggingface Trainer
 ```bash
 cd /home/user/workspace/llm-on-ray
-#llama-7B
-python pretrain/pretrain.py --config_file pretrain/config/llama_7b_8Guadi_pretrain.conf
+# llama-7B
+llm_on_ray-pretrain --config_file llm_on_ray/pretrain/config/llama_7b_8Guadi_pretrain.conf
 ```
 ##### Nvidia GPU:
 ###### Megatron-DeepSpeed
 ```bash
 cd /home/user/workspace/llm-on-ray
-#llama2-7B
-python pretrain/megatron_deepspeed_pretrain.py --config_file pretrain/config/llama2_3b_megatron_deepspeed_zs0_8gpus_pretrain.conf
+# llama2-7B
+llm_on_ray-megatron_deepspeed_pretrain --config_file llm_on_ray/pretrain/config/llama2_3b_megatron_deepspeed_zs0_8gpus_pretrain.conf
 ```
 ##### Huggingface Trainer
 ```bash
 cd /home/user/workspace/llm-on-ray
-#llama-7B
-python pretrain/pretrain.py --config_file pretrain/config/llama_7b_8gpu_pretrain.conf
+# llama-7B
+llm_on_ray-pretrain --config_file llm_on_ray/pretrain/config/llama_7b_8gpu_pretrain.conf
 ```
\ No newline at end of file
diff --git a/docs/serve.md b/docs/serve.md
index 2beed2b18..831774b6c 100644
--- a/docs/serve.md
+++ b/docs/serve.md
@@ -30,22 +30,22 @@ LLM-on-Ray also supports serving with [Deepspeed](serve_deepspeed.md) for AutoTP
 We support three methods to specify the models to be served, and they have the following priorities.
 1. Use inference configuration file if config_file is set.
 ```
-python inference/serve.py --config_file inference/models/gpt2.yaml
+llm_on_ray-serve --config_file llm_on_ray/inference/models/gpt2.yaml
 ```
 2. Use relevant configuration parameters if model_id_or_path is set.
 ```
-python inference/serve.py --model_id_or_path gpt2 [--tokenizer_id_or_path gpt2 --port 8000 --route_prefix ...]
+llm_on_ray-serve --model_id_or_path gpt2 [--tokenizer_id_or_path gpt2 --port 8000 --route_prefix ...]
 ```
 3. If --config_file and --model_id_or_path are both None, it will serve all pre-defined models in inference/models/*.yaml, or part of them if models is set.
 ```
-python inference/serve.py --models gpt2 gpt-j-6b
+llm_on_ray-serve --models gpt2 gpt-j-6b
 ```
 ### OpenAI-compatible API
 To deploy your model, execute the following command with the model's configuration file. This will create an OpenAI-compatible API ([OpenAI API Reference](https://platform.openai.com/docs/api-reference/chat)) for serving.
 ```bash
-python inference/serve.py --config_file <path to the conf file>
+llm_on_ray-serve --config_file <path to the conf file>
 ```
-To deploy and serve multiple models concurrently, place all models' configuration files under `inference/models` and directly run `python inference/serve.py` without passing any conf file.
+To deploy and serve multiple models concurrently, place all models' configuration files under `llm_on_ray/inference/models` and directly run `llm_on_ray-serve` without passing any conf file.
 
 After deploying the model, you can access and test it in many ways:
 ```bash
@@ -71,7 +71,7 @@ python examples/inference/api_server_openai/query_openai_sdk.py
 ### Serving Model to a Simple Endpoint
 This will create a simple endpoint for serving according to the `port` and `route_prefix` parameters in conf file, for example: http://127.0.0.1:8000/gpt2.
 ```bash
-python inference/serve.py --config_file <path to the conf file> --simple
+llm_on_ray-serve --config_file <path to the conf file> --simple
 ```
 After deploying the model endpoint, you can access and test it by using the script below:
 ```bash
diff --git a/docs/vllm.md b/docs/vllm.md
index 58393a9ae..426d2c305 100644
--- a/docs/vllm.md
+++ b/docs/vllm.md
@@ -9,7 +9,7 @@ vLLM for CPU currently supports Intel® 4th Gen Xeon® Scalable Performance proc
 Please run the following script to install vLLM for CPU into your current environment. Currently a GNU C++ compiler with >=12.3 version is required to build and install.
 
 ```bash
-$ dev/scripts/install-vllm-cpu.sh
+dev/scripts/install-vllm-cpu.sh
 ```
 
 ## Setup
@@ -23,7 +23,7 @@ Please follow [Deploying and Serving LLMs on Intel CPU/GPU/Gaudi](serve.md) docu
 To serve model with vLLM, run the following:
 
 ```bash
-$ python serve.py --config_file inference/models/vllm/llama-2-7b-chat-hf-vllm.yaml --simple --keep_serve_terminal
+llm_on_ray-serve --config_file llm_on_ray/inference/models/vllm/llama-2-7b-chat-hf-vllm.yaml --simple --keep_serve_terminal
 ```
 
 In the above example, `vllm` property is set to `true` in the config file for enabling vLLM.
@@ -33,11 +33,11 @@ In the above example, `vllm` property is set to `true` in the config file for en
 To start a non-streaming query, run the following:
 
 ```bash
-$ python examples/inference/api_server_simple/query_single.py --model_endpoint http://127.0.0.1:8000/llama-2-7b-chat-hf
+python examples/inference/api_server_simple/query_single.py --model_endpoint http://127.0.0.1:8000/llama-2-7b-chat-hf
 ```
 
 To start a streaming query, run the following:
 
 ```bash
-$ python examples/inference/api_server_simple/query_single.py --model_endpoint http://127.0.0.1:8000/llama-2-7b-chat-hf --streaming_response
+python examples/inference/api_server_simple/query_single.py --model_endpoint http://127.0.0.1:8000/llama-2-7b-chat-hf --streaming_response
 ```
\ No newline at end of file
diff --git a/docs/web_ui.md b/docs/web_ui.md
index db0c0824f..5207c736f 100644
--- a/docs/web_ui.md
+++ b/docs/web_ui.md
@@ -8,13 +8,13 @@ Please follow [setup.md](setup.md) to setup the base environment first.
 ## Setup UI Environment
 After activating the environment installed from the previous step, please run the following script to install environment for Web UI.
 ```bash
-$ dev/scripts/install-ui.sh
+dev/scripts/install-ui.sh
 ```
 
 ## Start Web UI
 
 ```bash
-python -u ui/start_ui.py --node_user_name $user --conda_env_name $conda_env --master_ip_port "$node_ip:6379"
+python -m llm_on_ray.ui.start_ui --node_user_name $user --conda_env_name $conda_env --master_ip_port "$node_ip:6379"
 ```
 You will get URL from the command line output (E.g. http://0.0.0.0:8080 for local network and https://180cd5f7c31a1cfd3c.gradio.live for public network) and use the web browser to open it.
 
diff --git a/llm_on_ray/common/__init__.py b/llm_on_ray/common/__init__.py
new file mode 100644
index 000000000..dadeefdda
--- /dev/null
+++ b/llm_on_ray/common/__init__.py
@@ -0,0 +1,5 @@
+from llm_on_ray.common.logging import logger
+from llm_on_ray.common.torch_config import TorchConfig
+from llm_on_ray.common.config import Config
+from llm_on_ray.common.init import init
+from llm_on_ray.common import agentenv, dataset, initializer, model, optimizer, tokenizer, trainer
diff --git a/llm_on_ray/common/agentenv/__init__.py b/llm_on_ray/common/agentenv/__init__.py
new file mode 100644
index 000000000..0ec64973b
--- /dev/null
+++ b/llm_on_ray/common/agentenv/__init__.py
@@ -0,0 +1,5 @@
+from llm_on_ray.common.agentenv.agentenv import AgentEnv
+from llm_on_ray.common.agentenv.rlhf_env import RLHFEnv
+
+
+__all__ = ["AgentEnv"]
diff --git a/common/agentenv/agentenv.py b/llm_on_ray/common/agentenv/agentenv.py
similarity index 100%
rename from common/agentenv/agentenv.py
rename to llm_on_ray/common/agentenv/agentenv.py
diff --git a/common/agentenv/rlhf_env.py b/llm_on_ray/common/agentenv/rlhf_env.py
similarity index 98%
rename from common/agentenv/rlhf_env.py
rename to llm_on_ray/common/agentenv/rlhf_env.py
index d4b0a5833..7c3e08ca1 100644
--- a/common/agentenv/rlhf_env.py
+++ b/llm_on_ray/common/agentenv/rlhf_env.py
@@ -7,8 +7,8 @@
 from ray.rllib.utils.spaces.repeated import Repeated
 import gymnasium.spaces as sp
 
-from .agentenv import AgentEnv
-from ..load import load_dataset, load_model, load_tokenizer
+from llm_on_ray.common.agentenv import AgentEnv
+from llm_on_ray.common.load import load_dataset, load_model, load_tokenizer
 
 
 def generate_response(
diff --git a/common/common.py b/llm_on_ray/common/common.py
similarity index 74%
rename from common/common.py
rename to llm_on_ray/common/common.py
index b846ea75a..590c5e4eb 100644
--- a/common/common.py
+++ b/llm_on_ray/common/common.py
@@ -2,10 +2,10 @@
 import glob
 import importlib
 
-from .logging import logger
+from llm_on_ray.common.logging import logger
 
 
-def import_all_module(basedir, prefix=None):
+def import_all_modules(basedir, prefix=None):
     all_py_files = glob.glob(basedir + "/*.py")
     modules = [os.path.basename(f) for f in all_py_files]
 
@@ -19,4 +19,4 @@ def import_all_module(basedir, prefix=None):
             try:
                 importlib.import_module(module_name)
             except Exception:
-                logger.warning(f"import {module_name} erro", exc_info=True)
+                logger.warning(f"import {module_name} error", exc_info=True)
diff --git a/common/config.py b/llm_on_ray/common/config.py
similarity index 100%
rename from common/config.py
rename to llm_on_ray/common/config.py
diff --git a/llm_on_ray/common/dataprocesser/__init__.py b/llm_on_ray/common/dataprocesser/__init__.py
new file mode 100644
index 000000000..99ff999fd
--- /dev/null
+++ b/llm_on_ray/common/dataprocesser/__init__.py
@@ -0,0 +1,6 @@
+from llm_on_ray.common.dataprocesser.dataprocesser import DataProcesser
+from llm_on_ray.common.dataprocesser.general_processer import GeneralProcesser
+from llm_on_ray.common.dataprocesser.rm_dataprocesser import RMDataProcesser
+
+
+__all__ = ["DataProcesser"]
diff --git a/common/dataprocesser/dataprocesser.py b/llm_on_ray/common/dataprocesser/dataprocesser.py
similarity index 100%
rename from common/dataprocesser/dataprocesser.py
rename to llm_on_ray/common/dataprocesser/dataprocesser.py
diff --git a/common/dataprocesser/general_processer.py b/llm_on_ray/common/dataprocesser/general_processer.py
similarity index 99%
rename from common/dataprocesser/general_processer.py
rename to llm_on_ray/common/dataprocesser/general_processer.py
index 4873b4594..cd09064a6 100644
--- a/common/dataprocesser/general_processer.py
+++ b/llm_on_ray/common/dataprocesser/general_processer.py
@@ -5,7 +5,7 @@
 import datasets
 import transformers
 
-from .dataprocesser import DataProcesser
+from llm_on_ray.common.dataprocesser import DataProcesser
 
 INTRO_BLURB = "Below is an instruction that describes a task. Write a response that appropriately completes the request."
 INSTRUCTION_KEY = "### Instruction:"
diff --git a/common/dataprocesser/rm_dataprocesser.py b/llm_on_ray/common/dataprocesser/rm_dataprocesser.py
similarity index 96%
rename from common/dataprocesser/rm_dataprocesser.py
rename to llm_on_ray/common/dataprocesser/rm_dataprocesser.py
index 36ead7d8b..10bfea6ff 100644
--- a/common/dataprocesser/rm_dataprocesser.py
+++ b/llm_on_ray/common/dataprocesser/rm_dataprocesser.py
@@ -1,8 +1,8 @@
 import torch
 import transformers
 
-from .dataprocesser import DataProcesser
-from ..logging import logger
+from llm_on_ray.common.dataprocesser import DataProcesser
+from llm_on_ray.common.logging import logger
 
 
 class RMDataProcesser(DataProcesser):
diff --git a/llm_on_ray/common/dataset/__init__.py b/llm_on_ray/common/dataset/__init__.py
new file mode 100644
index 000000000..00477c05b
--- /dev/null
+++ b/llm_on_ray/common/dataset/__init__.py
@@ -0,0 +1,5 @@
+from llm_on_ray.common.dataset.dataset import Dataset
+from llm_on_ray.common.dataset.huggingface_dataset import HuggingfaceDataset
+
+
+__all__ = ["Dataset"]
diff --git a/common/dataset/dataset.py b/llm_on_ray/common/dataset/dataset.py
similarity index 100%
rename from common/dataset/dataset.py
rename to llm_on_ray/common/dataset/dataset.py
diff --git a/common/dataset/huggingface_dataset.py b/llm_on_ray/common/dataset/huggingface_dataset.py
similarity index 97%
rename from common/dataset/huggingface_dataset.py
rename to llm_on_ray/common/dataset/huggingface_dataset.py
index 3b9214aaf..dddcc995e 100644
--- a/common/dataset/huggingface_dataset.py
+++ b/llm_on_ray/common/dataset/huggingface_dataset.py
@@ -1,7 +1,7 @@
 import os
 import datasets
 
-from .dataset import Dataset
+from llm_on_ray.common.dataset import Dataset
 
 
 def local_load(name, **load_config):
diff --git a/common/init.py b/llm_on_ray/common/init.py
similarity index 95%
rename from common/init.py
rename to llm_on_ray/common/init.py
index 63715f18f..6ee077b0c 100644
--- a/common/init.py
+++ b/llm_on_ray/common/init.py
@@ -1,7 +1,7 @@
 import torch
 import accelerate
 
-from .logging import logger
+from llm_on_ray.common.logging import logger
 
 
 def check_config(config):
diff --git a/llm_on_ray/common/initializer/__init__.py b/llm_on_ray/common/initializer/__init__.py
new file mode 100644
index 000000000..e1f5b0613
--- /dev/null
+++ b/llm_on_ray/common/initializer/__init__.py
@@ -0,0 +1,4 @@
+from llm_on_ray.common.initializer.initializer import Initializer
+
+
+__all__ = ["Initializer"]
diff --git a/common/initializer/initializer.py b/llm_on_ray/common/initializer/initializer.py
similarity index 100%
rename from common/initializer/initializer.py
rename to llm_on_ray/common/initializer/initializer.py
diff --git a/common/load.py b/llm_on_ray/common/load.py
similarity index 83%
rename from common/load.py
rename to llm_on_ray/common/load.py
index 16fcfd1c5..e598e5534 100644
--- a/common/load.py
+++ b/llm_on_ray/common/load.py
@@ -1,13 +1,8 @@
 import sys
 from typing import Any, Dict
 
-from .logging import logger
-from . import dataset
-from . import tokenizer
-from . import model
-from . import optimizer
-from . import trainer
-from . import initializer
+from llm_on_ray.common import logger
+from llm_on_ray.common import agentenv, dataset, initializer, model, optimizer, tokenizer, trainer
 
 
 def load_check_decorator(func):
@@ -120,3 +115,18 @@ def get_initializer(config: Dict[str, Any]):
         logger.critical(f"{Factory.__name__} init error: {e}", exc_info=True)
         exit(1)
     return _
+
+
+@load_check_decorator  # type: ignore # noqa: F405 # may be undefined, or defined from star imports
+def get_agentenv(config: Dict[str, Any]):
+    logger.info(f"{sys._getframe().f_code.co_name} config: {config}")
+    agentenv_type = config.get("type", None)
+    Factory = agentenv.AgentEnv.registory.get(agentenv_type)
+    if Factory is None:
+        raise ValueError(f"there is no {agentenv_type} AgentEnv.")
+    try:
+        _ = Factory(config)
+    except Exception as e:
+        logger.critical(f"{Factory.__name__} init error: {e}", exc_info=True)
+        exit(1)
+    return _
diff --git a/common/logging.py b/llm_on_ray/common/logging.py
similarity index 100%
rename from common/logging.py
rename to llm_on_ray/common/logging.py
diff --git a/llm_on_ray/common/model/__init__.py b/llm_on_ray/common/model/__init__.py
new file mode 100644
index 000000000..6f41c8214
--- /dev/null
+++ b/llm_on_ray/common/model/__init__.py
@@ -0,0 +1,6 @@
+from llm_on_ray.common.model.model import Model
+from llm_on_ray.common.model.huggingface_model_for_causal_lm import HuggingFaceModelForCausalLM
+from llm_on_ray.common.model.reward_model import HuggingFaceRewardModel
+
+
+__all__ = ["Model"]
diff --git a/common/model/huggingface_model_for_causal_lm.py b/llm_on_ray/common/model/huggingface_model_for_causal_lm.py
similarity index 96%
rename from common/model/huggingface_model_for_causal_lm.py
rename to llm_on_ray/common/model/huggingface_model_for_causal_lm.py
index cc2ce6234..2716ec897 100644
--- a/common/model/huggingface_model_for_causal_lm.py
+++ b/llm_on_ray/common/model/huggingface_model_for_causal_lm.py
@@ -1,6 +1,6 @@
 import transformers
 
-from .model import Model
+from llm_on_ray.common.model import Model
 from peft import get_peft_model, LoraConfig
 import deltatuner
 
diff --git a/common/model/model.py b/llm_on_ray/common/model/model.py
similarity index 100%
rename from common/model/model.py
rename to llm_on_ray/common/model/model.py
diff --git a/common/model/reward_model.py b/llm_on_ray/common/model/reward_model.py
similarity index 98%
rename from common/model/reward_model.py
rename to llm_on_ray/common/model/reward_model.py
index a4aa237ef..eaf5501d1 100644
--- a/common/model/reward_model.py
+++ b/llm_on_ray/common/model/reward_model.py
@@ -4,7 +4,7 @@
 import torch
 import torch.nn as nn
 
-from .model import Model
+from llm_on_ray.common.model import Model
 
 
 class HuggingFaceRewardModel(Model):
diff --git a/llm_on_ray/common/optimizer/__init__.py b/llm_on_ray/common/optimizer/__init__.py
new file mode 100644
index 000000000..f71a85785
--- /dev/null
+++ b/llm_on_ray/common/optimizer/__init__.py
@@ -0,0 +1,6 @@
+from llm_on_ray.common.optimizer.optimizer import Optimizer
+from llm_on_ray.common.optimizer.default_optimizer import DefaultOptimizer
+from llm_on_ray.common.optimizer.group_optimizer import GroupOptimizer
+
+
+__all__ = ["Optimizer"]
diff --git a/common/optimizer/default_optimizer.py b/llm_on_ray/common/optimizer/default_optimizer.py
similarity index 89%
rename from common/optimizer/default_optimizer.py
rename to llm_on_ray/common/optimizer/default_optimizer.py
index dab5803a2..fef023e62 100644
--- a/common/optimizer/default_optimizer.py
+++ b/llm_on_ray/common/optimizer/default_optimizer.py
@@ -1,5 +1,5 @@
-import torch  # noqa: F401
-from .optimizer import Optimizer
+import torch
+from llm_on_ray.common.optimizer import Optimizer
 
 
 class DefaultOptimizer(Optimizer):
diff --git a/common/optimizer/group_optimizer.py b/llm_on_ray/common/optimizer/group_optimizer.py
similarity index 94%
rename from common/optimizer/group_optimizer.py
rename to llm_on_ray/common/optimizer/group_optimizer.py
index 0e07878db..5816639a9 100644
--- a/common/optimizer/group_optimizer.py
+++ b/llm_on_ray/common/optimizer/group_optimizer.py
@@ -1,5 +1,5 @@
-import torch  # noqa: F401
-from .optimizer import Optimizer
+import torch
+from llm_on_ray.common.optimizer import Optimizer
 
 
 class GroupOptimizer(Optimizer):
diff --git a/common/optimizer/optimizer.py b/llm_on_ray/common/optimizer/optimizer.py
similarity index 100%
rename from common/optimizer/optimizer.py
rename to llm_on_ray/common/optimizer/optimizer.py
diff --git a/llm_on_ray/common/tokenizer/__init__.py b/llm_on_ray/common/tokenizer/__init__.py
new file mode 100644
index 000000000..3f2c40136
--- /dev/null
+++ b/llm_on_ray/common/tokenizer/__init__.py
@@ -0,0 +1,6 @@
+from llm_on_ray.common.tokenizer.tokenizer import Tokenizer
+from llm_on_ray.common.tokenizer.empty_tokenizer import EmptyTokenizer
+from llm_on_ray.common.tokenizer.huggingface_tokenizer import HuggingFaceTokenizer
+
+
+__all__ = ["Tokenizer"]
diff --git a/common/tokenizer/empty_tokenizer.py b/llm_on_ray/common/tokenizer/empty_tokenizer.py
similarity index 86%
rename from common/tokenizer/empty_tokenizer.py
rename to llm_on_ray/common/tokenizer/empty_tokenizer.py
index c2684aca0..50f5ca6f2 100644
--- a/common/tokenizer/empty_tokenizer.py
+++ b/llm_on_ray/common/tokenizer/empty_tokenizer.py
@@ -1,4 +1,4 @@
-from .tokenizer import Tokenizer
+from llm_on_ray.common.tokenizer import Tokenizer
 
 
 class _EmptyTokenizer:
diff --git a/common/tokenizer/huggingface_tokenizer.py b/llm_on_ray/common/tokenizer/huggingface_tokenizer.py
similarity index 85%
rename from common/tokenizer/huggingface_tokenizer.py
rename to llm_on_ray/common/tokenizer/huggingface_tokenizer.py
index a6a60bc7f..59905aef7 100644
--- a/common/tokenizer/huggingface_tokenizer.py
+++ b/llm_on_ray/common/tokenizer/huggingface_tokenizer.py
@@ -1,6 +1,6 @@
 import transformers
 
-from .tokenizer import Tokenizer
+from llm_on_ray.common.tokenizer import Tokenizer
 
 
 class HuggingFaceTokenizer(Tokenizer):
diff --git a/common/tokenizer/tokenizer.py b/llm_on_ray/common/tokenizer/tokenizer.py
similarity index 100%
rename from common/tokenizer/tokenizer.py
rename to llm_on_ray/common/tokenizer/tokenizer.py
diff --git a/common/torch_config.py b/llm_on_ray/common/torch_config.py
similarity index 91%
rename from common/torch_config.py
rename to llm_on_ray/common/torch_config.py
index 5a63ab565..a051de56f 100644
--- a/common/torch_config.py
+++ b/llm_on_ray/common/torch_config.py
@@ -26,15 +26,15 @@ def backend_cls(self):
 def libs_import():
     """try to import IPEX and oneCCL."""
     try:
-        import intel_extension_for_pytorch  # noqa: F401
+        import intel_extension_for_pytorch
     except ImportError:
         raise ImportError("Please install intel_extension_for_pytorch")
     try:
         ccl_version = importlib_metadata.version("oneccl_bind_pt")
         if ccl_version >= "1.12":
-            import oneccl_bindings_for_pytorch  # noqa: F401
+            import oneccl_bindings_for_pytorch
         else:
-            import torch_ccl  # noqa: F401
+            import torch_ccl
     except ImportError as ccl_not_exist:
         raise ImportError("Please install torch-ccl") from ccl_not_exist
 
diff --git a/llm_on_ray/common/trainer/__init__.py b/llm_on_ray/common/trainer/__init__.py
new file mode 100644
index 000000000..71ff4b808
--- /dev/null
+++ b/llm_on_ray/common/trainer/__init__.py
@@ -0,0 +1,6 @@
+from llm_on_ray.common.trainer.trainer import Trainer
+from llm_on_ray.common.trainer.default_trainer import DefaultTrainer
+from llm_on_ray.common.trainer.rm_trainer import RMTrainer
+
+
+__all__ = ["Trainer"]
diff --git a/common/trainer/default_trainer.py b/llm_on_ray/common/trainer/default_trainer.py
similarity index 98%
rename from common/trainer/default_trainer.py
rename to llm_on_ray/common/trainer/default_trainer.py
index f266a9cb9..18582caaf 100644
--- a/common/trainer/default_trainer.py
+++ b/llm_on_ray/common/trainer/default_trainer.py
@@ -10,10 +10,9 @@
 
 from ray.train import report, Checkpoint
 
-from .. import dataprocesser
-from .trainer import Trainer
-
-from ..logging import logger
+from llm_on_ray.common import dataprocesser
+from llm_on_ray.common.trainer import Trainer
+from llm_on_ray.common.logging import logger
 
 
 class DefaultTrainer(Trainer):
diff --git a/common/trainer/rm_trainer.py b/llm_on_ray/common/trainer/rm_trainer.py
similarity index 97%
rename from common/trainer/rm_trainer.py
rename to llm_on_ray/common/trainer/rm_trainer.py
index 1cc64d93e..83bf0a673 100644
--- a/common/trainer/rm_trainer.py
+++ b/llm_on_ray/common/trainer/rm_trainer.py
@@ -4,8 +4,8 @@
 import math
 import time
 
-from .default_trainer import DefaultTrainer
-from ..logging import logger
+from llm_on_ray.common.trainer.default_trainer import DefaultTrainer
+from llm_on_ray.common.logging import logger
 
 
 class RMTrainer(DefaultTrainer):
diff --git a/common/trainer/trainer.py b/llm_on_ray/common/trainer/trainer.py
similarity index 100%
rename from common/trainer/trainer.py
rename to llm_on_ray/common/trainer/trainer.py
diff --git a/finetune/__init__.py b/llm_on_ray/finetune/__init__.py
similarity index 100%
rename from finetune/__init__.py
rename to llm_on_ray/finetune/__init__.py
diff --git a/finetune/finetune.py b/llm_on_ray/finetune/finetune.py
similarity index 98%
rename from finetune/finetune.py
rename to llm_on_ray/finetune/finetune.py
index 3557b25a4..e38596915 100644
--- a/finetune/finetune.py
+++ b/llm_on_ray/finetune/finetune.py
@@ -21,11 +21,8 @@
     FullStateDictConfig,
 )
 
-import sys
-
-sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
-import common
-from finetune.finetune_config import FinetuneConfig
+from llm_on_ray import common
+from llm_on_ray.finetune.finetune_config import FinetuneConfig
 
 
 def get_accelerate_environment_variable(mode: str, config: Union[Dict[str, Any], None]) -> dict:
@@ -315,8 +312,8 @@ def main(external_config=None):
         run_config=run_config,
     )
     results = trainer.fit()
-
-    return results
+    if external_config is not None:
+        return results
 
 
 if __name__ == "__main__":
diff --git a/finetune/finetune.yaml b/llm_on_ray/finetune/finetune.yaml
similarity index 100%
rename from finetune/finetune.yaml
rename to llm_on_ray/finetune/finetune.yaml
diff --git a/finetune/finetune_config.py b/llm_on_ray/finetune/finetune_config.py
similarity index 100%
rename from finetune/finetune_config.py
rename to llm_on_ray/finetune/finetune_config.py
diff --git a/finetune/models/bloom-560m.yaml b/llm_on_ray/finetune/models/bloom-560m.yaml
similarity index 100%
rename from finetune/models/bloom-560m.yaml
rename to llm_on_ray/finetune/models/bloom-560m.yaml
diff --git a/finetune/models/finetune_config_template.yaml b/llm_on_ray/finetune/models/finetune_config_template.yaml
similarity index 100%
rename from finetune/models/finetune_config_template.yaml
rename to llm_on_ray/finetune/models/finetune_config_template.yaml
diff --git a/finetune/models/gpt-j-6b.yaml b/llm_on_ray/finetune/models/gpt-j-6b.yaml
similarity index 100%
rename from finetune/models/gpt-j-6b.yaml
rename to llm_on_ray/finetune/models/gpt-j-6b.yaml
diff --git a/finetune/models/gpt2.yaml b/llm_on_ray/finetune/models/gpt2.yaml
similarity index 100%
rename from finetune/models/gpt2.yaml
rename to llm_on_ray/finetune/models/gpt2.yaml
diff --git a/finetune/models/llama-2-7b-chat-hf.yaml b/llm_on_ray/finetune/models/llama-2-7b-chat-hf.yaml
similarity index 100%
rename from finetune/models/llama-2-7b-chat-hf.yaml
rename to llm_on_ray/finetune/models/llama-2-7b-chat-hf.yaml
diff --git a/finetune/models/llama-7b.yaml b/llm_on_ray/finetune/models/llama-7b.yaml
similarity index 100%
rename from finetune/models/llama-7b.yaml
rename to llm_on_ray/finetune/models/llama-7b.yaml
diff --git a/finetune/models/mistral-7b-v0.1.yaml b/llm_on_ray/finetune/models/mistral-7b-v0.1.yaml
similarity index 100%
rename from finetune/models/mistral-7b-v0.1.yaml
rename to llm_on_ray/finetune/models/mistral-7b-v0.1.yaml
diff --git a/finetune/models/mpt-7b-chat.yaml b/llm_on_ray/finetune/models/mpt-7b-chat.yaml
similarity index 100%
rename from finetune/models/mpt-7b-chat.yaml
rename to llm_on_ray/finetune/models/mpt-7b-chat.yaml
diff --git a/finetune/models/opt-125m.yaml b/llm_on_ray/finetune/models/opt-125m.yaml
similarity index 100%
rename from finetune/models/opt-125m.yaml
rename to llm_on_ray/finetune/models/opt-125m.yaml
diff --git a/inference/__init__.py b/llm_on_ray/inference/__init__.py
similarity index 100%
rename from inference/__init__.py
rename to llm_on_ray/inference/__init__.py
diff --git a/inference/api_openai_backend/openai_protocol.py b/llm_on_ray/inference/api_openai_backend/openai_protocol.py
similarity index 100%
rename from inference/api_openai_backend/openai_protocol.py
rename to llm_on_ray/inference/api_openai_backend/openai_protocol.py
diff --git a/inference/api_openai_backend/query_client.py b/llm_on_ray/inference/api_openai_backend/query_client.py
similarity index 95%
rename from inference/api_openai_backend/query_client.py
rename to llm_on_ray/inference/api_openai_backend/query_client.py
index fbfbb65b1..9e8c6656e 100644
--- a/inference/api_openai_backend/query_client.py
+++ b/llm_on_ray/inference/api_openai_backend/query_client.py
@@ -34,8 +34,8 @@
 
 from typing import Dict
 from fastapi import HTTPException
-from .openai_protocol import ModelCard, Prompt
-from .request_handler import handle_request
+from llm_on_ray.inference.api_openai_backend.openai_protocol import ModelCard, Prompt
+from llm_on_ray.inference.api_openai_backend.request_handler import handle_request
 
 
 class RouterQueryClient:
diff --git a/inference/api_openai_backend/request_handler.py b/llm_on_ray/inference/api_openai_backend/request_handler.py
similarity index 96%
rename from inference/api_openai_backend/request_handler.py
rename to llm_on_ray/inference/api_openai_backend/request_handler.py
index dd5a1189d..202f92538 100644
--- a/inference/api_openai_backend/request_handler.py
+++ b/llm_on_ray/inference/api_openai_backend/request_handler.py
@@ -38,8 +38,13 @@
 from fastapi import status, HTTPException, Request
 from starlette.responses import JSONResponse
 from pydantic import ValidationError as PydanticValidationError
-from inference.logger import get_logger
-from .openai_protocol import Prompt, ModelResponse, ErrorResponse, FinishReason
+from llm_on_ray.inference.logger import get_logger
+from llm_on_ray.inference.api_openai_backend.openai_protocol import (
+    Prompt,
+    ModelResponse,
+    ErrorResponse,
+    FinishReason,
+)
 
 logger = get_logger(__name__)
 
diff --git a/inference/api_openai_backend/router_app.py b/llm_on_ray/inference/api_openai_backend/router_app.py
similarity index 97%
rename from inference/api_openai_backend/router_app.py
rename to llm_on_ray/inference/api_openai_backend/router_app.py
index 236819392..ecaed39b8 100644
--- a/inference/api_openai_backend/router_app.py
+++ b/llm_on_ray/inference/api_openai_backend/router_app.py
@@ -40,16 +40,17 @@
 from fastapi import Response as FastAPIResponse
 from fastapi.middleware.cors import CORSMiddleware
 from starlette.responses import Response, StreamingResponse
-from inference.logger import get_logger
-from .request_handler import OpenAIHTTPException, openai_exception_handler
-from .query_client import RouterQueryClient
-from .openai_protocol import (
+from llm_on_ray.inference.logger import get_logger
+from llm_on_ray.inference.api_openai_backend.request_handler import (
+    OpenAIHTTPException,
+    openai_exception_handler,
+)
+from llm_on_ray.inference.api_openai_backend.query_client import RouterQueryClient
+from llm_on_ray.inference.api_openai_backend.openai_protocol import (
     Prompt,
     ModelResponse,
     CompletionRequest,
     ChatCompletionRequest,
-)
-from .openai_protocol import (
     ChatCompletionResponse,
     CompletionResponse,
     DeltaChoices,
diff --git a/inference/api_server_openai.py b/llm_on_ray/inference/api_server_openai.py
similarity index 94%
rename from inference/api_server_openai.py
rename to llm_on_ray/inference/api_server_openai.py
index 77831a9d2..2ba821075 100644
--- a/inference/api_server_openai.py
+++ b/llm_on_ray/inference/api_server_openai.py
@@ -34,8 +34,8 @@
 
 import os
 from ray import serve
-from inference.api_openai_backend.query_client import RouterQueryClient
-from inference.api_openai_backend.router_app import Router, router_app
+from llm_on_ray.inference.api_openai_backend.query_client import RouterQueryClient
+from llm_on_ray.inference.api_openai_backend.router_app import Router, router_app
 
 
 def router_application(deployments):
diff --git a/inference/api_server_simple.py b/llm_on_ray/inference/api_server_simple.py
similarity index 100%
rename from inference/api_server_simple.py
rename to llm_on_ray/inference/api_server_simple.py
diff --git a/inference/chat_process.py b/llm_on_ray/inference/chat_process.py
similarity index 100%
rename from inference/chat_process.py
rename to llm_on_ray/inference/chat_process.py
diff --git a/inference/deepspeed_predictor.py b/llm_on_ray/inference/deepspeed_predictor.py
similarity index 98%
rename from inference/deepspeed_predictor.py
rename to llm_on_ray/inference/deepspeed_predictor.py
index ef75c6118..dbdbca06f 100644
--- a/inference/deepspeed_predictor.py
+++ b/llm_on_ray/inference/deepspeed_predictor.py
@@ -12,9 +12,9 @@
 from ray.air import ScalingConfig
 from typing import List
 import os
-from predictor import Predictor
-from inference.utils import get_torch_dtype
-from inference.inference_config import (
+from llm_on_ray.inference.predictor import Predictor
+from llm_on_ray.inference.utils import get_torch_dtype
+from llm_on_ray.inference.inference_config import (
     InferenceConfig,
     GenerateResult,
     DEVICE_CPU,
diff --git a/inference/inference_config.py b/llm_on_ray/inference/inference_config.py
similarity index 97%
rename from inference/inference_config.py
rename to llm_on_ray/inference/inference_config.py
index 631b0eea0..57c1f54ac 100644
--- a/inference/inference_config.py
+++ b/llm_on_ray/inference/inference_config.py
@@ -161,8 +161,3 @@ def _check_workers_per_group(cls, v: int):
         _models[m.name] = m
 
 all_models = _models.copy()
-
-_gpt2_key = "gpt2"
-_gpt_j_6b = "gpt-j-6b"
-base_models[_gpt2_key] = _models[_gpt2_key]
-base_models[_gpt_j_6b] = _models[_gpt_j_6b]
diff --git a/inference/logger.py b/llm_on_ray/inference/logger.py
similarity index 100%
rename from inference/logger.py
rename to llm_on_ray/inference/logger.py
diff --git a/inference/mllm_predictor.py b/llm_on_ray/inference/mllm_predictor.py
similarity index 94%
rename from inference/mllm_predictor.py
rename to llm_on_ray/inference/mllm_predictor.py
index a50db97e5..895e00514 100644
--- a/inference/mllm_predictor.py
+++ b/llm_on_ray/inference/mllm_predictor.py
@@ -1,9 +1,8 @@
 import torch
 from transformers import TextIteratorStreamer
-from inference.inference_config import InferenceConfig, GenerateResult, PRECISION_BF16
-from predictor import Predictor
-from inference.utils import module_import
-from inference.utils import get_torch_dtype
+from llm_on_ray.inference.inference_config import InferenceConfig, GenerateResult, PRECISION_BF16
+from llm_on_ray.inference.utils import get_torch_dtype, module_import
+from llm_on_ray.inference.predictor import Predictor
 
 
 class MllmPredictor(Predictor):
diff --git a/inference/models/CodeLlama-7b-hf.yaml b/llm_on_ray/inference/models/CodeLlama-7b-hf.yaml
similarity index 100%
rename from inference/models/CodeLlama-7b-hf.yaml
rename to llm_on_ray/inference/models/CodeLlama-7b-hf.yaml
diff --git a/inference/models/bigdl/mistral-7b-v0.1-bigdl.yaml b/llm_on_ray/inference/models/bigdl/mistral-7b-v0.1-bigdl.yaml
similarity index 100%
rename from inference/models/bigdl/mistral-7b-v0.1-bigdl.yaml
rename to llm_on_ray/inference/models/bigdl/mistral-7b-v0.1-bigdl.yaml
diff --git a/inference/models/bigdl/mpt-7b-bigdl.yaml b/llm_on_ray/inference/models/bigdl/mpt-7b-bigdl.yaml
similarity index 100%
rename from inference/models/bigdl/mpt-7b-bigdl.yaml
rename to llm_on_ray/inference/models/bigdl/mpt-7b-bigdl.yaml
diff --git a/inference/models/bloom-560m.yaml b/llm_on_ray/inference/models/bloom-560m.yaml
similarity index 95%
rename from inference/models/bloom-560m.yaml
rename to llm_on_ray/inference/models/bloom-560m.yaml
index 8bc661557..19a5a7deb 100644
--- a/inference/models/bloom-560m.yaml
+++ b/llm_on_ray/inference/models/bloom-560m.yaml
@@ -2,7 +2,7 @@ port: 8000
 name: bloom-560m
 route_prefix: /bloom-560m
 num_replicas: 1
-cpus_per_worker: 10
+cpus_per_worker: 24
 gpus_per_worker: 0
 deepspeed: false
 workers_per_group: 2
diff --git a/inference/models/deplot.yaml b/llm_on_ray/inference/models/deplot.yaml
similarity index 97%
rename from inference/models/deplot.yaml
rename to llm_on_ray/inference/models/deplot.yaml
index e293bed54..ac6451c16 100644
--- a/inference/models/deplot.yaml
+++ b/llm_on_ray/inference/models/deplot.yaml
@@ -1,6 +1,7 @@
 port: 8000
 name: deplot
 route_prefix: /deplot
+num_replicas: 1
 cpus_per_worker: 24
 gpus_per_worker: 0
 deepspeed: false
diff --git a/inference/models/falcon-7b.yaml b/llm_on_ray/inference/models/falcon-7b.yaml
similarity index 100%
rename from inference/models/falcon-7b.yaml
rename to llm_on_ray/inference/models/falcon-7b.yaml
diff --git a/inference/models/fuyu8b.yaml b/llm_on_ray/inference/models/fuyu8b.yaml
similarity index 96%
rename from inference/models/fuyu8b.yaml
rename to llm_on_ray/inference/models/fuyu8b.yaml
index 1ad9faa98..561114cd0 100644
--- a/inference/models/fuyu8b.yaml
+++ b/llm_on_ray/inference/models/fuyu8b.yaml
@@ -1,6 +1,7 @@
 port: 8000
 name: fuyu-8b
 route_prefix: /fuyu-8b
+num_replicas: 1
 cpus_per_worker: 24
 gpus_per_worker: 0
 deepspeed: false
diff --git a/inference/models/gpt-j-6b.yaml b/llm_on_ray/inference/models/gpt-j-6b.yaml
similarity index 100%
rename from inference/models/gpt-j-6b.yaml
rename to llm_on_ray/inference/models/gpt-j-6b.yaml
diff --git a/inference/models/gpt2.yaml b/llm_on_ray/inference/models/gpt2.yaml
similarity index 100%
rename from inference/models/gpt2.yaml
rename to llm_on_ray/inference/models/gpt2.yaml
diff --git a/inference/models/llama-2-7b-chat-hf.yaml b/llm_on_ray/inference/models/llama-2-7b-chat-hf.yaml
similarity index 100%
rename from inference/models/llama-2-7b-chat-hf.yaml
rename to llm_on_ray/inference/models/llama-2-7b-chat-hf.yaml
diff --git a/inference/models/mistral-7b-v0.1.yaml b/llm_on_ray/inference/models/mistral-7b-v0.1.yaml
similarity index 100%
rename from inference/models/mistral-7b-v0.1.yaml
rename to llm_on_ray/inference/models/mistral-7b-v0.1.yaml
diff --git a/inference/models/mpt-7b.yaml b/llm_on_ray/inference/models/mpt-7b.yaml
similarity index 100%
rename from inference/models/mpt-7b.yaml
rename to llm_on_ray/inference/models/mpt-7b.yaml
diff --git a/inference/models/neural-chat-7b-v3-1.yaml b/llm_on_ray/inference/models/neural-chat-7b-v3-1.yaml
similarity index 100%
rename from inference/models/neural-chat-7b-v3-1.yaml
rename to llm_on_ray/inference/models/neural-chat-7b-v3-1.yaml
diff --git a/inference/models/opt-125m.yaml b/llm_on_ray/inference/models/opt-125m.yaml
similarity index 100%
rename from inference/models/opt-125m.yaml
rename to llm_on_ray/inference/models/opt-125m.yaml
diff --git a/inference/models/starcoder.yaml b/llm_on_ray/inference/models/starcoder.yaml
similarity index 100%
rename from inference/models/starcoder.yaml
rename to llm_on_ray/inference/models/starcoder.yaml
diff --git a/inference/models/template/export_inference_config_to_yaml.py b/llm_on_ray/inference/models/template/export_inference_config_to_yaml.py
similarity index 72%
rename from inference/models/template/export_inference_config_to_yaml.py
rename to llm_on_ray/inference/models/template/export_inference_config_to_yaml.py
index 62cfd4b75..493d3b6f5 100644
--- a/inference/models/template/export_inference_config_to_yaml.py
+++ b/llm_on_ray/inference/models/template/export_inference_config_to_yaml.py
@@ -1,6 +1,6 @@
 import yaml
 import os
-from inference.inference_config import InferenceConfig
+from llm_on_ray.inference.inference_config import InferenceConfig
 
 ic = InferenceConfig()
 
diff --git a/inference/models/template/inference_config_template.yaml b/llm_on_ray/inference/models/template/inference_config_template.yaml
similarity index 100%
rename from inference/models/template/inference_config_template.yaml
rename to llm_on_ray/inference/models/template/inference_config_template.yaml
diff --git a/inference/models/vllm/llama-2-7b-chat-hf-vllm.yaml b/llm_on_ray/inference/models/vllm/llama-2-7b-chat-hf-vllm.yaml
similarity index 100%
rename from inference/models/vllm/llama-2-7b-chat-hf-vllm.yaml
rename to llm_on_ray/inference/models/vllm/llama-2-7b-chat-hf-vllm.yaml
diff --git a/inference/predictor.py b/llm_on_ray/inference/predictor.py
similarity index 96%
rename from inference/predictor.py
rename to llm_on_ray/inference/predictor.py
index a69a9407e..d0f5daeee 100644
--- a/inference/predictor.py
+++ b/llm_on_ray/inference/predictor.py
@@ -1,9 +1,9 @@
 import re
 import torch
 from transformers import AutoTokenizer, StoppingCriteriaList
-from inference.inference_config import InferenceConfig, GenerateResult
-from inference.utils import StoppingCriteriaSub
 from typing import List, AsyncGenerator, Union
+from llm_on_ray.inference.inference_config import InferenceConfig, GenerateResult
+from llm_on_ray.inference.utils import StoppingCriteriaSub
 
 
 class Predictor:
diff --git a/inference/predictor_deployment.py b/llm_on_ray/inference/predictor_deployment.py
similarity index 94%
rename from inference/predictor_deployment.py
rename to llm_on_ray/inference/predictor_deployment.py
index a16cea7c5..094e41a56 100644
--- a/inference/predictor_deployment.py
+++ b/llm_on_ray/inference/predictor_deployment.py
@@ -19,16 +19,16 @@
 import asyncio
 import functools
 from ray import serve
-from starlette.requests import Request
 from queue import Empty
 import torch
 from transformers import TextIteratorStreamer
-from inference.inference_config import InferenceConfig
 from typing import Union, Dict, Any
+from starlette.requests import Request
 from starlette.responses import StreamingResponse, JSONResponse
 from fastapi import HTTPException
-from inference.api_openai_backend.openai_protocol import ModelResponse
-from inference.utils import get_prompt_format, PromptFormat
+from llm_on_ray.inference.inference_config import InferenceConfig
+from llm_on_ray.inference.api_openai_backend.openai_protocol import ModelResponse
+from llm_on_ray.inference.utils import get_prompt_format, PromptFormat
 
 
 @serve.deployment
@@ -59,20 +59,20 @@ def __init__(self, infer_conf: InferenceConfig):
         self.is_mllm = True if chat_processor_name in ["ChatModelwithImage"] else False
 
         if self.use_deepspeed:
-            from deepspeed_predictor import DeepSpeedPredictor
+            from llm_on_ray.inference.deepspeed_predictor import DeepSpeedPredictor
 
             self.predictor = DeepSpeedPredictor(infer_conf)
             self.streamer = self.predictor.get_streamer()
         elif self.use_vllm:
-            from vllm_predictor import VllmPredictor
+            from llm_on_ray.inference.vllm_predictor import VllmPredictor
 
             self.predictor = VllmPredictor(infer_conf)
         elif self.is_mllm:
-            from mllm_predictor import MllmPredictor
+            from llm_on_ray.inference.mllm_predictor import MllmPredictor
 
             self.predictor = MllmPredictor(infer_conf)
         else:
-            from transformer_predictor import TransformerPredictor
+            from llm_on_ray.inference.transformer_predictor import TransformerPredictor
 
             self.predictor = TransformerPredictor(infer_conf)
         self.loop = asyncio.get_running_loop()
diff --git a/inference/serve.py b/llm_on_ray/inference/serve.py
similarity index 94%
rename from inference/serve.py
rename to llm_on_ray/inference/serve.py
index 598ab247c..6d87bd247 100644
--- a/inference/serve.py
+++ b/llm_on_ray/inference/serve.py
@@ -16,12 +16,12 @@
 
 import ray
 import sys
-from inference.utils import get_deployment_actor_options
 from pydantic_yaml import parse_yaml_raw_as
-from api_server_simple import serve_run
-from api_server_openai import openai_serve_run
-from predictor_deployment import PredictorDeployment
-from inference.inference_config import ModelDescription, InferenceConfig, all_models
+from llm_on_ray.inference.utils import get_deployment_actor_options
+from llm_on_ray.inference.api_server_simple import serve_run
+from llm_on_ray.inference.api_server_openai import openai_serve_run
+from llm_on_ray.inference.predictor_deployment import PredictorDeployment
+from llm_on_ray.inference.inference_config import ModelDescription, InferenceConfig, all_models
 
 
 def get_deployed_models(args):
diff --git a/inference/transformer_predictor.py b/llm_on_ray/inference/transformer_predictor.py
similarity index 94%
rename from inference/transformer_predictor.py
rename to llm_on_ray/inference/transformer_predictor.py
index c1e83e432..8c1b74f08 100644
--- a/inference/transformer_predictor.py
+++ b/llm_on_ray/inference/transformer_predictor.py
@@ -1,9 +1,8 @@
 import torch
-from transformers import AutoModelForCausalLM, AutoConfig
-from transformers import TextIteratorStreamer
-from inference.inference_config import InferenceConfig, GenerateResult, PRECISION_BF16
-from inference.utils import get_torch_dtype
-from predictor import Predictor
+from transformers import AutoModelForCausalLM, AutoConfig, TextIteratorStreamer
+from llm_on_ray.inference.inference_config import InferenceConfig, GenerateResult, PRECISION_BF16
+from llm_on_ray.inference.utils import get_torch_dtype
+from llm_on_ray.inference.predictor import Predictor
 
 
 class TransformerPredictor(Predictor):
diff --git a/inference/utils.py b/llm_on_ray/inference/utils.py
similarity index 96%
rename from inference/utils.py
rename to llm_on_ray/inference/utils.py
index 07d928449..855916d29 100644
--- a/inference/utils.py
+++ b/llm_on_ray/inference/utils.py
@@ -16,10 +16,10 @@
 
 from transformers import StoppingCriteria
 import torch
-from inference.inference_config import InferenceConfig, DEVICE_CPU
-from inference.api_openai_backend.openai_protocol import ChatMessage
 from typing import Dict, Any, List, Union
 from enum import Enum
+from llm_on_ray.inference.inference_config import InferenceConfig, DEVICE_CPU
+from llm_on_ray.inference.api_openai_backend.openai_protocol import ChatMessage
 
 
 def get_deployment_actor_options(infer_conf: InferenceConfig):
diff --git a/inference/vllm_predictor.py b/llm_on_ray/inference/vllm_predictor.py
similarity index 96%
rename from inference/vllm_predictor.py
rename to llm_on_ray/inference/vllm_predictor.py
index 54ec4c110..d4ab10c44 100644
--- a/inference/vllm_predictor.py
+++ b/llm_on_ray/inference/vllm_predictor.py
@@ -1,11 +1,11 @@
 import asyncio
 from typing import AsyncGenerator, List, Union
-from predictor import Predictor
-from inference.inference_config import InferenceConfig, GenerateResult, PRECISION_BF16
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.async_llm_engine import AsyncLLMEngine
 from vllm.sampling_params import SamplingParams
 from vllm.utils import random_uuid
+from llm_on_ray.inference.predictor import Predictor
+from llm_on_ray.inference.inference_config import InferenceConfig, GenerateResult, PRECISION_BF16
 
 
 class VllmPredictor(Predictor):
diff --git a/pretrain/__init__.py b/llm_on_ray/pretrain/__init__.py
similarity index 100%
rename from pretrain/__init__.py
rename to llm_on_ray/pretrain/__init__.py
diff --git a/pretrain/backend/deepspeed_backend.py b/llm_on_ray/pretrain/backend/deepspeed_backend.py
similarity index 100%
rename from pretrain/backend/deepspeed_backend.py
rename to llm_on_ray/pretrain/backend/deepspeed_backend.py
diff --git a/pretrain/backend/habana_backend.py b/llm_on_ray/pretrain/backend/habana_backend.py
similarity index 93%
rename from pretrain/backend/habana_backend.py
rename to llm_on_ray/pretrain/backend/habana_backend.py
index 125987ba2..ca1240577 100644
--- a/pretrain/backend/habana_backend.py
+++ b/llm_on_ray/pretrain/backend/habana_backend.py
@@ -13,7 +13,7 @@ def backend_cls(self):
 
 def habana_import():
     try:
-        import habana_frameworks.torch  # noqa: F401
+        import habana_frameworks.torch
     except ImportError as habana_not_exist:
         raise ImportError("Please install habana_frameworks") from habana_not_exist
 
diff --git a/pretrain/config/bloom1b7_8gpus_pretrain.conf b/llm_on_ray/pretrain/config/bloom1b7_8gpus_pretrain.conf
similarity index 100%
rename from pretrain/config/bloom1b7_8gpus_pretrain.conf
rename to llm_on_ray/pretrain/config/bloom1b7_8gpus_pretrain.conf
diff --git a/pretrain/config/bloom_7b_megatron_deepspeed_zs0_8Gaudi_pretrain.conf b/llm_on_ray/pretrain/config/bloom_7b_megatron_deepspeed_zs0_8Gaudi_pretrain.conf
similarity index 100%
rename from pretrain/config/bloom_7b_megatron_deepspeed_zs0_8Gaudi_pretrain.conf
rename to llm_on_ray/pretrain/config/bloom_7b_megatron_deepspeed_zs0_8Gaudi_pretrain.conf
diff --git a/pretrain/config/llama2_7b_megatron_deepspeed_zs0_8gpus_pretrain.conf b/llm_on_ray/pretrain/config/llama2_7b_megatron_deepspeed_zs0_8gpus_pretrain.conf
similarity index 100%
rename from pretrain/config/llama2_7b_megatron_deepspeed_zs0_8gpus_pretrain.conf
rename to llm_on_ray/pretrain/config/llama2_7b_megatron_deepspeed_zs0_8gpus_pretrain.conf
diff --git a/pretrain/config/llama2_7b_megatron_deepspeed_zs3_8gpus_pretrain.conf b/llm_on_ray/pretrain/config/llama2_7b_megatron_deepspeed_zs3_8gpus_pretrain.conf
similarity index 100%
rename from pretrain/config/llama2_7b_megatron_deepspeed_zs3_8gpus_pretrain.conf
rename to llm_on_ray/pretrain/config/llama2_7b_megatron_deepspeed_zs3_8gpus_pretrain.conf
diff --git a/pretrain/config/llama_7b_8Guadi_pretrain.conf b/llm_on_ray/pretrain/config/llama_7b_8Guadi_pretrain.conf
similarity index 100%
rename from pretrain/config/llama_7b_8Guadi_pretrain.conf
rename to llm_on_ray/pretrain/config/llama_7b_8Guadi_pretrain.conf
diff --git a/pretrain/config/llama_7b_8gpu_pretrain.conf b/llm_on_ray/pretrain/config/llama_7b_8gpu_pretrain.conf
similarity index 100%
rename from pretrain/config/llama_7b_8gpu_pretrain.conf
rename to llm_on_ray/pretrain/config/llama_7b_8gpu_pretrain.conf
diff --git a/pretrain/config/llama_7b_megatron_deepspeed_zs0_8Gaudi_pretrain.conf b/llm_on_ray/pretrain/config/llama_7b_megatron_deepspeed_zs0_8Gaudi_pretrain.conf
similarity index 100%
rename from pretrain/config/llama_7b_megatron_deepspeed_zs0_8Gaudi_pretrain.conf
rename to llm_on_ray/pretrain/config/llama_7b_megatron_deepspeed_zs0_8Gaudi_pretrain.conf
diff --git a/pretrain/docker/Dockerfile.megatron.habana b/llm_on_ray/pretrain/docker/Dockerfile.megatron.habana
similarity index 100%
rename from pretrain/docker/Dockerfile.megatron.habana
rename to llm_on_ray/pretrain/docker/Dockerfile.megatron.habana
diff --git a/pretrain/docker/Dockerfile.nvidia b/llm_on_ray/pretrain/docker/Dockerfile.nvidia
similarity index 100%
rename from pretrain/docker/Dockerfile.nvidia
rename to llm_on_ray/pretrain/docker/Dockerfile.nvidia
diff --git a/pretrain/docker/Dockerfile.optimum.habana b/llm_on_ray/pretrain/docker/Dockerfile.optimum.habana
similarity index 100%
rename from pretrain/docker/Dockerfile.optimum.habana
rename to llm_on_ray/pretrain/docker/Dockerfile.optimum.habana
diff --git a/pretrain/docker/build-image.sh b/llm_on_ray/pretrain/docker/build-image.sh
similarity index 100%
rename from pretrain/docker/build-image.sh
rename to llm_on_ray/pretrain/docker/build-image.sh
diff --git a/pretrain/megatron_deepspeed_pretrain.py b/llm_on_ray/pretrain/megatron_deepspeed_pretrain.py
similarity index 96%
rename from pretrain/megatron_deepspeed_pretrain.py
rename to llm_on_ray/pretrain/megatron_deepspeed_pretrain.py
index aa5002711..6190974be 100644
--- a/pretrain/megatron_deepspeed_pretrain.py
+++ b/llm_on_ray/pretrain/megatron_deepspeed_pretrain.py
@@ -7,14 +7,13 @@
 from ray.air.config import ScalingConfig
 from ray.air import RunConfig, FailureConfig
 
-sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
-import common
+from llm_on_ray import common
 
 import importlib
 
 loader = importlib.util.find_spec("habana_frameworks")
 if loader is not None:
-    from backend.habana_backend import TorchConfig
+    from llm_on_ray.pretrain.backend.habana_backend import TorchConfig
 else:
     from ray.train.torch import TorchConfig
 
diff --git a/pretrain/patch/gpu/0001-Add-init.py-to-include-the-megatron.model.vision-int.patch b/llm_on_ray/pretrain/patch/gpu/0001-Add-init.py-to-include-the-megatron.model.vision-int.patch
similarity index 100%
rename from pretrain/patch/gpu/0001-Add-init.py-to-include-the-megatron.model.vision-int.patch
rename to llm_on_ray/pretrain/patch/gpu/0001-Add-init.py-to-include-the-megatron.model.vision-int.patch
diff --git a/pretrain/patch/gpu/0001-Change-the-sample-s-column-name.patch b/llm_on_ray/pretrain/patch/gpu/0001-Change-the-sample-s-column-name.patch
similarity index 100%
rename from pretrain/patch/gpu/0001-Change-the-sample-s-column-name.patch
rename to llm_on_ray/pretrain/patch/gpu/0001-Change-the-sample-s-column-name.patch
diff --git a/pretrain/patch/gpu/0001-hot-fix-for-megatron-deepspeed-for-gpu-version.patch b/llm_on_ray/pretrain/patch/gpu/0001-hot-fix-for-megatron-deepspeed-for-gpu-version.patch
similarity index 100%
rename from pretrain/patch/gpu/0001-hot-fix-for-megatron-deepspeed-for-gpu-version.patch
rename to llm_on_ray/pretrain/patch/gpu/0001-hot-fix-for-megatron-deepspeed-for-gpu-version.patch
diff --git a/pretrain/patch/hpu/0001-Init-megatron-deepspeed-with-Ray-cluster.patch b/llm_on_ray/pretrain/patch/hpu/0001-Init-megatron-deepspeed-with-Ray-cluster.patch
similarity index 100%
rename from pretrain/patch/hpu/0001-Init-megatron-deepspeed-with-Ray-cluster.patch
rename to llm_on_ray/pretrain/patch/hpu/0001-Init-megatron-deepspeed-with-Ray-cluster.patch
diff --git a/pretrain/patch/hpu/0002-Add-the-Huggingface-tokenizer.patch b/llm_on_ray/pretrain/patch/hpu/0002-Add-the-Huggingface-tokenizer.patch
similarity index 100%
rename from pretrain/patch/hpu/0002-Add-the-Huggingface-tokenizer.patch
rename to llm_on_ray/pretrain/patch/hpu/0002-Add-the-Huggingface-tokenizer.patch
diff --git a/pretrain/plugin/group_dataset.py b/llm_on_ray/pretrain/plugin/group_dataset.py
similarity index 93%
rename from pretrain/plugin/group_dataset.py
rename to llm_on_ray/pretrain/plugin/group_dataset.py
index 93838f7bf..1d1f9a2d5 100644
--- a/pretrain/plugin/group_dataset.py
+++ b/llm_on_ray/pretrain/plugin/group_dataset.py
@@ -1,7 +1,7 @@
 import os
 import datasets
 
-from common.dataset import Dataset
+from llm_on_ray.common.dataset import Dataset
 
 
 class GroupDataset(Dataset):
diff --git a/pretrain/plugin/hf_pretrainer.py b/llm_on_ray/pretrain/plugin/hf_pretrainer.py
similarity index 98%
rename from pretrain/plugin/hf_pretrainer.py
rename to llm_on_ray/pretrain/plugin/hf_pretrainer.py
index d9aafdfc5..2c2c5d1f7 100755
--- a/pretrain/plugin/hf_pretrainer.py
+++ b/llm_on_ray/pretrain/plugin/hf_pretrainer.py
@@ -3,9 +3,6 @@
 import logging
 import sys
 from torch.utils.data import DataLoader, Dataset
-import common
-from common import dataprocesser
-from common.logging import logger
 import evaluate
 from typing import Optional
 from transformers import (
@@ -16,7 +13,10 @@
 from transformers.trainer_utils import get_last_checkpoint
 from transformers.utils import check_min_version, send_example_telemetry
 from transformers import Trainer, TrainingArguments
-from common.trainer import Trainer as RayTrainer
+from llm_on_ray import common
+from llm_on_ray.common import dataprocesser
+from llm_on_ray.common.logging import logger
+from llm_on_ray.common.trainer import Trainer as RayTrainer
 
 use_habana = True
 import importlib
diff --git a/pretrain/plugin/huggingface_model_from_config.py b/llm_on_ray/pretrain/plugin/huggingface_model_from_config.py
similarity index 99%
rename from pretrain/plugin/huggingface_model_from_config.py
rename to llm_on_ray/pretrain/plugin/huggingface_model_from_config.py
index 5ce38da8f..fe5c9608f 100644
--- a/pretrain/plugin/huggingface_model_from_config.py
+++ b/llm_on_ray/pretrain/plugin/huggingface_model_from_config.py
@@ -1,7 +1,7 @@
 import torch
 import math
 import transformers
-from common.model.model import Model
+from llm_on_ray.common.model import Model
 
 
 # for huggingface model weight random initialization
diff --git a/pretrain/plugin/megatron_dataset.py b/llm_on_ray/pretrain/plugin/megatron_dataset.py
similarity index 96%
rename from pretrain/plugin/megatron_dataset.py
rename to llm_on_ray/pretrain/plugin/megatron_dataset.py
index 944c6b53b..0d74906de 100644
--- a/pretrain/plugin/megatron_dataset.py
+++ b/llm_on_ray/pretrain/plugin/megatron_dataset.py
@@ -2,7 +2,7 @@
 from megatron.training import build_train_valid_test_datasets, update_train_iters
 from megatron.data import gpt_dataset
 
-from common.dataset import Dataset
+from llm_on_ray.common.dataset import Dataset
 
 
 class MegatronDataset(Dataset):
diff --git a/pretrain/plugin/megatron_pretrainer.py b/llm_on_ray/pretrain/plugin/megatron_pretrainer.py
similarity index 98%
rename from pretrain/plugin/megatron_pretrainer.py
rename to llm_on_ray/pretrain/plugin/megatron_pretrainer.py
index 4ee76bfa3..30e6cb815 100644
--- a/pretrain/plugin/megatron_pretrainer.py
+++ b/llm_on_ray/pretrain/plugin/megatron_pretrainer.py
@@ -11,9 +11,9 @@
 from ray.train import Checkpoint
 from ray.train.torch import TorchCheckpoint
 
-from common import dataprocesser
-from .pretrainer import PreTrainer
-from common.logging import logger
+from llm_on_ray.common import dataprocesser
+from llm_on_ray.pretrain.plugin.pretrainer import PreTrainer
+from llm_on_ray.common.logging import logger
 
 
 class MegatronPreTrainer(PreTrainer):
diff --git a/pretrain/plugin/megatron_processer.py b/llm_on_ray/pretrain/plugin/megatron_processer.py
similarity index 96%
rename from pretrain/plugin/megatron_processer.py
rename to llm_on_ray/pretrain/plugin/megatron_processer.py
index 178256ad5..455138399 100644
--- a/pretrain/plugin/megatron_processer.py
+++ b/llm_on_ray/pretrain/plugin/megatron_processer.py
@@ -2,7 +2,7 @@
 from megatron.core import mpu
 from megatron.data.data_samplers import build_pretraining_data_loader
 
-from common.dataprocesser import DataProcesser
+from llm_on_ray.common.dataprocesser import DataProcesser
 
 
 class MegatronProcesser(DataProcesser):
diff --git a/pretrain/plugin/megtron_initializer.py b/llm_on_ray/pretrain/plugin/megtron_initializer.py
similarity index 85%
rename from pretrain/plugin/megtron_initializer.py
rename to llm_on_ray/pretrain/plugin/megtron_initializer.py
index cad268603..9aad0d402 100644
--- a/pretrain/plugin/megtron_initializer.py
+++ b/llm_on_ray/pretrain/plugin/megtron_initializer.py
@@ -1,6 +1,6 @@
 from megatron.initialize import initialize_megatron
-from common.initializer import Initializer
-from common.logging import logger
+from llm_on_ray.common.initializer import Initializer
+from llm_on_ray.common.logging import logger
 
 
 class MegatronInitializer(Initializer):
diff --git a/pretrain/plugin/plain_id_processer.py b/llm_on_ray/pretrain/plugin/plain_id_processer.py
similarity index 94%
rename from pretrain/plugin/plain_id_processer.py
rename to llm_on_ray/pretrain/plugin/plain_id_processer.py
index 20117cdcf..50faa5e15 100644
--- a/pretrain/plugin/plain_id_processer.py
+++ b/llm_on_ray/pretrain/plugin/plain_id_processer.py
@@ -1,7 +1,7 @@
 import torch
 import transformers
 
-from common.dataprocesser import DataProcesser
+from llm_on_ray.common.dataprocesser import DataProcesser
 
 
 class PlainIDProcesser(DataProcesser):
diff --git a/pretrain/plugin/pretrainer.py b/llm_on_ray/pretrain/plugin/pretrainer.py
similarity index 99%
rename from pretrain/plugin/pretrainer.py
rename to llm_on_ray/pretrain/plugin/pretrainer.py
index 1bde38f62..1e48232c7 100755
--- a/pretrain/plugin/pretrainer.py
+++ b/llm_on_ray/pretrain/plugin/pretrainer.py
@@ -12,9 +12,9 @@
 from ray.train.torch import TorchCheckpoint
 from pathlib import Path
 
-from common import dataprocesser
-from common.trainer import Trainer
-from common.logging import logger
+from llm_on_ray.common import dataprocesser
+from llm_on_ray.common.trainer import Trainer
+from llm_on_ray.common.logging import logger
 
 
 class PreTrainer(Trainer):
diff --git a/pretrain/pretrain.py b/llm_on_ray/pretrain/pretrain.py
similarity index 94%
rename from pretrain/pretrain.py
rename to llm_on_ray/pretrain/pretrain.py
index 3e045c19d..56680b74d 100644
--- a/pretrain/pretrain.py
+++ b/llm_on_ray/pretrain/pretrain.py
@@ -10,31 +10,28 @@
 from ray.air.config import ScalingConfig
 from ray.air import RunConfig, FailureConfig
 
-import sys
-
-sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
-import common
+from llm_on_ray import common
 
 from importlib import util
 
 use_habana = False
 loader = util.find_spec("habana_frameworks")
 if loader is not None:
-    from backend.habana_backend import TorchConfig
+    from llm_on_ray.pretrain.backend.habana_backend import TorchConfig
 
     use_habana = True
 else:
     from ray.train.torch import TorchConfig
-    from backend.deepspeed_backend import TorchConfig as DeepSpeedTorchConfig
+    from llm_on_ray.pretrain.backend.deepspeed_backend import TorchConfig as DeepSpeedTorchConfig
 
 
 def train_func(config: Dict[str, Any]):
     cwd = config.get("cwd")
     if cwd:
         os.chdir(cwd)
-    from common.common import import_all_module
+    from llm_on_ray.common import import_all_modules
 
-    import_all_module(f"{os.path.dirname(os.path.realpath(__file__))}/plugin", "plugin")
+    import_all_modules(f"{os.path.dirname(os.path.realpath(__file__))}/plugin", "plugin")
     common.init(config)  # type: ignore
     initializer_config = config.get("initializer")
     if initializer_config:
diff --git a/pretrain/pretrain_template.conf b/llm_on_ray/pretrain/pretrain_template.conf
similarity index 100%
rename from pretrain/pretrain_template.conf
rename to llm_on_ray/pretrain/pretrain_template.conf
diff --git a/pretrain/pretrain_template_megatron_dataset.conf b/llm_on_ray/pretrain/pretrain_template_megatron_dataset.conf
similarity index 100%
rename from pretrain/pretrain_template_megatron_dataset.conf
rename to llm_on_ray/pretrain/pretrain_template_megatron_dataset.conf
diff --git a/pretrain/requirements.optimum-habana.txt b/llm_on_ray/pretrain/requirements.optimum-habana.txt
similarity index 100%
rename from pretrain/requirements.optimum-habana.txt
rename to llm_on_ray/pretrain/requirements.optimum-habana.txt
diff --git a/pretrain/requirements.txt b/llm_on_ray/pretrain/requirements.txt
similarity index 100%
rename from pretrain/requirements.txt
rename to llm_on_ray/pretrain/requirements.txt
diff --git a/rlhf/__init__.py b/llm_on_ray/rlhf/__init__.py
similarity index 100%
rename from rlhf/__init__.py
rename to llm_on_ray/rlhf/__init__.py
diff --git a/rlhf/ppo.py b/llm_on_ray/rlhf/ppo.py
similarity index 93%
rename from rlhf/ppo.py
rename to llm_on_ray/rlhf/ppo.py
index cc9fab6ae..821e9b4c0 100644
--- a/rlhf/ppo.py
+++ b/llm_on_ray/rlhf/ppo.py
@@ -1,6 +1,5 @@
 #!/usr/bin/env python
 
-import os
 
 import ray
 from ray import air, tune
@@ -8,16 +7,11 @@
 from ray.rllib.core.rl_module.rl_module import SingleAgentRLModuleSpec
 from ray.rllib.algorithms.ppo import PPOConfig
 
-from rl_algo.ppo.ppo_rlhf import PPORLHF
-from rl_algo.ppo.rlhf_ppo_module import RLHFPPOTorchRLModule
-from rl_algo.ppo.rlhf_ppo_torch_learner import RLHFPPOTorchLearner
-
-import sys
-
-sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
-
-import common
-from common.agentenv.rlhf_env import RLHFEnv
+from llm_on_ray import common
+from llm_on_ray.rlhf.rl_algo.ppo.ppo_rlhf import PPORLHF
+from llm_on_ray.rlhf.rl_algo.ppo.rlhf_ppo_module import RLHFPPOTorchRLModule
+from llm_on_ray.rlhf.rl_algo.ppo.rlhf_ppo_torch_learner import RLHFPPOTorchLearner
+from llm_on_ray.common.agentenv.rlhf_env import RLHFEnv
 
 
 class ValueFunctionInitializerCallback(DefaultCallbacks):
diff --git a/rlhf/ppo.yaml b/llm_on_ray/rlhf/ppo.yaml
similarity index 100%
rename from rlhf/ppo.yaml
rename to llm_on_ray/rlhf/ppo.yaml
diff --git a/rlhf/reward.py b/llm_on_ray/rlhf/reward.py
similarity index 98%
rename from rlhf/reward.py
rename to llm_on_ray/rlhf/reward.py
index 7045a6c44..a88e3cb3f 100644
--- a/rlhf/reward.py
+++ b/llm_on_ray/rlhf/reward.py
@@ -10,11 +10,7 @@
 from ray.air.config import ScalingConfig
 from ray.air import RunConfig, FailureConfig
 
-import sys
-
-sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
-
-import common
+from llm_on_ray import common
 
 
 def train_func(config: Dict[str, Any]):
diff --git a/rlhf/reward.yaml b/llm_on_ray/rlhf/reward.yaml
similarity index 100%
rename from rlhf/reward.yaml
rename to llm_on_ray/rlhf/reward.yaml
diff --git a/rlhf/rl_algo/ppo/ppo_rlhf.py b/llm_on_ray/rlhf/rl_algo/ppo/ppo_rlhf.py
similarity index 96%
rename from rlhf/rl_algo/ppo/ppo_rlhf.py
rename to llm_on_ray/rlhf/rl_algo/ppo/ppo_rlhf.py
index 55657a507..10e43cd90 100644
--- a/rlhf/rl_algo/ppo/ppo_rlhf.py
+++ b/llm_on_ray/rlhf/rl_algo/ppo/ppo_rlhf.py
@@ -12,13 +12,9 @@
 )
 from ray.rllib.evaluation.metrics import RolloutMetrics
 
-import os
-import sys
 
-sys.path.append(os.path.join(os.path.dirname(__file__), "../../../"))
-
-from common.agentenv.rlhf_env import generate_response
-from .rlhf_buffer import Buffer, BufferItem
+from llm_on_ray.common.agentenv.rlhf_env import generate_response
+from llm_on_ray.rlhf.rl_algo.ppo.rlhf_buffer import Buffer, BufferItem
 
 
 class RLHFSampler:
diff --git a/rlhf/rl_algo/ppo/rlhf_buffer.py b/llm_on_ray/rlhf/rl_algo/ppo/rlhf_buffer.py
similarity index 100%
rename from rlhf/rl_algo/ppo/rlhf_buffer.py
rename to llm_on_ray/rlhf/rl_algo/ppo/rlhf_buffer.py
diff --git a/rlhf/rl_algo/ppo/rlhf_ppo_module.py b/llm_on_ray/rlhf/rl_algo/ppo/rlhf_ppo_module.py
similarity index 100%
rename from rlhf/rl_algo/ppo/rlhf_ppo_module.py
rename to llm_on_ray/rlhf/rl_algo/ppo/rlhf_ppo_module.py
diff --git a/rlhf/rl_algo/ppo/rlhf_ppo_torch_learner.py b/llm_on_ray/rlhf/rl_algo/ppo/rlhf_ppo_torch_learner.py
similarity index 98%
rename from rlhf/rl_algo/ppo/rlhf_ppo_torch_learner.py
rename to llm_on_ray/rlhf/rl_algo/ppo/rlhf_ppo_torch_learner.py
index 733863703..7c841c1f9 100644
--- a/rlhf/rl_algo/ppo/rlhf_ppo_torch_learner.py
+++ b/llm_on_ray/rlhf/rl_algo/ppo/rlhf_ppo_torch_learner.py
@@ -11,7 +11,7 @@
 from ray.rllib.models.torch.torch_distributions import TorchCategorical
 
 
-from .util import masked_mean
+from llm_on_ray.rlhf.rl_algo.ppo.util import masked_mean
 
 torch, nn = try_import_torch()
 
diff --git a/rlhf/rl_algo/ppo/util.py b/llm_on_ray/rlhf/rl_algo/ppo/util.py
similarity index 100%
rename from rlhf/rl_algo/ppo/util.py
rename to llm_on_ray/rlhf/rl_algo/ppo/util.py
diff --git a/ui/html_format.py b/llm_on_ray/ui/html_format.py
similarity index 100%
rename from ui/html_format.py
rename to llm_on_ray/ui/html_format.py
diff --git a/ui/images/Picture1.png b/llm_on_ray/ui/images/Picture1.png
similarity index 100%
rename from ui/images/Picture1.png
rename to llm_on_ray/ui/images/Picture1.png
diff --git a/ui/images/Picture2.png b/llm_on_ray/ui/images/Picture2.png
similarity index 100%
rename from ui/images/Picture2.png
rename to llm_on_ray/ui/images/Picture2.png
diff --git a/ui/images/logo.png b/llm_on_ray/ui/images/logo.png
similarity index 100%
rename from ui/images/logo.png
rename to llm_on_ray/ui/images/logo.png
diff --git a/ui/start_ui.py b/llm_on_ray/ui/start_ui.py
similarity index 98%
rename from ui/start_ui.py
rename to llm_on_ray/ui/start_ui.py
index 420a6fdcd..4b1f0363c 100644
--- a/ui/start_ui.py
+++ b/llm_on_ray/ui/start_ui.py
@@ -18,24 +18,26 @@
 import time
 import os
 import sys
-
-sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
-from inference.inference_config import all_models, ModelDescription, Prompt
-from inference.inference_config import InferenceConfig as FinetunedConfig
-from inference.chat_process import ChatModelGptJ, ChatModelLLama, ChatModelwithImage  # noqa: F401
-from inference.predictor_deployment import PredictorDeployment
-from ray import serve
-import ray
 import gradio as gr
 import argparse
+import paramiko
+from multiprocessing import Process, Queue
+from typing import Dict, List, Any
+import ray
+from ray import serve
 from ray.tune import Stopper
 from ray.train.base_trainer import TrainingFailedError
 from ray.tune.logger import LoggerCallback
-from multiprocessing import Process, Queue
 from ray.util import queue
-import paramiko
-from html_format import cpu_memory_html, ray_status_html, custom_css
-from typing import Dict, List, Any
+from llm_on_ray.inference.inference_config import all_models, ModelDescription, Prompt
+from llm_on_ray.inference.inference_config import InferenceConfig as FinetunedConfig
+from llm_on_ray.inference.chat_process import (
+    ChatModelGptJ,
+    ChatModelLLama,
+    ChatModelwithImage,
+)
+from llm_on_ray.inference.predictor_deployment import PredictorDeployment
+from llm_on_ray.ui.html_format import cpu_memory_html, ray_status_html, custom_css
 from langchain.vectorstores import FAISS
 from langchain.embeddings import HuggingFaceEmbeddings
 from pyrecdp.LLM import TextPipeline
@@ -617,7 +619,7 @@ def finetune(
         if max_train_step != 0:
             finetune_config["Training"]["max_train_steps"] = max_train_step
 
-        from finetune.finetune import main
+        from llm_on_ray.finetune.finetune import main
 
         finetune_config["total_epochs"] = queue.Queue(
             actor_options={"resources": {"queue_hardware": 1}}
@@ -925,13 +927,16 @@ def _init_ui(self):
 
         title = "Manage LLM Lifecycle"
         with gr.Blocks(css=custom_css, title=title) as gr_chat:
+            logo_path = os.path.join(self.repo_code_path, "ui/images/logo.png")
             head_content = """
                 <div style="color: #fff;text-align: center;">
-                    <div style="position:absolute; left:15px; top:15px; "><img  src="/file=ui/images/logo.png" width="50" height="50"/></div>
+                    <div style="position:absolute; left:15px; top:15px; "><img  src="/file={logo_path}" width="50" height="50"/></div>
                     <p style="color: #fff; font-size: 1.1rem;">Manage LLM Lifecycle</p>
                     <p style="color: #fff; font-size: 0.9rem;">Fine-Tune LLMs using workflow on Ray, Deploy and Inference</p>
                 </div>
-            """
+            """.format(
+                logo_path=logo_path
+            )
             foot_content = """
                 <div class="footer">
                     <p>The workflow is powered by Ray to provide infrastructure management, distributed training, model serving with reliability and auto scaling.</p>
@@ -1383,7 +1388,7 @@ def _init_ui(self):
                 with gr.Row():
                     with gr.Column(scale=0.1, min_width=45):
                         with gr.Row():
-                            node_pic = r"./ui/images/Picture2.png"
+                            node_pic = os.path.join(self.repo_code_path, "./ui/images/Picture2.png")
                             gr.Image(
                                 type="pil",
                                 value=node_pic,
@@ -1446,7 +1451,9 @@ def _init_ui(self):
 
                         with gr.Column(scale=0.065, min_width=45):
                             with gr.Row():
-                                node_pic = r"./ui/images/Picture1.png"
+                                node_pic = os.path.join(
+                                    self.repo_code_path, "./ui/images/Picture1.png"
+                                )
                                 gr.Image(
                                     type="pil",
                                     value=node_pic,
@@ -1709,14 +1716,13 @@ def _init_ui(self):
     args = parser.parse_args()
 
     file_path = os.path.abspath(__file__)
-    infer_path = os.path.dirname(file_path)
-    repo_path = os.path.abspath(infer_path + os.path.sep + "../")
+    ui_path = os.path.dirname(file_path)
+    repo_path = os.path.abspath(ui_path + os.path.sep + "../")
     default_data_path = os.path.abspath(
-        infer_path + os.path.sep + "../examples/data/sample_finetune_data.jsonl"
+        repo_path + os.path.sep + "../examples/data/sample_finetune_data.jsonl"
     )
 
-    sys.path.append(repo_path)
-    from finetune.finetune import get_accelerate_environment_variable
+    from llm_on_ray.finetune.finetune import get_accelerate_environment_variable
 
     finetune_config: Dict[str, Any] = {
         "General": {"config": {}},
diff --git a/pyproject.toml b/pyproject.toml
index 777a8aeb1..a30e8bcd2 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -71,12 +71,18 @@ bigdl-cpu = [
 
 [tool.setuptools]
 # with MANIFEST.in, the configs below work in both baremetal and container
-package-dir = {"inference" = "inference", "finetune" = "finetune"}
+package-dir = {"llm_on_ray" = "llm_on_ray"}
 include-package-data = true
 
 [project.urls]
 Repository = "https://github.com/intel/llm-on-ray.git"
 Issues = "https://github.com/intel/llm-on-ray.git/issues"
 
+[project.scripts]
+llm_on_ray-finetune = "llm_on_ray.finetune.finetune:main"
+llm_on_ray-serve = "llm_on_ray.inference.serve:main"
+llm_on_ray-pretrain = "llm_on_ray.pretrain.pretrain:main"
+llm_on_ray-megatron_deepspeed_pretrain = "llm_on_ray.pretrain.megatron_deepspeed_pretrain:main"
+
 [tool.black]
 line-length = 100