Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
609 changes: 609 additions & 0 deletions ARTIFACTORY_BUILD.md

Large diffs are not rendered by default.

78 changes: 78 additions & 0 deletions Dockerfile-cpu
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
# vim: filetype=dockerfile

ARG FLAVOR=${TARGETARCH}

ARG ROCMVERSION=6.3.3
ARG JETPACK5VERSION=r35.4.1
ARG JETPACK6VERSION=r36.4.0
ARG CMAKEVERSION=3.31.2

# CUDA v11 requires gcc v10. v10.3 has regressions, so the rockylinux 8.5 AppStream has the latest compatible version
FROM --platform=linux/amd64 rocm/dev-almalinux-8:${ROCMVERSION}-complete AS base-amd64
RUN yum install -y yum-utils \
&& yum-config-manager --add-repo https://dl.rockylinux.org/vault/rocky/8.5/AppStream/\$basearch/os/ \
&& rpm --import https://dl.rockylinux.org/pub/rocky/RPM-GPG-KEY-Rocky-8 \
&& dnf install -y yum-utils ccache gcc-toolset-10-gcc-10.2.1-8.2.el8 gcc-toolset-10-gcc-c++-10.2.1-8.2.el8 \
&& yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/cuda-rhel8.repo
ENV PATH=/opt/rh/gcc-toolset-10/root/usr/bin:$PATH




FROM --platform=linux/arm64 almalinux:8 AS base-arm64
# install epel-release for ccache
RUN yum install -y yum-utils epel-release \
&& dnf install -y clang ccache \
&& yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/sbsa/cuda-rhel8.repo
ENV CC=clang CXX=clang++

FROM base-${TARGETARCH} AS base
ARG CMAKEVERSION
RUN curl -fsSL https://github.com/Kitware/CMake/releases/download/v${CMAKEVERSION}/cmake-${CMAKEVERSION}-linux-$(uname -m).tar.gz | tar xz -C /usr/local --strip-components 1
COPY CMakeLists.txt CMakePresets.json .
COPY ml/backend/ggml/ggml ml/backend/ggml/ggml
ENV LDFLAGS=-s

FROM base AS cpu
RUN dnf install -y gcc-toolset-11-gcc gcc-toolset-11-gcc-c++
ENV PATH=/opt/rh/gcc-toolset-11/root/usr/bin:$PATH
RUN --mount=type=cache,target=/root/.ccache \
cmake --preset 'CPU' \
&& cmake --build --parallel --preset 'CPU' \
&& cmake --install build --component CPU --strip --parallel 2


FROM base AS build
ARG GOVERSION=1.24.4
RUN curl -fsSL https://golang.org/dl/go${GOVERSION}.linux-$(case $(uname -m) in x86_64) echo amd64 ;; aarch64) echo arm64 ;; esac).tar.gz | tar xz -C /usr/local
ENV PATH=/usr/local/go/bin:$PATH
WORKDIR /go/src/github.com/ollama/ollama
COPY . .
ARG GOFLAGS="'-ldflags=-w -s'"
ENV CGO_ENABLED=1
RUN --mount=type=cache,target=/root/.cache/go-build \
go build -trimpath -buildmode=pie -o /bin/ollama .

FROM --platform=linux/amd64 scratch AS amd64

FROM --platform=linux/arm64 scratch AS arm64

FROM ${FLAVOR} AS archive
COPY --from=cpu dist/lib/ollama /lib/ollama
COPY --from=build /bin/ollama /bin/ollama

FROM ubuntu:24.04
RUN apt-get update \
&& apt-get install -y ca-certificates curl openssl \
&& apt-get install --only-upgrade -y libpam0g libpam-modules libpam-modules-bin libpam-runtime \
libsystemd0 libudev1 \
&& update-ca-certificates \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/*
COPY --from=archive /bin /usr/bin
ENV PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
COPY --from=archive /lib/ollama /usr/lib/ollama
ENV OLLAMA_HOST=0.0.0.0:11434
EXPOSE 11434
ENTRYPOINT ["/bin/ollama"]
CMD ["serve"]
56 changes: 56 additions & 0 deletions build_cpu_artifactory.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
#!/bin/bash
set -eu

# Artifactory configuration
REGISTRY=${REGISTRY:-""}
IMAGE_NAME=${IMAGE_NAME:-"ollama-cpu"}
VERSION=${VERSION:-"latest"}

# Optional Docker repository path within the registry
# If using this path, ensure to add /${DOCKER_REPO} to the image name
# example: FULL_IMAGE_NAME="${REGISTRY}/${DOCKER_REPO}/${IMAGE_NAME}:${VERSION}"
#DOCKER_REPO=${DOCKER_REPO:-""}

# Artifactory credentials (can be set via environment variables)
# Use API Key authentication for Artifactory
ARTIFACTORY_USERNAME=${ARTIFACTORY_USERNAME:-""}
ARTIFACTORY_API_KEY=${ARTIFACTORY_API_KEY:-""}

# Target platforms
PLATFORMS=${PLATFORMS:-"linux/amd64,linux/arm64"}

# Silent login if credentials are provided
if [ -n "$ARTIFACTORY_USERNAME" ] && [ -n "$ARTIFACTORY_API_KEY" ]; then
echo "Logging in to Artifactory at $REGISTRY as $ARTIFACTORY_USERNAME..."
echo "$ARTIFACTORY_API_KEY" | docker login -u "$ARTIFACTORY_USERNAME" --password-stdin "$REGISTRY" >/dev/null 2>&1
echo "Login successful"
else
echo "Artifactory credentials not provided, assuming you're already logged in"
fi

# Set up buildx if needed
BUILDER_NAME="multiarch-builder"
if ! docker buildx inspect ${BUILDER_NAME} &>/dev/null; then
echo "Creating new buildx builder: ${BUILDER_NAME}"
docker buildx create --name ${BUILDER_NAME} --driver docker-container --use
else
echo "Using existing buildx builder: ${BUILDER_NAME}"
docker buildx use ${BUILDER_NAME}
fi
docker buildx inspect --bootstrap

# Build and push the multi-arch image
FULL_IMAGE_NAME="${REGISTRY}/${IMAGE_NAME}:${VERSION}"
echo "Building and pushing ${FULL_IMAGE_NAME} for platforms: ${PLATFORMS}"

docker buildx build \
--push \
--platform ${PLATFORMS} \
--output=type=image,push=true,registry.insecure=true \
--tag ${FULL_IMAGE_NAME} \
-f Dockerfile-cpu \
. \
--no-cache

echo "Build and push completed successfully!"
echo "Image pushed to: ${FULL_IMAGE_NAME}"
76 changes: 76 additions & 0 deletions build_cpu_dockerhub.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
#!/bin/bash
set -eu

# Set your organization and image name
ORG=${ORG:-""}
IMAGE_NAME=${IMAGE_NAME:-"ollama-cpu"}
VERSION=${VERSION:-"latest"}

# Docker Hub credentials (can be set via environment variables)
DOCKER_USERNAME=${DOCKER_USERNAME:-""}
DOCKER_PASSWORD=${DOCKER_PASSWORD:-""}

# Target platforms - same as Ollama's defaults
PLATFORMS=${PLATFORMS:-"linux/arm64,linux/amd64"}

# Silent login if credentials are provided
if [ -n "$DOCKER_USERNAME" ] && [ -n "$DOCKER_PASSWORD" ]; then
echo "Logging in to Docker Hub as $DOCKER_USERNAME..."
echo "$DOCKER_PASSWORD" | docker login -u "$DOCKER_USERNAME" --password-stdin >/dev/null 2>&1
echo "Login successful"

# If login successful, use the provided username as the org
if [ "$ORG" = "yourorg" ]; then
ORG=$DOCKER_USERNAME
echo "Using Docker username '$ORG' as organization"
fi
else
echo "Docker credentials not provided, assuming you're already logged in"
fi

# Ensure QEMU is installed for cross-platform builds
echo "Setting up QEMU for cross-platform builds..."
docker run --privileged --rm tonistiigi/binfmt --install all

# Set up buildx if needed
BUILDER_NAME="multiarch-builder"
if ! docker buildx inspect ${BUILDER_NAME} &>/dev/null; then
echo "Creating new buildx builder: ${BUILDER_NAME}"
docker buildx create --name ${BUILDER_NAME} --driver docker-container --use
else
docker buildx use ${BUILDER_NAME}
fi
docker buildx inspect --bootstrap

# Set PUSH to a non-empty string to trigger push instead of load
PUSH=${PUSH:-""}
if [ -z "${PUSH}" ] ; then
echo "Building ${ORG}/${IMAGE_NAME}:${VERSION} locally. Set PUSH=1 to push"
# Note: --load only works for single platform, so if building locally, adjust PLATFORMS
if [[ "${PLATFORMS}" == *","* ]]; then
echo "WARNING: --load only works for single platform. Setting platform to linux/$(uname -m | sed 's/x86_64/amd64/' | sed 's/aarch64/arm64/')"
PLATFORMS="linux/$(uname -m | sed 's/x86_64/amd64/' | sed 's/aarch64/arm64/')"
fi
LOAD_OR_PUSH="--load"
else
echo "Will be pushing ${ORG}/${IMAGE_NAME}:${VERSION}"
LOAD_OR_PUSH="--push"
fi

# Build and push/load the multi-arch image
echo "Building for platforms: ${PLATFORMS}"
docker buildx build \
--provenance=true \
--sbom=true \
--network=host \
${LOAD_OR_PUSH} \
--platform=${PLATFORMS} \
-f Dockerfile-cpu \
-t ${ORG}/${IMAGE_NAME}:${VERSION} \
.

echo "Build completed successfully!"
if [ -n "${PUSH}" ]; then
echo "Image pushed to: ${ORG}/${IMAGE_NAME}:${VERSION}"
echo "To pull: docker pull ${ORG}/${IMAGE_NAME}:${VERSION}"
fi
1 change: 1 addition & 0 deletions cmd/cmd.go
Original file line number Diff line number Diff line change
Expand Up @@ -1571,6 +1571,7 @@ func NewCLI() *cobra.Command {
envVars["OLLAMA_LLM_LIBRARY"],
envVars["OLLAMA_GPU_OVERHEAD"],
envVars["OLLAMA_LOAD_TIMEOUT"],
envVars["OLLAMA_SKIP_MEMORY_CHECK"],
})
default:
appendEnvDocs(cmd, envs)
Expand Down
20 changes: 20 additions & 0 deletions docs/faq.md
Original file line number Diff line number Diff line change
Expand Up @@ -333,3 +333,23 @@ The currently available K/V cache quantization types are:
How much the cache quantization impacts the model's response quality will depend on the model and the task. Models that have a high GQA count (e.g. Qwen2) may see a larger impact on precision from quantization than models with a low GQA count.

You may need to experiment with different quantization types to find the best balance between memory usage and quality.



## How do I bypass available memory check before loading a model?

By default, Ollama checks if your system has sufficient available memory before loading a model to prevent out-of-memory errors that could crash your system or cause instability.
You can bypass this safety check by setting the OLLAMA_SKIP_MEMORY_CHECK environment variable to 1.

### When to use this option

- You have swap space configured and accept slower performance
- You're running on a system with non-standard memory reporting
- You're debugging memory-related issues
- You understand the risks and have adequate system monitoring

### Important Warnings

- System instability: Loading models without sufficient memory can cause system freezes or crashes
- Performance degradation: Your system may become unresponsive due to excessive swapping
- Data loss risk: System crashes could result in unsaved work being lost
9 changes: 9 additions & 0 deletions envconfig/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -226,6 +226,12 @@ var (
MaxQueue = Uint("OLLAMA_MAX_QUEUE", 512)
)

var (
// Bypass the memory check during model load. This is an expert only setting, to be used under situations where the system is guaranteedAdd commentMore actions
// to get the have enough memory or is able to procure this at runtime by evicting blocks from caches. e.g ZFS Arc Cache.
AvailableMemoryCheckOverride = Uint("OLLAMA_SKIP_MEMORY_CHECK", 0)
)

func Uint64(key string, defaultValue uint64) func() uint64 {
return func() uint64 {
if s := Var(key); s != "" {
Expand Down Expand Up @@ -275,6 +281,9 @@ func AsMap() map[string]EnvVar {
"HTTP_PROXY": {"HTTP_PROXY", String("HTTP_PROXY")(), "HTTP proxy"},
"HTTPS_PROXY": {"HTTPS_PROXY", String("HTTPS_PROXY")(), "HTTPS proxy"},
"NO_PROXY": {"NO_PROXY", String("NO_PROXY")(), "No proxy"},

//Overrides
"OLLAMA_SKIP_MEMORY_CHECK": {"OLLAMA_SKIP_MEMORY_CHECK", AvailableMemoryCheckOverride(), "Bypass checking for available memory before loading models. (e.g. OLLAMA_SKIP_MEMORY_CHECK=1)"},
}

if runtime.GOOS != "windows" {
Expand Down
53 changes: 53 additions & 0 deletions llm/memory.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package llm

import (
"bufio"
"fmt"
"log/slog"
"os"
Expand Down Expand Up @@ -438,3 +439,55 @@ func projectorMemoryRequirements(filename string) (weights uint64) {

return weights
}

// GetZFSReclaimable returns max(0, size – c_min) from ZFS ARC stats.
// Added to fix the arc memory cache issue on zfs
// This will be a no-op is no zfs is involved.
func GetZFSReclaimableMemory() (uint64, error) {
paths := []string{"/proc/spl/kstat/zfs/arcstats", "/proc/zfs/arcstats"}
var f *os.File
for _, path := range paths {
if file, err := os.Open(path); err == nil {
f = file
break
}
}
if f == nil {
return 0, fmt.Errorf("no ZFS ARC stats found")
}
defer f.Close()

var size, cmin uint64
scanner := bufio.NewScanner(f)
for scanner.Scan() {
cols := strings.Fields(scanner.Text())
if len(cols) < 3 {
continue
}
var err error
var val uint64

val, err = strconv.ParseUint(cols[2], 10, 64)
if err != nil {
continue
}
switch cols[0] {
case "size":
size = val
case "c_min":
cmin = val
default:
continue
}
}
if err := scanner.Err(); err != nil {
return 0, err
}
if size <= 0 || cmin <= 0 {
return 0, fmt.Errorf("failed to read ZFS ARC stats")
}
if size > cmin {
return size - cmin, nil
}
return 0, nil
}
32 changes: 24 additions & 8 deletions llm/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -161,14 +161,30 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
}
}

// On linux and windows, over-allocating CPU memory will almost always result in an error
// Darwin has fully dynamic swap so has no direct concept of free swap space
if runtime.GOOS != "darwin" {
systemMemoryRequired := estimate.TotalSize - estimate.VRAMSize
available := systemFreeMemory + systemSwapFreeMemory
if systemMemoryRequired > available {
slog.Warn("model request too large for system", "requested", format.HumanBytes2(systemMemoryRequired), "available", available, "total", format.HumanBytes2(systemTotalMemory), "free", format.HumanBytes2(systemFreeMemory), "swap", format.HumanBytes2(systemSwapFreeMemory))
return nil, fmt.Errorf("model requires more system memory (%s) than is available (%s)", format.HumanBytes2(systemMemoryRequired), format.HumanBytes2(available))
// Env variable to bypass ollama's memory check guardrail.
if envconfig.AvailableMemoryCheckOverride() == 1 {
slog.Warn("OLLAMA_SKIP_MEMORY_CHECK set; bypassing memory checks")
} else {
// On linux and windows, over-allocating CPU memory will almost always result in an error
// Darwin has fully dynamic swap so has no direct concept of free swap space
slog.Debug("OLLAMA_SKIP_MEMORY_CHECK not set; running memory checks")
if runtime.GOOS != "darwin" {
systemMemoryRequired := estimate.TotalSize - estimate.VRAMSize
available := systemFreeMemory + systemSwapFreeMemory

// On Linux, reclaim ZFS ARC (size – c_min)
if runtime.GOOS == "linux" {
if reclaim, err := GetZFSReclaimableMemory(); err == nil {
slog.Info("reclaiming ZFS Arc cache size:", "size", format.HumanBytes2(reclaim))
available += reclaim
} else {
slog.Warn("failure while computing ZFS Arc cache size:", "error", err)
}
}
if systemMemoryRequired > available {
slog.Warn("model request too large for system", "requested", format.HumanBytes2(systemMemoryRequired), "available", available, "total", format.HumanBytes2(systemTotalMemory), "free", format.HumanBytes2(systemFreeMemory), "swap", format.HumanBytes2(systemSwapFreeMemory))
return nil, fmt.Errorf("model requires more system memory (%s) than is available (%s)", format.HumanBytes2(systemMemoryRequired), format.HumanBytes2(available))
}
}
}

Expand Down
Loading