Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
215 changes: 215 additions & 0 deletions .github/scripts/validate_llm_pricing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,215 @@
#!/usr/bin/env python3
import json
import re
import sys
import urllib.request

LITELLM_URL = "https://raw.githubusercontent.com/BerriAI/litellm/refs/heads/main/model_prices_and_context_window.json"
MDX_PATH = "openhands/usage/llms/openhands-llms.mdx"

# Models to skip from strict validation (absent in LiteLLM DB or intentionally N/A values)
SKIP_MODELS = {
"qwen3-coder-480b",
"devstral-medium-2507",
"devstral-small-2507",
}

# Optional manual key mapping if MDX model name differs from LiteLLM JSON key
MODEL_KEY_MAP: dict[str, str] = {
# Add mappings here only if necessary
}


def fetch_litellm_db(url: str) -> dict:
with urllib.request.urlopen(url, timeout=30) as resp:
data = resp.read()
return json.loads(data)


def parse_money(s: str) -> float | None:
s = s.strip()
if s.upper() in {"N/A", "NA", "-", "—", "--", ""}:
return None
if s.startswith("$"):
s = s[1:]
try:
return float(s)
except ValueError:
return None


def parse_int(s: str) -> int | None:
s = s.strip()
if s.upper() in {"N/A", "NA", "-", "—", "--", ""}:
return None
s = s.replace(",", "")
try:
return int(s)
except ValueError:
return None


def extract_table_from_mdx(path: str) -> list[dict[str, str | None]]:
rows: list[dict[str, str | None]] = []
with open(path, "r", encoding="utf-8") as f:
lines = f.read().splitlines()

# Find table header
start = None
for i, line in enumerate(lines):
if "| Model |" in line:
start = i
break
if start is None:
raise SystemExit("ERROR: Could not find LLM pricing table header in MDX file.")

i = start + 1
# Skip the separator line (---)
while i < len(lines) and lines[i].strip().startswith("|"):
# Stop when we hit a blank line after table
if not lines[i].strip():
break
# Skip header separator row like |-----|
if re.match(r"^\|\s*-+\s*\|", lines[i]):
i += 1
continue
# Stop when the row clearly ends (non-table line)
if not lines[i].strip().startswith("|"):
break

parts = [p.strip() for p in lines[i].strip().strip("|").split("|")]
if len(parts) == 6 and parts[0] != "Model":
rows.append({
"model": parts[0],
"input_cost": parts[1],
"cached_input_cost": parts[2],
"output_cost": parts[3],
"max_input_tokens": parts[4],
"max_output_tokens": parts[5],
})
i += 1

if not rows:
raise SystemExit("ERROR: Found table header but no data rows parsed.")
return rows


def to_per_million(val_per_token: float | None) -> float | None:
if val_per_token is None:
return None
return val_per_token * 1_000_000.0


def near(a: float | None, b: float | None, tol: float = 1e-3) -> bool:
if a is None and b is None:
return True
if a is None or b is None:
return False
return abs(a - b) <= tol


def main() -> int:
db = fetch_litellm_db(LITELLM_URL)
rows = extract_table_from_mdx(MDX_PATH)

failures: List[str] = []
validations = 0

for row in rows:
model = row["model"]
if model in SKIP_MODELS:
continue

key = MODEL_KEY_MAP.get(model, model)
entry = db.get(key)
if entry is None:
# Try a few fallbacks (provider-prefixed keys)
# e.g., openai/gpt-5-codex, google/gemini-2.5-pro
candidates = [
f"openai/{model}",
f"azure/{model}",
f"anthropic/{model}",
f"google/{model}",
f"gemini/{model}",
f"mistral/{model}",
]
for c in candidates:
if c in db:
entry = db[c]
key = c
break

if entry is None:
# Not in LiteLLM DB; skip but report
print(f"[skip] {model}: not found in LiteLLM DB")
continue

# Parse MDX values
mdx_input_cost = parse_money(row["input_cost"]) # $ per 1M
mdx_cached_cost = parse_money(row["cached_input_cost"]) # $ per 1M or None
mdx_output_cost = parse_money(row["output_cost"]) # $ per 1M
mdx_max_in = parse_int(row["max_input_tokens"]) # tokens
mdx_max_out = parse_int(row["max_output_tokens"]) # tokens

# Compute expected from LiteLLM DB
llm_in_per_token = entry.get("input_cost_per_token")
llm_cached_per_token = entry.get("cache_read_input_token_cost")
llm_out_per_token = entry.get("output_cost_per_token")

exp_input_cost = to_per_million(llm_in_per_token)
exp_cached_cost = to_per_million(llm_cached_per_token)
exp_output_cost = to_per_million(llm_out_per_token)

# Compare costs (only if LLM DB provides them)
def add_fail(msg: str):
failures.append(f"{model}: {msg}")

# Input cost
if exp_input_cost is not None and mdx_input_cost is not None:
validations += 1
if not near(mdx_input_cost, exp_input_cost):
add_fail(f"input_cost mismatch: mdx={mdx_input_cost} vs litellm={exp_input_cost}")

# Cached input cost
if exp_cached_cost is not None or mdx_cached_cost is not None:
Copy link
Collaborator Author

@enyst enyst Nov 7, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

GPT-5 explanation for why it cares about None on cached input, but not on input or output:

Cached input cost: We treat it as both a price and a capability signal (prompt caching support). So the validator enforces presence parity and numeric accuracy:

  • Both None → OK
  • LiteLLM None, MDX number → fail (docs claim caching where provider doesn’t report it)
  • LiteLLM number, MDX None → fail (docs missing a provider-reported caching price)
  • Both numbers → compare within tolerance

Input/output costs: These are fundamental but occasionally missing in LiteLLM for preview/edge cases. To avoid false failures due to incomplete upstream data, we only compare when both sides provide numbers; if either is None, we skip strict enforcement.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It makes sense to me... WDYT?

# If JSON missing but MDX has numeric, that's a mismatch; if MDX N/A and JSON missing, accept
validations += 1
if exp_cached_cost is None and mdx_cached_cost is None:
pass
elif exp_cached_cost is None and mdx_cached_cost is not None:
add_fail(f"cached_input_cost present in MDX but missing in LiteLLM: mdx={mdx_cached_cost}")
elif exp_cached_cost is not None and mdx_cached_cost is None:
add_fail(f"cached_input_cost missing in MDX but present in LiteLLM: litellm={exp_cached_cost}")
elif not near(mdx_cached_cost, exp_cached_cost):
add_fail(f"cached_input_cost mismatch: mdx={mdx_cached_cost} vs litellm={exp_cached_cost}")

# Output cost
if exp_output_cost is not None and mdx_output_cost is not None:
validations += 1
if not near(mdx_output_cost, exp_output_cost):
add_fail(f"output_cost mismatch: mdx={mdx_output_cost} vs litellm={exp_output_cost}")

# Token limits (compare only if LiteLLM provides the field)
llm_max_in = entry.get("max_input_tokens")
llm_max_out = entry.get("max_output_tokens")

if llm_max_in is not None and mdx_max_in is not None:
validations += 1
if mdx_max_in != int(llm_max_in):
add_fail(f"max_input_tokens mismatch: mdx={mdx_max_in} vs litellm={llm_max_in}")

if llm_max_out is not None and mdx_max_out is not None:
validations += 1
if mdx_max_out != int(llm_max_out):
add_fail(f"max_output_tokens mismatch: mdx={mdx_max_out} vs litellm={llm_max_out}")

if failures:
print("\nValidation FAILED:\n" + "\n".join(failures))
return 1

print(f"Validation passed. Checks performed: {validations}")
return 0


if __name__ == "__main__":
sys.exit(main())
22 changes: 22 additions & 0 deletions .github/workflows/validate-llm-pricing.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
name: Validate LLM pricing table

on:
pull_request:
paths:
- 'openhands/usage/llms/openhands-llms.mdx'
- '.github/scripts/validate_llm_pricing.py'

jobs:
validate:
runs-on: ubuntu-latest
steps:
- name: Check out repository
uses: actions/checkout@v4

- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.x'

- name: Run pricing validator
run: python .github/scripts/validate_llm_pricing.py
19 changes: 11 additions & 8 deletions openhands/usage/llms/openhands-llms.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -34,17 +34,20 @@ Pricing follows official API provider rates. Below are the current pricing detai

| Model | Input Cost (per 1M tokens) | Cached Input Cost (per 1M tokens) | Output Cost (per 1M tokens) | Max Input Tokens | Max Output Tokens |
|-------|----------------------------|-----------------------------------|------------------------------|------------------|-------------------|
| claude-opus-4-20250514 | $15.00 | $1.50 | $75.00 | 200,000 | 32,000 |
| claude-sonnet-4-20250514 | $3.00 | $0.30 | $15.00 | 200,000 | 64,000 |
| claude-sonnet-4-5-20250929 | $3.00 | $0.30 | $15.00 | 200,000 | 64,000 |
| claude-sonnet-4-20250514 | $3.00 | $0.30 | $15.00 | 1,000,000 | 64,000 |
| claude-opus-4-20250514 | $15.00 | $1.50 | $75.00 | 200,000 | 32,000 |
| claude-opus-4-1-20250805 | $15.00 | $1.50 | $75.00 | 200,000 | 32,000 |
| claude-haiku-4-5-20251001 | $1.00 | $0.10 | $5.00 | 200,000 | 64,000 |
| gpt-5-codex | $1.25 | $0.125 | $10.00 | 272,000 | 128,000 |
| gpt-5-2025-08-07 | $1.25 | $0.125 | $10.00 | 272,000 | 128,000 |
| gpt-5-mini-2025-08-07 | $0.25 | $0.025 | $2.00 | 272,000 | 128,000 |
| devstral-medium-2507 | $0.40 | N/A | $2.00 | 128,000 | 128,000 |
| devstral-small-2505 | $0.10 | N/A | $0.30 | 128,000 | 128,000 |
| devstral-small-2507 | $0.10 | N/A | $0.30 | 128,000 | 128,000 |
| gemini-2.5-pro | $1.25 | $0.31 | $10.00 | 1,048,576 | 65,535 |
| gpt-5-2025-08-07 | $1.25 | $0.125 | $10.00 | 400,000 | 128,000 |
| gpt-5-mini-2025-08-07 | $0.25 | $0.025 | $2.00 | 400,000 | 128,000 |
| o3 | $2.00 | $0.50 | $8.00 | 200,000 | 100,000 |
| o4-mini | $1.10 | $0.28 | $4.40 | 200,000 | 100,000 |
| o4-mini | $1.10 | $0.275 | $4.40 | 200,000 | 100,000 |
| gemini-2.5-pro | $1.25 | $0.125 | $10.00 | 1,048,576 | 65,535 |
| kimi-k2-0711-preview | $0.60 | $0.15 | $2.50 | 131,072 | 131,072 |
| qwen3-coder-480b | $0.40 | N/A | $1.60 | N/A | N/A |

**Note:** Cached input tokens are charged at a reduced rate when the same content is reused across requests. Models that don't support prompt caching show "N/A" for cached input cost.
**Note:** Prices listed reflect provider rates with no markup, sourced via LiteLLM’s model price database and provider pricing pages. Cached input tokens are charged at a reduced rate when the same content is reused across requests. Models that don't support prompt caching show "N/A" for cached input cost.