Skip to content
This repository was archived by the owner on Oct 10, 2025. It is now read-only.

Commit 0bbf85e

Browse files
committed
fix: [#12] improve deployment reliability with robust container health checks
- Added comprehensive wait_for_services logic to check health status for all containers - Improved logging with color-coded warnings and debug output for container status - Added wait_for_system_ready to ensure cloud-init and Docker are ready before deployment - Updated deployment logic to preserve storage folder across deployments - Fixed SSH command usage with -n flag for reliability - Refactored health check detection using docker inspect for accurate status - Removed duplicate health check logic from E2E test script - Enhanced container startup validation to wait for all services to be healthy - Increased health check timeout for better reliability with fresh deployments This resolves issues where deployment script would declare success too early, only checking one container instead of waiting for all containers to be healthy. The improvements ensure MySQL and tracker containers are fully ready before running health checks and E2E tests.
1 parent 537b87e commit 0bbf85e

File tree

2 files changed

+238
-81
lines changed

2 files changed

+238
-81
lines changed

infrastructure/scripts/deploy-app.sh

Lines changed: 235 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ get_vm_ip() {
6666
echo "${vm_ip}"
6767
}
6868

69-
# Test SSH connectivity
69+
# Test SSH connectivity and wait for system readiness
7070
test_ssh_connection() {
7171
local vm_ip="$1"
7272
local max_attempts=5
@@ -93,6 +93,70 @@ test_ssh_connection() {
9393
exit 1
9494
}
9595

96+
# Wait for cloud-init and Docker to be ready
97+
wait_for_system_ready() {
98+
local vm_ip="$1"
99+
local max_attempts=30 # 15 minutes (30 * 30 seconds) for cloud-init completion
100+
local attempt=1
101+
102+
log_info "Waiting for system initialization (cloud-init and Docker) to complete..."
103+
104+
while [[ ${attempt} -le ${max_attempts} ]]; do
105+
log_info "Checking system readiness (attempt ${attempt}/${max_attempts})..."
106+
107+
# Check if cloud-init is done
108+
cloud_init_status=$(ssh -o StrictHostKeyChecking=no -o ConnectTimeout=10 "torrust@${vm_ip}" "cloud-init status" 2>/dev/null || echo "failed")
109+
110+
if [[ "${cloud_init_status}" == *"done"* ]]; then
111+
log_info "Cloud-init completed: ${cloud_init_status}"
112+
113+
# Check if Docker is available
114+
docker_available=$(ssh -o StrictHostKeyChecking=no -o ConnectTimeout=10 "torrust@${vm_ip}" "docker --version >/dev/null 2>&1 && echo 'available' || echo 'not-available'" 2>/dev/null || echo "not-available")
115+
116+
if [[ "${docker_available}" == "available" ]]; then
117+
# Check if Docker daemon is running
118+
docker_running=$(ssh -o StrictHostKeyChecking=no -o ConnectTimeout=10 "torrust@${vm_ip}" "docker info >/dev/null 2>&1 && echo 'running' || echo 'not-running'" 2>/dev/null || echo "not-running")
119+
120+
if [[ "${docker_running}" == "running" ]]; then
121+
log_success "System is ready: cloud-init done, Docker available and running"
122+
return 0
123+
else
124+
log_info "Docker installed but daemon not running yet, waiting..."
125+
fi
126+
else
127+
log_info "Docker not available yet, cloud-init may still be installing it..."
128+
fi
129+
else
130+
log_info "Cloud-init status: ${cloud_init_status}, waiting for completion..."
131+
fi
132+
133+
log_info "System not ready yet. Retrying in 30 seconds..."
134+
sleep 30
135+
((attempt++))
136+
done
137+
138+
log_error "Timeout waiting for system to be ready after ${max_attempts} attempts (15 minutes)"
139+
log_error "Cloud-init may have failed or Docker installation encountered issues"
140+
141+
# Show diagnostic information
142+
vm_exec "${vm_ip}" "
143+
echo '=== System Diagnostic Information ==='
144+
echo 'Cloud-init status:'
145+
cloud-init status --long || echo 'cloud-init command failed'
146+
echo ''
147+
echo 'Docker version:'
148+
docker --version || echo 'Docker not available'
149+
echo ''
150+
echo 'Docker service status:'
151+
systemctl status docker || echo 'Docker service status unavailable'
152+
echo ''
153+
echo 'Recent cloud-init logs:'
154+
tail -20 /var/log/cloud-init.log || echo 'Cloud-init logs unavailable'
155+
" "Dumping diagnostic information"
156+
157+
exit 1
158+
}
159+
96160
# Execute command on VM via SSH
97161
vm_exec() {
98162
local vm_ip="$1"
@@ -132,8 +196,31 @@ release_stage() {
132196
# Create target directory structure
133197
vm_exec "${vm_ip}" "mkdir -p /home/torrust/github/torrust" "Creating directory structure"
134198

135-
# Remove existing directory if it exists
136-
vm_exec "${vm_ip}" "test -d /home/torrust/github/torrust/torrust-tracker-demo && rm -rf /home/torrust/github/torrust/torrust-tracker-demo || true" "Removing existing repository"
199+
# Check if we need to preserve storage before removing repository
200+
storage_exists=$(ssh -o StrictHostKeyChecking=no -o ConnectTimeout=10 "torrust@${vm_ip}" "
201+
if [ -d /home/torrust/github/torrust/torrust-tracker-demo/application/storage ]; then
202+
echo 'true'
203+
else
204+
echo 'false'
205+
fi
206+
" 2>/dev/null || echo "false")
207+
208+
if [[ "${storage_exists}" == "true" ]]; then
209+
log_warning "Preserving existing storage folder with persistent data"
210+
fi
211+
212+
# Handle existing repository - preserve storage folder if it exists
213+
vm_exec "${vm_ip}" "
214+
if [ -d /home/torrust/github/torrust/torrust-tracker-demo ]; then
215+
if [ -d /home/torrust/github/torrust/torrust-tracker-demo/application/storage ]; then
216+
# Move storage folder to temporary location
217+
mv /home/torrust/github/torrust/torrust-tracker-demo/application/storage /tmp/torrust-storage-backup-\$(date +%s) || true
218+
fi
219+
220+
# Remove the repository directory (excluding storage)
221+
rm -rf /home/torrust/github/torrust/torrust-tracker-demo
222+
fi
223+
" "Removing existing repository (preserving storage)"
137224

138225
# Copy archive to VM
139226
if ! scp -o StrictHostKeyChecking=no "${temp_archive}" "torrust@${vm_ip}:/tmp/"; then
@@ -147,6 +234,28 @@ release_stage() {
147234
vm_exec "${vm_ip}" "cd /home/torrust/github/torrust/torrust-tracker-demo && tar -xzf /tmp/$(basename "${temp_archive}")" "Extracting repository"
148235
vm_exec "${vm_ip}" "rm -f /tmp/$(basename "${temp_archive}")" "Cleaning up temp files"
149236

237+
# Restore storage folder if it was backed up
238+
vm_exec "${vm_ip}" "
239+
storage_backup=\$(ls /tmp/torrust-storage-backup-* 2>/dev/null | head -1 || echo '')
240+
if [ -n \"\$storage_backup\" ] && [ -d \"\$storage_backup\" ]; then
241+
rm -rf /home/torrust/github/torrust/torrust-tracker-demo/application/storage
242+
mv \"\$storage_backup\" /home/torrust/github/torrust/torrust-tracker-demo/application/storage
243+
fi
244+
" "Restoring preserved storage folder"
245+
246+
# Check if storage was restored and log appropriately
247+
storage_restored=$(ssh -o StrictHostKeyChecking=no -o ConnectTimeout=10 "torrust@${vm_ip}" "
248+
if [ -d /home/torrust/github/torrust/torrust-tracker-demo/application/storage/mysql ] || [ -d /home/torrust/github/torrust/torrust-tracker-demo/application/storage/tracker ]; then
249+
echo 'true'
250+
else
251+
echo 'false'
252+
fi
253+
" 2>/dev/null || echo "false")
254+
255+
if [[ "${storage_restored}" == "true" ]]; then
256+
log_info "Storage folder restored with existing persistent data"
257+
fi
258+
150259
# Clean up local temp file
151260
rm -f "${temp_archive}"
152261

@@ -182,6 +291,98 @@ release_stage() {
182291
log_success "Release stage completed"
183292
}
184293

294+
# Wait for services to become healthy
295+
wait_for_services() {
296+
local vm_ip="$1"
297+
local max_attempts=60 # 10 minutes (60 * 10 seconds) - increased for MySQL initialization
298+
local attempt=1
299+
300+
log_info "Waiting for application services to become healthy..."
301+
302+
while [[ ${attempt} -le ${max_attempts} ]]; do
303+
log_info "Checking container status (attempt ${attempt}/${max_attempts})..."
304+
305+
# Get container status with service names only
306+
services=$(ssh -n -o StrictHostKeyChecking=no -o ConnectTimeout=10 "torrust@${vm_ip}" "cd /home/torrust/github/torrust/torrust-tracker-demo/application && docker compose ps --services" 2>/dev/null || echo "SSH_FAILED")
307+
308+
if [[ "${services}" == "SSH_FAILED" ]]; then
309+
log_warning "SSH connection failed while checking container status. Retrying in 10 seconds..."
310+
sleep 10
311+
((attempt++))
312+
continue
313+
fi
314+
315+
if [[ -z "${services}" ]]; then
316+
log_warning "Could not get container status. Services might not be running yet. Retrying in 10 seconds..."
317+
sleep 10
318+
((attempt++))
319+
continue
320+
fi
321+
322+
log_info "Found services: $(echo "${services}" | wc -l) services"
323+
324+
all_healthy=true
325+
container_count=0
326+
327+
while IFS= read -r service_name; do
328+
[[ -z "$service_name" ]] && continue # Skip empty lines
329+
container_count=$((container_count + 1))
330+
331+
# Get the container state and health for this service
332+
container_info=$(ssh -n -o StrictHostKeyChecking=no -o ConnectTimeout=10 "torrust@${vm_ip}" "cd /home/torrust/github/torrust/torrust-tracker-demo/application && docker compose ps ${service_name} --format '{{.State}}'" 2>/dev/null)
333+
health_status=$(ssh -n -o StrictHostKeyChecking=no -o ConnectTimeout=10 "torrust@${vm_ip}" "cd /home/torrust/github/torrust/torrust-tracker-demo/application && docker inspect ${service_name} --format '{{if .State.Health}}{{.State.Health.Status}}{{else}}no-healthcheck{{end}}' 2>/dev/null" || echo "no-healthcheck")
334+
335+
# Clean up output
336+
container_info=$(echo "${container_info}" | tr -d '\n\r' | xargs)
337+
health_status=$(echo "${health_status}" | tr -d '\n\r' | xargs)
338+
339+
# Check if container is running
340+
if [[ "${container_info}" != "running" ]]; then
341+
log_info "Service '${service_name}': ${container_info} - not running yet"
342+
all_healthy=false
343+
continue
344+
fi
345+
346+
# If container is running, check health status
347+
case "${health_status}" in
348+
"healthy")
349+
log_info "Service '${service_name}': running ✓ (healthy)"
350+
;;
351+
"no-healthcheck")
352+
log_info "Service '${service_name}': running ✓ (no health check)"
353+
;;
354+
"starting")
355+
log_info "Service '${service_name}': running (health check starting) - waiting..."
356+
all_healthy=false
357+
;;
358+
"unhealthy")
359+
log_warning "Service '${service_name}': running (unhealthy) - waiting for recovery..."
360+
all_healthy=false
361+
;;
362+
*)
363+
log_info "Service '${service_name}': running (health: ${health_status}) - waiting..."
364+
all_healthy=false
365+
;;
366+
esac
367+
done <<<"${services}"
368+
369+
log_info "Checked ${container_count} containers, all_healthy=${all_healthy}"
370+
371+
if ${all_healthy}; then
372+
log_success "All application services are healthy and ready."
373+
return 0
374+
fi
375+
376+
log_info "Not all services are healthy. Retrying in 10 seconds..."
377+
sleep 10
378+
((attempt++))
379+
done
380+
381+
log_error "Timeout waiting for services to become healthy after ${max_attempts} attempts."
382+
vm_exec "${vm_ip}" "cd /home/torrust/github/torrust/torrust-tracker-demo/application && docker compose ps && docker compose logs" "Dumping logs on failure"
383+
exit 1
384+
}
385+
185386
# RUN STAGE: Start application processes
186387
run_stage() {
187388
local vm_ip="$1"
@@ -210,8 +411,7 @@ run_stage() {
210411
" "Starting application services"
211412

212413
# Wait for services to initialize
213-
log_info "Waiting for services to initialize (30 seconds)..."
214-
sleep 30
414+
wait_for_services "${vm_ip}"
215415

216416
log_success "Run stage completed"
217417
}
@@ -222,25 +422,47 @@ validate_deployment() {
222422

223423
log_info "=== DEPLOYMENT VALIDATION ==="
224424

225-
# Check service status
425+
# Check service status with detailed output
226426
vm_exec "${vm_ip}" "
227427
cd /home/torrust/github/torrust/torrust-tracker-demo/application
228-
echo '=== Docker Compose Services ==='
428+
echo '=== Docker Compose Services (Detailed Status) ==='
429+
docker compose ps --format 'table {{.Service}}\t{{.State}}\t{{.Status}}\t{{.Ports}}'
430+
431+
echo ''
432+
echo '=== Docker Compose Services (Default Format) ==='
229433
docker compose ps
230434
231-
echo '=== Service Logs (last 10 lines) ==='
435+
echo ''
436+
echo '=== Container Health Check Details ==='
437+
# Show health status for each container
438+
for container in \$(docker compose ps --format '{{.Name}}'); do
439+
echo \"Container: \$container\"
440+
state=\$(docker inspect \$container --format '{{.State.Status}}')
441+
health=\$(docker inspect \$container --format '{{.State.Health.Status}}' 2>/dev/null || echo 'no-healthcheck')
442+
echo \" State: \$state\"
443+
echo \" Health: \$health\"
444+
445+
# Show health check logs for problematic containers
446+
if [ \"\$health\" = \"unhealthy\" ] || [ \"\$health\" = \"starting\" ]; then
447+
echo \" Health check output (last 3 attempts):\"
448+
docker inspect \$container --format '{{range .State.Health.Log}} {{.Start}}: {{.Output}}{{end}}' 2>/dev/null | tail -3 || echo \" No health check logs available\"
449+
fi
450+
echo ''
451+
done
452+
453+
echo '=== Service Logs (last 10 lines each) ==='
232454
docker compose logs --tail=10
233-
" "Checking service status"
455+
" "Checking detailed service status"
234456

235457
# Test application endpoints
236458
vm_exec "${vm_ip}" "
237459
echo '=== Testing Application Endpoints ==='
238460
239-
# Test health check endpoint (through nginx proxy)
461+
# Test global health check endpoint (through nginx proxy)
240462
if curl -f -s http://localhost/health_check >/dev/null 2>&1; then
241-
echo '✅ Health check endpoint: OK'
463+
echo '✅ Global health check endpoint: OK'
242464
else
243-
echo '❌ Health check endpoint: FAILED'
465+
echo '❌ Global health check endpoint: FAILED'
244466
exit 1
245467
fi
246468
@@ -299,6 +521,7 @@ main() {
299521
vm_ip=$(get_vm_ip)
300522

301523
test_ssh_connection "${vm_ip}"
524+
wait_for_system_ready "${vm_ip}"
302525
release_stage "${vm_ip}"
303526
run_stage "${vm_ip}"
304527

0 commit comments

Comments
 (0)