Skip to content
This repository was archived by the owner on Oct 10, 2025. It is now read-only.

Commit 732a297

Browse files
committed
feat: [#14] improve cloud-init completion detection robustness
- Replace Docker-based detection with multi-layered approach: 1. Primary: Official cloud-init status command 2. Secondary: Custom completion marker file 3. Tertiary: System service readiness checks - Add completion marker creation at end of cloud-init setup - Update E2E tests to use robust detection method - Update deployment scripts with same detection logic - Remove dependency on specific software for completion detection This makes cloud-init detection more reliable and future-proof, working regardless of command order or installed software.
1 parent 8ba6858 commit 732a297

File tree

3 files changed

+62
-34
lines changed

3 files changed

+62
-34
lines changed

infrastructure/cloud-init/user-data.yaml.tpl

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -191,6 +191,11 @@ runcmd:
191191
# Set up log rotation for Docker
192192
- systemctl restart docker
193193

194+
# Create completion marker for robust cloud-init status detection
195+
# This file indicates that ALL cloud-init setup tasks have completed successfully
196+
- echo "Cloud-init setup completed at $(date)" > /var/lib/cloud/torrust-setup-complete
197+
- chmod 644 /var/lib/cloud/torrust-setup-complete
198+
194199
# Final message
195200
final_message: |
196201
Torrust Tracker Demo VM setup completed!

infrastructure/scripts/deploy-app.sh

Lines changed: 33 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -73,39 +73,49 @@ test_ssh_connection() {
7373
exit 1
7474
}
7575

76-
# Wait for cloud-init and Docker to be ready
76+
# Wait for cloud-init to complete using robust detection method
7777
wait_for_system_ready() {
7878
local vm_ip="$1"
7979
local max_attempts=30 # 15 minutes (30 * 30 seconds) for cloud-init completion
8080
local attempt=1
8181

82-
log_info "Waiting for system initialization (cloud-init and Docker) to complete..."
82+
log_info "Waiting for cloud-init to complete using robust detection method..."
8383

8484
while [[ ${attempt} -le ${max_attempts} ]]; do
8585
log_info "Checking system readiness (attempt ${attempt}/${max_attempts})..."
8686

87-
# Check if cloud-init is done
87+
# Primary check: Official cloud-init status
8888
cloud_init_status=$(ssh -o StrictHostKeyChecking=no -o ConnectTimeout=10 "torrust@${vm_ip}" "cloud-init status" 2>/dev/null || echo "failed")
8989

9090
if [[ "${cloud_init_status}" == *"done"* ]]; then
9191
log_info "Cloud-init completed: ${cloud_init_status}"
9292

93-
# Check if Docker is available
94-
docker_available=$(ssh -o StrictHostKeyChecking=no -o ConnectTimeout=10 "torrust@${vm_ip}" "docker --version >/dev/null 2>&1 && echo 'available' || echo 'not-available'" 2>/dev/null || echo "not-available")
93+
# Secondary check: Custom completion marker file
94+
completion_marker_exists=$(ssh -o StrictHostKeyChecking=no -o ConnectTimeout=10 "torrust@${vm_ip}" "test -f /var/lib/cloud/torrust-setup-complete && echo 'exists' || echo 'not-exists'" 2>/dev/null || echo "not-exists")
9595

96-
if [[ "${docker_available}" == "available" ]]; then
97-
# Check if Docker daemon is running
98-
docker_running=$(ssh -o StrictHostKeyChecking=no -o ConnectTimeout=10 "torrust@${vm_ip}" "docker info >/dev/null 2>&1 && echo 'running' || echo 'not-running'" 2>/dev/null || echo "not-running")
96+
if [[ "${completion_marker_exists}" == "exists" ]]; then
97+
log_success "Setup completion marker found - all cloud-init tasks completed"
98+
99+
# Tertiary check: Verify system services are ready (only if needed for deployment)
100+
# Note: This check is deployment-specific, not cloud-init specific
101+
systemd_ready=$(ssh -o StrictHostKeyChecking=no -o ConnectTimeout=10 "torrust@${vm_ip}" "systemctl is-system-running --quiet && echo 'ready' || echo 'not-ready'" 2>/dev/null || echo "not-ready")
99102

100-
if [[ "${docker_running}" == "running" ]]; then
101-
log_success "System is ready: cloud-init done, Docker available and running"
103+
if [[ "${systemd_ready}" == "ready" ]]; then
104+
log_success "System is fully ready for application deployment"
102105
return 0
103106
else
104-
log_info "Docker installed but daemon not running yet, waiting..."
107+
log_info "System services still starting up, waiting..."
105108
fi
106109
else
107-
log_info "Docker not available yet, cloud-init may still be installing it..."
110+
log_info "Setup completion marker not found yet, cloud-init tasks may still be running..."
108111
fi
112+
elif [[ "${cloud_init_status}" == *"error"* ]]; then
113+
log_error "Cloud-init failed with error status: ${cloud_init_status}"
114+
115+
# Show detailed error information
116+
detailed_status=$(ssh -o StrictHostKeyChecking=no -o ConnectTimeout=10 "torrust@${vm_ip}" "cloud-init status --long" 2>/dev/null || echo "unknown")
117+
log_error "Detailed cloud-init status: ${detailed_status}"
118+
return 1
109119
else
110120
log_info "Cloud-init status: ${cloud_init_status}, waiting for completion..."
111121
fi
@@ -116,22 +126,22 @@ wait_for_system_ready() {
116126
done
117127

118128
log_error "Timeout waiting for system to be ready after ${max_attempts} attempts (15 minutes)"
119-
log_error "Cloud-init may have failed or Docker installation encountered issues"
129+
log_error "Cloud-init may have failed or system setup encountered issues"
120130

121-
# Show diagnostic information
131+
# Show diagnostic information using robust detection methods
122132
vm_exec "${vm_ip}" "
123133
echo '=== System Diagnostic Information ==='
124134
echo 'Cloud-init status:'
125135
cloud-init status --long || echo 'cloud-init command failed'
126-
echo ''
127-
echo 'Docker version:'
128-
docker --version || echo 'Docker not available'
129-
echo ''
130-
echo 'Docker service status:'
131-
systemctl status docker || echo 'Docker service status unavailable'
132-
echo ''
133-
echo 'Recent cloud-init logs:'
134-
tail -20 /var/log/cloud-init.log || echo 'Cloud-init logs unavailable'
136+
echo
137+
echo 'Setup completion marker:'
138+
ls -la /var/lib/cloud/torrust-setup-complete 2>/dev/null || echo 'Completion marker not found'
139+
echo
140+
echo 'Cloud-init logs (last 20 lines):'
141+
tail -20 /var/log/cloud-init.log 2>/dev/null || echo 'Cloud-init log not available'
142+
echo
143+
echo 'System service status:'
144+
systemctl is-system-running || echo 'System status check failed'
135145
" "Dumping diagnostic information"
136146

137147
exit 1

tests/test-e2e.sh

Lines changed: 24 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -420,23 +420,36 @@ wait_for_cloud_init_to_finish() {
420420
continue
421421
fi
422422

423-
# Check if cloud-init has finished
423+
# Primary check: Official cloud-init status
424424
local cloud_init_status
425425
cloud_init_status=$(ssh_to_vm "${vm_ip}" "cloud-init status" "2>/dev/null" || echo "unknown")
426426

427427
if [[ "${cloud_init_status}" == *"done"* ]]; then
428-
log_success "Cloud-init completed successfully"
429-
430-
# Check if Docker is available and working
431-
if ssh_to_vm "${vm_ip}" "docker --version && docker compose version"; then
432-
log_success "Docker is ready and available"
433-
log_success "VM is ready for application deployment"
434-
return 0
428+
log_success "Cloud-init reports completion: ${cloud_init_status}"
429+
430+
# Secondary check: Custom completion marker file
431+
if ssh_to_vm "${vm_ip}" "test -f /var/lib/cloud/torrust-setup-complete"; then
432+
log_success "Setup completion marker found"
433+
434+
# Tertiary check: Verify critical services are available
435+
# Note: This is not tied to specific software, just basic system readiness
436+
if ssh_to_vm "${vm_ip}" "systemctl is-active docker >/dev/null 2>&1"; then
437+
log_success "Critical services are active"
438+
log_success "VM is ready for application deployment"
439+
return 0
440+
else
441+
log_info "Critical services not ready yet, waiting 10 seconds..."
442+
fi
435443
else
436-
log_info "Docker not ready yet, waiting 10 seconds..."
444+
log_info "Setup completion marker not found yet, waiting 10 seconds..."
437445
fi
438446
elif [[ "${cloud_init_status}" == *"error"* ]]; then
439-
log_error "Cloud-init failed with error status"
447+
log_error "Cloud-init failed with error status: ${cloud_init_status}"
448+
449+
# Try to get more detailed error information
450+
local cloud_init_result
451+
cloud_init_result=$(ssh_to_vm "${vm_ip}" "cloud-init status --long" "2>/dev/null" || echo "unknown")
452+
log_error "Cloud-init detailed status: ${cloud_init_result}"
440453
return 1
441454
else
442455
log_info "Cloud-init status: ${cloud_init_status}, waiting 10 seconds..."
@@ -447,7 +460,7 @@ wait_for_cloud_init_to_finish() {
447460
done
448461

449462
log_error "Timeout waiting for cloud-init to finish after $((max_attempts * 10)) seconds"
450-
log_error "You can check manually with: ssh torrust@${vm_ip} 'cloud-init status'"
463+
log_error "You can check manually with: ssh torrust@${vm_ip} 'cloud-init status --long'"
451464
return 1
452465
}
453466

0 commit comments

Comments
 (0)