@@ -66,7 +66,7 @@ get_vm_ip() {
6666 echo " ${vm_ip} "
6767}
6868
69- # Test SSH connectivity
69+ # Test SSH connectivity and wait for system readiness
7070test_ssh_connection () {
7171 local vm_ip=" $1 "
7272 local max_attempts=5
@@ -93,6 +93,70 @@ test_ssh_connection() {
9393 exit 1
9494}
9595
96+ # Wait for cloud-init and Docker to be ready
97+ wait_for_system_ready () {
98+ local vm_ip=" $1 "
99+ local max_attempts=30 # 15 minutes (30 * 30 seconds) for cloud-init completion
100+ local attempt=1
101+
102+ log_info " Waiting for system initialization (cloud-init and Docker) to complete..."
103+
104+ while [[ ${attempt} -le ${max_attempts} ]]; do
105+ log_info " Checking system readiness (attempt ${attempt} /${max_attempts} )..."
106+
107+ # Check if cloud-init is done
108+ cloud_init_status=$( ssh -o StrictHostKeyChecking=no -o ConnectTimeout=10 " torrust@${vm_ip} " " cloud-init status" 2> /dev/null || echo " failed" )
109+
110+ if [[ " ${cloud_init_status} " == * " done" * ]]; then
111+ log_info " Cloud-init completed: ${cloud_init_status} "
112+
113+ # Check if Docker is available
114+ docker_available=$( ssh -o StrictHostKeyChecking=no -o ConnectTimeout=10 " torrust@${vm_ip} " " docker --version >/dev/null 2>&1 && echo 'available' || echo 'not-available'" 2> /dev/null || echo " not-available" )
115+
116+ if [[ " ${docker_available} " == " available" ]]; then
117+ # Check if Docker daemon is running
118+ docker_running=$( ssh -o StrictHostKeyChecking=no -o ConnectTimeout=10 " torrust@${vm_ip} " " docker info >/dev/null 2>&1 && echo 'running' || echo 'not-running'" 2> /dev/null || echo " not-running" )
119+
120+ if [[ " ${docker_running} " == " running" ]]; then
121+ log_success " System is ready: cloud-init done, Docker available and running"
122+ return 0
123+ else
124+ log_info " Docker installed but daemon not running yet, waiting..."
125+ fi
126+ else
127+ log_info " Docker not available yet, cloud-init may still be installing it..."
128+ fi
129+ else
130+ log_info " Cloud-init status: ${cloud_init_status} , waiting for completion..."
131+ fi
132+
133+ log_info " System not ready yet. Retrying in 30 seconds..."
134+ sleep 30
135+ (( attempt++ ))
136+ done
137+
138+ log_error " Timeout waiting for system to be ready after ${max_attempts} attempts (15 minutes)"
139+ log_error " Cloud-init may have failed or Docker installation encountered issues"
140+
141+ # Show diagnostic information
142+ vm_exec " ${vm_ip} " "
143+ echo '=== System Diagnostic Information ==='
144+ echo 'Cloud-init status:'
145+ cloud-init status --long || echo 'cloud-init command failed'
146+ echo ''
147+ echo 'Docker version:'
148+ docker --version || echo 'Docker not available'
149+ echo ''
150+ echo 'Docker service status:'
151+ systemctl status docker || echo 'Docker service status unavailable'
152+ echo ''
153+ echo 'Recent cloud-init logs:'
154+ tail -20 /var/log/cloud-init.log || echo 'Cloud-init logs unavailable'
155+ " " Dumping diagnostic information"
156+
157+ exit 1
158+ }
159+
96160# Execute command on VM via SSH
97161vm_exec () {
98162 local vm_ip=" $1 "
@@ -132,8 +196,31 @@ release_stage() {
132196 # Create target directory structure
133197 vm_exec " ${vm_ip} " " mkdir -p /home/torrust/github/torrust" " Creating directory structure"
134198
135- # Remove existing directory if it exists
136- vm_exec " ${vm_ip} " " test -d /home/torrust/github/torrust/torrust-tracker-demo && rm -rf /home/torrust/github/torrust/torrust-tracker-demo || true" " Removing existing repository"
199+ # Check if we need to preserve storage before removing repository
200+ storage_exists=$( ssh -o StrictHostKeyChecking=no -o ConnectTimeout=10 " torrust@${vm_ip} " "
201+ if [ -d /home/torrust/github/torrust/torrust-tracker-demo/application/storage ]; then
202+ echo 'true'
203+ else
204+ echo 'false'
205+ fi
206+ " 2> /dev/null || echo " false" )
207+
208+ if [[ " ${storage_exists} " == " true" ]]; then
209+ log_warning " Preserving existing storage folder with persistent data"
210+ fi
211+
212+ # Handle existing repository - preserve storage folder if it exists
213+ vm_exec " ${vm_ip} " "
214+ if [ -d /home/torrust/github/torrust/torrust-tracker-demo ]; then
215+ if [ -d /home/torrust/github/torrust/torrust-tracker-demo/application/storage ]; then
216+ # Move storage folder to temporary location
217+ mv /home/torrust/github/torrust/torrust-tracker-demo/application/storage /tmp/torrust-storage-backup-\$ (date +%s) || true
218+ fi
219+
220+ # Remove the repository directory (excluding storage)
221+ rm -rf /home/torrust/github/torrust/torrust-tracker-demo
222+ fi
223+ " " Removing existing repository (preserving storage)"
137224
138225 # Copy archive to VM
139226 if ! scp -o StrictHostKeyChecking=no " ${temp_archive} " " torrust@${vm_ip} :/tmp/" ; then
@@ -147,6 +234,28 @@ release_stage() {
147234 vm_exec " ${vm_ip} " " cd /home/torrust/github/torrust/torrust-tracker-demo && tar -xzf /tmp/$( basename " ${temp_archive} " ) " " Extracting repository"
148235 vm_exec " ${vm_ip} " " rm -f /tmp/$( basename " ${temp_archive} " ) " " Cleaning up temp files"
149236
237+ # Restore storage folder if it was backed up
238+ vm_exec " ${vm_ip} " "
239+ storage_backup=\$ (ls /tmp/torrust-storage-backup-* 2>/dev/null | head -1 || echo '')
240+ if [ -n \"\$ storage_backup\" ] && [ -d \"\$ storage_backup\" ]; then
241+ rm -rf /home/torrust/github/torrust/torrust-tracker-demo/application/storage
242+ mv \"\$ storage_backup\" /home/torrust/github/torrust/torrust-tracker-demo/application/storage
243+ fi
244+ " " Restoring preserved storage folder"
245+
246+ # Check if storage was restored and log appropriately
247+ storage_restored=$( ssh -o StrictHostKeyChecking=no -o ConnectTimeout=10 " torrust@${vm_ip} " "
248+ if [ -d /home/torrust/github/torrust/torrust-tracker-demo/application/storage/mysql ] || [ -d /home/torrust/github/torrust/torrust-tracker-demo/application/storage/tracker ]; then
249+ echo 'true'
250+ else
251+ echo 'false'
252+ fi
253+ " 2> /dev/null || echo " false" )
254+
255+ if [[ " ${storage_restored} " == " true" ]]; then
256+ log_info " Storage folder restored with existing persistent data"
257+ fi
258+
150259 # Clean up local temp file
151260 rm -f " ${temp_archive} "
152261
@@ -182,6 +291,98 @@ release_stage() {
182291 log_success " Release stage completed"
183292}
184293
294+ # Wait for services to become healthy
295+ wait_for_services () {
296+ local vm_ip=" $1 "
297+ local max_attempts=60 # 10 minutes (60 * 10 seconds) - increased for MySQL initialization
298+ local attempt=1
299+
300+ log_info " Waiting for application services to become healthy..."
301+
302+ while [[ ${attempt} -le ${max_attempts} ]]; do
303+ log_info " Checking container status (attempt ${attempt} /${max_attempts} )..."
304+
305+ # Get container status with service names only
306+ services=$( ssh -n -o StrictHostKeyChecking=no -o ConnectTimeout=10 " torrust@${vm_ip} " " cd /home/torrust/github/torrust/torrust-tracker-demo/application && docker compose ps --services" 2> /dev/null || echo " SSH_FAILED" )
307+
308+ if [[ " ${services} " == " SSH_FAILED" ]]; then
309+ log_warning " SSH connection failed while checking container status. Retrying in 10 seconds..."
310+ sleep 10
311+ (( attempt++ ))
312+ continue
313+ fi
314+
315+ if [[ -z " ${services} " ]]; then
316+ log_warning " Could not get container status. Services might not be running yet. Retrying in 10 seconds..."
317+ sleep 10
318+ (( attempt++ ))
319+ continue
320+ fi
321+
322+ log_info " Found services: $( echo " ${services} " | wc -l) services"
323+
324+ all_healthy=true
325+ container_count=0
326+
327+ while IFS= read -r service_name; do
328+ [[ -z " $service_name " ]] && continue # Skip empty lines
329+ container_count=$(( container_count + 1 ))
330+
331+ # Get the container state and health for this service
332+ container_info=$( ssh -n -o StrictHostKeyChecking=no -o ConnectTimeout=10 " torrust@${vm_ip} " " cd /home/torrust/github/torrust/torrust-tracker-demo/application && docker compose ps ${service_name} --format '{{.State}}'" 2> /dev/null)
333+ health_status=$( ssh -n -o StrictHostKeyChecking=no -o ConnectTimeout=10 " torrust@${vm_ip} " " cd /home/torrust/github/torrust/torrust-tracker-demo/application && docker inspect ${service_name} --format '{{if .State.Health}}{{.State.Health.Status}}{{else}}no-healthcheck{{end}}' 2>/dev/null" || echo " no-healthcheck" )
334+
335+ # Clean up output
336+ container_info=$( echo " ${container_info} " | tr -d ' \n\r' | xargs)
337+ health_status=$( echo " ${health_status} " | tr -d ' \n\r' | xargs)
338+
339+ # Check if container is running
340+ if [[ " ${container_info} " != " running" ]]; then
341+ log_info " Service '${service_name} ': ${container_info} - not running yet"
342+ all_healthy=false
343+ continue
344+ fi
345+
346+ # If container is running, check health status
347+ case " ${health_status} " in
348+ " healthy" )
349+ log_info " Service '${service_name} ': running ✓ (healthy)"
350+ ;;
351+ " no-healthcheck" )
352+ log_info " Service '${service_name} ': running ✓ (no health check)"
353+ ;;
354+ " starting" )
355+ log_info " Service '${service_name} ': running (health check starting) - waiting..."
356+ all_healthy=false
357+ ;;
358+ " unhealthy" )
359+ log_warning " Service '${service_name} ': running (unhealthy) - waiting for recovery..."
360+ all_healthy=false
361+ ;;
362+ * )
363+ log_info " Service '${service_name} ': running (health: ${health_status} ) - waiting..."
364+ all_healthy=false
365+ ;;
366+ esac
367+ done <<< " ${services}"
368+
369+ log_info " Checked ${container_count} containers, all_healthy=${all_healthy} "
370+
371+ if ${all_healthy} ; then
372+ log_success " All application services are healthy and ready."
373+ return 0
374+ fi
375+
376+ log_info " Not all services are healthy. Retrying in 10 seconds..."
377+ sleep 10
378+ (( attempt++ ))
379+ done
380+
381+ log_error " Timeout waiting for services to become healthy after ${max_attempts} attempts."
382+ vm_exec " ${vm_ip} " " cd /home/torrust/github/torrust/torrust-tracker-demo/application && docker compose ps && docker compose logs" " Dumping logs on failure"
383+ exit 1
384+ }
385+
185386# RUN STAGE: Start application processes
186387run_stage () {
187388 local vm_ip=" $1 "
@@ -210,8 +411,7 @@ run_stage() {
210411 " " Starting application services"
211412
212413 # Wait for services to initialize
213- log_info " Waiting for services to initialize (30 seconds)..."
214- sleep 30
414+ wait_for_services " ${vm_ip} "
215415
216416 log_success " Run stage completed"
217417}
@@ -222,25 +422,47 @@ validate_deployment() {
222422
223423 log_info " === DEPLOYMENT VALIDATION ==="
224424
225- # Check service status
425+ # Check service status with detailed output
226426 vm_exec " ${vm_ip} " "
227427 cd /home/torrust/github/torrust/torrust-tracker-demo/application
228- echo '=== Docker Compose Services ==='
428+ echo '=== Docker Compose Services (Detailed Status) ==='
429+ docker compose ps --format 'table {{.Service}}\t{{.State}}\t{{.Status}}\t{{.Ports}}'
430+
431+ echo ''
432+ echo '=== Docker Compose Services (Default Format) ==='
229433 docker compose ps
230434
231- echo '=== Service Logs (last 10 lines) ==='
435+ echo ''
436+ echo '=== Container Health Check Details ==='
437+ # Show health status for each container
438+ for container in \$ (docker compose ps --format '{{.Name}}'); do
439+ echo \" Container: \$ container\"
440+ state=\$ (docker inspect \$ container --format '{{.State.Status}}')
441+ health=\$ (docker inspect \$ container --format '{{.State.Health.Status}}' 2>/dev/null || echo 'no-healthcheck')
442+ echo \" State: \$ state\"
443+ echo \" Health: \$ health\"
444+
445+ # Show health check logs for problematic containers
446+ if [ \"\$ health\" = \" unhealthy\" ] || [ \"\$ health\" = \" starting\" ]; then
447+ echo \" Health check output (last 3 attempts):\"
448+ docker inspect \$ container --format '{{range .State.Health.Log}} {{.Start}}: {{.Output}}{{end}}' 2>/dev/null | tail -3 || echo \" No health check logs available\"
449+ fi
450+ echo ''
451+ done
452+
453+ echo '=== Service Logs (last 10 lines each) ==='
232454 docker compose logs --tail=10
233- " " Checking service status"
455+ " " Checking detailed service status"
234456
235457 # Test application endpoints
236458 vm_exec " ${vm_ip} " "
237459 echo '=== Testing Application Endpoints ==='
238460
239- # Test health check endpoint (through nginx proxy)
461+ # Test global health check endpoint (through nginx proxy)
240462 if curl -f -s http://localhost/health_check >/dev/null 2>&1; then
241- echo '✅ Health check endpoint: OK'
463+ echo '✅ Global health check endpoint: OK'
242464 else
243- echo '❌ Health check endpoint: FAILED'
465+ echo '❌ Global health check endpoint: FAILED'
244466 exit 1
245467 fi
246468
@@ -299,6 +521,7 @@ main() {
299521 vm_ip=$( get_vm_ip)
300522
301523 test_ssh_connection " ${vm_ip} "
524+ wait_for_system_ready " ${vm_ip} "
302525 release_stage " ${vm_ip} "
303526 run_stage " ${vm_ip} "
304527
0 commit comments