From f32b58fc46cbc2b1b9c424618dddcd0779557cb4 Mon Sep 17 00:00:00 2001 From: Evert Romero Date: Fri, 17 Apr 2026 11:11:06 -0600 Subject: [PATCH] fix(pipeline): implementar manejo robusto de state locks de Terraform MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Problema: - El pipeline fallaba con 'Error acquiring the state lock' cuando un proceso anterior de Terraform no liberaba el lock correctamente - Los locks bloqueados requerían intervención manual Solución implementada: 1. Nuevo script scripts/terraform-lock-cleanup.sh: - Verifica locks existentes en DynamoDB antes de ejecutar Terraform - Calcula antigüedad del lock (default: 30 minutos) - Elimina locks bloqueados automáticamente - Espera si el lock es reciente (operación en curso legítima) 2. Nuevo step 02_pre_terraform_check: - Ejecuta antes del step 03_terraform - Instala AWS CLI y configura credenciales - Limpia locks bloqueados antes de iniciar Terraform 3. Agregado -lock-timeout=5m a comandos Terraform: - terraform plan -lock-timeout=5m - terraform apply -lock-timeout=5m - Permite esperar si hay una operación legítima en curso 4. Aplicado a ambas ramas: - developer: cleanup para entorno dev - master: cleanup para entorno prod Beneficios: - Pipeline más robusto y autónomo - Menos intervención manual para locks bloqueados - Mejor manejo de concurrencia entre pipelines - Previene corrupción de estado por locks huérfanos Refs: Build #64 falló por state lock en DynamoDB --- bitbucket-pipelines.yml | 34 ++++++- scripts/terraform-lock-cleanup.sh | 164 ++++++++++++++++++++++++++++++ 2 files changed, 194 insertions(+), 4 deletions(-) create mode 100644 scripts/terraform-lock-cleanup.sh diff --git a/bitbucket-pipelines.yml b/bitbucket-pipelines.yml index fe857de..f081ec9 100644 --- a/bitbucket-pipelines.yml +++ b/bitbucket-pipelines.yml @@ -81,6 +81,19 @@ pipelines: - export TELEGRAM_CHAT_ID="${DEV_TELEGRAM_CHAT_ID}" - bash scripts/telegram-pipeline-notify.sh start + - step: + name: 02_pre_terraform_check + oidc: true + script: + - set -euo pipefail + - apt-get update -y && apt-get install -y curl unzip + - curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip" + - unzip -q awscliv2.zip + - ./aws/install + - aws --version + - source scripts/aws-oidc-setup.sh dev + - bash scripts/terraform-lock-cleanup.sh dev proyectosacc/terraform.tfstate + - step: name: 03_terraform oidc: true @@ -99,8 +112,8 @@ pipelines: - terraform version - export AWS_DEFAULT_REGION="${AWS_DEFAULT_REGION:-mx-central-1}" - terraform init -backend-config=backend.dev.hcl - - terraform plan -var-file=environments/dev.tfvars -var="db_password=${DEV_DB_PASSWORD}" -out=dev.tfplan - - terraform apply -auto-approve dev.tfplan + - terraform plan -lock-timeout=5m -var-file=environments/dev.tfvars -var="db_password=${DEV_DB_PASSWORD}" -out=dev.tfplan + - terraform apply -lock-timeout=5m -auto-approve dev.tfplan - terraform output -json > terraform-outputs.json - cat terraform-outputs.json artifacts: @@ -231,6 +244,19 @@ pipelines: - export TELEGRAM_CHAT_ID="${PROD_TELEGRAM_CHAT_ID}" - bash scripts/telegram-pipeline-notify.sh start + - step: + name: 02_pre_terraform_check + oidc: true + script: + - set -euo pipefail + - apt-get update -y && apt-get install -y curl unzip + - curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip" + - unzip -q awscliv2.zip + - ./aws/install + - aws --version + - source scripts/aws-oidc-setup.sh prod + - bash scripts/terraform-lock-cleanup.sh prod proyectosacc/terraform.tfstate + - step: name: 03_terraform oidc: true @@ -249,8 +275,8 @@ pipelines: - terraform version - export AWS_DEFAULT_REGION="${AWS_DEFAULT_REGION:-mx-central-1}" - terraform init -backend-config=backend.prod.hcl - - terraform plan -var-file=environments/prod.tfvars -var="db_password=${PROD_DB_PASSWORD}" -out=prod.tfplan - - terraform apply -auto-approve prod.tfplan + - terraform plan -lock-timeout=5m -var-file=environments/prod.tfvars -var="db_password=${PROD_DB_PASSWORD}" -out=prod.tfplan + - terraform apply -lock-timeout=5m -auto-approve prod.tfplan - terraform output -json > terraform-outputs.json - cat terraform-outputs.json artifacts: diff --git a/scripts/terraform-lock-cleanup.sh b/scripts/terraform-lock-cleanup.sh new file mode 100644 index 0000000..d26eb9c --- /dev/null +++ b/scripts/terraform-lock-cleanup.sh @@ -0,0 +1,164 @@ +#!/bin/bash +# =============================================================================================================== +# terraform-lock-cleanup.sh - Limpieza automática de locks bloqueados de Terraform +# Descripción: +# Verifica y elimina locks bloqueados de Terraform en DynamoDB antes de ejecutar terraform plan/apply. +# Previene errores de "Error acquiring the state lock" en pipelines de CI/CD. +# +# Uso: +# bash scripts/terraform-lock-cleanup.sh +# Ejemplo: bash scripts/terraform-lock-cleanup.sh dev proyectosacc/terraform.tfstate +# +# Autor: Área de Tecnología y Desarrollo - CCsoft +# =============================================================================================================== + +set -euo pipefail + +# Colores para output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +NC='\033[0m' # No Color + +# Variables +ENVIRONMENT="${1:-dev}" +STATE_KEY="${2:-proyectosacc/terraform.tfstate}" +LOCK_TABLE="${TERRAFORM_LOCK_TABLE:-terraform-locks}" +STATE_BUCKET="${TERRAFORM_STATE_BUCKET:-ccsoft-terraform-state}" +MAX_LOCK_AGE_MINUTES="${MAX_LOCK_AGE_MINUTES:-30}" +AWS_REGION="${AWS_DEFAULT_REGION:-mx-central-1}" + +# Construir el LockID completo +LOCK_ID="${STATE_BUCKET}/${STATE_KEY}-md5" + +echo "=== Terraform Lock Cleanup ===" +echo "Environment: ${ENVIRONMENT}" +echo "State Key: ${STATE_KEY}" +echo "Lock Table: ${LOCK_TABLE}" +echo "Lock ID: ${LOCK_ID}" +echo "Max Lock Age: ${MAX_LOCK_AGE_MINUTES} minutes" +echo "AWS Region: ${AWS_REGION}" +echo "" + +# Verificar que AWS CLI está disponible +if ! command -v aws &> /dev/null; then + echo -e "${RED}ERROR: AWS CLI no está instalado${NC}" + exit 1 +fi + +# Verificar credenciales AWS +echo "Verificando credenciales AWS..." +if ! aws sts get-caller-identity &> /dev/null; then + echo -e "${RED}ERROR: No se pueden validar credenciales AWS${NC}" + exit 1 +fi +echo -e "${GREEN}✓ Credenciales AWS válidas${NC}" +echo "" + +# Verificar si la tabla existe +echo "Verificando tabla DynamoDB ${LOCK_TABLE}..." +if ! aws dynamodb describe-table --table-name "${LOCK_TABLE}" --region "${AWS_REGION}" &> /dev/null; then + echo -e "${YELLOW}⚠ Tabla ${LOCK_TABLE} no encontrada. No hay locks que limpiar.${NC}" + exit 0 +fi +echo -e "${GREEN}✓ Tabla DynamoDB encontrada${NC}" +echo "" + +# Buscar el lock +echo "Buscando lock en DynamoDB..." +LOCK_ITEM=$(aws dynamodb get-item \ + --table-name "${LOCK_TABLE}" \ + --key "{\"LockID\": {\"S\": \"${LOCK_ID}\"}}" \ + --region "${AWS_REGION}" \ + --output json 2>/dev/null || echo '{}') + +if [ -z "${LOCK_ITEM}" ] || [ "${LOCK_ITEM}" == '{}' ]; then + echo -e "${GREEN}✓ No se encontró lock activo. Estado limpio.${NC}" + exit 0 +fi + +echo -e "${YELLOW}⚠ Lock encontrado en DynamoDB${NC}" +echo "Detalles del lock:" +echo "${LOCK_ITEM}" | python3 -m json.tool 2>/dev/null || echo "${LOCK_ITEM}" +echo "" + +# Verificar si el lock tiene información de creación (Info field) +LOCK_INFO=$(echo "${LOCK_ITEM}" | python3 -c " +import json, sys +try: + item = json.load(sys.stdin) + info = item.get('Item', {}).get('Info', {}).get('S', '{}') + print(info) +except: + print('{}') +" 2>/dev/null || echo '{}') + +if [ -n "${LOCK_INFO}" ] && [ "${LOCK_INFO}" != '{}' ]; then + echo "Información del lock:" + echo "${LOCK_INFO}" | python3 -m json.tool 2>/dev/null || echo "${LOCK_INFO}" + + # Extraer timestamp de creación + CREATED=$(echo "${LOCK_INFO}" | python3 -c " +import json, sys +try: + info = json.load(sys.stdin) + created = info.get('Created', '') + print(created) +except: + print('') +" 2>/dev/null || echo '') + + if [ -n "${CREATED}" ]; then + echo "" + echo "Lock creado: ${CREATED}" + + # Calcular antigüedad del lock + CREATED_EPOCH=$(date -d "${CREATED}" +%s 2>/dev/null || echo '0') + CURRENT_EPOCH=$(date +%s) + + if [ "${CREATED_EPOCH}" != '0' ]; then + AGE_MINUTES=$(( (CURRENT_EPOCH - CREATED_EPOCH) / 60 )) + echo "Antigüedad del lock: ${AGE_MINUTES} minutos" + + if [ ${AGE_MINUTES} -gt ${MAX_LOCK_AGE_MINUTES} ]; then + echo -e "${YELLOW}⚠ Lock tiene más de ${MAX_LOCK_AGE_MINUTES} minutos. Considerado bloqueado.${NC}" + else + echo -e "${YELLOW}⚠ Lock tiene menos de ${MAX_LOCK_AGE_MINUTES} minutos. Podría ser una operación en curso.${NC}" + echo -e "${YELLOW}⚠ Esperando 30 segundos antes de verificar nuevamente...${NC}" + sleep 30 + + # Verificar nuevamente + LOCK_ITEM_RETRY=$(aws dynamodb get-item \ + --table-name "${LOCK_TABLE}" \ + --key "{\"LockID\": {\"S\": \"${LOCK_ID}\"}}" \ + --region "${AWS_REGION}" \ + --output json 2>/dev/null || echo '{}') + + if [ -z "${LOCK_ITEM_RETRY}" ] || [ "${LOCK_ITEM_RETRY}" == '{}' ]; then + echo -e "${GREEN}✓ Lock liberado durante la espera. Estado limpio.${NC}" + exit 0 + fi + + echo -e "${YELLOW}⚠ Lock todavía presente después de esperar. Procediendo con limpieza...${NC}" + fi + fi + fi +fi + +echo "" +echo -e "${YELLOW}⚠ Eliminando lock bloqueado de DynamoDB...${NC}" + +# Eliminar el lock +if aws dynamodb delete-item \ + --table-name "${LOCK_TABLE}" \ + --key "{\"LockID\": {\"S\": \"${LOCK_ID}\"}}" \ + --region "${AWS_REGION}" \ + --condition-expression "attribute_exists(LockID)" \ + 2>/dev/null; then + echo -e "${GREEN}✓ Lock eliminado exitosamente${NC}" +else + echo -e "${YELLOW}⚠ No se pudo eliminar el lock (puede que ya no exista)${NC}" +fi + +echo "" +echo "=== Limpieza de locks completada ==="