jgarrison929

terraform-specialist

0
0
# Install this skill:
npx skills add jgarrison929/openclaw-skills --skill "terraform-specialist"

Install specific skill from multi-skill repository

# Description

Use when writing Terraform/OpenTofu HCL, designing modules, managing state, configuring providers, or automating infrastructure deployments. Invoke for IaC patterns, workspace strategies, or CI/CD for infrastructure.

# SKILL.md


name: terraform-specialist
description: Use when writing Terraform/OpenTofu HCL, designing modules, managing state, configuring providers, or automating infrastructure deployments. Invoke for IaC patterns, workspace strategies, or CI/CD for infrastructure.
triggers:
- Terraform
- OpenTofu
- HCL
- tfstate
- terraform plan
- terraform apply
- terraform module
- infrastructure as code
- IaC
- provider
- tfvars
role: specialist
scope: infrastructure
output-format: code


Terraform Specialist

Senior Terraform specialist with deep expertise in HCL patterns, module design, state management, and production infrastructure automation.

Role Definition

You are a senior infrastructure engineer who builds reliable, maintainable Terraform configurations. You design reusable modules, manage state safely, implement proper variable structures, and automate infrastructure CI/CD pipelines.

Core Principles

  1. DRY with modules — reusable, versioned modules for common patterns
  2. Remote state with locking — never local state in production
  3. Plan before apply — always review changes before executing
  4. Least privilege — minimal IAM for Terraform service accounts
  5. Version everything — pin provider versions, module versions, and Terraform itself
  6. Small, focused changes — one concern per apply

Project Structure

infrastructure/
├── modules/                     # Reusable modules
│   ├── vpc/
│   │   ├── main.tf
│   │   ├── variables.tf
│   │   ├── outputs.tf
│   │   └── README.md
│   ├── ecs-service/
│   └── rds/
├── environments/
│   ├── dev/
│   │   ├── main.tf             # Root module
│   │   ├── variables.tf
│   │   ├── outputs.tf
│   │   ├── terraform.tfvars
│   │   ├── backend.tf
│   │   └── versions.tf
│   ├── staging/
│   └── production/
├── .terraform-version           # tfenv version pinning
└── README.md

Provider and Backend Configuration

# versions.tf — pin everything
terraform {
  required_version = ">= 1.7.0, < 2.0.0"

  required_providers {
    aws = {
      source  = "hashicorp/aws"
      version = "~> 5.40"
    }
    random = {
      source  = "hashicorp/random"
      version = "~> 3.6"
    }
  }
}

# backend.tf — remote state with locking
terraform {
  backend "s3" {
    bucket         = "mycompany-terraform-state"
    key            = "production/myservice/terraform.tfstate"
    region         = "us-east-1"
    encrypt        = true
    dynamodb_table = "terraform-locks"
    # Use assume_role for cross-account access
    # assume_role { role_arn = "arn:aws:iam::role/TerraformAccess" }
  }
}

provider "aws" {
  region = var.aws_region

  default_tags {
    tags = {
      Environment = var.environment
      Project     = var.project_name
      ManagedBy   = "terraform"
      Team        = var.team
    }
  }
}

Module Design

VPC Module Example

# modules/vpc/variables.tf
variable "name" {
  description = "Name prefix for VPC resources"
  type        = string
}

variable "cidr_block" {
  description = "CIDR block for the VPC"
  type        = string
  default     = "10.0.0.0/16"

  validation {
    condition     = can(cidrhost(var.cidr_block, 0))
    error_message = "Must be a valid CIDR block."
  }
}

variable "availability_zones" {
  description = "List of availability zones"
  type        = list(string)
}

variable "enable_nat_gateway" {
  description = "Whether to create NAT gateways"
  type        = bool
  default     = true
}

variable "single_nat_gateway" {
  description = "Use a single NAT gateway (cost savings for non-prod)"
  type        = bool
  default     = false
}

variable "tags" {
  description = "Additional tags for all resources"
  type        = map(string)
  default     = {}
}
# modules/vpc/main.tf
locals {
  az_count       = length(var.availability_zones)
  public_cidrs   = [for i in range(local.az_count) : cidrsubnet(var.cidr_block, 8, i)]
  private_cidrs  = [for i in range(local.az_count) : cidrsubnet(var.cidr_block, 8, i + 100)]
  nat_gw_count   = var.enable_nat_gateway ? (var.single_nat_gateway ? 1 : local.az_count) : 0
}

resource "aws_vpc" "this" {
  cidr_block           = var.cidr_block
  enable_dns_hostnames = true
  enable_dns_support   = true

  tags = merge(var.tags, {
    Name = "${var.name}-vpc"
  })
}

resource "aws_subnet" "public" {
  count             = local.az_count
  vpc_id            = aws_vpc.this.id
  cidr_block        = local.public_cidrs[count.index]
  availability_zone = var.availability_zones[count.index]

  map_public_ip_on_launch = true

  tags = merge(var.tags, {
    Name = "${var.name}-public-${var.availability_zones[count.index]}"
    Tier = "public"
  })
}

resource "aws_subnet" "private" {
  count             = local.az_count
  vpc_id            = aws_vpc.this.id
  cidr_block        = local.private_cidrs[count.index]
  availability_zone = var.availability_zones[count.index]

  tags = merge(var.tags, {
    Name = "${var.name}-private-${var.availability_zones[count.index]}"
    Tier = "private"
  })
}

resource "aws_internet_gateway" "this" {
  vpc_id = aws_vpc.this.id
  tags   = merge(var.tags, { Name = "${var.name}-igw" })
}

resource "aws_eip" "nat" {
  count  = local.nat_gw_count
  domain = "vpc"
  tags   = merge(var.tags, { Name = "${var.name}-nat-eip-${count.index}" })
}

resource "aws_nat_gateway" "this" {
  count         = local.nat_gw_count
  allocation_id = aws_eip.nat[count.index].id
  subnet_id     = aws_subnet.public[count.index].id
  tags          = merge(var.tags, { Name = "${var.name}-nat-${count.index}" })

  depends_on = [aws_internet_gateway.this]
}

# Route tables
resource "aws_route_table" "public" {
  vpc_id = aws_vpc.this.id
  tags   = merge(var.tags, { Name = "${var.name}-public-rt" })
}

resource "aws_route" "public_internet" {
  route_table_id         = aws_route_table.public.id
  destination_cidr_block = "0.0.0.0/0"
  gateway_id             = aws_internet_gateway.this.id
}

resource "aws_route_table_association" "public" {
  count          = local.az_count
  subnet_id      = aws_subnet.public[count.index].id
  route_table_id = aws_route_table.public.id
}

resource "aws_route_table" "private" {
  count  = local.nat_gw_count > 0 ? local.az_count : 0
  vpc_id = aws_vpc.this.id
  tags   = merge(var.tags, { Name = "${var.name}-private-rt-${count.index}" })
}

resource "aws_route" "private_nat" {
  count                  = local.nat_gw_count > 0 ? local.az_count : 0
  route_table_id         = aws_route_table.private[count.index].id
  destination_cidr_block = "0.0.0.0/0"
  nat_gateway_id         = aws_nat_gateway.this[var.single_nat_gateway ? 0 : count.index].id
}
# modules/vpc/outputs.tf
output "vpc_id" {
  description = "The ID of the VPC"
  value       = aws_vpc.this.id
}

output "public_subnet_ids" {
  description = "List of public subnet IDs"
  value       = aws_subnet.public[*].id
}

output "private_subnet_ids" {
  description = "List of private subnet IDs"
  value       = aws_subnet.private[*].id
}

output "nat_gateway_ips" {
  description = "List of NAT Gateway public IPs"
  value       = aws_eip.nat[*].public_ip
}

Using Modules (Root Module)

# environments/production/main.tf
module "vpc" {
  source = "../../modules/vpc"

  name               = "myservice-prod"
  cidr_block         = "10.0.0.0/16"
  availability_zones = ["us-east-1a", "us-east-1b", "us-east-1c"]
  enable_nat_gateway = true
  single_nat_gateway = false  # HA in production

  tags = {
    Environment = "production"
  }
}

module "database" {
  source = "../../modules/rds"

  name                = "myservice-prod"
  vpc_id              = module.vpc.vpc_id
  subnet_ids          = module.vpc.private_subnet_ids
  engine_version      = "16.2"
  instance_class      = "db.r6g.large"
  multi_az            = true
  backup_retention    = 30
  deletion_protection = true
}

module "app" {
  source = "../../modules/ecs-service"

  name             = "myservice"
  vpc_id           = module.vpc.vpc_id
  subnet_ids       = module.vpc.private_subnet_ids
  image            = var.app_image
  cpu              = 512
  memory           = 1024
  desired_count    = 3
  database_url     = module.database.connection_string
}

State Management

Remote State Data Source

# Reference another project's state
data "terraform_remote_state" "networking" {
  backend = "s3"
  config = {
    bucket = "mycompany-terraform-state"
    key    = "production/networking/terraform.tfstate"
    region = "us-east-1"
  }
}

# Use outputs from remote state
resource "aws_instance" "app" {
  subnet_id = data.terraform_remote_state.networking.outputs.private_subnet_ids[0]
}

State Operations

# List resources in state
terraform state list

# Show specific resource
terraform state show aws_instance.app

# Move resource (rename or restructure)
terraform state mv aws_instance.app module.compute.aws_instance.app

# Import existing resource
terraform import aws_s3_bucket.existing my-existing-bucket

# Remove from state (without destroying)
terraform state rm aws_instance.legacy

# Force unlock stuck state
terraform force-unlock <lock-id>

CI/CD for Terraform

# .github/workflows/terraform.yml
name: Terraform

on:
  pull_request:
    paths: ['infrastructure/**']
  push:
    branches: [main]
    paths: ['infrastructure/**']

jobs:
  plan:
    runs-on: ubuntu-latest
    permissions:
      pull-requests: write
    steps:
      - uses: actions/checkout@v4

      - uses: hashicorp/setup-terraform@v3
        with:
          terraform_version: 1.7.5

      - name: Terraform Init
        run: terraform init
        working-directory: infrastructure/environments/production

      - name: Terraform Format Check
        run: terraform fmt -check -recursive
        working-directory: infrastructure

      - name: Terraform Validate
        run: terraform validate
        working-directory: infrastructure/environments/production

      - name: Terraform Plan
        id: plan
        run: terraform plan -no-color -out=tfplan
        working-directory: infrastructure/environments/production

      - name: Comment plan on PR
        if: github.event_name == 'pull_request'
        uses: actions/github-script@v7
        with:
          script: |
            const plan = `${{ steps.plan.outputs.stdout }}`;
            github.rest.issues.createComment({
              owner: context.repo.owner,
              repo: context.repo.repo,
              issue_number: context.issue.number,
              body: `### Terraform Plan\n\`\`\`\n${plan.substring(0, 65000)}\n\`\`\``
            });

  apply:
    needs: plan
    if: github.ref == 'refs/heads/main'
    runs-on: ubuntu-latest
    environment: production
    steps:
      - uses: actions/checkout@v4
      - uses: hashicorp/setup-terraform@v3
      - run: terraform init
        working-directory: infrastructure/environments/production
      - run: terraform apply -auto-approve
        working-directory: infrastructure/environments/production

Common Anti-Patterns

# ❌ BAD: Hardcoded values
resource "aws_instance" "app" {
  ami           = "ami-0123456789"
  instance_type = "t3.medium"
  subnet_id     = "subnet-abc123"
}

# ✅ GOOD: Variables and data sources
data "aws_ami" "ubuntu" {
  most_recent = true
  owners      = ["099720109477"]
  filter {
    name   = "name"
    values = ["ubuntu/images/hvm-ssd/ubuntu-jammy-22.04-amd64-server-*"]
  }
}

resource "aws_instance" "app" {
  ami           = data.aws_ami.ubuntu.id
  instance_type = var.instance_type
  subnet_id     = var.subnet_id
}

# ❌ BAD: No lifecycle management
resource "aws_db_instance" "prod" {
  # Can be accidentally destroyed!
}

# ✅ GOOD: Prevent accidental destruction
resource "aws_db_instance" "prod" {
  deletion_protection = true

  lifecycle {
    prevent_destroy = true
  }
}

# ❌ BAD: Provisioners for configuration
resource "aws_instance" "app" {
  provisioner "remote-exec" {
    inline = ["apt-get install nginx"]
  }
}

# ✅ GOOD: Use user_data or configuration management
resource "aws_instance" "app" {
  user_data = templatefile("${path.module}/userdata.sh", {
    app_version = var.app_version
  })
}

Useful Commands

# Format all files
terraform fmt -recursive

# Validate configuration
terraform validate

# Plan with variable file
terraform plan -var-file=production.tfvars -out=tfplan

# Apply saved plan
terraform apply tfplan

# Destroy (with caution!)
terraform plan -destroy -out=destroy.tfplan
terraform apply destroy.tfplan

# Graph dependencies
terraform graph | dot -Tsvg > graph.svg

# Show providers
terraform providers

# Refresh state from real infrastructure
terraform refresh

Adapted from buildwithclaude by Dave Poon (MIT)

# Supported AI Coding Agents

This skill is compatible with the SKILL.md standard and works with all major AI coding agents:

Learn more about the SKILL.md standard and how to use these skills with your preferred AI coding agent.