dag-execution-tracer

by @erichowens in AI & LLM

# Install this skill:

npx skills add erichowens/some_claude_skills --skill "dag-execution-tracer"

Install specific skill from multi-skill repository

# Description

Traces complete execution paths through DAG workflows. Records timing, inputs, outputs, and state transitions for all nodes. Activate on 'execution trace', 'trace execution', 'execution path', 'debug execution', 'execution log'. NOT for performance analysis (use dag-performance-profiler) or failure investigation (use dag-failure-analyzer).

# SKILL.md

name: dag-execution-tracer
description: Traces complete execution paths through DAG workflows. Records timing, inputs, outputs, and state transitions for all nodes. Activate on 'execution trace', 'trace execution', 'execution path', 'debug execution', 'execution log'. NOT for performance analysis (use dag-performance-profiler) or failure investigation (use dag-failure-analyzer).
allowed-tools:
- Read
- Write
- Edit
- Glob
- Grep
category: DAG Framework
tags:
- dag
- observability
- tracing
- debugging
- logging
pairs-with:
- skill: dag-performance-profiler
reason: Provides timing data
- skill: dag-failure-analyzer
reason: Provides failure context
- skill: dag-pattern-learner
reason: Provides execution patterns
- skill: dag-task-scheduler
reason: Traces scheduled tasks

You are a DAG Execution Tracer, an expert at recording and analyzing complete execution paths through DAG workflows. You capture timing, inputs, outputs, state transitions, and context for all nodes to enable debugging, analysis, and learning.

Core Responsibilities

1. Trace Recording

Capture node execution events
Record state transitions
Log inputs and outputs
Track context propagation

2. Trace Visualization

Generate execution timelines
Show dependency relationships
Visualize parallel execution
Highlight critical paths

3. Context Capture

Record decision points
Capture environmental context
Log tool usage
Track resource consumption

4. Trace Analysis

Identify bottlenecks
Detect anomalies
Support debugging
Enable replay

Trace Architecture

interface ExecutionTrace {
  traceId: string;
  dagId: string;
  startedAt: Date;
  completedAt?: Date;
  status: 'running' | 'completed' | 'failed' | 'cancelled';
  rootSpan: TraceSpan;
  spans: Map<SpanId, TraceSpan>;
  events: TraceEvent[];
  context: TraceContext;
  metadata: TraceMetadata;
}

interface TraceSpan {
  spanId: SpanId;
  parentSpanId?: SpanId;
  nodeId: NodeId;
  operationName: string;
  startTime: Date;
  endTime?: Date;
  duration?: number;
  status: SpanStatus;
  attributes: Record<string, unknown>;
  events: SpanEvent[];
  links: SpanLink[];
}

type SpanStatus =
  | { code: 'OK' }
  | { code: 'ERROR'; message: string }
  | { code: 'UNSET' };

interface TraceEvent {
  timestamp: Date;
  type: EventType;
  spanId: SpanId;
  name: string;
  attributes: Record<string, unknown>;
}

type EventType =
  | 'node_started'
  | 'node_completed'
  | 'node_failed'
  | 'state_transition'
  | 'tool_called'
  | 'context_received'
  | 'output_produced'
  | 'retry_initiated'
  | 'child_spawned';

Trace Recording

class ExecutionTracer {
  private traces: Map<string, ExecutionTrace> = new Map();

  startTrace(dagId: string): ExecutionTrace {
    const trace: ExecutionTrace = {
      traceId: generateTraceId(),
      dagId,
      startedAt: new Date(),
      status: 'running',
      rootSpan: this.createRootSpan(dagId),
      spans: new Map(),
      events: [],
      context: this.captureContext(),
      metadata: this.captureMetadata(),
    };

    this.traces.set(trace.traceId, trace);
    return trace;
  }

  startSpan(
    traceId: string,
    nodeId: NodeId,
    operationName: string,
    parentSpanId?: SpanId
  ): TraceSpan {
    const trace = this.getTrace(traceId);
    const span: TraceSpan = {
      spanId: generateSpanId(),
      parentSpanId,
      nodeId,
      operationName,
      startTime: new Date(),
      status: { code: 'UNSET' },
      attributes: {},
      events: [],
      links: [],
    };

    trace.spans.set(span.spanId, span);
    this.recordEvent(traceId, {
      timestamp: new Date(),
      type: 'node_started',
      spanId: span.spanId,
      name: `${operationName} started`,
      attributes: { nodeId },
    });

    return span;
  }

  endSpan(
    traceId: string,
    spanId: SpanId,
    status: SpanStatus,
    attributes?: Record<string, unknown>
  ): void {
    const trace = this.getTrace(traceId);
    const span = trace.spans.get(spanId);

    if (!span) throw new Error(`Span ${spanId} not found`);

    span.endTime = new Date();
    span.duration = span.endTime.getTime() - span.startTime.getTime();
    span.status = status;
    if (attributes) {
      span.attributes = { ...span.attributes, ...attributes };
    }

    this.recordEvent(traceId, {
      timestamp: new Date(),
      type: status.code === 'OK' ? 'node_completed' : 'node_failed',
      spanId,
      name: `${span.operationName} ${status.code === 'OK' ? 'completed' : 'failed'}`,
      attributes: { duration: span.duration, ...attributes },
    });
  }

  recordEvent(traceId: string, event: TraceEvent): void {
    const trace = this.getTrace(traceId);
    trace.events.push(event);
  }

  completeTrace(traceId: string, status: ExecutionTrace['status']): void {
    const trace = this.getTrace(traceId);
    trace.completedAt = new Date();
    trace.status = status;
  }
}

Context Capture

interface TraceContext {
  environment: EnvironmentContext;
  user: UserContext;
  dag: DAGContext;
  execution: ExecutionContext;
}

interface EnvironmentContext {
  runtime: 'claude-code-cli' | 'sdk' | 'http-api';
  platform: string;
  nodeVersion?: string;
  timestamp: Date;
  timezone: string;
}

interface DAGContext {
  dagId: string;
  dagName: string;
  totalNodes: number;
  totalEdges: number;
  maxParallelism: number;
  estimatedDuration?: number;
}

interface ExecutionContext {
  initiator: string;
  priority: 'low' | 'normal' | 'high';
  timeout?: number;
  retryPolicy?: RetryPolicy;
  isolationLevel: IsolationLevel;
}

function captureContext(): TraceContext {
  return {
    environment: {
      runtime: detectRuntime(),
      platform: process.platform,
      nodeVersion: process.version,
      timestamp: new Date(),
      timezone: Intl.DateTimeFormat().resolvedOptions().timeZone,
    },
    user: captureUserContext(),
    dag: {} as DAGContext, // Filled when DAG is known
    execution: {} as ExecutionContext, // Filled at execution start
  };
}

Span Attributes

function recordNodeExecution(
  tracer: ExecutionTracer,
  traceId: string,
  node: DAGNode,
  input: unknown,
  parentSpan?: TraceSpan
): TraceSpan {
  const span = tracer.startSpan(
    traceId,
    node.id,
    `node:${node.type}:${node.id}`,
    parentSpan?.spanId
  );

  // Standard attributes
  span.attributes = {
    'dag.node.id': node.id,
    'dag.node.type': node.type,
    'dag.node.skill': node.skillId ?? 'none',
    'dag.node.dependencies': node.dependencies.length,
    'dag.input.size': JSON.stringify(input).length,
  };

  return span;
}

function recordToolCall(
  tracer: ExecutionTracer,
  traceId: string,
  spanId: SpanId,
  tool: string,
  args: unknown,
  result: unknown,
  duration: number
): void {
  tracer.recordEvent(traceId, {
    timestamp: new Date(),
    type: 'tool_called',
    spanId,
    name: `tool:${tool}`,
    attributes: {
      tool,
      args: summarizeArgs(args),
      resultSize: JSON.stringify(result).length,
      duration,
    },
  });
}

function recordStateTransition(
  tracer: ExecutionTracer,
  traceId: string,
  spanId: SpanId,
  fromState: string,
  toState: string,
  reason: string
): void {
  tracer.recordEvent(traceId, {
    timestamp: new Date(),
    type: 'state_transition',
    spanId,
    name: `${fromState} → ${toState}`,
    attributes: { fromState, toState, reason },
  });
}

Trace Visualization

function generateTimeline(trace: ExecutionTrace): string {
  const spans = Array.from(trace.spans.values())
    .sort((a, b) => a.startTime.getTime() - b.startTime.getTime());

  const totalDuration = trace.completedAt
    ? trace.completedAt.getTime() - trace.startedAt.getTime()
    : Date.now() - trace.startedAt.getTime();

  const scale = 80; // Characters width

  let timeline = '';
  timeline += `Execution Timeline (${totalDuration}ms total)\n`;
  timeline += '═'.repeat(scale + 30) + '\n';

  for (const span of spans) {
    const offset = Math.round(
      ((span.startTime.getTime() - trace.startedAt.getTime()) / totalDuration) * scale
    );
    const width = Math.max(1, Math.round(
      ((span.duration ?? 0) / totalDuration) * scale
    ));

    const bar = ' '.repeat(offset) + '█'.repeat(width);
    const status = span.status.code === 'OK' ? '✓' :
                   span.status.code === 'ERROR' ? '✗' : '?';

    timeline += `${span.nodeId.padEnd(20)} ${status} ${bar} ${span.duration ?? 0}ms\n`;
  }

  return timeline;
}

function generateDependencyGraph(trace: ExecutionTrace): string {
  const spans = Array.from(trace.spans.values());
  const nodes = spans.map(s => s.nodeId);
  const edges: string[] = [];

  for (const span of spans) {
    if (span.parentSpanId) {
      const parent = trace.spans.get(span.parentSpanId);
      if (parent) {
        edges.push(`${parent.nodeId} --> ${span.nodeId}`);
      }
    }
  }

  let graph = 'graph TD\n';
  for (const node of nodes) {
    const span = spans.find(s => s.nodeId === node);
    const status = span?.status.code === 'OK' ? ':::success' :
                   span?.status.code === 'ERROR' ? ':::error' : '';
    graph += `  ${node}[${node}]${status}\n`;
  }
  for (const edge of edges) {
    graph += `  ${edge}\n`;
  }

  return graph;
}

Trace Export

interface TraceExport {
  format: 'json' | 'otlp' | 'jaeger' | 'yaml';
  includeEvents: boolean;
  includeAttributes: boolean;
  sanitize: boolean;
}

function exportTrace(
  trace: ExecutionTrace,
  options: TraceExport
): string {
  const sanitized = options.sanitize
    ? sanitizeTrace(trace)
    : trace;

  switch (options.format) {
    case 'json':
      return JSON.stringify(sanitized, null, 2);
    case 'otlp':
      return convertToOTLP(sanitized);
    case 'jaeger':
      return convertToJaeger(sanitized);
    case 'yaml':
      return convertToYAML(sanitized);
  }
}

function sanitizeTrace(trace: ExecutionTrace): ExecutionTrace {
  // Remove sensitive data from attributes
  const sanitizedSpans = new Map<SpanId, TraceSpan>();

  for (const [id, span] of trace.spans) {
    sanitizedSpans.set(id, {
      ...span,
      attributes: sanitizeAttributes(span.attributes),
    });
  }

  return {
    ...trace,
    spans: sanitizedSpans,
    events: trace.events.map(e => ({
      ...e,
      attributes: sanitizeAttributes(e.attributes),
    })),
  };
}

const SENSITIVE_PATTERNS = [
  /api[_-]?key/i,
  /password/i,
  /secret/i,
  /token/i,
  /credential/i,
];

function sanitizeAttributes(
  attrs: Record<string, unknown>
): Record<string, unknown> {
  const sanitized: Record<string, unknown> = {};

  for (const [key, value] of Object.entries(attrs)) {
    if (SENSITIVE_PATTERNS.some(p => p.test(key))) {
      sanitized[key] = '[REDACTED]';
    } else {
      sanitized[key] = value;
    }
  }

  return sanitized;
}

Trace Report

executionTrace:
  traceId: "tr-8f4a2b1c-3d5e-6f7a-8b9c"
  dagId: "code-review-dag"
  startedAt: "2024-01-15T10:30:00.000Z"
  completedAt: "2024-01-15T10:30:45.234Z"
  status: completed
  duration: 45234

  timeline: |
    Execution Timeline (45234ms total)
    ══════════════════════════════════════════════════════════════════════════════════
    fetch-code            ✓ ████                                                    3421ms
    analyze-complexity    ✓     █████████                                           8234ms
    check-security        ✓     ███████                                             6892ms
    review-performance    ✓          ██████████████                                12456ms
    aggregate-results     ✓                          ████████████████              14231ms

  spans:
    - spanId: "sp-001"
      nodeId: fetch-code
      operationName: "node:skill:fetch-code"
      startTime: "2024-01-15T10:30:00.000Z"
      duration: 3421
      status: OK
      attributes:
        dag.node.type: skill
        dag.node.skill: code-fetcher
        dag.input.size: 245
        dag.output.size: 15234
      events:
        - type: tool_called
          name: "tool:Read"
          attributes:
            file: "src/main.ts"
            duration: 234

    - spanId: "sp-002"
      nodeId: analyze-complexity
      operationName: "node:skill:analyze-complexity"
      startTime: "2024-01-15T10:30:03.421Z"
      duration: 8234
      status: OK
      parentSpanId: "sp-001"

    - spanId: "sp-003"
      nodeId: check-security
      operationName: "node:skill:check-security"
      startTime: "2024-01-15T10:30:03.421Z"
      duration: 6892
      status: OK
      parentSpanId: "sp-001"

  context:
    environment:
      runtime: claude-code-cli
      platform: darwin
    execution:
      initiator: user
      priority: normal
      isolationLevel: moderate

  summary:
    totalSpans: 5
    successfulSpans: 5
    failedSpans: 0
    criticalPath: ["fetch-code", "review-performance", "aggregate-results"]
    parallelExecution: 2  # Max concurrent spans

Integration Points

Output: Traces to dag-performance-profiler and dag-failure-analyzer
Events: State changes from dag-task-scheduler
Storage: Patterns to dag-pattern-learner
Visualization: Timeline to monitoring dashboards

Best Practices

Trace Everything: Complete traces enable full debugging
Structured Attributes: Use consistent attribute naming
Span Hierarchy: Properly link parent/child spans
Sanitize Exports: Remove sensitive data before sharing
Correlate Traces: Use trace IDs across services

Full visibility. Complete history. Every execution recorded.

# Supported AI Coding Agents

This skill is compatible with the SKILL.md standard and works with all major AI coding agents:

⚡ Amp 🚀 Antigravity 🤖 Claude Code 🦀 Clawdbot 📝 Codex ▶️ Cursor 🤖 Droid 💎 Gemini CLI 🐙 GitHub Copilot 🪿 Goose 📊 Kilo Code 🔧 Kiro CLI 💻 OpenCode 🦘 Roo Code 🌲 Trae 🏄 Windsurf

Learn more about the SKILL.md standard and how to use these skills with your preferred AI coding agent.