Questions
Single Jenkins server is overloaded. Design a distributed build architecture with dynamic agents.
The Scenario
Your Jenkins master server is struggling:
Jenkins System Metrics:
- CPU: 95% (constant)
- Memory: 14GB / 16GB
- Build Queue: 47 builds waiting
- Average Wait Time: 25 minutes
- Executor Count: 8 (all busy)
Teams are complaining that builds take forever to start. The single server approach worked when you had 10 developers, but now you have 200. Management wants a solution that scales.
The Challenge
Design a distributed Jenkins architecture with dynamic agents that can handle 500+ concurrent builds while keeping the master server stable and maintaining cost efficiency.
A junior engineer might just add more executors to the master, spin up a few static VMs as permanent agents, or buy a bigger server. These approaches don't scale - more executors on master increases instability, static VMs waste money when idle, and vertical scaling has hard limits.
A senior engineer designs a distributed architecture with the master handling only coordination, implements dynamic agent provisioning using cloud APIs or Kubernetes, sets up agent templates for different workload types, and implements auto-scaling based on queue depth.
Step 1: Architect the Distributed System
┌─────────────────────────────────────────────────────────────────────┐
│ Jenkins Distributed Architecture │
├─────────────────────────────────────────────────────────────────────┤
│ │
│ ┌─────────────┐ │
│ │ Jenkins │ - No builds on master (executors = 0) │
│ │ Master │ - Manages jobs, credentials, plugins │
│ │ │ - HA with standby or Kubernetes deployment │
│ └──────┬──────┘ │
│ │ │
│ │ JNLP/SSH │
│ │ │
│ ┌──────┴──────────────────────────────────────────────────────┐ │
│ │ Agent Provisioners │ │
│ ├──────────────┬──────────────┬──────────────┬────────────────┤ │
│ │ Kubernetes │ EC2/GCP │ Docker │ Static Agents │ │
│ │ Plugin │ Plugin │ Plugin │ (Permanent) │ │
│ └──────┬───────┴──────┬───────┴──────┬───────┴────────┬───────┘ │
│ │ │ │ │ │
│ ┌──────┴──────┐ ┌─────┴─────┐ ┌──────┴──────┐ ┌───────┴───────┐ │
│ │ K8s Pods │ │ EC2 │ │ Docker │ │ Bare Metal │ │
│ │ (ephemeral) │ │ Instances │ │ Containers │ │ Servers │ │
│ └─────────────┘ └───────────┘ └─────────────┘ └───────────────┘ │
│ │
└─────────────────────────────────────────────────────────────────────┘Step 2: Configure Master for Coordination Only
// Configure via JCasC (jenkins.yaml)
jenkins:
numExecutors: 0 # No builds on master
mode: EXCLUSIVE
nodes:
- permanent:
name: "static-agent-01"
remoteFS: "/var/jenkins"
launcher:
ssh:
host: "agent-01.company.com"
credentialsId: "jenkins-ssh-key"
sshHostKeyVerificationStrategy: "knownHostsFileKeyVerificationStrategy"
nodeProperties:
- toolLocation:
locations:
- key: "maven"
home: "/opt/maven"
clouds:
- kubernetes:
name: "kubernetes"
serverUrl: "https://kubernetes.default"
jenkinsUrl: "http://jenkins.jenkins.svc.cluster.local:8080"
jenkinsTunnel: "jenkins-agent.jenkins.svc.cluster.local:50000"
containerCapStr: "100"
maxRequestsPerHostStr: "32"
templates:
- name: "default"
label: "kubernetes"
nodeUsageMode: NORMAL
containers:
- name: "jnlp"
image: "jenkins/inbound-agent:latest"
resourceRequestCpu: "500m"
resourceRequestMemory: "512Mi"
resourceLimitCpu: "1"
resourceLimitMemory: "1Gi"Step 3: Implement Kubernetes Dynamic Agents
// Pipeline with Kubernetes pod template
pipeline {
agent {
kubernetes {
yaml '''
apiVersion: v1
kind: Pod
metadata:
labels:
jenkins: agent
spec:
serviceAccountName: jenkins-agent
containers:
- name: jnlp
image: jenkins/inbound-agent:latest
resources:
requests:
memory: "256Mi"
cpu: "200m"
limits:
memory: "512Mi"
cpu: "500m"
- name: maven
image: maven:3.8-openjdk-17
command: ['sleep', 'infinity']
resources:
requests:
memory: "1Gi"
cpu: "500m"
limits:
memory: "2Gi"
cpu: "1"
volumeMounts:
- name: maven-cache
mountPath: /root/.m2
- name: docker
image: docker:24-dind
securityContext:
privileged: true
volumeMounts:
- name: docker-graph
mountPath: /var/lib/docker
volumes:
- name: maven-cache
persistentVolumeClaim:
claimName: maven-cache-pvc
- name: docker-graph
emptyDir: {}
'''
}
}
stages {
stage('Build') {
steps {
container('maven') {
sh 'mvn clean package -DskipTests'
}
}
}
stage('Docker Build') {
steps {
container('docker') {
sh '''
docker build -t myapp:${BUILD_NUMBER} .
docker push registry.company.com/myapp:${BUILD_NUMBER}
'''
}
}
}
}
}Step 4: Implement EC2 Dynamic Agents
// JCasC configuration for EC2 plugin
jenkins:
clouds:
- amazonEC2:
name: "aws-ec2"
region: "us-east-1"
credentialsId: "aws-credentials"
sshKeysCredentialsId: "ec2-ssh-key"
templates:
- description: "Linux Build Agent"
ami: "ami-0123456789abcdef0"
type: M5Large
remoteFS: "/var/jenkins"
labels: "linux docker"
mode: NORMAL
numExecutors: 2
securityGroups: "jenkins-agent-sg"
subnetId: "subnet-abc123"
associatePublicIp: false
spotConfig:
spotMaxBidPrice: "0.10"
fallbackToOndemand: true
idleTerminationMinutes: 15
minimumNumberOfInstances: 0
maximumNumberOfInstances: 50
initScript: |
#!/bin/bash
yum update -y
yum install -y docker java-17-amazon-corretto
systemctl start docker
usermod -aG docker ec2-user
- description: "Windows Build Agent"
ami: "ami-windows-server-2022"
type: M5Large
labels: "windows dotnet"
mode: EXCLUSIVE
numExecutors: 1
minimumNumberOfInstances: 0
maximumNumberOfInstances: 20Step 5: Implement Auto-Scaling Based on Queue
// Shared library function for smart agent selection
// vars/smartAgent.groovy
def call(Map config = [:]) {
def queueDepth = Jenkins.instance.queue.items.length
def agentType = config.type ?: 'default'
// Select agent based on queue depth and requirements
if (agentType == 'heavy' || config.memory > '4Gi') {
return kubernetes {
yaml getLargeAgentPod(config)
}
} else if (queueDepth > 20) {
// Use spot instances for cost savings during high load
return label('spot-agents')
} else {
return kubernetes {
yaml getDefaultAgentPod(config)
}
}
}
def getDefaultAgentPod(Map config) {
return """
apiVersion: v1
kind: Pod
spec:
containers:
- name: jnlp
image: jenkins/inbound-agent:latest
resources:
requests:
memory: "${config.memory ?: '512Mi'}"
cpu: "${config.cpu ?: '500m'}"
"""
}
def getLargeAgentPod(Map config) {
return """
apiVersion: v1
kind: Pod
spec:
nodeSelector:
node-type: high-memory
containers:
- name: jnlp
image: jenkins/inbound-agent:latest
resources:
requests:
memory: "${config.memory ?: '4Gi'}"
cpu: "${config.cpu ?: '2'}"
limits:
memory: "${config.memoryLimit ?: '8Gi'}"
cpu: "${config.cpuLimit ?: '4'}"
"""
}Step 6: Monitor and Auto-Scale
// Monitoring pipeline for agent scaling
pipeline {
agent { label 'monitoring' }
triggers {
cron('*/5 * * * *') // Every 5 minutes
}
stages {
stage('Check Queue Metrics') {
steps {
script {
def metrics = getJenkinsMetrics()
if (metrics.queueDepth > 30 && metrics.idleAgents < 5) {
echo "High queue depth (${metrics.queueDepth}), scaling up..."
scaleAgents(targetCount: metrics.queueDepth / 2)
}
if (metrics.queueDepth < 5 && metrics.idleAgents > 10) {
echo "Low queue depth, allowing scale down..."
// EC2 plugin handles termination based on idle time
}
// Alert if wait time is too high
if (metrics.avgWaitTime > 600) { // 10 minutes
slackSend(
channel: '#jenkins-alerts',
message: "Build queue wait time is ${metrics.avgWaitTime}s"
)
}
}
}
}
}
}
def getJenkinsMetrics() {
def jenkins = Jenkins.instance
return [
queueDepth: jenkins.queue.items.length,
idleAgents: jenkins.nodes.findAll { it.computer.isIdle() }.size(),
busyAgents: jenkins.nodes.findAll { !it.computer.isIdle() }.size(),
avgWaitTime: jenkins.queue.items.collect {
System.currentTimeMillis() - it.getInQueueSince()
}.average() ?: 0
]
} Distributed Architecture Benefits
| Metric | Before | After | Improvement |
|---|---|---|---|
| Queue Wait Time | 25 min | 2 min | 92% reduction |
| Concurrent Builds | 8 | 100+ | 12x increase |
| Master CPU | 95% | 15% | Stable |
| Cost per Build | Fixed | Pay-per-use | 40% savings |
| Recovery Time | Hours | Minutes | HA enabled |
Agent Selection Guide
| Workload | Agent Type | Why |
|---|---|---|
| Quick tests | Kubernetes pods | Fast startup, ephemeral |
| Docker builds | EC2 with Docker | DinD complexity in K8s |
| Windows builds | EC2 Windows AMI | Native environment |
| GPU workloads | EC2 GPU instances | Specialized hardware |
| Sensitive builds | Static agents | Network isolation |
Practice Question
Why should you set numExecutors to 0 on the Jenkins master in a distributed architecture?