Questions
Jenkins agents keep disconnecting and builds are stuck in queue. Diagnose and fix the issue.
The Scenario
Your Jenkins build queue is growing rapidly:
Build Queue (47 items):
- deploy-production #234 (waiting for agent 'linux-docker')
- unit-tests #1892 (waiting for 25 minutes)
- integration-tests #445 (pending - No matching agents)
...
Agent Status:
- linux-docker-01: Offline (connection reset)
- linux-docker-02: Offline (ping timeout)
- linux-docker-03: Connected (launching...)
- kubernetes-pod-xyz: Offline (terminated)
Developers are escalating. The agents were working fine yesterday. You need to diagnose and fix this quickly.
The Challenge
Systematically debug the agent connectivity issues, identify root causes, and implement fixes along with monitoring to prevent recurrence.
A junior engineer might randomly restart all agents, delete and recreate the agent configurations, or just wait hoping it fixes itself. These approaches don't identify the root cause, may cause data loss, and waste valuable time during an outage.
A senior engineer methodically checks agent logs, network connectivity, resource utilization, and configuration. They identify the specific failure mode, implement targeted fixes, and add monitoring to catch issues before they become outages.
Step 1: Quick Diagnostic Commands
# Check Jenkins master logs for agent errors
tail -f /var/log/jenkins/jenkins.log | grep -i "agent\|slave\|connect"
# Check agent status via API
curl -s http://jenkins:8080/computer/api/json?pretty=true \
-u admin:$TOKEN | jq '.computer[] | {name: .displayName, offline: .offline, offlineCauseReason: .offlineCauseReason}'
# Check network connectivity to agents
for agent in agent-01 agent-02 agent-03; do
echo "=== $agent ==="
ping -c 2 $agent
nc -zv $agent 22 # SSH
nc -zv $agent 50000 # JNLP
done
# Check if agents can reach master
ssh agent-01 "curl -s http://jenkins:8080/login | head -5"
ssh agent-01 "nc -zv jenkins 8080 && nc -zv jenkins 50000"Step 2: Check Agent Logs
# On SSH-connected agents
ssh agent-01 "tail -100 /var/log/jenkins/agent.log"
# Common errors to look for:
# - "java.net.SocketTimeoutException" - Network issue
# - "java.lang.OutOfMemoryError" - Agent JVM memory
# - "hudson.remoting.ChannelClosedException" - Connection dropped
# - "java.security.cert.CertificateException" - SSL issues// Check agent status programmatically in Jenkins Script Console
import hudson.model.*
import hudson.slaves.*
Jenkins.instance.computers.each { computer ->
if (computer.name != "") {
println "=== ${computer.name} ==="
println " Online: ${computer.isOnline()}"
println " Connecting: ${computer.isConnecting()}"
if (computer.isOffline()) {
println " Offline Cause: ${computer.getOfflineCauseReason()}"
println " Offline Since: ${computer.getOfflineCause()?.getTimestamp()}"
}
println " Executors: ${computer.numExecutors}"
println " Busy Executors: ${computer.countBusy()}"
println ""
}
}Step 3: Common Issues and Fixes
// Fix 1: Agent JVM memory issues
// In agent launch command, increase heap size
// Before: java -jar agent.jar
// After:
java -Xmx2g -Xms512m \
-XX:+UseG1GC \
-XX:MaxGCPauseMillis=200 \
-jar agent.jar \
-jnlpUrl http://jenkins:8080/computer/agent-01/slave-agent.jnlp \
-secret @secret-file
// Fix 2: Network timeout issues - increase timeouts
// In Jenkins > Configure Global Security
// Agent -> Controller Security:
// - TCP port for inbound agents: Fixed (50000)
// - Agent protocols: Enable "Inbound TCP Agent Protocol/4 (TLS encryption)"// Fix 3: Reconnection settings for SSH agents
// Configure via JCasC
jenkins:
nodes:
- permanent:
name: "linux-agent-01"
remoteFS: "/var/jenkins"
launcher:
ssh:
host: "agent-01.company.com"
credentialsId: "jenkins-ssh"
maxNumRetries: 5
retryWaitTime: 30
sshHostKeyVerificationStrategy:
manuallyTrustedKeyVerificationStrategy:
requireInitialManualTrust: false
javaPath: "/usr/bin/java"
jvmOptions: "-Xmx1g"
nodeProperties:
- diskSpaceMonitor:
freeDiskSpaceThreshold: "5GB"
freeDiskSpaceWarningThreshold: "10GB"
retentionStrategy:
demand:
idleDelay: 10
inDemandDelay: 0Step 4: Kubernetes Agent Issues
// Debug Kubernetes agent issues
pipeline {
agent {
kubernetes {
yaml '''
apiVersion: v1
kind: Pod
metadata:
labels:
jenkins: agent
spec:
containers:
- name: jnlp
image: jenkins/inbound-agent:latest
resources:
requests:
memory: "512Mi"
cpu: "500m"
limits:
memory: "1Gi"
cpu: "1"
# Add liveness probe to detect hung agents
livenessProbe:
httpGet:
path: /tcpSlaveAgentListener/
port: 50000
initialDelaySeconds: 60
periodSeconds: 30
'''
}
}
stages {
stage('Debug') {
steps {
sh '''
echo "=== Pod Info ==="
hostname
cat /etc/resolv.conf
echo "=== Jenkins Master Connectivity ==="
nc -zv $JENKINS_URL 8080 || true
nc -zv $JENKINS_TUNNEL_HOST 50000 || true
echo "=== DNS Resolution ==="
nslookup jenkins.jenkins.svc.cluster.local || true
'''
}
}
}
}# Kubernetes debugging commands
# Check pod status
kubectl get pods -n jenkins -l jenkins=agent
# Check pod events
kubectl describe pod <pod-name> -n jenkins | grep -A 20 "Events:"
# Check if service is accessible
kubectl run debug --rm -it --image=busybox -n jenkins -- sh
nslookup jenkins.jenkins.svc.cluster.local
nc -zv jenkins 8080
nc -zv jenkins-agent 50000Step 5: Implement Health Monitoring
// Jenkinsfile for agent health monitoring
pipeline {
agent { label 'monitoring' }
triggers {
cron('*/5 * * * *') // Every 5 minutes
}
stages {
stage('Check Agent Health') {
steps {
script {
def unhealthyAgents = []
Jenkins.instance.computers.each { computer ->
if (computer.name != "" && computer.name != "master") {
def health = checkAgentHealth(computer)
if (!health.healthy) {
unhealthyAgents.add([
name: computer.name,
reason: health.reason
])
// Attempt auto-recovery
if (health.recoverable) {
echo "Attempting to reconnect ${computer.name}..."
computer.connect(true)
}
}
}
}
if (unhealthyAgents) {
def message = "Unhealthy Jenkins Agents:\n"
unhealthyAgents.each { agent ->
message += "- ${agent.name}: ${agent.reason}\n"
}
slackSend(
channel: '#jenkins-alerts',
color: 'warning',
message: message
)
}
}
}
}
}
}
def checkAgentHealth(computer) {
def result = [healthy: true, reason: '', recoverable: false]
// Check if offline
if (computer.isOffline()) {
result.healthy = false
result.reason = computer.getOfflineCauseReason() ?: 'Unknown'
result.recoverable = !computer.isTemporarilyOffline()
return result
}
// Check disk space
def diskSpace = computer.getMonitorData()?.get('hudson.node_monitors.DiskSpaceMonitor')
if (diskSpace && diskSpace.size < 5 * 1024 * 1024 * 1024) { // Less than 5GB
result.healthy = false
result.reason = "Low disk space: ${diskSpace.size / 1024 / 1024 / 1024}GB"
return result
}
// Check response time
def responseTime = computer.getMonitorData()?.get('hudson.node_monitors.ResponseTimeMonitor')
if (responseTime && responseTime.average > 5000) { // More than 5 seconds
result.healthy = false
result.reason = "High response time: ${responseTime.average}ms"
return result
}
return result
}Step 6: Preventive Configuration
// JCasC configuration with resilient agent setup
jenkins:
nodes:
- permanent:
name: "resilient-agent"
remoteFS: "/var/jenkins"
numExecutors: 4
launcher:
ssh:
host: "agent.company.com"
credentialsId: "jenkins-ssh"
maxNumRetries: 10
retryWaitTime: 60
launchTimeoutSeconds: 300
nodeProperties:
- diskSpaceMonitor:
freeDiskSpaceThreshold: "5GiB"
- responseTimeMonitor:
averageResponseTimeThreshold: 5000
- clockMonitor:
clockTimeDiffThreshold: 60000
# Global agent settings
remotingSecurity:
enabled: true
unclassified:
location:
url: "https://jenkins.company.com/"
# Agent controller connectivity
slaveAgentPort: 50000
agentProtocols:
- "JNLP4-connect"
- "Ping" Agent Troubleshooting Reference
| Symptom | Likely Cause | Diagnostic Command | Fix |
|---|---|---|---|
| ”Connection reset” | Network firewall | nc -zv agent 50000 | Open port 50000 |
| ”Ping timeout” | Agent process crashed | ssh agent "ps aux | grep java" | Restart agent |
| ”Channel closed” | JVM OOM | Check /var/log/jenkins/agent.log | Increase heap |
| ”Certificate error” | SSL mismatch | openssl s_client -connect jenkins:443 | Update certs |
| ”Permission denied” | SSH key issues | ssh -vvv agent | Regenerate keys |
Practice Question
A Jenkins SSH agent shows 'Offline - Connection refused' but you can SSH to the machine manually. What is the most likely cause?