Questions
Azure SQL failover takes too long and connections drop. Implement proper HA.
The Scenario
Your mission-critical application uses Azure SQL Database:
Current Setup:
├── Single Azure SQL Database (S3 tier)
├── Location: East US
├── Connection string: hardcoded in app.config
├── No geo-replication configured
├── RTO requirement: < 5 minutes
├── RPO requirement: < 30 seconds
└── Recent outage: 45 minutes (regional issue)
Business is demanding zero-downtime database access across regions.
The Challenge
Implement auto-failover groups with proper connection handling, understand the failover process, and design for multi-region high availability.
A junior engineer might configure active geo-replication without failover groups, use the primary server endpoint in connection strings, skip connection retry logic, or manually switch DNS during outages. These approaches require manual intervention, break connections, or don't meet RTO requirements.
A senior engineer implements auto-failover groups with listener endpoints, configures proper connection retry logic, understands the read-write vs read-only routing, and implements health monitoring with automated alerting.
Step 1: Understand Failover Options
Azure SQL HA Options:
┌─────────────────────────────────────────────────────────────────┐
│ Zone Redundancy (Same Region) │
│ • Automatic within availability zones │
│ • No configuration needed (just enable) │
│ • RTO: ~0 (automatic) │
└─────────────────────────────────────────────────────────────────┘
┌─────────────────────────────────────────────────────────────────┐
│ Active Geo-Replication (Cross-Region) │
│ • Up to 4 readable secondaries │
│ • Manual failover required │
│ • Different connection strings per replica │
└─────────────────────────────────────────────────────────────────┘
┌─────────────────────────────────────────────────────────────────┐
│ Auto-Failover Groups (Cross-Region + Automatic) │
│ • Automatic failover with grace period │
│ • Single listener endpoint (DNS-based) │
│ • Read-write and read-only endpoints │
│ • Recommended for production │
└─────────────────────────────────────────────────────────────────┘Step 2: Create Failover Group with Bicep
// Parameters
param primaryLocation string = 'eastus'
param secondaryLocation string = 'westus'
param sqlAdminLogin string
@secure()
param sqlAdminPassword string
// Primary SQL Server
resource primaryServer 'Microsoft.Sql/servers@2023-05-01-preview' = {
name: 'sql-primary-${uniqueString(resourceGroup().id)}'
location: primaryLocation
properties: {
administratorLogin: sqlAdminLogin
administratorLoginPassword: sqlAdminPassword
minimalTlsVersion: '1.2'
}
}
// Secondary SQL Server (in different region)
resource secondaryServer 'Microsoft.Sql/servers@2023-05-01-preview' = {
name: 'sql-secondary-${uniqueString(resourceGroup().id)}'
location: secondaryLocation
properties: {
administratorLogin: sqlAdminLogin
administratorLoginPassword: sqlAdminPassword
minimalTlsVersion: '1.2'
}
}
// Database on primary server
resource database 'Microsoft.Sql/servers/databases@2023-05-01-preview' = {
parent: primaryServer
name: 'appdb'
location: primaryLocation
sku: {
name: 'S3'
tier: 'Standard'
}
properties: {
zoneRedundant: true // Enable zone redundancy
readScale: 'Enabled' // Enable read scale-out
}
}
// Auto-failover group
resource failoverGroup 'Microsoft.Sql/servers/failoverGroups@2023-05-01-preview' = {
parent: primaryServer
name: 'fog-appdb'
properties: {
readWriteEndpoint: {
failoverPolicy: 'Automatic'
failoverWithDataLossGracePeriodMinutes: 1 // Wait 1 min before auto-failover
}
readOnlyEndpoint: {
failoverPolicy: 'Enabled' // Failover read endpoint too
}
partnerServers: [
{
id: secondaryServer.id
}
]
databases: [
database.id
]
}
}
// Outputs - Use these endpoints in your application
output readWriteEndpoint string = 'fog-appdb.database.windows.net'
output readOnlyEndpoint string = 'fog-appdb.secondary.database.windows.net'Step 3: Configure Connection Strings
// appsettings.json - Use failover group endpoints
{
"ConnectionStrings": {
// Read-write traffic (points to current primary)
"DefaultConnection": "Server=fog-appdb.database.windows.net;Database=appdb;Authentication=Active Directory Default;Encrypt=True;",
// Read-only traffic (can use secondary for read scale)
"ReadOnlyConnection": "Server=fog-appdb.secondary.database.windows.net;Database=appdb;Authentication=Active Directory Default;Encrypt=True;ApplicationIntent=ReadOnly;"
}
}
// Connection with retry logic
public class ResilientDbContext : DbContext
{
protected override void OnConfiguring(DbContextOptionsBuilder options)
{
options.UseSqlServer(connectionString, sqlOptions =>
{
// Enable connection resiliency
sqlOptions.EnableRetryOnFailure(
maxRetryCount: 5,
maxRetryDelay: TimeSpan.FromSeconds(30),
errorNumbersToAdd: new List<int> {
4060, // Cannot open database
40197, // Error processing request
40501, // Service busy
40613, // Database unavailable
49918, // Not enough resources
49919, // Cannot process request
49920 // Too many requests
});
sqlOptions.CommandTimeout(60);
});
}
}Step 4: Implement Connection Retry Logic
// Polly-based retry policy for transient failures
public static class SqlRetryPolicy
{
public static IAsyncPolicy GetRetryPolicy()
{
return Policy
.Handle<SqlException>(ex => IsTransientError(ex))
.Or<TimeoutException>()
.WaitAndRetryAsync(
retryCount: 5,
sleepDurationProvider: attempt =>
TimeSpan.FromSeconds(Math.Pow(2, attempt)), // Exponential backoff
onRetry: (exception, timeSpan, retryCount, context) =>
{
Log.Warning(
"SQL transient error. Retry {RetryCount} after {Delay}ms. Error: {Error}",
retryCount, timeSpan.TotalMilliseconds, exception.Message);
});
}
private static bool IsTransientError(SqlException ex)
{
// Transient error numbers during failover
int[] transientErrors = {
4060, 40197, 40501, 40613, 40143, 40166,
49918, 49919, 49920, 10929, 10928, 10060,
10054, 10053, 233, -1, -2
};
return transientErrors.Contains(ex.Number);
}
}
// Usage
public async Task<Order> GetOrderAsync(int orderId)
{
return await SqlRetryPolicy.GetRetryPolicy().ExecuteAsync(async () =>
{
using var connection = new SqlConnection(_connectionString);
await connection.OpenAsync();
return await connection.QuerySingleOrDefaultAsync<Order>(
"SELECT * FROM Orders WHERE Id = @Id",
new { Id = orderId });
});
}Step 5: Monitor Failover Group Health
// Alert on failover events
resource failoverAlert 'Microsoft.Insights/metricAlerts@2018-03-01' = {
name: 'sql-failover-alert'
location: 'global'
properties: {
description: 'Alert when failover group role changes'
severity: 1
enabled: true
scopes: [primaryServer.id]
evaluationFrequency: 'PT1M'
windowSize: 'PT5M'
criteria: {
'odata.type': 'Microsoft.Azure.Monitor.SingleResourceMultipleMetricCriteria'
allOf: [
{
name: 'FailoverGroupRole'
metricName: 'sql_server_failover_group_role'
dimensions: []
operator: 'GreaterThan'
threshold: 0 // 0 = Primary, 1 = Secondary
timeAggregation: 'Maximum'
}
]
}
actions: [
{
actionGroupId: actionGroup.id
}
]
}
}# Check failover group status
az sql failover-group show \
--name fog-appdb \
--resource-group myResourceGroup \
--server sql-primary
# Manual failover (for testing/planned maintenance)
az sql failover-group set-primary \
--name fog-appdb \
--resource-group myResourceGroup \
--server sql-secondary
# Forced failover (data loss possible)
az sql failover-group set-primary \
--name fog-appdb \
--resource-group myResourceGroup \
--server sql-secondary \
--allow-data-lossStep 6: Read Scale-Out for Performance
// Route read queries to secondary replicas
public class ReadScaleDbContext : DbContext
{
private readonly bool _useReadReplica;
public ReadScaleDbContext(bool useReadReplica = false)
{
_useReadReplica = useReadReplica;
}
protected override void OnConfiguring(DbContextOptionsBuilder options)
{
var connectionString = _useReadReplica
? "Server=fog-appdb.secondary.database.windows.net;Database=appdb;ApplicationIntent=ReadOnly;..."
: "Server=fog-appdb.database.windows.net;Database=appdb;...";
options.UseSqlServer(connectionString);
}
}
// Service using read replicas for reports
public class ReportService
{
public async Task<SalesReport> GetMonthlySalesAsync()
{
// Use read replica for heavy read queries
using var context = new ReadScaleDbContext(useReadReplica: true);
return await context.Orders
.Where(o => o.OrderDate >= startOfMonth)
.GroupBy(o => o.ProductCategory)
.Select(g => new { Category = g.Key, Total = g.Sum(o => o.Amount) })
.ToListAsync();
}
}Failover Behavior Summary
Failover Timeline:
┌────────────────────────────────────────────────────────────────┐
│ T+0:00 Primary becomes unavailable │
│ T+0:00 Azure detects health check failures │
│ T+0:30 Grace period begins (configured: 1 minute) │
│ T+1:00 Grace period expires │
│ T+1:00 Automatic failover initiated │
│ T+1:05 Secondary promoted to primary │
│ T+1:10 DNS updated (fog-appdb.database.windows.net) │
│ T+1:15 Apps reconnect to new primary (with retry logic) │
│ T+1:30 Full service restored │
└────────────────────────────────────────────────────────────────┘
Total RTO: ~90 seconds (within 5 minute requirement)
RPO: Near-zero with sync replication (within 30 second requirement) Failover Group Checklist
| Component | Configuration | Purpose |
|---|---|---|
| Grace period | 1 hour (default), 0-1 hour | Avoid false failovers |
| Read-only endpoint | Enabled | Route read traffic to secondary |
| Listener endpoints | Use .database.windows.net | Automatic DNS failover |
| Connection retry | Exponential backoff | Handle transient errors |
| Zone redundancy | Enable | HA within region |
Connection String Endpoints
| Endpoint | Format | Routes To |
|---|---|---|
| Read-write | fog-name.database.windows.net | Current primary |
| Read-only | fog-name.secondary.database.windows.net | Current secondary |
Practice Question
Why should you use the failover group listener endpoint instead of the primary server endpoint in connection strings?