Questions
Build a monitoring solution with Azure Monitor that catches issues before users notice.
The Scenario
Your production environment lacks proper observability:
Current State:
├── Azure resources spread across 3 subscriptions
├── No centralized logging
├── Alerts: only CPU > 90% email (often ignored)
├── MTTD (Mean Time to Detect): 2-4 hours
├── MTTR (Mean Time to Recover): 4-8 hours
├── Last week: customers reported issues before team noticed
└── No correlation between logs, metrics, and traces
Business requirement: Detect production issues within 5 minutes.
The Challenge
Implement a comprehensive monitoring strategy using Azure Monitor, Log Analytics, Application Insights, and proper alerting to achieve proactive incident detection.
A junior engineer might set up basic alerts on individual resources, use separate Log Analytics workspaces per resource, create noisy alerts that get ignored, or skip distributed tracing. These approaches cause alert fatigue, make correlation impossible, and don't provide actionable insights.
A senior engineer designs a centralized monitoring architecture with a shared Log Analytics workspace, implements Application Insights with distributed tracing, creates multi-signal alerts with proper severity levels, and builds actionable dashboards with workbooks.
Step 1: Design Monitoring Architecture
Centralized Monitoring Architecture:
┌────────────────────────────────────────────────────────────────────────┐
│ Azure Monitor │
│ ┌──────────────────────────────────────────────────────────────────┐ │
│ │ Log Analytics Workspace (Central) │ │
│ │ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ ┌────────────┐ │ │
│ │ │ App Insights│ │Platform Logs│ │ Custom Logs │ │ Security │ │ │
│ │ │ (APM) │ │(Diagnostic) │ │(Application)│ │ Logs │ │ │
│ │ └─────────────┘ └─────────────┘ └─────────────┘ └────────────┘ │ │
│ └──────────────────────────────────────────────────────────────────┘ │
│ │ │
│ ┌────────────────────────────┼────────────────────────────────────┐ │
│ │ Alert Rules │ │
│ │ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ │ │
│ │ │ Metric │ │ Log │ │ Activity │ │ │
│ │ │ Alerts │ │ Alerts │ │ Alerts │ │ │
│ │ └─────────────┘ └─────────────┘ └─────────────┘ │ │
│ └──────────────────────────────────────────────────────────────────┘ │
│ │ │
│ ┌────────────────────────────┼────────────────────────────────────┐ │
│ │ Action Groups │ │
│ │ Email │ SMS │ PagerDuty │ Logic Apps │ Azure Functions │ │
│ └──────────────────────────────────────────────────────────────────┘ │
└────────────────────────────────────────────────────────────────────────┘Step 2: Create Centralized Log Analytics Workspace
// Centralized Log Analytics workspace
resource logAnalyticsWorkspace 'Microsoft.OperationalInsights/workspaces@2022-10-01' = {
name: 'law-central-${environment}'
location: location
properties: {
sku: {
name: 'PerGB2018'
}
retentionInDays: 90
features: {
enableLogAccessUsingOnlyResourcePermissions: true
}
workspaceCapping: {
dailyQuotaGb: 10 // Cost control
}
}
}
// Application Insights connected to workspace
resource appInsights 'Microsoft.Insights/components@2020-02-02' = {
name: 'appi-${appName}-${environment}'
location: location
kind: 'web'
properties: {
Application_Type: 'web'
WorkspaceResourceId: logAnalyticsWorkspace.id
IngestionMode: 'LogAnalytics'
publicNetworkAccessForIngestion: 'Enabled'
publicNetworkAccessForQuery: 'Enabled'
RetentionInDays: 90
}
}
// Enable diagnostic settings for all resources
resource appServiceDiagnostics 'Microsoft.Insights/diagnosticSettings@2021-05-01-preview' = {
name: 'diag-appservice'
scope: appService
properties: {
workspaceId: logAnalyticsWorkspace.id
logs: [
{
category: 'AppServiceHTTPLogs'
enabled: true
}
{
category: 'AppServiceConsoleLogs'
enabled: true
}
{
category: 'AppServiceAppLogs'
enabled: true
}
{
category: 'AppServicePlatformLogs'
enabled: true
}
]
metrics: [
{
category: 'AllMetrics'
enabled: true
}
]
}
}Step 3: Implement Application Insights
// Program.cs - Configure Application Insights
var builder = WebApplication.CreateBuilder(args);
// Add Application Insights telemetry
builder.Services.AddApplicationInsightsTelemetry(options =>
{
options.ConnectionString = builder.Configuration["ApplicationInsights:ConnectionString"];
options.EnableAdaptiveSampling = true;
options.EnableQuickPulseMetricStream = true; // Live Metrics
});
// Add custom telemetry initializer
builder.Services.AddSingleton<ITelemetryInitializer, CustomTelemetryInitializer>();
// Custom telemetry initializer for enriching data
public class CustomTelemetryInitializer : ITelemetryInitializer
{
public void Initialize(ITelemetry telemetry)
{
// Add custom properties to all telemetry
telemetry.Context.Cloud.RoleName = "OrderService";
telemetry.Context.GlobalProperties["Environment"] = Environment.GetEnvironmentVariable("ASPNETCORE_ENVIRONMENT");
telemetry.Context.GlobalProperties["Version"] = Assembly.GetExecutingAssembly().GetName().Version?.ToString();
}
}// Custom telemetry for business metrics
public class OrderService
{
private readonly TelemetryClient _telemetry;
private readonly ILogger<OrderService> _logger;
public OrderService(TelemetryClient telemetry, ILogger<OrderService> logger)
{
_telemetry = telemetry;
_logger = logger;
}
public async Task<Order> ProcessOrderAsync(CreateOrderRequest request)
{
using var operation = _telemetry.StartOperation<RequestTelemetry>("ProcessOrder");
try
{
// Track custom event
_telemetry.TrackEvent("OrderReceived", new Dictionary<string, string>
{
["OrderId"] = request.OrderId,
["CustomerId"] = request.CustomerId,
["Region"] = request.Region
});
var stopwatch = Stopwatch.StartNew();
var order = await CreateOrderInternalAsync(request);
stopwatch.Stop();
// Track custom metric
_telemetry.TrackMetric("OrderProcessingTime", stopwatch.ElapsedMilliseconds);
_telemetry.TrackMetric("OrderValue", (double)order.TotalAmount);
// Track dependency call
using (_telemetry.StartOperation<DependencyTelemetry>("PaymentService"))
{
await ProcessPaymentAsync(order);
}
operation.Telemetry.Success = true;
return order;
}
catch (Exception ex)
{
operation.Telemetry.Success = false;
_telemetry.TrackException(ex, new Dictionary<string, string>
{
["OrderId"] = request.OrderId,
["ErrorType"] = ex.GetType().Name
});
// Structured logging with correlation
_logger.LogError(ex, "Failed to process order {OrderId} for customer {CustomerId}",
request.OrderId, request.CustomerId);
throw;
}
}
}Step 4: Create Intelligent Alerts
// Action group for notifications
resource actionGroup 'Microsoft.Insights/actionGroups@2023-01-01' = {
name: 'ag-production-critical'
location: 'global'
properties: {
groupShortName: 'ProdCrit'
enabled: true
emailReceivers: [
{
name: 'OnCallTeam'
emailAddress: 'oncall@contoso.com'
useCommonAlertSchema: true
}
]
smsReceivers: [
{
name: 'OnCallSMS'
countryCode: '1'
phoneNumber: '5551234567'
}
]
webhookReceivers: [
{
name: 'PagerDuty'
serviceUri: 'https://events.pagerduty.com/integration/xxx/enqueue'
useCommonAlertSchema: true
}
]
}
}
// Metric alert: High error rate
resource errorRateAlert 'Microsoft.Insights/metricAlerts@2018-03-01' = {
name: 'alert-high-error-rate'
location: 'global'
properties: {
description: 'Alert when error rate exceeds 5%'
severity: 1 // Critical
enabled: true
scopes: [appInsights.id]
evaluationFrequency: 'PT1M'
windowSize: 'PT5M'
criteria: {
'odata.type': 'Microsoft.Azure.Monitor.SingleResourceMultipleMetricCriteria'
allOf: [
{
name: 'FailedRequests'
metricName: 'requests/failed'
operator: 'GreaterThan'
threshold: 5
timeAggregation: 'Average'
criterionType: 'StaticThresholdCriterion'
}
]
}
actions: [
{
actionGroupId: actionGroup.id
}
]
}
}
// Dynamic threshold alert: Anomaly detection
resource anomalyAlert 'Microsoft.Insights/metricAlerts@2018-03-01' = {
name: 'alert-response-time-anomaly'
location: 'global'
properties: {
description: 'Alert when response time deviates from normal'
severity: 2
enabled: true
scopes: [appInsights.id]
evaluationFrequency: 'PT5M'
windowSize: 'PT15M'
criteria: {
'odata.type': 'Microsoft.Azure.Monitor.MultipleResourceMultipleMetricCriteria'
allOf: [
{
name: 'ResponseTimeAnomaly'
metricName: 'requests/duration'
operator: 'GreaterOrLessThan'
alertSensitivity: 'Medium' // Low, Medium, High
failingPeriods: {
numberOfEvaluationPeriods: 4
minFailingPeriodsToAlert: 3
}
timeAggregation: 'Average'
criterionType: 'DynamicThresholdCriterion'
}
]
}
actions: [
{
actionGroupId: actionGroup.id
}
]
}
}
// Log alert: Exception spike
resource exceptionAlert 'Microsoft.Insights/scheduledQueryRules@2022-06-15' = {
name: 'alert-exception-spike'
location: location
properties: {
displayName: 'Exception Spike Detected'
description: 'Alert when exception count exceeds normal levels'
severity: 2
enabled: true
scopes: [logAnalyticsWorkspace.id]
evaluationFrequency: 'PT5M'
windowSize: 'PT15M'
criteria: {
allOf: [
{
query: '''
exceptions
| where timestamp > ago(15m)
| summarize ExceptionCount = count() by type, bin(timestamp, 5m)
| where ExceptionCount > 10
'''
timeAggregation: 'Count'
operator: 'GreaterThan'
threshold: 0
failingPeriods: {
numberOfEvaluationPeriods: 1
minFailingPeriodsToAlert: 1
}
}
]
}
actions: {
actionGroups: [actionGroup.id]
customProperties: {
Severity: 'High'
Runbook: 'https://wiki.contoso.com/runbooks/exception-investigation'
}
}
}
}Step 5: Create Azure Workbook for Dashboard
// Workbook template for production dashboard
{
"version": "Notebook/1.0",
"items": [
{
"type": 1,
"content": {
"json": "# Production Health Dashboard\n---"
}
},
{
"type": 10,
"content": {
"chartId": "workbook-health-overview",
"version": "MetricsItem/2.0",
"size": 1,
"chartType": 2,
"resourceType": "microsoft.insights/components",
"metricScope": 0,
"resourceParameter": "AppInsights",
"timeContext": {
"durationMs": 3600000
},
"metrics": [
{
"namespace": "microsoft.insights/components",
"metric": "requests/count",
"aggregation": 1
},
{
"namespace": "microsoft.insights/components",
"metric": "requests/failed",
"aggregation": 1
}
]
}
},
{
"type": 3,
"content": {
"version": "KqlItem/1.0",
"query": "requests\n| where timestamp > ago(1h)\n| summarize \n Requests = count(),\n AvgDuration = avg(duration),\n P95Duration = percentile(duration, 95),\n FailedPercent = 100.0 * countif(success == false) / count()\nby bin(timestamp, 5m)\n| render timechart",
"size": 0,
"title": "Request Performance (Last Hour)"
}
}
]
}Step 6: KQL Queries for Investigation
// Find slow requests
requests
| where timestamp > ago(1h)
| where duration > 5000 // > 5 seconds
| project timestamp, name, duration, resultCode, operation_Id
| order by duration desc
| take 100
// Exception breakdown by type
exceptions
| where timestamp > ago(24h)
| summarize Count = count() by type, outerMessage
| order by Count desc
| take 20
// Dependency failures
dependencies
| where timestamp > ago(1h)
| where success == false
| summarize FailureCount = count() by target, name, resultCode
| order by FailureCount desc
// End-to-end transaction trace
union requests, dependencies, exceptions, traces
| where operation_Id == "abc123"
| project timestamp, itemType, name, duration, success, message
| order by timestamp asc
// Availability by region
requests
| where timestamp > ago(24h)
| summarize
TotalRequests = count(),
FailedRequests = countif(success == false),
AvailabilityPercent = 100.0 * countif(success == true) / count()
by client_CountryOrRegion
| order by TotalRequests desc
// Performance degradation detection
requests
| where timestamp > ago(7d)
| summarize AvgDuration = avg(duration) by bin(timestamp, 1h)
| render timechartStep 7: Implement Smart Detection
// Enable Smart Detection for Application Insights
resource smartDetection 'Microsoft.Insights/components/ProactiveDetectionConfigs@2018-05-01-preview' = {
parent: appInsights
name: 'slowpageloadtime'
properties: {
RuleDefinitions: {
Name: 'slowpageloadtime'
DisplayName: 'Slow page load time'
Description: 'Smart Detection rules notify you of performance anomalies'
HelpUrl: 'https://docs.microsoft.com/azure/application-insights/app-insights-proactive-performance-diagnostics'
IsHidden: false
IsEnabledByDefault: true
IsInPreview: false
SupportsEmailNotifications: true
}
Enabled: true
SendEmailsToSubscriptionOwners: true
CustomEmails: ['platform-team@contoso.com']
}
} Monitoring Best Practices
| Area | Recommendation | Purpose |
|---|---|---|
| Workspace | Single centralized workspace | Correlation across resources |
| Retention | 90 days (export to Storage for longer) | Cost vs compliance balance |
| Alerts | Dynamic thresholds | Reduce false positives |
| Sampling | Adaptive sampling | Cost control for high-volume |
| Dashboards | Azure Workbooks | Interactive investigation |
Alert Severity Guidelines
| Severity | Response Time | Examples |
|---|---|---|
| Sev 0 - Critical | Immediate (page) | Service down, data loss |
| Sev 1 - Error | 15 minutes | High error rate, degraded |
| Sev 2 - Warning | 1 hour | Elevated latency, capacity |
| Sev 3 - Info | Next business day | Anomalies, recommendations |
Practice Question
Why should you use dynamic thresholds instead of static thresholds for latency alerts?