Azure Monitor — Custom Metrics and Business KPI Alerting
The Problem
You're tracking technical metrics (CPU, memory, requests) but:
- Can't measure business outcomes (orders per minute, revenue)
- Don't know when business processes fail silently
- Alerts fire for trivial issues but miss critical problems
- No correlation between technical issues and business impact
You need to measure what matters to the business.
Solution Architecture
┌─────────────────────────────────────────────────────────────────┐
│ Business Metrics Flow │
└─────────────────────────────────────────────────────────────────┘
Application Metrics API Azure Monitor Alerts
│ │ │ │
│ ─── Order created ─│ │ │
│ │ ── Send Metric ──────▶ │ │
│ ─── Payment failed │ │ │
│ │ ── Send Metric ──────▶ │ │
│ ─── User signed up │ │ │
│ │ ── Send Metric ──────▶ │ │
│ │ │ │
│ │ │ ── Check ──▶ │
│ │ │ │
│ │ │ ◀─── Alert ── │
│ │ │ │
▼ ▼ ▼ ▼
Your Code Custom Metrics Time Series DB Teams/Slack
Solution Implementation
Step 1: Track Business Events as Custom Metrics
using Azure.Monitor.OpenTelemetry.Exporter;
public class BusinessMetricsService
{
private readonly Meter _meter;
private readonly Counter<long> _ordersPlaced;
private readonly Counter<long> _ordersFailed;
private readonly Histogram<double> _orderValue;
private readonly Counter<long> _usersSignedUp;
private readonly Counter<long> _supportTicketsCreated;
private readonly Gauge<int> _activeUsers;
public BusinessMetricsService()
{
_meter = new Meter("BusinessMetrics", "1.0");
// Counters for discrete events
_ordersPlaced = _meter.CreateCounter<long>("orders_placed",
description: "Total orders placed");
_ordersFailed = _meter.CreateCounter<long>("orders_failed",
description: "Failed orders");
_usersSignedUp = _meter.CreateCounter<long>("users_signed_up",
description: "New user signups");
_supportTicketsCreated = _meter.CreateCounter<long>("support_tickets_created",
description: "Support tickets");
// Histogram for value distributions
_orderValue = _meter.CreateHistogram<double>("order_value",
description: "Order value in USD",
unit: "USD");
// Gauge for current state
_activeUsers = _meter.CreateObservableGauge<int>("active_users",
() => GetCurrentActiveUserCount());
}
public void RecordOrderPlaced(Order order)
{
_ordersPlaced.Add(1, new TagList
{
{ "region", order.Region },
{ "tier", order.Tier },
{ "channel", order.Channel }
});
_orderValue.Record(order.TotalAmount, new TagList
{
{ "region", order.Region },
{ "tier", order.Tier }
});
}
public void RecordOrderFailed(string reason)
{
_ordersFailed.Add(1, new TagList
{
{ "reason", reason }
});
}
public void RecordUserSignup(User user)
{
_usersSignedUp.Add(1, new TagList
{
{ "source", user.SignupSource },
{ "plan", user.Plan }
});
}
private int GetCurrentActiveUserCount()
{
return _activeUserCache.Count;
}
}
// Register in DI
services.AddSingleton<BusinessMetricsService>();
Step 2: Configure OpenTelemetry Export
// Program.cs
builder.Services.AddOpenTelemetry()
.ConfigureResource(resource => resource
.AddService("order-service"))
.WithMetrics(metrics => metrics
.AddMeter("BusinessMetrics")
.AddAzureMonitorMetricsExporter(options =>
{
options.ConnectionString = builder.Configuration["APPLICATIONINSIGHTS_CONNECTION_STRING"];
})
.AddRuntimeInstrumentation()
.AddHttpClientInstrumentation()
.AddAspNetCoreInstrumentation());
Step 3: Create Business KPI Alerts
# Alert when order success rate drops below 95%
az monitor metrics alert create \
--name "OrderSuccessRateLow" \
--resource-group monitor-rg \
--condition "type=Metric and aggregation=Average and namespace=microsoft.insights/components and metric=custom/orders_placed - custom/orders_failed and operator=LessThan and threshold=0.95" \
--description "Order success rate below 95%" \
--evaluation-frequency 5m \
--window-size 15m
# Alert when average order value drops significantly
az monitor metrics alert create \
--name "OrderValueDropped" \
--resource-group monitor-rg \
--condition "type=Metric and aggregation=Average and metric=custom/order_value and operator=LessThan and threshold=50" \
--description "Average order value dropped below $50"
Step 4: Complex Business Alert Rules
// KQL-based alert: Business health score
AzureDiagnostics
| where TimeGenerated > ago(1h)
| extend
orders_placed = toscalar(customEvents | where name == "order_placed" | summarize sum(itemCount)),
orders_failed = toscalar(customEvents | where name == "order_failed" | summarize sum(itemCount)),
signups = toscalar(customEvents | where name == "signup" | summarize sum(itemCount)),
support_tickets = toscalar(customEvents | where name == "support_ticket" | summarize sum(itemCount))
| extend
success_rate = (orders_placed - orders_failed) * 100.0 / orders_placed,
health_score = (signups * 2) - (support_tickets * 1.5)
| where success_rate < 90 or health_score < 0
| project TimeGenerated, success_rate, signups, support_tickets, health_score
Step 5: Custom Metrics in Node.js
import { MeterProvider, PeriodicExportingMetricReader } from "@opentelemetry/sdk-metrics";
import { AzureMonitorTraceExporter } from "@azure/monitor-opentelemetry-exporter";
// Create meter
const meter = new Meter({
name: "business-metrics",
version: "1.0.0"
});
// Create instruments
const ordersPlacedCounter = meter.createCounter("orders_placed", {
description: "Total orders placed"
});
const orderValueHistogram = meter.createHistogram("order_value", {
description: "Order value distribution",
unit: "USD"
});
const activeUsersGauge = meter.createObservableGauge("active_users", {
description: "Currently active users"
});
// Update gauge periodically
setInterval(() => {
activeUsersGauge.observe(getCurrentActiveUsers());
}, 60000);
// Record metrics
export function recordOrder(order: Order): void {
ordersPlacedCounter.add(1, {
region: order.region,
tier: order.tier
});
orderValueHistogram.record(order.total, {
region: order.region
});
}
// Export to Azure Monitor
const reader = new PeriodicExportingMetricReader({
exportIntervalMillis: 60000,
exporter: new AzureMonitorMetricExporter({
connectionString: process.env["APPLICATIONINSIGHTS_CONNECTION_STRING"]
})
});
new MeterProvider({ metrics: [meter], metricReader: reader }).start();
Step 6: Service Bus Business Metrics
// Track business events from Service Bus messages
public class OrderMessageProcessor
{
private readonly BusinessMetricsService _metrics;
public async Task ProcessMessageAsync(ServiceBusReceivedMessage message)
{
var order = message.Body.ToObject<Order>();
try
{
await _orderService.ProcessOrder(order);
_metrics.RecordOrderPlaced(order);
// Track by region for regional alerts
message.ApplicationProperties.TryGetValue("Region", out var region);
if (region != null)
{
_metrics.RecordOrderByRegion(region.ToString()!, order.Total);
}
}
catch (Exception ex)
{
_metrics.RecordOrderFailed(ex.Message);
throw;
}
}
}
// Create alerts on business metrics
public class BusinessAlertConfig
{
public void ConfigureAlerts()
{
// Alert if orders drop below threshold
// Alert if support tickets spike
// Alert if payment failures increase
// Alert if user signups stop
}
}
Step 7: Alert Severity and Escalation
{
"alertRules": [
{
"name": "Critical Business Impact",
"condition": "orders_failed > 10 AND success_rate < 0.8",
"severity": "critical",
"actions": [
{ "type": "email", "to": "oncall@company.com" },
{ "type": "sms", "to": "+1234567890" },
{ "type": "webhook", "url": "https://pagerduty.com/trigger" }
]
},
{
"name": "Warning - Business Impact",
"condition": "success_rate < 0.95 OR order_value < 30",
"severity": "warning",
"actions": [
{ "type": "email", "to": "team@company.com" },
{ "type": "teams", "channel": "#operations" }
]
},
{
"name": "Info - Business Trend",
"condition": "active_users < 100 AND time > 6h",
"severity": "info",
"actions": [
{ "type": "email", "to": "product@company.com" }
]
}
]
}
Step 8: Business Dashboard
// Business Overview Dashboard Query
let startTime = ago(24h);
let endTime = now();
// Orders metrics
let orders = customEvents
| where name startswith "order_"
| summarize
total = sumif(itemCount, name == "order_placed"),
failed = sumif(itemCount, name == "order_failed"),
revenue = sumif(measurements["value"], name == "order_placed")
by bin(TimeGenerated, 1h);
// Signups
let signups = customEvents
| where name == "signup"
| summarize signups = sum(itemCount) by bin(TimeGenerated, 1h);
// Support
let support = customEvents
| where name == "support_ticket"
| summarize tickets = sum(itemCount) by bin(TimeGenerated, 1h);
orders
| join kind=outer signups on TimeGenerated
| join kind=outer support on TimeGenerated
| extend
success_rate = round((total - failed) * 100.0 / total, 2),
avg_order_value = round(revenue / total, 2)
| project TimeGenerated, total, failed, success_rate, revenue, avg_order_value, signups, tickets
| order by TimeGenerated desc
Best Practices
| Practice | Benefit |
|---|---|
| Track business events, not just technical | Measure real outcomes |
| Use proper metric types (counter, gauge, histogram) | Right analysis |
| Add business context as dimensions | Filter and segment |
| Set thresholds based on business impact | Actionable alerts |
| Create severity levels | Proper escalation |
Summary
- Emit custom metrics for business events
- Use Application Insights/OpenTelemetry
- Create alerts on business KPIs, not just technical metrics
- Build dashboards showing business health
- Implement proper alert severity and escalation