You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
1487 lines
43 KiB
Markdown
1487 lines
43 KiB
Markdown
# Know Foolery - Observability Strategy
|
|
|
|
## Overview
|
|
|
|
Comprehensive observability is essential for maintaining the Know Foolery quiz game's reliability, performance, and user experience. This document outlines the strategy for metrics collection, monitoring, alerting, and distributed tracing across all system components.
|
|
|
|
## Observability Architecture
|
|
|
|
### Three Pillars of Observability
|
|
|
|
```
|
|
┌─────────────────────────────────────────────────────────────────────────┐
|
|
│ Observability Stack │
|
|
│ │
|
|
│ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ │
|
|
│ │ METRICS │ │ LOGS │ │ TRACES │ │
|
|
│ │ │ │ │ │ │ │
|
|
│ │ Prometheus │ │ Loki │ │ Jaeger │ │
|
|
│ │ + │ │ + │ │ + │ │
|
|
│ │ Grafana │ │ Grafana │ │ OpenTeleme- │ │
|
|
│ │ │ │ │ │ try │ │
|
|
│ └─────────────┘ └─────────────┘ └─────────────┘ │
|
|
│ │ │ │ │
|
|
│ └───────────────────┼───────────────────┘ │
|
|
│ │ │
|
|
│ ┌─────────────┐ │
|
|
│ │ Grafana │ │
|
|
│ │ Unified │ │
|
|
│ │ Dashboard │ │
|
|
│ └─────────────┘ │
|
|
└─────────────────────────────────────────────────────────────────────────┘
|
|
│
|
|
Alerts & Notifications
|
|
│
|
|
┌─────────────────────────────────────────────────────────────────────────┐
|
|
│ Alert Management │
|
|
│ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ │
|
|
│ │ Email │ │ Slack │ │ PagerDuty │ │
|
|
│ │ Alerts │ │ Channels │ │ (Critical) │ │
|
|
│ └─────────────┘ └─────────────┘ └─────────────┘ │
|
|
└─────────────────────────────────────────────────────────────────────────┘
|
|
```
|
|
|
|
## Metrics Strategy
|
|
|
|
### Application Metrics Collection
|
|
|
|
#### 1. Business Metrics (Game-Specific)
|
|
```go
|
|
// Business metrics for game insights
|
|
package metrics
|
|
|
|
import (
|
|
"github.com/prometheus/client_golang/prometheus"
|
|
"github.com/prometheus/client_golang/prometheus/promauto"
|
|
)
|
|
|
|
var (
|
|
// Game session metrics
|
|
gamesStarted = promauto.NewCounterVec(
|
|
prometheus.CounterOpts{
|
|
Name: "games_started_total",
|
|
Help: "Total number of games started",
|
|
},
|
|
[]string{"player_type", "platform"},
|
|
)
|
|
|
|
gamesCompleted = promauto.NewCounterVec(
|
|
prometheus.CounterOpts{
|
|
Name: "games_completed_total",
|
|
Help: "Total number of games completed",
|
|
},
|
|
[]string{"completion_type", "platform"}, // normal, timeout, abandoned
|
|
)
|
|
|
|
sessionDuration = promauto.NewHistogramVec(
|
|
prometheus.HistogramOpts{
|
|
Name: "game_session_duration_seconds",
|
|
Help: "Duration of game sessions",
|
|
Buckets: []float64{60, 300, 600, 900, 1200, 1500, 1800}, // 1min to 30min
|
|
},
|
|
[]string{"completion_type"},
|
|
)
|
|
|
|
// Question and answer metrics
|
|
questionsAsked = promauto.NewCounterVec(
|
|
prometheus.CounterOpts{
|
|
Name: "questions_asked_total",
|
|
Help: "Total number of questions asked",
|
|
},
|
|
[]string{"theme", "difficulty"},
|
|
)
|
|
|
|
answersSubmitted = promauto.NewCounterVec(
|
|
prometheus.CounterOpts{
|
|
Name: "answers_submitted_total",
|
|
Help: "Total number of answers submitted",
|
|
},
|
|
[]string{"theme", "is_correct", "attempt_number", "used_hint"},
|
|
)
|
|
|
|
hintsRequested = promauto.NewCounterVec(
|
|
prometheus.CounterOpts{
|
|
Name: "hints_requested_total",
|
|
Help: "Total number of hints requested",
|
|
},
|
|
[]string{"theme", "question_difficulty"},
|
|
)
|
|
|
|
// Score distribution
|
|
scoreDistribution = promauto.NewHistogramVec(
|
|
prometheus.HistogramOpts{
|
|
Name: "game_scores",
|
|
Help: "Distribution of game scores",
|
|
Buckets: []float64{0, 5, 10, 15, 20, 25, 30, 40, 50, 60, 80, 100},
|
|
},
|
|
[]string{"session_duration_bucket"},
|
|
)
|
|
|
|
// Leaderboard metrics
|
|
leaderboardUpdates = promauto.NewCounter(
|
|
prometheus.CounterOpts{
|
|
Name: "leaderboard_updates_total",
|
|
Help: "Total number of leaderboard updates",
|
|
},
|
|
)
|
|
|
|
topScoreChanges = promauto.NewCounterVec(
|
|
prometheus.CounterOpts{
|
|
Name: "top_score_changes_total",
|
|
Help: "Changes in top 10 scores",
|
|
},
|
|
[]string{"position"}, // top_1, top_5, top_10
|
|
)
|
|
)
|
|
|
|
// Business metrics collection service
|
|
type GameMetrics struct {
|
|
registry prometheus.Registerer
|
|
}
|
|
|
|
func NewGameMetrics() *GameMetrics {
|
|
return &GameMetrics{
|
|
registry: prometheus.DefaultRegisterer,
|
|
}
|
|
}
|
|
|
|
func (m *GameMetrics) RecordGameStart(playerType, platform string) {
|
|
gamesStarted.WithLabelValues(playerType, platform).Inc()
|
|
}
|
|
|
|
func (m *GameMetrics) RecordGameCompletion(completionType, platform string, duration time.Duration) {
|
|
gamesCompleted.WithLabelValues(completionType, platform).Inc()
|
|
sessionDuration.WithLabelValues(completionType).Observe(duration.Seconds())
|
|
}
|
|
|
|
func (m *GameMetrics) RecordQuestionAsked(theme, difficulty string) {
|
|
questionsAsked.WithLabelValues(theme, difficulty).Inc()
|
|
}
|
|
|
|
func (m *GameMetrics) RecordAnswerSubmitted(theme string, isCorrect bool, attemptNum int, usedHint bool) {
|
|
answersSubmitted.WithLabelValues(
|
|
theme,
|
|
strconv.FormatBool(isCorrect),
|
|
strconv.Itoa(attemptNum),
|
|
strconv.FormatBool(usedHint),
|
|
).Inc()
|
|
}
|
|
|
|
func (m *GameMetrics) RecordFinalScore(score int, sessionDuration time.Duration) {
|
|
durationBucket := m.getDurationBucket(sessionDuration)
|
|
scoreDistribution.WithLabelValues(durationBucket).Observe(float64(score))
|
|
}
|
|
|
|
func (m *GameMetrics) getDurationBucket(duration time.Duration) string {
|
|
minutes := int(duration.Minutes())
|
|
switch {
|
|
case minutes <= 5:
|
|
return "0-5min"
|
|
case minutes <= 15:
|
|
return "5-15min"
|
|
case minutes <= 25:
|
|
return "15-25min"
|
|
default:
|
|
return "25-30min"
|
|
}
|
|
}
|
|
```
|
|
|
|
#### 2. Technical Metrics (Infrastructure)
|
|
```go
|
|
// Technical metrics for system health
|
|
var (
|
|
// HTTP metrics
|
|
httpRequestsTotal = promauto.NewCounterVec(
|
|
prometheus.CounterOpts{
|
|
Name: "http_requests_total",
|
|
Help: "Total number of HTTP requests",
|
|
},
|
|
[]string{"method", "endpoint", "status_code", "service"},
|
|
)
|
|
|
|
httpRequestDuration = promauto.NewHistogramVec(
|
|
prometheus.HistogramOpts{
|
|
Name: "http_request_duration_seconds",
|
|
Help: "HTTP request duration",
|
|
Buckets: prometheus.DefBuckets,
|
|
},
|
|
[]string{"method", "endpoint", "service"},
|
|
)
|
|
|
|
// Database metrics
|
|
dbConnectionsActive = promauto.NewGaugeVec(
|
|
prometheus.GaugeOpts{
|
|
Name: "db_connections_active",
|
|
Help: "Number of active database connections",
|
|
},
|
|
[]string{"database", "service"},
|
|
)
|
|
|
|
dbQueryDuration = promauto.NewHistogramVec(
|
|
prometheus.HistogramOpts{
|
|
Name: "db_query_duration_seconds",
|
|
Help: "Database query duration",
|
|
Buckets: []float64{0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1.0, 2.0, 5.0},
|
|
},
|
|
[]string{"query_type", "table", "service"},
|
|
)
|
|
|
|
dbErrors = promauto.NewCounterVec(
|
|
prometheus.CounterOpts{
|
|
Name: "db_errors_total",
|
|
Help: "Total number of database errors",
|
|
},
|
|
[]string{"error_type", "service"},
|
|
)
|
|
|
|
// Cache metrics
|
|
cacheOperations = promauto.NewCounterVec(
|
|
prometheus.CounterOpts{
|
|
Name: "cache_operations_total",
|
|
Help: "Total number of cache operations",
|
|
},
|
|
[]string{"operation", "result", "service"}, // get/set/delete, hit/miss/error
|
|
)
|
|
|
|
cacheKeyCount = promauto.NewGaugeVec(
|
|
prometheus.GaugeOpts{
|
|
Name: "cache_keys_total",
|
|
Help: "Number of keys in cache",
|
|
},
|
|
[]string{"cache_type", "service"},
|
|
)
|
|
|
|
// Authentication metrics
|
|
authenticationAttempts = promauto.NewCounterVec(
|
|
prometheus.CounterOpts{
|
|
Name: "authentication_attempts_total",
|
|
Help: "Total authentication attempts",
|
|
},
|
|
[]string{"method", "result", "user_type"}, // jwt/oauth, success/failure, player/admin
|
|
)
|
|
|
|
tokenOperations = promauto.NewCounterVec(
|
|
prometheus.CounterOpts{
|
|
Name: "token_operations_total",
|
|
Help: "JWT token operations",
|
|
},
|
|
[]string{"operation", "result"}, // validate/refresh, success/failure
|
|
)
|
|
)
|
|
|
|
// Fiber middleware for HTTP metrics
|
|
func PrometheusMiddleware(serviceName string) fiber.Handler {
|
|
return func(c *fiber.Ctx) error {
|
|
start := time.Now()
|
|
|
|
// Process request
|
|
err := c.Next()
|
|
|
|
// Record metrics
|
|
duration := time.Since(start).Seconds()
|
|
statusCode := strconv.Itoa(c.Response().StatusCode())
|
|
|
|
httpRequestsTotal.WithLabelValues(
|
|
c.Method(),
|
|
c.Route().Path,
|
|
statusCode,
|
|
serviceName,
|
|
).Inc()
|
|
|
|
httpRequestDuration.WithLabelValues(
|
|
c.Method(),
|
|
c.Route().Path,
|
|
serviceName,
|
|
).Observe(duration)
|
|
|
|
return err
|
|
}
|
|
}
|
|
|
|
// Database metrics middleware for Ent
|
|
type MetricsHook struct {
|
|
serviceName string
|
|
}
|
|
|
|
func NewMetricsHook(serviceName string) *MetricsHook {
|
|
return &MetricsHook{serviceName: serviceName}
|
|
}
|
|
|
|
func (h *MetricsHook) Hook() ent.Hook {
|
|
return hook.On(
|
|
func(next ent.Mutator) ent.Mutator {
|
|
return ent.MutateFunc(func(ctx context.Context, m ent.Mutation) (ent.Value, error) {
|
|
start := time.Now()
|
|
|
|
result, err := next.Mutate(ctx, m)
|
|
|
|
duration := time.Since(start).Seconds()
|
|
queryType := strings.ToLower(m.Op().String())
|
|
table := m.Type()
|
|
|
|
dbQueryDuration.WithLabelValues(queryType, table, h.serviceName).Observe(duration)
|
|
|
|
if err != nil {
|
|
dbErrors.WithLabelValues("query_error", h.serviceName).Inc()
|
|
}
|
|
|
|
return result, err
|
|
})
|
|
},
|
|
ent.OpCreate|ent.OpUpdate|ent.OpUpdateOne|ent.OpDelete|ent.OpDeleteOne,
|
|
)
|
|
}
|
|
```
|
|
|
|
### Frontend Metrics Collection
|
|
|
|
#### Web Application Metrics
|
|
```typescript
|
|
// Frontend metrics collection
|
|
class MetricsCollector {
|
|
private endpoint: string
|
|
private batchSize: number = 50
|
|
private flushInterval: number = 30000 // 30 seconds
|
|
private metrics: MetricEvent[] = []
|
|
|
|
constructor(endpoint: string) {
|
|
this.endpoint = endpoint
|
|
this.startBatchFlush()
|
|
this.setupPerformanceObserver()
|
|
}
|
|
|
|
// User interaction metrics
|
|
trackUserAction(action: string, properties: Record<string, any> = {}): void {
|
|
this.addMetric({
|
|
type: 'user_action',
|
|
action,
|
|
timestamp: Date.now(),
|
|
session_id: this.getSessionId(),
|
|
user_agent: navigator.userAgent,
|
|
...properties,
|
|
})
|
|
}
|
|
|
|
// Game-specific metrics
|
|
trackGameEvent(event: GameEvent): void {
|
|
this.addMetric({
|
|
type: 'game_event',
|
|
event: event.type,
|
|
timestamp: Date.now(),
|
|
session_id: this.getSessionId(),
|
|
game_session_id: event.gameSessionId,
|
|
properties: event.properties,
|
|
})
|
|
}
|
|
|
|
// Performance metrics
|
|
trackPerformance(metric: PerformanceMetric): void {
|
|
this.addMetric({
|
|
type: 'performance',
|
|
metric: metric.name,
|
|
value: metric.value,
|
|
timestamp: Date.now(),
|
|
url: window.location.pathname,
|
|
})
|
|
}
|
|
|
|
// Error tracking
|
|
trackError(error: Error, context: string): void {
|
|
this.addMetric({
|
|
type: 'error',
|
|
error_message: error.message,
|
|
error_stack: error.stack,
|
|
context,
|
|
timestamp: Date.now(),
|
|
url: window.location.pathname,
|
|
user_agent: navigator.userAgent,
|
|
})
|
|
}
|
|
|
|
private setupPerformanceObserver(): void {
|
|
// Web Vitals tracking
|
|
if ('PerformanceObserver' in window) {
|
|
// Largest Contentful Paint
|
|
new PerformanceObserver((list) => {
|
|
list.getEntries().forEach((entry) => {
|
|
this.trackPerformance({
|
|
name: 'largest_contentful_paint',
|
|
value: entry.startTime,
|
|
})
|
|
})
|
|
}).observe({ entryTypes: ['largest-contentful-paint'] })
|
|
|
|
// First Input Delay
|
|
new PerformanceObserver((list) => {
|
|
list.getEntries().forEach((entry) => {
|
|
this.trackPerformance({
|
|
name: 'first_input_delay',
|
|
value: entry.processingStart - entry.startTime,
|
|
})
|
|
})
|
|
}).observe({ entryTypes: ['first-input'] })
|
|
|
|
// Cumulative Layout Shift
|
|
new PerformanceObserver((list) => {
|
|
let cumulativeScore = 0
|
|
list.getEntries().forEach((entry) => {
|
|
if (!entry.hadRecentInput) {
|
|
cumulativeScore += entry.value
|
|
}
|
|
})
|
|
this.trackPerformance({
|
|
name: 'cumulative_layout_shift',
|
|
value: cumulativeScore,
|
|
})
|
|
}).observe({ entryTypes: ['layout-shift'] })
|
|
}
|
|
|
|
// API response time tracking
|
|
this.interceptFetch()
|
|
}
|
|
|
|
private interceptFetch(): void {
|
|
const originalFetch = window.fetch
|
|
window.fetch = async (...args) => {
|
|
const start = performance.now()
|
|
const url = args[0].toString()
|
|
|
|
try {
|
|
const response = await originalFetch(...args)
|
|
const duration = performance.now() - start
|
|
|
|
this.trackPerformance({
|
|
name: 'api_request_duration',
|
|
value: duration,
|
|
url,
|
|
status: response.status,
|
|
})
|
|
|
|
return response
|
|
} catch (error) {
|
|
const duration = performance.now() - start
|
|
|
|
this.trackError(error as Error, `API request to ${url}`)
|
|
this.trackPerformance({
|
|
name: 'api_request_duration',
|
|
value: duration,
|
|
url,
|
|
status: 0,
|
|
})
|
|
|
|
throw error
|
|
}
|
|
}
|
|
}
|
|
|
|
private addMetric(metric: MetricEvent): void {
|
|
this.metrics.push(metric)
|
|
|
|
if (this.metrics.length >= this.batchSize) {
|
|
this.flush()
|
|
}
|
|
}
|
|
|
|
private async flush(): Promise<void> {
|
|
if (this.metrics.length === 0) return
|
|
|
|
const batch = [...this.metrics]
|
|
this.metrics = []
|
|
|
|
try {
|
|
await fetch(this.endpoint, {
|
|
method: 'POST',
|
|
headers: {
|
|
'Content-Type': 'application/json',
|
|
},
|
|
body: JSON.stringify({ metrics: batch }),
|
|
})
|
|
} catch (error) {
|
|
console.error('Failed to send metrics:', error)
|
|
// Re-queue metrics for retry
|
|
this.metrics.unshift(...batch)
|
|
}
|
|
}
|
|
|
|
private startBatchFlush(): void {
|
|
setInterval(() => {
|
|
this.flush()
|
|
}, this.flushInterval)
|
|
|
|
// Flush on page unload
|
|
window.addEventListener('beforeunload', () => {
|
|
this.flush()
|
|
})
|
|
}
|
|
|
|
private getSessionId(): string {
|
|
// Implementation depends on session management
|
|
return sessionStorage.getItem('session_id') || 'anonymous'
|
|
}
|
|
}
|
|
|
|
// Game-specific metrics tracking
|
|
export class GameMetricsTracker {
|
|
private collector: MetricsCollector
|
|
|
|
constructor(collector: MetricsCollector) {
|
|
this.collector = collector
|
|
}
|
|
|
|
trackGameStart(gameSessionId: string, playerName: string): void {
|
|
this.collector.trackGameEvent({
|
|
type: 'game_started',
|
|
gameSessionId,
|
|
properties: {
|
|
player_name: playerName,
|
|
platform: this.getPlatform(),
|
|
},
|
|
})
|
|
}
|
|
|
|
trackQuestionDisplayed(gameSessionId: string, questionId: string, theme: string): void {
|
|
this.collector.trackGameEvent({
|
|
type: 'question_displayed',
|
|
gameSessionId,
|
|
properties: {
|
|
question_id: questionId,
|
|
theme,
|
|
display_time: Date.now(),
|
|
},
|
|
})
|
|
}
|
|
|
|
trackAnswerSubmitted(
|
|
gameSessionId: string,
|
|
questionId: string,
|
|
isCorrect: boolean,
|
|
attemptNumber: number,
|
|
timeTaken: number,
|
|
usedHint: boolean
|
|
): void {
|
|
this.collector.trackGameEvent({
|
|
type: 'answer_submitted',
|
|
gameSessionId,
|
|
properties: {
|
|
question_id: questionId,
|
|
is_correct: isCorrect,
|
|
attempt_number: attemptNumber,
|
|
time_taken_ms: timeTaken,
|
|
used_hint: usedHint,
|
|
},
|
|
})
|
|
}
|
|
|
|
trackHintRequested(gameSessionId: string, questionId: string): void {
|
|
this.collector.trackGameEvent({
|
|
type: 'hint_requested',
|
|
gameSessionId,
|
|
properties: {
|
|
question_id: questionId,
|
|
request_time: Date.now(),
|
|
},
|
|
})
|
|
}
|
|
|
|
trackGameCompleted(
|
|
gameSessionId: string,
|
|
finalScore: number,
|
|
questionsAnswered: number,
|
|
completionType: 'normal' | 'timeout' | 'abandoned'
|
|
): void {
|
|
this.collector.trackGameEvent({
|
|
type: 'game_completed',
|
|
gameSessionId,
|
|
properties: {
|
|
final_score: finalScore,
|
|
questions_answered: questionsAnswered,
|
|
completion_type: completionType,
|
|
platform: this.getPlatform(),
|
|
},
|
|
})
|
|
}
|
|
|
|
private getPlatform(): string {
|
|
// Detect platform
|
|
if (/Android/i.test(navigator.userAgent)) return 'android'
|
|
if (/iPhone|iPad|iPod/i.test(navigator.userAgent)) return 'ios'
|
|
if (window.wails) return 'desktop' // For Wails apps
|
|
return 'web'
|
|
}
|
|
}
|
|
|
|
// Usage in React components
|
|
export const useGameMetrics = () => {
|
|
const collector = useRef(new MetricsCollector('/api/v1/metrics'))
|
|
const gameTracker = useRef(new GameMetricsTracker(collector.current))
|
|
|
|
return {
|
|
trackGameStart: gameTracker.current.trackGameStart.bind(gameTracker.current),
|
|
trackQuestionDisplayed: gameTracker.current.trackQuestionDisplayed.bind(gameTracker.current),
|
|
trackAnswerSubmitted: gameTracker.current.trackAnswerSubmitted.bind(gameTracker.current),
|
|
trackHintRequested: gameTracker.current.trackHintRequested.bind(gameTracker.current),
|
|
trackGameCompleted: gameTracker.current.trackGameCompleted.bind(gameTracker.current),
|
|
trackUserAction: collector.current.trackUserAction.bind(collector.current),
|
|
trackError: collector.current.trackError.bind(collector.current),
|
|
}
|
|
}
|
|
```
|
|
|
|
## Distributed Tracing
|
|
|
|
### OpenTelemetry Integration
|
|
|
|
#### Backend Tracing Setup
|
|
```go
|
|
// OpenTelemetry tracing setup
|
|
package observability
|
|
|
|
import (
|
|
"context"
|
|
"go.opentelemetry.io/otel"
|
|
"go.opentelemetry.io/otel/attribute"
|
|
"go.opentelemetry.io/otel/exporters/jaeger"
|
|
"go.opentelemetry.io/otel/sdk/resource"
|
|
"go.opentelemetry.io/otel/sdk/trace"
|
|
"go.opentelemetry.io/otel/semconv/v1.12.0/httpconv"
|
|
"go.opentelemetry.io/otel/semconv/v1.12.0/netconv"
|
|
)
|
|
|
|
type TracingConfig struct {
|
|
ServiceName string
|
|
ServiceVersion string
|
|
Environment string
|
|
JaegerEndpoint string
|
|
SampleRate float64
|
|
}
|
|
|
|
func InitTracing(config TracingConfig) (*trace.TracerProvider, error) {
|
|
// Create Jaeger exporter
|
|
jaegerExporter, err := jaeger.New(
|
|
jaeger.WithCollectorEndpoint(jaeger.WithEndpoint(config.JaegerEndpoint)),
|
|
)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
// Create resource with service information
|
|
res, err := resource.New(
|
|
context.Background(),
|
|
resource.WithAttributes(
|
|
attribute.String("service.name", config.ServiceName),
|
|
attribute.String("service.version", config.ServiceVersion),
|
|
attribute.String("environment", config.Environment),
|
|
),
|
|
)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
// Create tracer provider
|
|
tp := trace.NewTracerProvider(
|
|
trace.WithBatcher(jaegerExporter),
|
|
trace.WithResource(res),
|
|
trace.WithSampler(trace.TraceIDRatioBased(config.SampleRate)),
|
|
)
|
|
|
|
// Set global tracer provider
|
|
otel.SetTracerProvider(tp)
|
|
|
|
return tp, nil
|
|
}
|
|
|
|
// Fiber middleware for distributed tracing
|
|
func TracingMiddleware(serviceName string) fiber.Handler {
|
|
tracer := otel.Tracer(serviceName)
|
|
|
|
return func(c *fiber.Ctx) error {
|
|
// Start span
|
|
ctx, span := tracer.Start(c.Context(), fmt.Sprintf("%s %s", c.Method(), c.Route().Path))
|
|
defer span.End()
|
|
|
|
// Set span attributes
|
|
span.SetAttributes(
|
|
httpconv.HTTPMethodKey.String(c.Method()),
|
|
httpconv.HTTPURLKey.String(string(c.Request().URI().FullURI())),
|
|
httpconv.HTTPUserAgentKey.String(string(c.Request().Header.UserAgent())),
|
|
netconv.NetPeerIPKey.String(c.IP()),
|
|
)
|
|
|
|
// Add to context
|
|
c.SetUserContext(ctx)
|
|
|
|
// Process request
|
|
err := c.Next()
|
|
|
|
// Set response attributes
|
|
span.SetAttributes(
|
|
httpconv.HTTPStatusCodeKey.Int(c.Response().StatusCode()),
|
|
)
|
|
|
|
if err != nil {
|
|
span.RecordError(err)
|
|
}
|
|
|
|
return err
|
|
}
|
|
}
|
|
|
|
// Service-level tracing helpers
|
|
func TraceServiceOperation(ctx context.Context, serviceName, operation string, fn func(context.Context) error) error {
|
|
tracer := otel.Tracer(serviceName)
|
|
ctx, span := tracer.Start(ctx, operation)
|
|
defer span.End()
|
|
|
|
err := fn(ctx)
|
|
if err != nil {
|
|
span.RecordError(err)
|
|
span.SetAttributes(attribute.Bool("error", true))
|
|
}
|
|
|
|
return err
|
|
}
|
|
|
|
// Database tracing for Ent
|
|
func TraceDatabaseOperation(ctx context.Context, operation, table string, fn func(context.Context) error) error {
|
|
tracer := otel.Tracer("database")
|
|
ctx, span := tracer.Start(ctx, fmt.Sprintf("db.%s.%s", operation, table))
|
|
defer span.End()
|
|
|
|
span.SetAttributes(
|
|
attribute.String("db.operation", operation),
|
|
attribute.String("db.table", table),
|
|
attribute.String("db.system", "postgresql"),
|
|
)
|
|
|
|
err := fn(ctx)
|
|
if err != nil {
|
|
span.RecordError(err)
|
|
}
|
|
|
|
return err
|
|
}
|
|
```
|
|
|
|
#### Frontend Tracing Integration
|
|
```typescript
|
|
// Frontend tracing with OpenTelemetry
|
|
import { WebTracerProvider } from '@opentelemetry/sdk-trace-web'
|
|
import { getWebAutoInstrumentations } from '@opentelemetry/auto-instrumentations-web'
|
|
import { JaegerExporter } from '@opentelemetry/exporter-jaeger'
|
|
import { registerInstrumentations } from '@opentelemetry/instrumentation'
|
|
|
|
export class FrontendTracing {
|
|
private provider: WebTracerProvider
|
|
|
|
constructor(config: TracingConfig) {
|
|
this.provider = new WebTracerProvider({
|
|
resource: new Resource({
|
|
'service.name': config.serviceName,
|
|
'service.version': config.serviceVersion,
|
|
}),
|
|
})
|
|
|
|
// Configure Jaeger exporter
|
|
const jaegerExporter = new JaegerExporter({
|
|
endpoint: config.jaegerEndpoint,
|
|
})
|
|
|
|
this.provider.addSpanProcessor(
|
|
new BatchSpanProcessor(jaegerExporter)
|
|
)
|
|
|
|
// Register provider
|
|
this.provider.register()
|
|
|
|
// Auto-instrument browser APIs
|
|
registerInstrumentations({
|
|
instrumentations: [
|
|
getWebAutoInstrumentations({
|
|
'@opentelemetry/instrumentation-document-load': {
|
|
enabled: true,
|
|
},
|
|
'@opentelemetry/instrumentation-user-interaction': {
|
|
enabled: true,
|
|
},
|
|
'@opentelemetry/instrumentation-fetch': {
|
|
enabled: true,
|
|
propagateTraceHeaderCorsUrls: [
|
|
new RegExp(config.apiBaseUrl),
|
|
],
|
|
},
|
|
}),
|
|
],
|
|
})
|
|
}
|
|
|
|
// Game-specific tracing
|
|
traceGameAction(action: string, properties: Record<string, any>, fn: () => Promise<void>): Promise<void> {
|
|
const tracer = trace.getTracer('game-frontend')
|
|
|
|
return tracer.startActiveSpan(action, async (span) => {
|
|
try {
|
|
// Set span attributes
|
|
Object.entries(properties).forEach(([key, value]) => {
|
|
span.setAttributes({ [key]: value })
|
|
})
|
|
|
|
await fn()
|
|
} catch (error) {
|
|
span.recordException(error as Error)
|
|
span.setStatus({ code: SpanStatusCode.ERROR })
|
|
throw error
|
|
} finally {
|
|
span.end()
|
|
}
|
|
})
|
|
}
|
|
}
|
|
|
|
// React hook for tracing
|
|
export const useTracing = () => {
|
|
const tracer = trace.getTracer('react-components')
|
|
|
|
const traceUserAction = useCallback(
|
|
async (action: string, properties: Record<string, any>, fn: () => Promise<void>) => {
|
|
return tracer.startActiveSpan(`user.${action}`, async (span) => {
|
|
try {
|
|
span.setAttributes(properties)
|
|
await fn()
|
|
} catch (error) {
|
|
span.recordException(error as Error)
|
|
throw error
|
|
} finally {
|
|
span.end()
|
|
}
|
|
})
|
|
},
|
|
[tracer]
|
|
)
|
|
|
|
return { traceUserAction }
|
|
}
|
|
```
|
|
|
|
## Logging Strategy
|
|
|
|
### Structured Logging Implementation
|
|
|
|
#### Backend Logging
|
|
```go
|
|
// Structured logging with zerolog
|
|
package logging
|
|
|
|
import (
|
|
"os"
|
|
"time"
|
|
"github.com/rs/zerolog"
|
|
"github.com/rs/zerolog/log"
|
|
)
|
|
|
|
type Logger struct {
|
|
logger zerolog.Logger
|
|
}
|
|
|
|
type LogConfig struct {
|
|
Level string
|
|
Environment string
|
|
ServiceName string
|
|
Version string
|
|
}
|
|
|
|
func NewLogger(config LogConfig) *Logger {
|
|
// Parse log level
|
|
level, err := zerolog.ParseLevel(config.Level)
|
|
if err != nil {
|
|
level = zerolog.InfoLevel
|
|
}
|
|
|
|
// Configure zerolog
|
|
zerolog.SetGlobalLevel(level)
|
|
zerolog.TimeFieldFormat = time.RFC3339Nano
|
|
|
|
var logger zerolog.Logger
|
|
|
|
if config.Environment == "development" {
|
|
// Human-readable console output for development
|
|
logger = zerolog.New(zerolog.ConsoleWriter{
|
|
Out: os.Stdout,
|
|
TimeFormat: "15:04:05",
|
|
}).With().Timestamp().Logger()
|
|
} else {
|
|
// JSON output for production
|
|
logger = zerolog.New(os.Stdout).With().Timestamp().Logger()
|
|
}
|
|
|
|
// Add service metadata
|
|
logger = logger.With().
|
|
Str("service", config.ServiceName).
|
|
Str("version", config.Version).
|
|
Str("environment", config.Environment).
|
|
Logger()
|
|
|
|
return &Logger{logger: logger}
|
|
}
|
|
|
|
// Structured logging methods
|
|
func (l *Logger) GameEvent(event string, gameSessionID, userID string, properties map[string]interface{}) {
|
|
l.logger.Info().
|
|
Str("event_type", "game").
|
|
Str("event", event).
|
|
Str("game_session_id", gameSessionID).
|
|
Str("user_id", userID).
|
|
Fields(properties).
|
|
Msg("Game event occurred")
|
|
}
|
|
|
|
func (l *Logger) APIRequest(method, path string, statusCode int, duration time.Duration, userID string) {
|
|
l.logger.Info().
|
|
Str("event_type", "api_request").
|
|
Str("method", method).
|
|
Str("path", path).
|
|
Int("status_code", statusCode).
|
|
Dur("duration_ms", duration).
|
|
Str("user_id", userID).
|
|
Msg("API request processed")
|
|
}
|
|
|
|
func (l *Logger) DatabaseOperation(operation, table string, duration time.Duration, rowsAffected int64) {
|
|
l.logger.Debug().
|
|
Str("event_type", "database").
|
|
Str("operation", operation).
|
|
Str("table", table).
|
|
Dur("duration_ms", duration).
|
|
Int64("rows_affected", rowsAffected).
|
|
Msg("Database operation completed")
|
|
}
|
|
|
|
func (l *Logger) AuthenticationEvent(event, userID, userType string, success bool, details map[string]string) {
|
|
level := l.logger.Info()
|
|
if !success {
|
|
level = l.logger.Warn()
|
|
}
|
|
|
|
level.
|
|
Str("event_type", "authentication").
|
|
Str("event", event).
|
|
Str("user_id", userID).
|
|
Str("user_type", userType).
|
|
Bool("success", success).
|
|
Fields(details).
|
|
Msg("Authentication event")
|
|
}
|
|
|
|
func (l *Logger) SecurityEvent(event, userID, ipAddress string, severity string, details map[string]interface{}) {
|
|
l.logger.Warn().
|
|
Str("event_type", "security").
|
|
Str("event", event).
|
|
Str("user_id", userID).
|
|
Str("ip_address", ipAddress).
|
|
Str("severity", severity).
|
|
Fields(details).
|
|
Msg("Security event detected")
|
|
}
|
|
|
|
func (l *Logger) Error(err error, context string, fields map[string]interface{}) {
|
|
l.logger.Error().
|
|
Err(err).
|
|
Str("context", context).
|
|
Fields(fields).
|
|
Msg("Error occurred")
|
|
}
|
|
|
|
// Fiber middleware for request logging
|
|
func RequestLoggingMiddleware(logger *Logger) fiber.Handler {
|
|
return func(c *fiber.Ctx) error {
|
|
start := time.Now()
|
|
|
|
// Process request
|
|
err := c.Next()
|
|
|
|
// Log request
|
|
duration := time.Since(start)
|
|
userID := c.Locals("user_id")
|
|
if userID == nil {
|
|
userID = "anonymous"
|
|
}
|
|
|
|
logger.APIRequest(
|
|
c.Method(),
|
|
c.Route().Path,
|
|
c.Response().StatusCode(),
|
|
duration,
|
|
userID.(string),
|
|
)
|
|
|
|
return err
|
|
}
|
|
}
|
|
```
|
|
|
|
#### Frontend Logging
|
|
```typescript
|
|
// Frontend structured logging
|
|
interface LogEntry {
|
|
timestamp: string
|
|
level: 'debug' | 'info' | 'warn' | 'error'
|
|
message: string
|
|
context?: string
|
|
userId?: string
|
|
sessionId?: string
|
|
gameSessionId?: string
|
|
error?: {
|
|
name: string
|
|
message: string
|
|
stack?: string
|
|
}
|
|
properties?: Record<string, any>
|
|
}
|
|
|
|
export class FrontendLogger {
|
|
private buffer: LogEntry[] = []
|
|
private endpoint: string
|
|
private maxBufferSize: number = 100
|
|
private flushInterval: number = 30000
|
|
|
|
constructor(endpoint: string) {
|
|
this.endpoint = endpoint
|
|
this.startPeriodicFlush()
|
|
}
|
|
|
|
debug(message: string, context?: string, properties?: Record<string, any>): void {
|
|
this.log('debug', message, context, properties)
|
|
}
|
|
|
|
info(message: string, context?: string, properties?: Record<string, any>): void {
|
|
this.log('info', message, context, properties)
|
|
}
|
|
|
|
warn(message: string, context?: string, properties?: Record<string, any>): void {
|
|
this.log('warn', message, context, properties)
|
|
}
|
|
|
|
error(message: string, error?: Error, context?: string, properties?: Record<string, any>): void {
|
|
const entry: LogEntry = {
|
|
timestamp: new Date().toISOString(),
|
|
level: 'error',
|
|
message,
|
|
context,
|
|
userId: this.getUserId(),
|
|
sessionId: this.getSessionId(),
|
|
gameSessionId: this.getGameSessionId(),
|
|
properties,
|
|
}
|
|
|
|
if (error) {
|
|
entry.error = {
|
|
name: error.name,
|
|
message: error.message,
|
|
stack: error.stack,
|
|
}
|
|
}
|
|
|
|
this.buffer.push(entry)
|
|
this.checkFlushConditions()
|
|
}
|
|
|
|
// Game-specific logging methods
|
|
logGameEvent(event: string, gameSessionId: string, properties?: Record<string, any>): void {
|
|
this.info(`Game event: ${event}`, 'game', {
|
|
gameSessionId,
|
|
...properties,
|
|
})
|
|
}
|
|
|
|
logUserAction(action: string, properties?: Record<string, any>): void {
|
|
this.info(`User action: ${action}`, 'user', properties)
|
|
}
|
|
|
|
logPerformanceMetric(metric: string, value: number, unit: string): void {
|
|
this.debug(`Performance metric: ${metric}`, 'performance', {
|
|
metric,
|
|
value,
|
|
unit,
|
|
url: window.location.pathname,
|
|
})
|
|
}
|
|
|
|
private log(level: LogEntry['level'], message: string, context?: string, properties?: Record<string, any>): void {
|
|
const entry: LogEntry = {
|
|
timestamp: new Date().toISOString(),
|
|
level,
|
|
message,
|
|
context,
|
|
userId: this.getUserId(),
|
|
sessionId: this.getSessionId(),
|
|
gameSessionId: this.getGameSessionId(),
|
|
properties,
|
|
}
|
|
|
|
this.buffer.push(entry)
|
|
this.checkFlushConditions()
|
|
}
|
|
|
|
private checkFlushConditions(): void {
|
|
if (this.buffer.length >= this.maxBufferSize) {
|
|
this.flush()
|
|
}
|
|
}
|
|
|
|
private async flush(): Promise<void> {
|
|
if (this.buffer.length === 0) return
|
|
|
|
const logs = [...this.buffer]
|
|
this.buffer = []
|
|
|
|
try {
|
|
await fetch(this.endpoint, {
|
|
method: 'POST',
|
|
headers: {
|
|
'Content-Type': 'application/json',
|
|
},
|
|
body: JSON.stringify({ logs }),
|
|
})
|
|
} catch (error) {
|
|
console.error('Failed to send logs:', error)
|
|
// Re-queue logs for retry (keep only most recent to avoid memory issues)
|
|
this.buffer.unshift(...logs.slice(-50))
|
|
}
|
|
}
|
|
|
|
private startPeriodicFlush(): void {
|
|
setInterval(() => {
|
|
this.flush()
|
|
}, this.flushInterval)
|
|
|
|
// Flush on page unload
|
|
window.addEventListener('beforeunload', () => {
|
|
// Use sendBeacon for reliable delivery during page unload
|
|
if (this.buffer.length > 0) {
|
|
navigator.sendBeacon(this.endpoint, JSON.stringify({ logs: this.buffer }))
|
|
}
|
|
})
|
|
}
|
|
|
|
private getUserId(): string | undefined {
|
|
// Implementation depends on auth system
|
|
return sessionStorage.getItem('user_id') || undefined
|
|
}
|
|
|
|
private getSessionId(): string | undefined {
|
|
return sessionStorage.getItem('session_id') || undefined
|
|
}
|
|
|
|
private getGameSessionId(): string | undefined {
|
|
return sessionStorage.getItem('game_session_id') || undefined
|
|
}
|
|
}
|
|
```
|
|
|
|
## Monitoring Dashboards
|
|
|
|
### Grafana Dashboard Configuration
|
|
|
|
#### 1. Business Intelligence Dashboard
|
|
```json
|
|
{
|
|
"dashboard": {
|
|
"title": "Know Foolery - Business Intelligence",
|
|
"panels": [
|
|
{
|
|
"title": "Active Games",
|
|
"type": "stat",
|
|
"targets": [
|
|
{
|
|
"expr": "sum(rate(games_started_total[5m])) * 300",
|
|
"legendFormat": "Games per 5min"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"title": "Game Completion Rate",
|
|
"type": "stat",
|
|
"targets": [
|
|
{
|
|
"expr": "rate(games_completed_total{completion_type=\"normal\"}[1h]) / rate(games_started_total[1h]) * 100",
|
|
"legendFormat": "Completion %"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"title": "Average Session Duration",
|
|
"type": "graph",
|
|
"targets": [
|
|
{
|
|
"expr": "histogram_quantile(0.5, rate(game_session_duration_seconds_bucket[5m]))",
|
|
"legendFormat": "Median"
|
|
},
|
|
{
|
|
"expr": "histogram_quantile(0.95, rate(game_session_duration_seconds_bucket[5m]))",
|
|
"legendFormat": "95th percentile"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"title": "Question Accuracy by Theme",
|
|
"type": "heatmap",
|
|
"targets": [
|
|
{
|
|
"expr": "rate(answers_submitted_total{is_correct=\"true\"}[1h]) / rate(answers_submitted_total[1h]) by (theme)",
|
|
"legendFormat": "{{theme}}"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"title": "Hint Usage Rate",
|
|
"type": "graph",
|
|
"targets": [
|
|
{
|
|
"expr": "rate(hints_requested_total[5m]) / rate(questions_asked_total[5m]) * 100",
|
|
"legendFormat": "Hint Usage %"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"title": "Score Distribution",
|
|
"type": "histogram",
|
|
"targets": [
|
|
{
|
|
"expr": "histogram_quantile(0.25, rate(game_scores_bucket[1h]))",
|
|
"legendFormat": "25th percentile"
|
|
},
|
|
{
|
|
"expr": "histogram_quantile(0.5, rate(game_scores_bucket[1h]))",
|
|
"legendFormat": "Median"
|
|
},
|
|
{
|
|
"expr": "histogram_quantile(0.75, rate(game_scores_bucket[1h]))",
|
|
"legendFormat": "75th percentile"
|
|
}
|
|
]
|
|
}
|
|
]
|
|
}
|
|
}
|
|
```
|
|
|
|
#### 2. Technical Performance Dashboard
|
|
```json
|
|
{
|
|
"dashboard": {
|
|
"title": "Know Foolery - Technical Performance",
|
|
"panels": [
|
|
{
|
|
"title": "API Response Times",
|
|
"type": "graph",
|
|
"targets": [
|
|
{
|
|
"expr": "histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) by (service)",
|
|
"legendFormat": "{{service}} - 95th percentile"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"title": "Error Rate",
|
|
"type": "graph",
|
|
"targets": [
|
|
{
|
|
"expr": "rate(http_requests_total{status_code=~\"5..\"}[5m]) / rate(http_requests_total[5m]) * 100 by (service)",
|
|
"legendFormat": "{{service}} - Error %"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"title": "Database Performance",
|
|
"type": "graph",
|
|
"targets": [
|
|
{
|
|
"expr": "histogram_quantile(0.95, rate(db_query_duration_seconds_bucket[5m])) by (service)",
|
|
"legendFormat": "{{service}} - Query Time"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"title": "Cache Hit Rate",
|
|
"type": "stat",
|
|
"targets": [
|
|
{
|
|
"expr": "rate(cache_operations_total{result=\"hit\"}[5m]) / rate(cache_operations_total{operation=\"get\"}[5m]) * 100",
|
|
"legendFormat": "Hit Rate %"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"title": "Authentication Success Rate",
|
|
"type": "graph",
|
|
"targets": [
|
|
{
|
|
"expr": "rate(authentication_attempts_total{result=\"success\"}[5m]) / rate(authentication_attempts_total[5m]) * 100 by (method)",
|
|
"legendFormat": "{{method}} - Success %"
|
|
}
|
|
]
|
|
}
|
|
]
|
|
}
|
|
}
|
|
```
|
|
|
|
## Alerting Strategy
|
|
|
|
### Alert Rules Configuration
|
|
|
|
#### Critical Alerts
|
|
```yaml
|
|
# prometheus-alerts.yml
|
|
groups:
|
|
- name: know-foolery-critical
|
|
rules:
|
|
- alert: HighErrorRate
|
|
expr: rate(http_requests_total{status_code=~"5.."}[5m]) / rate(http_requests_total[5m]) > 0.05
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
team: backend
|
|
annotations:
|
|
summary: "High error rate detected in {{ $labels.service }}"
|
|
description: "Error rate is {{ $value | humanizePercentage }} for service {{ $labels.service }}"
|
|
|
|
- alert: DatabaseConnectionFailure
|
|
expr: db_connections_active == 0
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
team: backend
|
|
annotations:
|
|
summary: "Database connections dropped to zero"
|
|
description: "Service {{ $labels.service }} has no active database connections"
|
|
|
|
- alert: AuthenticationSystemDown
|
|
expr: up{service="zitadel"} == 0
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
team: security
|
|
annotations:
|
|
summary: "Authentication system is down"
|
|
description: "Zitadel authentication service is unreachable"
|
|
|
|
- alert: GameSessionsStuck
|
|
expr: increase(games_started_total[5m]) > increase(games_completed_total[5m]) * 2
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
team: backend
|
|
annotations:
|
|
summary: "Game sessions not completing"
|
|
description: "Many games are starting but not completing normally"
|
|
|
|
- name: know-foolery-warning
|
|
rules:
|
|
- alert: HighLatency
|
|
expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) > 1.0
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
team: backend
|
|
annotations:
|
|
summary: "High API latency detected"
|
|
description: "95th percentile latency is {{ $value }}s for {{ $labels.service }}"
|
|
|
|
- alert: LowGameCompletionRate
|
|
expr: rate(games_completed_total{completion_type="normal"}[1h]) / rate(games_started_total[1h]) < 0.7
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
team: product
|
|
annotations:
|
|
summary: "Low game completion rate"
|
|
description: "Only {{ $value | humanizePercentage }} of games are being completed normally"
|
|
|
|
- alert: HighHintUsage
|
|
expr: rate(hints_requested_total[1h]) / rate(questions_asked_total[1h]) > 0.8
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
team: product
|
|
annotations:
|
|
summary: "Unusually high hint usage"
|
|
description: "{{ $value | humanizePercentage }} of questions are requesting hints"
|
|
|
|
- name: know-foolery-security
|
|
rules:
|
|
- alert: HighAuthenticationFailures
|
|
expr: rate(authentication_attempts_total{result="failure"}[5m]) > 10
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
team: security
|
|
annotations:
|
|
summary: "High authentication failure rate"
|
|
description: "{{ $value }} authentication failures per second"
|
|
|
|
- alert: SuspiciousUserActivity
|
|
expr: rate(answers_submitted_total[1m]) by (user_id) > 5
|
|
for: 1m
|
|
labels:
|
|
severity: warning
|
|
team: security
|
|
annotations:
|
|
summary: "Suspicious user activity detected"
|
|
description: "User {{ $labels.user_id }} is submitting answers at {{ $value }}/second"
|
|
```
|
|
|
|
### Alert Routing and Escalation
|
|
```yaml
|
|
# alertmanager.yml
|
|
global:
|
|
slack_api_url: 'https://hooks.slack.com/services/...'
|
|
|
|
route:
|
|
group_by: ['alertname', 'service']
|
|
group_wait: 10s
|
|
group_interval: 10s
|
|
repeat_interval: 1h
|
|
receiver: 'default'
|
|
routes:
|
|
- match:
|
|
severity: critical
|
|
receiver: 'critical-alerts'
|
|
group_wait: 0s
|
|
- match:
|
|
team: security
|
|
receiver: 'security-team'
|
|
- match:
|
|
team: product
|
|
receiver: 'product-team'
|
|
|
|
receivers:
|
|
- name: 'default'
|
|
slack_configs:
|
|
- channel: '#alerts'
|
|
title: 'Know Foolery Alert'
|
|
text: '{{ range .Alerts }}{{ .Annotations.summary }}{{ end }}'
|
|
|
|
- name: 'critical-alerts'
|
|
slack_configs:
|
|
- channel: '#critical-alerts'
|
|
title: 'CRITICAL: Know Foolery'
|
|
text: '{{ range .Alerts }}{{ .Annotations.summary }}\n{{ .Annotations.description }}{{ end }}'
|
|
pagerduty_configs:
|
|
- service_key: 'your-pagerduty-key'
|
|
description: '{{ range .Alerts }}{{ .Annotations.summary }}{{ end }}'
|
|
|
|
- name: 'security-team'
|
|
slack_configs:
|
|
- channel: '#security-alerts'
|
|
title: 'Security Alert: Know Foolery'
|
|
text: '{{ range .Alerts }}{{ .Annotations.summary }}{{ end }}'
|
|
|
|
- name: 'product-team'
|
|
slack_configs:
|
|
- channel: '#product-alerts'
|
|
title: 'Product Alert: Know Foolery'
|
|
text: '{{ range .Alerts }}{{ .Annotations.summary }}{{ end }}'
|
|
```
|
|
|
|
This comprehensive observability strategy ensures that Know Foolery has full visibility into its performance, user behavior, and system health, enabling proactive issue resolution and data-driven product improvements. |