# Know Foolery - Detailed Observability Implementation Guidelines ## Metrics Strategy ### Application Metrics Collection #### 1. Business Metrics (Game-Specific) ```go // Business metrics for game insights package metrics import ( "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/promauto" ) var ( // Game session metrics gamesStarted = promauto.NewCounterVec( prometheus.CounterOpts{ Name: "games_started_total", Help: "Total number of games started", }, []string{"player_type", "platform"}, ) gamesCompleted = promauto.NewCounterVec( prometheus.CounterOpts{ Name: "games_completed_total", Help: "Total number of games completed", }, []string{"completion_type", "platform"}, // normal, timeout, abandoned ) sessionDuration = promauto.NewHistogramVec( prometheus.HistogramOpts{ Name: "game_session_duration_seconds", Help: "Duration of game sessions", Buckets: []float64{60, 300, 600, 900, 1200, 1500, 1800}, // 1min to 30min }, []string{"completion_type"}, ) // Question and answer metrics questionsAsked = promauto.NewCounterVec( prometheus.CounterOpts{ Name: "questions_asked_total", Help: "Total number of questions asked", }, []string{"theme", "difficulty"}, ) answersSubmitted = promauto.NewCounterVec( prometheus.CounterOpts{ Name: "answers_submitted_total", Help: "Total number of answers submitted", }, []string{"theme", "is_correct", "attempt_number", "used_hint"}, ) hintsRequested = promauto.NewCounterVec( prometheus.CounterOpts{ Name: "hints_requested_total", Help: "Total number of hints requested", }, []string{"theme", "question_difficulty"}, ) // Score distribution scoreDistribution = promauto.NewHistogramVec( prometheus.HistogramOpts{ Name: "game_scores", Help: "Distribution of game scores", Buckets: []float64{0, 5, 10, 15, 20, 25, 30, 40, 50, 60, 80, 100}, }, []string{"session_duration_bucket"}, ) // Leaderboard metrics leaderboardUpdates = promauto.NewCounter( prometheus.CounterOpts{ Name: "leaderboard_updates_total", Help: "Total number of leaderboard updates", }, ) topScoreChanges = promauto.NewCounterVec( prometheus.CounterOpts{ Name: "top_score_changes_total", Help: "Changes in top 10 scores", }, []string{"position"}, // top_1, top_5, top_10 ) ) // Business metrics collection service type GameMetrics struct { registry prometheus.Registerer } func NewGameMetrics() *GameMetrics { return &GameMetrics{ registry: prometheus.DefaultRegisterer, } } func (m *GameMetrics) RecordGameStart(playerType, platform string) { gamesStarted.WithLabelValues(playerType, platform).Inc() } func (m *GameMetrics) RecordGameCompletion(completionType, platform string, duration time.Duration) { gamesCompleted.WithLabelValues(completionType, platform).Inc() sessionDuration.WithLabelValues(completionType).Observe(duration.Seconds()) } func (m *GameMetrics) RecordQuestionAsked(theme, difficulty string) { questionsAsked.WithLabelValues(theme, difficulty).Inc() } func (m *GameMetrics) RecordAnswerSubmitted(theme string, isCorrect bool, attemptNum int, usedHint bool) { answersSubmitted.WithLabelValues( theme, strconv.FormatBool(isCorrect), strconv.Itoa(attemptNum), strconv.FormatBool(usedHint), ).Inc() } func (m *GameMetrics) RecordFinalScore(score int, sessionDuration time.Duration) { durationBucket := m.getDurationBucket(sessionDuration) scoreDistribution.WithLabelValues(durationBucket).Observe(float64(score)) } func (m *GameMetrics) getDurationBucket(duration time.Duration) string { minutes := int(duration.Minutes()) switch { case minutes <= 5: return "0-5min" case minutes <= 15: return "5-15min" case minutes <= 25: return "15-25min" default: return "25-30min" } } ``` #### 2. Technical Metrics (Infrastructure) ```go // Technical metrics for system health var ( // HTTP metrics httpRequestsTotal = promauto.NewCounterVec( prometheus.CounterOpts{ Name: "http_requests_total", Help: "Total number of HTTP requests", }, []string{"method", "endpoint", "status_code", "service"}, ) httpRequestDuration = promauto.NewHistogramVec( prometheus.HistogramOpts{ Name: "http_request_duration_seconds", Help: "HTTP request duration", Buckets: prometheus.DefBuckets, }, []string{"method", "endpoint", "service"}, ) // Database metrics dbConnectionsActive = promauto.NewGaugeVec( prometheus.GaugeOpts{ Name: "db_connections_active", Help: "Number of active database connections", }, []string{"database", "service"}, ) dbQueryDuration = promauto.NewHistogramVec( prometheus.HistogramOpts{ Name: "db_query_duration_seconds", Help: "Database query duration", Buckets: []float64{0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1.0, 2.0, 5.0}, }, []string{"query_type", "table", "service"}, ) dbErrors = promauto.NewCounterVec( prometheus.CounterOpts{ Name: "db_errors_total", Help: "Total number of database errors", }, []string{"error_type", "service"}, ) // Cache metrics cacheOperations = promauto.NewCounterVec( prometheus.CounterOpts{ Name: "cache_operations_total", Help: "Total number of cache operations", }, []string{"operation", "result", "service"}, // get/set/delete, hit/miss/error ) cacheKeyCount = promauto.NewGaugeVec( prometheus.GaugeOpts{ Name: "cache_keys_total", Help: "Number of keys in cache", }, []string{"cache_type", "service"}, ) // Authentication metrics authenticationAttempts = promauto.NewCounterVec( prometheus.CounterOpts{ Name: "authentication_attempts_total", Help: "Total authentication attempts", }, []string{"method", "result", "user_type"}, // jwt/oauth, success/failure, player/admin ) tokenOperations = promauto.NewCounterVec( prometheus.CounterOpts{ Name: "token_operations_total", Help: "JWT token operations", }, []string{"operation", "result"}, // validate/refresh, success/failure ) ) // Fiber middleware for HTTP metrics func PrometheusMiddleware(serviceName string) fiber.Handler { return func(c *fiber.Ctx) error { start := time.Now() // Process request err := c.Next() // Record metrics duration := time.Since(start).Seconds() statusCode := strconv.Itoa(c.Response().StatusCode()) httpRequestsTotal.WithLabelValues( c.Method(), c.Route().Path, statusCode, serviceName, ).Inc() httpRequestDuration.WithLabelValues( c.Method(), c.Route().Path, serviceName, ).Observe(duration) return err } } // Database metrics middleware for Ent type MetricsHook struct { serviceName string } func NewMetricsHook(serviceName string) *MetricsHook { return &MetricsHook{serviceName: serviceName} } func (h *MetricsHook) Hook() ent.Hook { return hook.On( func(next ent.Mutator) ent.Mutator { return ent.MutateFunc(func(ctx context.Context, m ent.Mutation) (ent.Value, error) { start := time.Now() result, err := next.Mutate(ctx, m) duration := time.Since(start).Seconds() queryType := strings.ToLower(m.Op().String()) table := m.Type() dbQueryDuration.WithLabelValues(queryType, table, h.serviceName).Observe(duration) if err != nil { dbErrors.WithLabelValues("query_error", h.serviceName).Inc() } return result, err }) }, ent.OpCreate|ent.OpUpdate|ent.OpUpdateOne|ent.OpDelete|ent.OpDeleteOne, ) } ``` ### Frontend Metrics Collection #### Web Application Metrics ```typescript // Frontend metrics collection class MetricsCollector { private endpoint: string private batchSize: number = 50 private flushInterval: number = 30000 // 30 seconds private metrics: MetricEvent[] = [] constructor(endpoint: string) { this.endpoint = endpoint this.startBatchFlush() this.setupPerformanceObserver() } // User interaction metrics trackUserAction(action: string, properties: Record = {}): void { this.addMetric({ type: 'user_action', action, timestamp: Date.now(), session_id: this.getSessionId(), user_agent: navigator.userAgent, ...properties, }) } // Game-specific metrics trackGameEvent(event: GameEvent): void { this.addMetric({ type: 'game_event', event: event.type, timestamp: Date.now(), session_id: this.getSessionId(), game_session_id: event.gameSessionId, properties: event.properties, }) } // Performance metrics trackPerformance(metric: PerformanceMetric): void { this.addMetric({ type: 'performance', metric: metric.name, value: metric.value, timestamp: Date.now(), url: window.location.pathname, }) } // Error tracking trackError(error: Error, context: string): void { this.addMetric({ type: 'error', error_message: error.message, error_stack: error.stack, context, timestamp: Date.now(), url: window.location.pathname, user_agent: navigator.userAgent, }) } private setupPerformanceObserver(): void { // Web Vitals tracking if ('PerformanceObserver' in window) { // Largest Contentful Paint new PerformanceObserver((list) => { list.getEntries().forEach((entry) => { this.trackPerformance({ name: 'largest_contentful_paint', value: entry.startTime, }) }) }).observe({ entryTypes: ['largest-contentful-paint'] }) // First Input Delay new PerformanceObserver((list) => { list.getEntries().forEach((entry) => { this.trackPerformance({ name: 'first_input_delay', value: entry.processingStart - entry.startTime, }) }) }).observe({ entryTypes: ['first-input'] }) // Cumulative Layout Shift new PerformanceObserver((list) => { let cumulativeScore = 0 list.getEntries().forEach((entry) => { if (!entry.hadRecentInput) { cumulativeScore += entry.value } }) this.trackPerformance({ name: 'cumulative_layout_shift', value: cumulativeScore, }) }).observe({ entryTypes: ['layout-shift'] }) } // API response time tracking this.interceptFetch() } private interceptFetch(): void { const originalFetch = window.fetch window.fetch = async (...args) => { const start = performance.now() const url = args[0].toString() try { const response = await originalFetch(...args) const duration = performance.now() - start this.trackPerformance({ name: 'api_request_duration', value: duration, url, status: response.status, }) return response } catch (error) { const duration = performance.now() - start this.trackError(error as Error, `API request to ${url}`) this.trackPerformance({ name: 'api_request_duration', value: duration, url, status: 0, }) throw error } } } private addMetric(metric: MetricEvent): void { this.metrics.push(metric) if (this.metrics.length >= this.batchSize) { this.flush() } } private async flush(): Promise { if (this.metrics.length === 0) return const batch = [...this.metrics] this.metrics = [] try { await fetch(this.endpoint, { method: 'POST', headers: { 'Content-Type': 'application/json', }, body: JSON.stringify({ metrics: batch }), }) } catch (error) { console.error('Failed to send metrics:', error) // Re-queue metrics for retry this.metrics.unshift(...batch) } } private startBatchFlush(): void { setInterval(() => { this.flush() }, this.flushInterval) // Flush on page unload window.addEventListener('beforeunload', () => { this.flush() }) } private getSessionId(): string { // Implementation depends on session management return sessionStorage.getItem('session_id') || 'anonymous' } } // Game-specific metrics tracking export class GameMetricsTracker { private collector: MetricsCollector constructor(collector: MetricsCollector) { this.collector = collector } trackGameStart(gameSessionId: string, playerName: string): void { this.collector.trackGameEvent({ type: 'game_started', gameSessionId, properties: { player_name: playerName, platform: this.getPlatform(), }, }) } trackQuestionDisplayed(gameSessionId: string, questionId: string, theme: string): void { this.collector.trackGameEvent({ type: 'question_displayed', gameSessionId, properties: { question_id: questionId, theme, display_time: Date.now(), }, }) } trackAnswerSubmitted( gameSessionId: string, questionId: string, isCorrect: boolean, attemptNumber: number, timeTaken: number, usedHint: boolean ): void { this.collector.trackGameEvent({ type: 'answer_submitted', gameSessionId, properties: { question_id: questionId, is_correct: isCorrect, attempt_number: attemptNumber, time_taken_ms: timeTaken, used_hint: usedHint, }, }) } trackHintRequested(gameSessionId: string, questionId: string): void { this.collector.trackGameEvent({ type: 'hint_requested', gameSessionId, properties: { question_id: questionId, request_time: Date.now(), }, }) } trackGameCompleted( gameSessionId: string, finalScore: number, questionsAnswered: number, completionType: 'normal' | 'timeout' | 'abandoned' ): void { this.collector.trackGameEvent({ type: 'game_completed', gameSessionId, properties: { final_score: finalScore, questions_answered: questionsAnswered, completion_type: completionType, platform: this.getPlatform(), }, }) } private getPlatform(): string { // Detect platform if (/Android/i.test(navigator.userAgent)) return 'android' if (/iPhone|iPad|iPod/i.test(navigator.userAgent)) return 'ios' if (window.wails) return 'desktop' // For Wails apps return 'web' } } // Usage in React components export const useGameMetrics = () => { const collector = useRef(new MetricsCollector('/api/v1/metrics')) const gameTracker = useRef(new GameMetricsTracker(collector.current)) return { trackGameStart: gameTracker.current.trackGameStart.bind(gameTracker.current), trackQuestionDisplayed: gameTracker.current.trackQuestionDisplayed.bind(gameTracker.current), trackAnswerSubmitted: gameTracker.current.trackAnswerSubmitted.bind(gameTracker.current), trackHintRequested: gameTracker.current.trackHintRequested.bind(gameTracker.current), trackGameCompleted: gameTracker.current.trackGameCompleted.bind(gameTracker.current), trackUserAction: collector.current.trackUserAction.bind(collector.current), trackError: collector.current.trackError.bind(collector.current), } } ``` ## Distributed Tracing ### OpenTelemetry Integration #### Backend Tracing Setup ```go // OpenTelemetry tracing setup package observability import ( "context" "go.opentelemetry.io/otel" "go.opentelemetry.io/otel/attribute" "go.opentelemetry.io/otel/exporters/jaeger" "go.opentelemetry.io/otel/sdk/resource" "go.opentelemetry.io/otel/sdk/trace" "go.opentelemetry.io/otel/semconv/v1.12.0/httpconv" "go.opentelemetry.io/otel/semconv/v1.12.0/netconv" ) type TracingConfig struct { ServiceName string ServiceVersion string Environment string JaegerEndpoint string SampleRate float64 } func InitTracing(config TracingConfig) (*trace.TracerProvider, error) { // Create Jaeger exporter jaegerExporter, err := jaeger.New( jaeger.WithCollectorEndpoint(jaeger.WithEndpoint(config.JaegerEndpoint)), ) if err != nil { return nil, err } // Create resource with service information res, err := resource.New( context.Background(), resource.WithAttributes( attribute.String("service.name", config.ServiceName), attribute.String("service.version", config.ServiceVersion), attribute.String("environment", config.Environment), ), ) if err != nil { return nil, err } // Create tracer provider tp := trace.NewTracerProvider( trace.WithBatcher(jaegerExporter), trace.WithResource(res), trace.WithSampler(trace.TraceIDRatioBased(config.SampleRate)), ) // Set global tracer provider otel.SetTracerProvider(tp) return tp, nil } // Fiber middleware for distributed tracing func TracingMiddleware(serviceName string) fiber.Handler { tracer := otel.Tracer(serviceName) return func(c *fiber.Ctx) error { // Start span ctx, span := tracer.Start(c.Context(), fmt.Sprintf("%s %s", c.Method(), c.Route().Path)) defer span.End() // Set span attributes span.SetAttributes( httpconv.HTTPMethodKey.String(c.Method()), httpconv.HTTPURLKey.String(string(c.Request().URI().FullURI())), httpconv.HTTPUserAgentKey.String(string(c.Request().Header.UserAgent())), netconv.NetPeerIPKey.String(c.IP()), ) // Add to context c.SetUserContext(ctx) // Process request err := c.Next() // Set response attributes span.SetAttributes( httpconv.HTTPStatusCodeKey.Int(c.Response().StatusCode()), ) if err != nil { span.RecordError(err) } return err } } // Service-level tracing helpers func TraceServiceOperation(ctx context.Context, serviceName, operation string, fn func(context.Context) error) error { tracer := otel.Tracer(serviceName) ctx, span := tracer.Start(ctx, operation) defer span.End() err := fn(ctx) if err != nil { span.RecordError(err) span.SetAttributes(attribute.Bool("error", true)) } return err } // Database tracing for Ent func TraceDatabaseOperation(ctx context.Context, operation, table string, fn func(context.Context) error) error { tracer := otel.Tracer("database") ctx, span := tracer.Start(ctx, fmt.Sprintf("db.%s.%s", operation, table)) defer span.End() span.SetAttributes( attribute.String("db.operation", operation), attribute.String("db.table", table), attribute.String("db.system", "postgresql"), ) err := fn(ctx) if err != nil { span.RecordError(err) } return err } ``` #### Frontend Tracing Integration ```typescript // Frontend tracing with OpenTelemetry import { WebTracerProvider } from '@opentelemetry/sdk-trace-web' import { getWebAutoInstrumentations } from '@opentelemetry/auto-instrumentations-web' import { JaegerExporter } from '@opentelemetry/exporter-jaeger' import { registerInstrumentations } from '@opentelemetry/instrumentation' export class FrontendTracing { private provider: WebTracerProvider constructor(config: TracingConfig) { this.provider = new WebTracerProvider({ resource: new Resource({ 'service.name': config.serviceName, 'service.version': config.serviceVersion, }), }) // Configure Jaeger exporter const jaegerExporter = new JaegerExporter({ endpoint: config.jaegerEndpoint, }) this.provider.addSpanProcessor( new BatchSpanProcessor(jaegerExporter) ) // Register provider this.provider.register() // Auto-instrument browser APIs registerInstrumentations({ instrumentations: [ getWebAutoInstrumentations({ '@opentelemetry/instrumentation-document-load': { enabled: true, }, '@opentelemetry/instrumentation-user-interaction': { enabled: true, }, '@opentelemetry/instrumentation-fetch': { enabled: true, propagateTraceHeaderCorsUrls: [ new RegExp(config.apiBaseUrl), ], }, }), ], }) } // Game-specific tracing traceGameAction(action: string, properties: Record, fn: () => Promise): Promise { const tracer = trace.getTracer('game-frontend') return tracer.startActiveSpan(action, async (span) => { try { // Set span attributes Object.entries(properties).forEach(([key, value]) => { span.setAttributes({ [key]: value }) }) await fn() } catch (error) { span.recordException(error as Error) span.setStatus({ code: SpanStatusCode.ERROR }) throw error } finally { span.end() } }) } } // React hook for tracing export const useTracing = () => { const tracer = trace.getTracer('react-components') const traceUserAction = useCallback( async (action: string, properties: Record, fn: () => Promise) => { return tracer.startActiveSpan(`user.${action}`, async (span) => { try { span.setAttributes(properties) await fn() } catch (error) { span.recordException(error as Error) throw error } finally { span.end() } }) }, [tracer] ) return { traceUserAction } } ``` ## Logging Strategy ### Structured Logging Implementation #### Backend Logging ```go // Structured logging with zerolog package logging import ( "os" "time" "github.com/rs/zerolog" "github.com/rs/zerolog/log" ) type Logger struct { logger zerolog.Logger } type LogConfig struct { Level string Environment string ServiceName string Version string } func NewLogger(config LogConfig) *Logger { // Parse log level level, err := zerolog.ParseLevel(config.Level) if err != nil { level = zerolog.InfoLevel } // Configure zerolog zerolog.SetGlobalLevel(level) zerolog.TimeFieldFormat = time.RFC3339Nano var logger zerolog.Logger if config.Environment == "development" { // Human-readable console output for development logger = zerolog.New(zerolog.ConsoleWriter{ Out: os.Stdout, TimeFormat: "15:04:05", }).With().Timestamp().Logger() } else { // JSON output for production logger = zerolog.New(os.Stdout).With().Timestamp().Logger() } // Add service metadata logger = logger.With(). Str("service", config.ServiceName). Str("version", config.Version). Str("environment", config.Environment). Logger() return &Logger{logger: logger} } // Structured logging methods func (l *Logger) GameEvent(event string, gameSessionID, userID string, properties map[string]interface{}) { l.logger.Info(). Str("event_type", "game"). Str("event", event). Str("game_session_id", gameSessionID). Str("user_id", userID). Fields(properties). Msg("Game event occurred") } func (l *Logger) APIRequest(method, path string, statusCode int, duration time.Duration, userID string) { l.logger.Info(). Str("event_type", "api_request"). Str("method", method). Str("path", path). Int("status_code", statusCode). Dur("duration_ms", duration). Str("user_id", userID). Msg("API request processed") } func (l *Logger) DatabaseOperation(operation, table string, duration time.Duration, rowsAffected int64) { l.logger.Debug(). Str("event_type", "database"). Str("operation", operation). Str("table", table). Dur("duration_ms", duration). Int64("rows_affected", rowsAffected). Msg("Database operation completed") } func (l *Logger) AuthenticationEvent(event, userID, userType string, success bool, details map[string]string) { level := l.logger.Info() if !success { level = l.logger.Warn() } level. Str("event_type", "authentication"). Str("event", event). Str("user_id", userID). Str("user_type", userType). Bool("success", success). Fields(details). Msg("Authentication event") } func (l *Logger) SecurityEvent(event, userID, ipAddress string, severity string, details map[string]interface{}) { l.logger.Warn(). Str("event_type", "security"). Str("event", event). Str("user_id", userID). Str("ip_address", ipAddress). Str("severity", severity). Fields(details). Msg("Security event detected") } func (l *Logger) Error(err error, context string, fields map[string]interface{}) { l.logger.Error(). Err(err). Str("context", context). Fields(fields). Msg("Error occurred") } // Fiber middleware for request logging func RequestLoggingMiddleware(logger *Logger) fiber.Handler { return func(c *fiber.Ctx) error { start := time.Now() // Process request err := c.Next() // Log request duration := time.Since(start) userID := c.Locals("user_id") if userID == nil { userID = "anonymous" } logger.APIRequest( c.Method(), c.Route().Path, c.Response().StatusCode(), duration, userID.(string), ) return err } } ``` #### Frontend Logging ```typescript // Frontend structured logging interface LogEntry { timestamp: string level: 'debug' | 'info' | 'warn' | 'error' message: string context?: string userId?: string sessionId?: string gameSessionId?: string error?: { name: string message: string stack?: string } properties?: Record } export class FrontendLogger { private buffer: LogEntry[] = [] private endpoint: string private maxBufferSize: number = 100 private flushInterval: number = 30000 constructor(endpoint: string) { this.endpoint = endpoint this.startPeriodicFlush() } debug(message: string, context?: string, properties?: Record): void { this.log('debug', message, context, properties) } info(message: string, context?: string, properties?: Record): void { this.log('info', message, context, properties) } warn(message: string, context?: string, properties?: Record): void { this.log('warn', message, context, properties) } error(message: string, error?: Error, context?: string, properties?: Record): void { const entry: LogEntry = { timestamp: new Date().toISOString(), level: 'error', message, context, userId: this.getUserId(), sessionId: this.getSessionId(), gameSessionId: this.getGameSessionId(), properties, } if (error) { entry.error = { name: error.name, message: error.message, stack: error.stack, } } this.buffer.push(entry) this.checkFlushConditions() } // Game-specific logging methods logGameEvent(event: string, gameSessionId: string, properties?: Record): void { this.info(`Game event: ${event}`, 'game', { gameSessionId, ...properties, }) } logUserAction(action: string, properties?: Record): void { this.info(`User action: ${action}`, 'user', properties) } logPerformanceMetric(metric: string, value: number, unit: string): void { this.debug(`Performance metric: ${metric}`, 'performance', { metric, value, unit, url: window.location.pathname, }) } private log(level: LogEntry['level'], message: string, context?: string, properties?: Record): void { const entry: LogEntry = { timestamp: new Date().toISOString(), level, message, context, userId: this.getUserId(), sessionId: this.getSessionId(), gameSessionId: this.getGameSessionId(), properties, } this.buffer.push(entry) this.checkFlushConditions() } private checkFlushConditions(): void { if (this.buffer.length >= this.maxBufferSize) { this.flush() } } private async flush(): Promise { if (this.buffer.length === 0) return const logs = [...this.buffer] this.buffer = [] try { await fetch(this.endpoint, { method: 'POST', headers: { 'Content-Type': 'application/json', }, body: JSON.stringify({ logs }), }) } catch (error) { console.error('Failed to send logs:', error) // Re-queue logs for retry (keep only most recent to avoid memory issues) this.buffer.unshift(...logs.slice(-50)) } } private startPeriodicFlush(): void { setInterval(() => { this.flush() }, this.flushInterval) // Flush on page unload window.addEventListener('beforeunload', () => { // Use sendBeacon for reliable delivery during page unload if (this.buffer.length > 0) { navigator.sendBeacon(this.endpoint, JSON.stringify({ logs: this.buffer })) } }) } private getUserId(): string | undefined { // Implementation depends on auth system return sessionStorage.getItem('user_id') || undefined } private getSessionId(): string | undefined { return sessionStorage.getItem('session_id') || undefined } private getGameSessionId(): string | undefined { return sessionStorage.getItem('game_session_id') || undefined } } ``` ## Monitoring Dashboards ### Grafana Dashboard Configuration #### 1. Business Intelligence Dashboard ```json { "dashboard": { "title": "Know Foolery - Business Intelligence", "panels": [ { "title": "Active Games", "type": "stat", "targets": [ { "expr": "sum(rate(games_started_total[5m])) * 300", "legendFormat": "Games per 5min" } ] }, { "title": "Game Completion Rate", "type": "stat", "targets": [ { "expr": "rate(games_completed_total{completion_type=\"normal\"}[1h]) / rate(games_started_total[1h]) * 100", "legendFormat": "Completion %" } ] }, { "title": "Average Session Duration", "type": "graph", "targets": [ { "expr": "histogram_quantile(0.5, rate(game_session_duration_seconds_bucket[5m]))", "legendFormat": "Median" }, { "expr": "histogram_quantile(0.95, rate(game_session_duration_seconds_bucket[5m]))", "legendFormat": "95th percentile" } ] }, { "title": "Question Accuracy by Theme", "type": "heatmap", "targets": [ { "expr": "rate(answers_submitted_total{is_correct=\"true\"}[1h]) / rate(answers_submitted_total[1h]) by (theme)", "legendFormat": "{{theme}}" } ] }, { "title": "Hint Usage Rate", "type": "graph", "targets": [ { "expr": "rate(hints_requested_total[5m]) / rate(questions_asked_total[5m]) * 100", "legendFormat": "Hint Usage %" } ] }, { "title": "Score Distribution", "type": "histogram", "targets": [ { "expr": "histogram_quantile(0.25, rate(game_scores_bucket[1h]))", "legendFormat": "25th percentile" }, { "expr": "histogram_quantile(0.5, rate(game_scores_bucket[1h]))", "legendFormat": "Median" }, { "expr": "histogram_quantile(0.75, rate(game_scores_bucket[1h]))", "legendFormat": "75th percentile" } ] } ] } } ``` #### 2. Technical Performance Dashboard ```json { "dashboard": { "title": "Know Foolery - Technical Performance", "panels": [ { "title": "API Response Times", "type": "graph", "targets": [ { "expr": "histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) by (service)", "legendFormat": "{{service}} - 95th percentile" } ] }, { "title": "Error Rate", "type": "graph", "targets": [ { "expr": "rate(http_requests_total{status_code=~\"5..\"}[5m]) / rate(http_requests_total[5m]) * 100 by (service)", "legendFormat": "{{service}} - Error %" } ] }, { "title": "Database Performance", "type": "graph", "targets": [ { "expr": "histogram_quantile(0.95, rate(db_query_duration_seconds_bucket[5m])) by (service)", "legendFormat": "{{service}} - Query Time" } ] }, { "title": "Cache Hit Rate", "type": "stat", "targets": [ { "expr": "rate(cache_operations_total{result=\"hit\"}[5m]) / rate(cache_operations_total{operation=\"get\"}[5m]) * 100", "legendFormat": "Hit Rate %" } ] }, { "title": "Authentication Success Rate", "type": "graph", "targets": [ { "expr": "rate(authentication_attempts_total{result=\"success\"}[5m]) / rate(authentication_attempts_total[5m]) * 100 by (method)", "legendFormat": "{{method}} - Success %" } ] } ] } } ``` ## Alerting Strategy ### Alert Rules Configuration #### Critical Alerts ```yaml # prometheus-alerts.yml groups: - name: know-foolery-critical rules: - alert: HighErrorRate expr: rate(http_requests_total{status_code=~"5.."}[5m]) / rate(http_requests_total[5m]) > 0.05 for: 2m labels: severity: critical team: backend annotations: summary: "High error rate detected in {{ $labels.service }}" description: "Error rate is {{ $value | humanizePercentage }} for service {{ $labels.service }}" - alert: DatabaseConnectionFailure expr: db_connections_active == 0 for: 1m labels: severity: critical team: backend annotations: summary: "Database connections dropped to zero" description: "Service {{ $labels.service }} has no active database connections" - alert: AuthenticationSystemDown expr: up{service="zitadel"} == 0 for: 1m labels: severity: critical team: security annotations: summary: "Authentication system is down" description: "Zitadel authentication service is unreachable" - alert: GameSessionsStuck expr: increase(games_started_total[5m]) > increase(games_completed_total[5m]) * 2 for: 5m labels: severity: critical team: backend annotations: summary: "Game sessions not completing" description: "Many games are starting but not completing normally" - name: know-foolery-warning rules: - alert: HighLatency expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) > 1.0 for: 5m labels: severity: warning team: backend annotations: summary: "High API latency detected" description: "95th percentile latency is {{ $value }}s for {{ $labels.service }}" - alert: LowGameCompletionRate expr: rate(games_completed_total{completion_type="normal"}[1h]) / rate(games_started_total[1h]) < 0.7 for: 10m labels: severity: warning team: product annotations: summary: "Low game completion rate" description: "Only {{ $value | humanizePercentage }} of games are being completed normally" - alert: HighHintUsage expr: rate(hints_requested_total[1h]) / rate(questions_asked_total[1h]) > 0.8 for: 15m labels: severity: warning team: product annotations: summary: "Unusually high hint usage" description: "{{ $value | humanizePercentage }} of questions are requesting hints" - name: know-foolery-security rules: - alert: HighAuthenticationFailures expr: rate(authentication_attempts_total{result="failure"}[5m]) > 10 for: 2m labels: severity: warning team: security annotations: summary: "High authentication failure rate" description: "{{ $value }} authentication failures per second" - alert: SuspiciousUserActivity expr: rate(answers_submitted_total[1m]) by (user_id) > 5 for: 1m labels: severity: warning team: security annotations: summary: "Suspicious user activity detected" description: "User {{ $labels.user_id }} is submitting answers at {{ $value }}/second" ``` ### Alert Routing and Escalation ```yaml # alertmanager.yml global: slack_api_url: 'https://hooks.slack.com/services/...' route: group_by: ['alertname', 'service'] group_wait: 10s group_interval: 10s repeat_interval: 1h receiver: 'default' routes: - match: severity: critical receiver: 'critical-alerts' group_wait: 0s - match: team: security receiver: 'security-team' - match: team: product receiver: 'product-team' receivers: - name: 'default' slack_configs: - channel: '#alerts' title: 'Know Foolery Alert' text: '{{ range .Alerts }}{{ .Annotations.summary }}{{ end }}' - name: 'critical-alerts' slack_configs: - channel: '#critical-alerts' title: 'CRITICAL: Know Foolery' text: '{{ range .Alerts }}{{ .Annotations.summary }}\n{{ .Annotations.description }}{{ end }}' pagerduty_configs: - service_key: 'your-pagerduty-key' description: '{{ range .Alerts }}{{ .Annotations.summary }}{{ end }}' - name: 'security-team' slack_configs: - channel: '#security-alerts' title: 'Security Alert: Know Foolery' text: '{{ range .Alerts }}{{ .Annotations.summary }}{{ end }}' - name: 'product-team' slack_configs: - channel: '#product-alerts' title: 'Product Alert: Know Foolery' text: '{{ range .Alerts }}{{ .Annotations.summary }}{{ end }}' ``` This comprehensive observability strategy ensures that Know Foolery has full visibility into its performance, user behavior, and system health, enabling proactive issue resolution and data-driven product improvements.