You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
knowfoolery/docs/observability-strategy.md

43 KiB

Know Foolery - Observability Strategy

Overview

Comprehensive observability is essential for maintaining the Know Foolery quiz game's reliability, performance, and user experience. This document outlines the strategy for metrics collection, monitoring, alerting, and distributed tracing across all system components.

Observability Architecture

Three Pillars of Observability

┌─────────────────────────────────────────────────────────────────────────┐
│                           Observability Stack                            │
│                                                                         │
│  ┌─────────────┐    ┌─────────────┐    ┌─────────────┐                 │
│  │   METRICS   │    │    LOGS     │    │   TRACES    │                 │
│  │             │    │             │    │             │                 │
│  │ Prometheus  │    │    Loki     │    │   Jaeger    │                 │
│  │   +         │    │   +         │    │   +         │                 │
│  │  Grafana    │    │  Grafana    │    │ OpenTeleme- │                 │
│  │             │    │             │    │    try      │                 │
│  └─────────────┘    └─────────────┘    └─────────────┘                 │
│         │                   │                   │                      │
│         └───────────────────┼───────────────────┘                      │
│                             │                                          │
│                    ┌─────────────┐                                     │
│                    │  Grafana    │                                     │
│                    │ Unified     │                                     │
│                    │ Dashboard   │                                     │
│                    └─────────────┘                                     │
└─────────────────────────────────────────────────────────────────────────┘
                              │
                       Alerts & Notifications
                              │
┌─────────────────────────────────────────────────────────────────────────┐
│                      Alert Management                                    │
│  ┌─────────────┐    ┌─────────────┐    ┌─────────────┐                 │
│  │   Email     │    │   Slack     │    │  PagerDuty  │                 │
│  │  Alerts     │    │  Channels   │    │ (Critical)  │                 │
│  └─────────────┘    └─────────────┘    └─────────────┘                 │
└─────────────────────────────────────────────────────────────────────────┘

Metrics Strategy

Application Metrics Collection

1. Business Metrics (Game-Specific)

// Business metrics for game insights
package metrics

import (
    "github.com/prometheus/client_golang/prometheus"
    "github.com/prometheus/client_golang/prometheus/promauto"
)

var (
    // Game session metrics
    gamesStarted = promauto.NewCounterVec(
        prometheus.CounterOpts{
            Name: "games_started_total",
            Help: "Total number of games started",
        },
        []string{"player_type", "platform"},
    )
    
    gamesCompleted = promauto.NewCounterVec(
        prometheus.CounterOpts{
            Name: "games_completed_total",
            Help: "Total number of games completed",
        },
        []string{"completion_type", "platform"}, // normal, timeout, abandoned
    )
    
    sessionDuration = promauto.NewHistogramVec(
        prometheus.HistogramOpts{
            Name:    "game_session_duration_seconds",
            Help:    "Duration of game sessions",
            Buckets: []float64{60, 300, 600, 900, 1200, 1500, 1800}, // 1min to 30min
        },
        []string{"completion_type"},
    )
    
    // Question and answer metrics
    questionsAsked = promauto.NewCounterVec(
        prometheus.CounterOpts{
            Name: "questions_asked_total",
            Help: "Total number of questions asked",
        },
        []string{"theme", "difficulty"},
    )
    
    answersSubmitted = promauto.NewCounterVec(
        prometheus.CounterOpts{
            Name: "answers_submitted_total",
            Help: "Total number of answers submitted",
        },
        []string{"theme", "is_correct", "attempt_number", "used_hint"},
    )
    
    hintsRequested = promauto.NewCounterVec(
        prometheus.CounterOpts{
            Name: "hints_requested_total",
            Help: "Total number of hints requested",
        },
        []string{"theme", "question_difficulty"},
    )
    
    // Score distribution
    scoreDistribution = promauto.NewHistogramVec(
        prometheus.HistogramOpts{
            Name:    "game_scores",
            Help:    "Distribution of game scores",
            Buckets: []float64{0, 5, 10, 15, 20, 25, 30, 40, 50, 60, 80, 100},
        },
        []string{"session_duration_bucket"},
    )
    
    // Leaderboard metrics
    leaderboardUpdates = promauto.NewCounter(
        prometheus.CounterOpts{
            Name: "leaderboard_updates_total",
            Help: "Total number of leaderboard updates",
        },
    )
    
    topScoreChanges = promauto.NewCounterVec(
        prometheus.CounterOpts{
            Name: "top_score_changes_total",
            Help: "Changes in top 10 scores",
        },
        []string{"position"}, // top_1, top_5, top_10
    )
)

// Business metrics collection service
type GameMetrics struct {
    registry prometheus.Registerer
}

func NewGameMetrics() *GameMetrics {
    return &GameMetrics{
        registry: prometheus.DefaultRegisterer,
    }
}

func (m *GameMetrics) RecordGameStart(playerType, platform string) {
    gamesStarted.WithLabelValues(playerType, platform).Inc()
}

func (m *GameMetrics) RecordGameCompletion(completionType, platform string, duration time.Duration) {
    gamesCompleted.WithLabelValues(completionType, platform).Inc()
    sessionDuration.WithLabelValues(completionType).Observe(duration.Seconds())
}

func (m *GameMetrics) RecordQuestionAsked(theme, difficulty string) {
    questionsAsked.WithLabelValues(theme, difficulty).Inc()
}

func (m *GameMetrics) RecordAnswerSubmitted(theme string, isCorrect bool, attemptNum int, usedHint bool) {
    answersSubmitted.WithLabelValues(
        theme,
        strconv.FormatBool(isCorrect),
        strconv.Itoa(attemptNum),
        strconv.FormatBool(usedHint),
    ).Inc()
}

func (m *GameMetrics) RecordFinalScore(score int, sessionDuration time.Duration) {
    durationBucket := m.getDurationBucket(sessionDuration)
    scoreDistribution.WithLabelValues(durationBucket).Observe(float64(score))
}

func (m *GameMetrics) getDurationBucket(duration time.Duration) string {
    minutes := int(duration.Minutes())
    switch {
    case minutes <= 5:
        return "0-5min"
    case minutes <= 15:
        return "5-15min"
    case minutes <= 25:
        return "15-25min"
    default:
        return "25-30min"
    }
}

2. Technical Metrics (Infrastructure)

// Technical metrics for system health
var (
    // HTTP metrics
    httpRequestsTotal = promauto.NewCounterVec(
        prometheus.CounterOpts{
            Name: "http_requests_total",
            Help: "Total number of HTTP requests",
        },
        []string{"method", "endpoint", "status_code", "service"},
    )
    
    httpRequestDuration = promauto.NewHistogramVec(
        prometheus.HistogramOpts{
            Name:    "http_request_duration_seconds",
            Help:    "HTTP request duration",
            Buckets: prometheus.DefBuckets,
        },
        []string{"method", "endpoint", "service"},
    )
    
    // Database metrics
    dbConnectionsActive = promauto.NewGaugeVec(
        prometheus.GaugeOpts{
            Name: "db_connections_active",
            Help: "Number of active database connections",
        },
        []string{"database", "service"},
    )
    
    dbQueryDuration = promauto.NewHistogramVec(
        prometheus.HistogramOpts{
            Name:    "db_query_duration_seconds",
            Help:    "Database query duration",
            Buckets: []float64{0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1.0, 2.0, 5.0},
        },
        []string{"query_type", "table", "service"},
    )
    
    dbErrors = promauto.NewCounterVec(
        prometheus.CounterOpts{
            Name: "db_errors_total",
            Help: "Total number of database errors",
        },
        []string{"error_type", "service"},
    )
    
    // Cache metrics
    cacheOperations = promauto.NewCounterVec(
        prometheus.CounterOpts{
            Name: "cache_operations_total",
            Help: "Total number of cache operations",
        },
        []string{"operation", "result", "service"}, // get/set/delete, hit/miss/error
    )
    
    cacheKeyCount = promauto.NewGaugeVec(
        prometheus.GaugeOpts{
            Name: "cache_keys_total",
            Help: "Number of keys in cache",
        },
        []string{"cache_type", "service"},
    )
    
    // Authentication metrics
    authenticationAttempts = promauto.NewCounterVec(
        prometheus.CounterOpts{
            Name: "authentication_attempts_total",
            Help: "Total authentication attempts",
        },
        []string{"method", "result", "user_type"}, // jwt/oauth, success/failure, player/admin
    )
    
    tokenOperations = promauto.NewCounterVec(
        prometheus.CounterOpts{
            Name: "token_operations_total",
            Help: "JWT token operations",
        },
        []string{"operation", "result"}, // validate/refresh, success/failure
    )
)

// Fiber middleware for HTTP metrics
func PrometheusMiddleware(serviceName string) fiber.Handler {
    return func(c *fiber.Ctx) error {
        start := time.Now()
        
        // Process request
        err := c.Next()
        
        // Record metrics
        duration := time.Since(start).Seconds()
        statusCode := strconv.Itoa(c.Response().StatusCode())
        
        httpRequestsTotal.WithLabelValues(
            c.Method(),
            c.Route().Path,
            statusCode,
            serviceName,
        ).Inc()
        
        httpRequestDuration.WithLabelValues(
            c.Method(),
            c.Route().Path,
            serviceName,
        ).Observe(duration)
        
        return err
    }
}

// Database metrics middleware for Ent
type MetricsHook struct {
    serviceName string
}

func NewMetricsHook(serviceName string) *MetricsHook {
    return &MetricsHook{serviceName: serviceName}
}

func (h *MetricsHook) Hook() ent.Hook {
    return hook.On(
        func(next ent.Mutator) ent.Mutator {
            return ent.MutateFunc(func(ctx context.Context, m ent.Mutation) (ent.Value, error) {
                start := time.Now()
                
                result, err := next.Mutate(ctx, m)
                
                duration := time.Since(start).Seconds()
                queryType := strings.ToLower(m.Op().String())
                table := m.Type()
                
                dbQueryDuration.WithLabelValues(queryType, table, h.serviceName).Observe(duration)
                
                if err != nil {
                    dbErrors.WithLabelValues("query_error", h.serviceName).Inc()
                }
                
                return result, err
            })
        },
        ent.OpCreate|ent.OpUpdate|ent.OpUpdateOne|ent.OpDelete|ent.OpDeleteOne,
    )
}

Frontend Metrics Collection

Web Application Metrics

// Frontend metrics collection
class MetricsCollector {
  private endpoint: string
  private batchSize: number = 50
  private flushInterval: number = 30000 // 30 seconds
  private metrics: MetricEvent[] = []

  constructor(endpoint: string) {
    this.endpoint = endpoint
    this.startBatchFlush()
    this.setupPerformanceObserver()
  }

  // User interaction metrics
  trackUserAction(action: string, properties: Record<string, any> = {}): void {
    this.addMetric({
      type: 'user_action',
      action,
      timestamp: Date.now(),
      session_id: this.getSessionId(),
      user_agent: navigator.userAgent,
      ...properties,
    })
  }

  // Game-specific metrics
  trackGameEvent(event: GameEvent): void {
    this.addMetric({
      type: 'game_event',
      event: event.type,
      timestamp: Date.now(),
      session_id: this.getSessionId(),
      game_session_id: event.gameSessionId,
      properties: event.properties,
    })
  }

  // Performance metrics
  trackPerformance(metric: PerformanceMetric): void {
    this.addMetric({
      type: 'performance',
      metric: metric.name,
      value: metric.value,
      timestamp: Date.now(),
      url: window.location.pathname,
    })
  }

  // Error tracking
  trackError(error: Error, context: string): void {
    this.addMetric({
      type: 'error',
      error_message: error.message,
      error_stack: error.stack,
      context,
      timestamp: Date.now(),
      url: window.location.pathname,
      user_agent: navigator.userAgent,
    })
  }

  private setupPerformanceObserver(): void {
    // Web Vitals tracking
    if ('PerformanceObserver' in window) {
      // Largest Contentful Paint
      new PerformanceObserver((list) => {
        list.getEntries().forEach((entry) => {
          this.trackPerformance({
            name: 'largest_contentful_paint',
            value: entry.startTime,
          })
        })
      }).observe({ entryTypes: ['largest-contentful-paint'] })

      // First Input Delay
      new PerformanceObserver((list) => {
        list.getEntries().forEach((entry) => {
          this.trackPerformance({
            name: 'first_input_delay',
            value: entry.processingStart - entry.startTime,
          })
        })
      }).observe({ entryTypes: ['first-input'] })

      // Cumulative Layout Shift
      new PerformanceObserver((list) => {
        let cumulativeScore = 0
        list.getEntries().forEach((entry) => {
          if (!entry.hadRecentInput) {
            cumulativeScore += entry.value
          }
        })
        this.trackPerformance({
          name: 'cumulative_layout_shift',
          value: cumulativeScore,
        })
      }).observe({ entryTypes: ['layout-shift'] })
    }

    // API response time tracking
    this.interceptFetch()
  }

  private interceptFetch(): void {
    const originalFetch = window.fetch
    window.fetch = async (...args) => {
      const start = performance.now()
      const url = args[0].toString()
      
      try {
        const response = await originalFetch(...args)
        const duration = performance.now() - start
        
        this.trackPerformance({
          name: 'api_request_duration',
          value: duration,
          url,
          status: response.status,
        })
        
        return response
      } catch (error) {
        const duration = performance.now() - start
        
        this.trackError(error as Error, `API request to ${url}`)
        this.trackPerformance({
          name: 'api_request_duration',
          value: duration,
          url,
          status: 0,
        })
        
        throw error
      }
    }
  }

  private addMetric(metric: MetricEvent): void {
    this.metrics.push(metric)
    
    if (this.metrics.length >= this.batchSize) {
      this.flush()
    }
  }

  private async flush(): Promise<void> {
    if (this.metrics.length === 0) return

    const batch = [...this.metrics]
    this.metrics = []

    try {
      await fetch(this.endpoint, {
        method: 'POST',
        headers: {
          'Content-Type': 'application/json',
        },
        body: JSON.stringify({ metrics: batch }),
      })
    } catch (error) {
      console.error('Failed to send metrics:', error)
      // Re-queue metrics for retry
      this.metrics.unshift(...batch)
    }
  }

  private startBatchFlush(): void {
    setInterval(() => {
      this.flush()
    }, this.flushInterval)

    // Flush on page unload
    window.addEventListener('beforeunload', () => {
      this.flush()
    })
  }

  private getSessionId(): string {
    // Implementation depends on session management
    return sessionStorage.getItem('session_id') || 'anonymous'
  }
}

// Game-specific metrics tracking
export class GameMetricsTracker {
  private collector: MetricsCollector

  constructor(collector: MetricsCollector) {
    this.collector = collector
  }

  trackGameStart(gameSessionId: string, playerName: string): void {
    this.collector.trackGameEvent({
      type: 'game_started',
      gameSessionId,
      properties: {
        player_name: playerName,
        platform: this.getPlatform(),
      },
    })
  }

  trackQuestionDisplayed(gameSessionId: string, questionId: string, theme: string): void {
    this.collector.trackGameEvent({
      type: 'question_displayed',
      gameSessionId,
      properties: {
        question_id: questionId,
        theme,
        display_time: Date.now(),
      },
    })
  }

  trackAnswerSubmitted(
    gameSessionId: string,
    questionId: string,
    isCorrect: boolean,
    attemptNumber: number,
    timeTaken: number,
    usedHint: boolean
  ): void {
    this.collector.trackGameEvent({
      type: 'answer_submitted',
      gameSessionId,
      properties: {
        question_id: questionId,
        is_correct: isCorrect,
        attempt_number: attemptNumber,
        time_taken_ms: timeTaken,
        used_hint: usedHint,
      },
    })
  }

  trackHintRequested(gameSessionId: string, questionId: string): void {
    this.collector.trackGameEvent({
      type: 'hint_requested',
      gameSessionId,
      properties: {
        question_id: questionId,
        request_time: Date.now(),
      },
    })
  }

  trackGameCompleted(
    gameSessionId: string,
    finalScore: number,
    questionsAnswered: number,
    completionType: 'normal' | 'timeout' | 'abandoned'
  ): void {
    this.collector.trackGameEvent({
      type: 'game_completed',
      gameSessionId,
      properties: {
        final_score: finalScore,
        questions_answered: questionsAnswered,
        completion_type: completionType,
        platform: this.getPlatform(),
      },
    })
  }

  private getPlatform(): string {
    // Detect platform
    if (/Android/i.test(navigator.userAgent)) return 'android'
    if (/iPhone|iPad|iPod/i.test(navigator.userAgent)) return 'ios'
    if (window.wails) return 'desktop' // For Wails apps
    return 'web'
  }
}

// Usage in React components
export const useGameMetrics = () => {
  const collector = useRef(new MetricsCollector('/api/v1/metrics'))
  const gameTracker = useRef(new GameMetricsTracker(collector.current))

  return {
    trackGameStart: gameTracker.current.trackGameStart.bind(gameTracker.current),
    trackQuestionDisplayed: gameTracker.current.trackQuestionDisplayed.bind(gameTracker.current),
    trackAnswerSubmitted: gameTracker.current.trackAnswerSubmitted.bind(gameTracker.current),
    trackHintRequested: gameTracker.current.trackHintRequested.bind(gameTracker.current),
    trackGameCompleted: gameTracker.current.trackGameCompleted.bind(gameTracker.current),
    trackUserAction: collector.current.trackUserAction.bind(collector.current),
    trackError: collector.current.trackError.bind(collector.current),
  }
}

Distributed Tracing

OpenTelemetry Integration

Backend Tracing Setup

// OpenTelemetry tracing setup
package observability

import (
    "context"
    "go.opentelemetry.io/otel"
    "go.opentelemetry.io/otel/attribute"
    "go.opentelemetry.io/otel/exporters/jaeger"
    "go.opentelemetry.io/otel/sdk/resource"
    "go.opentelemetry.io/otel/sdk/trace"
    "go.opentelemetry.io/otel/semconv/v1.12.0/httpconv"
    "go.opentelemetry.io/otel/semconv/v1.12.0/netconv"
)

type TracingConfig struct {
    ServiceName     string
    ServiceVersion  string
    Environment     string
    JaegerEndpoint  string
    SampleRate      float64
}

func InitTracing(config TracingConfig) (*trace.TracerProvider, error) {
    // Create Jaeger exporter
    jaegerExporter, err := jaeger.New(
        jaeger.WithCollectorEndpoint(jaeger.WithEndpoint(config.JaegerEndpoint)),
    )
    if err != nil {
        return nil, err
    }

    // Create resource with service information
    res, err := resource.New(
        context.Background(),
        resource.WithAttributes(
            attribute.String("service.name", config.ServiceName),
            attribute.String("service.version", config.ServiceVersion),
            attribute.String("environment", config.Environment),
        ),
    )
    if err != nil {
        return nil, err
    }

    // Create tracer provider
    tp := trace.NewTracerProvider(
        trace.WithBatcher(jaegerExporter),
        trace.WithResource(res),
        trace.WithSampler(trace.TraceIDRatioBased(config.SampleRate)),
    )

    // Set global tracer provider
    otel.SetTracerProvider(tp)

    return tp, nil
}

// Fiber middleware for distributed tracing
func TracingMiddleware(serviceName string) fiber.Handler {
    tracer := otel.Tracer(serviceName)
    
    return func(c *fiber.Ctx) error {
        // Start span
        ctx, span := tracer.Start(c.Context(), fmt.Sprintf("%s %s", c.Method(), c.Route().Path))
        defer span.End()

        // Set span attributes
        span.SetAttributes(
            httpconv.HTTPMethodKey.String(c.Method()),
            httpconv.HTTPURLKey.String(string(c.Request().URI().FullURI())),
            httpconv.HTTPUserAgentKey.String(string(c.Request().Header.UserAgent())),
            netconv.NetPeerIPKey.String(c.IP()),
        )

        // Add to context
        c.SetUserContext(ctx)

        // Process request
        err := c.Next()

        // Set response attributes
        span.SetAttributes(
            httpconv.HTTPStatusCodeKey.Int(c.Response().StatusCode()),
        )

        if err != nil {
            span.RecordError(err)
        }

        return err
    }
}

// Service-level tracing helpers
func TraceServiceOperation(ctx context.Context, serviceName, operation string, fn func(context.Context) error) error {
    tracer := otel.Tracer(serviceName)
    ctx, span := tracer.Start(ctx, operation)
    defer span.End()

    err := fn(ctx)
    if err != nil {
        span.RecordError(err)
        span.SetAttributes(attribute.Bool("error", true))
    }

    return err
}

// Database tracing for Ent
func TraceDatabaseOperation(ctx context.Context, operation, table string, fn func(context.Context) error) error {
    tracer := otel.Tracer("database")
    ctx, span := tracer.Start(ctx, fmt.Sprintf("db.%s.%s", operation, table))
    defer span.End()

    span.SetAttributes(
        attribute.String("db.operation", operation),
        attribute.String("db.table", table),
        attribute.String("db.system", "postgresql"),
    )

    err := fn(ctx)
    if err != nil {
        span.RecordError(err)
    }

    return err
}

Frontend Tracing Integration

// Frontend tracing with OpenTelemetry
import { WebTracerProvider } from '@opentelemetry/sdk-trace-web'
import { getWebAutoInstrumentations } from '@opentelemetry/auto-instrumentations-web'
import { JaegerExporter } from '@opentelemetry/exporter-jaeger'
import { registerInstrumentations } from '@opentelemetry/instrumentation'

export class FrontendTracing {
  private provider: WebTracerProvider

  constructor(config: TracingConfig) {
    this.provider = new WebTracerProvider({
      resource: new Resource({
        'service.name': config.serviceName,
        'service.version': config.serviceVersion,
      }),
    })

    // Configure Jaeger exporter
    const jaegerExporter = new JaegerExporter({
      endpoint: config.jaegerEndpoint,
    })

    this.provider.addSpanProcessor(
      new BatchSpanProcessor(jaegerExporter)
    )

    // Register provider
    this.provider.register()

    // Auto-instrument browser APIs
    registerInstrumentations({
      instrumentations: [
        getWebAutoInstrumentations({
          '@opentelemetry/instrumentation-document-load': {
            enabled: true,
          },
          '@opentelemetry/instrumentation-user-interaction': {
            enabled: true,
          },
          '@opentelemetry/instrumentation-fetch': {
            enabled: true,
            propagateTraceHeaderCorsUrls: [
              new RegExp(config.apiBaseUrl),
            ],
          },
        }),
      ],
    })
  }

  // Game-specific tracing
  traceGameAction(action: string, properties: Record<string, any>, fn: () => Promise<void>): Promise<void> {
    const tracer = trace.getTracer('game-frontend')
    
    return tracer.startActiveSpan(action, async (span) => {
      try {
        // Set span attributes
        Object.entries(properties).forEach(([key, value]) => {
          span.setAttributes({ [key]: value })
        })

        await fn()
      } catch (error) {
        span.recordException(error as Error)
        span.setStatus({ code: SpanStatusCode.ERROR })
        throw error
      } finally {
        span.end()
      }
    })
  }
}

// React hook for tracing
export const useTracing = () => {
  const tracer = trace.getTracer('react-components')

  const traceUserAction = useCallback(
    async (action: string, properties: Record<string, any>, fn: () => Promise<void>) => {
      return tracer.startActiveSpan(`user.${action}`, async (span) => {
        try {
          span.setAttributes(properties)
          await fn()
        } catch (error) {
          span.recordException(error as Error)
          throw error
        } finally {
          span.end()
        }
      })
    },
    [tracer]
  )

  return { traceUserAction }
}

Logging Strategy

Structured Logging Implementation

Backend Logging

// Structured logging with zerolog
package logging

import (
    "os"
    "time"
    "github.com/rs/zerolog"
    "github.com/rs/zerolog/log"
)

type Logger struct {
    logger zerolog.Logger
}

type LogConfig struct {
    Level       string
    Environment string
    ServiceName string
    Version     string
}

func NewLogger(config LogConfig) *Logger {
    // Parse log level
    level, err := zerolog.ParseLevel(config.Level)
    if err != nil {
        level = zerolog.InfoLevel
    }

    // Configure zerolog
    zerolog.SetGlobalLevel(level)
    zerolog.TimeFieldFormat = time.RFC3339Nano

    var logger zerolog.Logger

    if config.Environment == "development" {
        // Human-readable console output for development
        logger = zerolog.New(zerolog.ConsoleWriter{
            Out:        os.Stdout,
            TimeFormat: "15:04:05",
        }).With().Timestamp().Logger()
    } else {
        // JSON output for production
        logger = zerolog.New(os.Stdout).With().Timestamp().Logger()
    }

    // Add service metadata
    logger = logger.With().
        Str("service", config.ServiceName).
        Str("version", config.Version).
        Str("environment", config.Environment).
        Logger()

    return &Logger{logger: logger}
}

// Structured logging methods
func (l *Logger) GameEvent(event string, gameSessionID, userID string, properties map[string]interface{}) {
    l.logger.Info().
        Str("event_type", "game").
        Str("event", event).
        Str("game_session_id", gameSessionID).
        Str("user_id", userID).
        Fields(properties).
        Msg("Game event occurred")
}

func (l *Logger) APIRequest(method, path string, statusCode int, duration time.Duration, userID string) {
    l.logger.Info().
        Str("event_type", "api_request").
        Str("method", method).
        Str("path", path).
        Int("status_code", statusCode).
        Dur("duration_ms", duration).
        Str("user_id", userID).
        Msg("API request processed")
}

func (l *Logger) DatabaseOperation(operation, table string, duration time.Duration, rowsAffected int64) {
    l.logger.Debug().
        Str("event_type", "database").
        Str("operation", operation).
        Str("table", table).
        Dur("duration_ms", duration).
        Int64("rows_affected", rowsAffected).
        Msg("Database operation completed")
}

func (l *Logger) AuthenticationEvent(event, userID, userType string, success bool, details map[string]string) {
    level := l.logger.Info()
    if !success {
        level = l.logger.Warn()
    }

    level.
        Str("event_type", "authentication").
        Str("event", event).
        Str("user_id", userID).
        Str("user_type", userType).
        Bool("success", success).
        Fields(details).
        Msg("Authentication event")
}

func (l *Logger) SecurityEvent(event, userID, ipAddress string, severity string, details map[string]interface{}) {
    l.logger.Warn().
        Str("event_type", "security").
        Str("event", event).
        Str("user_id", userID).
        Str("ip_address", ipAddress).
        Str("severity", severity).
        Fields(details).
        Msg("Security event detected")
}

func (l *Logger) Error(err error, context string, fields map[string]interface{}) {
    l.logger.Error().
        Err(err).
        Str("context", context).
        Fields(fields).
        Msg("Error occurred")
}

// Fiber middleware for request logging
func RequestLoggingMiddleware(logger *Logger) fiber.Handler {
    return func(c *fiber.Ctx) error {
        start := time.Now()

        // Process request
        err := c.Next()

        // Log request
        duration := time.Since(start)
        userID := c.Locals("user_id")
        if userID == nil {
            userID = "anonymous"
        }

        logger.APIRequest(
            c.Method(),
            c.Route().Path,
            c.Response().StatusCode(),
            duration,
            userID.(string),
        )

        return err
    }
}

Frontend Logging

// Frontend structured logging
interface LogEntry {
  timestamp: string
  level: 'debug' | 'info' | 'warn' | 'error'
  message: string
  context?: string
  userId?: string
  sessionId?: string
  gameSessionId?: string
  error?: {
    name: string
    message: string
    stack?: string
  }
  properties?: Record<string, any>
}

export class FrontendLogger {
  private buffer: LogEntry[] = []
  private endpoint: string
  private maxBufferSize: number = 100
  private flushInterval: number = 30000

  constructor(endpoint: string) {
    this.endpoint = endpoint
    this.startPeriodicFlush()
  }

  debug(message: string, context?: string, properties?: Record<string, any>): void {
    this.log('debug', message, context, properties)
  }

  info(message: string, context?: string, properties?: Record<string, any>): void {
    this.log('info', message, context, properties)
  }

  warn(message: string, context?: string, properties?: Record<string, any>): void {
    this.log('warn', message, context, properties)
  }

  error(message: string, error?: Error, context?: string, properties?: Record<string, any>): void {
    const entry: LogEntry = {
      timestamp: new Date().toISOString(),
      level: 'error',
      message,
      context,
      userId: this.getUserId(),
      sessionId: this.getSessionId(),
      gameSessionId: this.getGameSessionId(),
      properties,
    }

    if (error) {
      entry.error = {
        name: error.name,
        message: error.message,
        stack: error.stack,
      }
    }

    this.buffer.push(entry)
    this.checkFlushConditions()
  }

  // Game-specific logging methods
  logGameEvent(event: string, gameSessionId: string, properties?: Record<string, any>): void {
    this.info(`Game event: ${event}`, 'game', {
      gameSessionId,
      ...properties,
    })
  }

  logUserAction(action: string, properties?: Record<string, any>): void {
    this.info(`User action: ${action}`, 'user', properties)
  }

  logPerformanceMetric(metric: string, value: number, unit: string): void {
    this.debug(`Performance metric: ${metric}`, 'performance', {
      metric,
      value,
      unit,
      url: window.location.pathname,
    })
  }

  private log(level: LogEntry['level'], message: string, context?: string, properties?: Record<string, any>): void {
    const entry: LogEntry = {
      timestamp: new Date().toISOString(),
      level,
      message,
      context,
      userId: this.getUserId(),
      sessionId: this.getSessionId(),
      gameSessionId: this.getGameSessionId(),
      properties,
    }

    this.buffer.push(entry)
    this.checkFlushConditions()
  }

  private checkFlushConditions(): void {
    if (this.buffer.length >= this.maxBufferSize) {
      this.flush()
    }
  }

  private async flush(): Promise<void> {
    if (this.buffer.length === 0) return

    const logs = [...this.buffer]
    this.buffer = []

    try {
      await fetch(this.endpoint, {
        method: 'POST',
        headers: {
          'Content-Type': 'application/json',
        },
        body: JSON.stringify({ logs }),
      })
    } catch (error) {
      console.error('Failed to send logs:', error)
      // Re-queue logs for retry (keep only most recent to avoid memory issues)
      this.buffer.unshift(...logs.slice(-50))
    }
  }

  private startPeriodicFlush(): void {
    setInterval(() => {
      this.flush()
    }, this.flushInterval)

    // Flush on page unload
    window.addEventListener('beforeunload', () => {
      // Use sendBeacon for reliable delivery during page unload
      if (this.buffer.length > 0) {
        navigator.sendBeacon(this.endpoint, JSON.stringify({ logs: this.buffer }))
      }
    })
  }

  private getUserId(): string | undefined {
    // Implementation depends on auth system
    return sessionStorage.getItem('user_id') || undefined
  }

  private getSessionId(): string | undefined {
    return sessionStorage.getItem('session_id') || undefined
  }

  private getGameSessionId(): string | undefined {
    return sessionStorage.getItem('game_session_id') || undefined
  }
}

Monitoring Dashboards

Grafana Dashboard Configuration

1. Business Intelligence Dashboard

{
  "dashboard": {
    "title": "Know Foolery - Business Intelligence",
    "panels": [
      {
        "title": "Active Games",
        "type": "stat",
        "targets": [
          {
            "expr": "sum(rate(games_started_total[5m])) * 300",
            "legendFormat": "Games per 5min"
          }
        ]
      },
      {
        "title": "Game Completion Rate",
        "type": "stat",
        "targets": [
          {
            "expr": "rate(games_completed_total{completion_type=\"normal\"}[1h]) / rate(games_started_total[1h]) * 100",
            "legendFormat": "Completion %"
          }
        ]
      },
      {
        "title": "Average Session Duration",
        "type": "graph",
        "targets": [
          {
            "expr": "histogram_quantile(0.5, rate(game_session_duration_seconds_bucket[5m]))",
            "legendFormat": "Median"
          },
          {
            "expr": "histogram_quantile(0.95, rate(game_session_duration_seconds_bucket[5m]))",
            "legendFormat": "95th percentile"
          }
        ]
      },
      {
        "title": "Question Accuracy by Theme",
        "type": "heatmap",
        "targets": [
          {
            "expr": "rate(answers_submitted_total{is_correct=\"true\"}[1h]) / rate(answers_submitted_total[1h]) by (theme)",
            "legendFormat": "{{theme}}"
          }
        ]
      },
      {
        "title": "Hint Usage Rate",
        "type": "graph",
        "targets": [
          {
            "expr": "rate(hints_requested_total[5m]) / rate(questions_asked_total[5m]) * 100",
            "legendFormat": "Hint Usage %"
          }
        ]
      },
      {
        "title": "Score Distribution",
        "type": "histogram",
        "targets": [
          {
            "expr": "histogram_quantile(0.25, rate(game_scores_bucket[1h]))",
            "legendFormat": "25th percentile"
          },
          {
            "expr": "histogram_quantile(0.5, rate(game_scores_bucket[1h]))",
            "legendFormat": "Median"
          },
          {
            "expr": "histogram_quantile(0.75, rate(game_scores_bucket[1h]))",
            "legendFormat": "75th percentile"
          }
        ]
      }
    ]
  }
}

2. Technical Performance Dashboard

{
  "dashboard": {
    "title": "Know Foolery - Technical Performance",
    "panels": [
      {
        "title": "API Response Times",
        "type": "graph",
        "targets": [
          {
            "expr": "histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) by (service)",
            "legendFormat": "{{service}} - 95th percentile"
          }
        ]
      },
      {
        "title": "Error Rate",
        "type": "graph",
        "targets": [
          {
            "expr": "rate(http_requests_total{status_code=~\"5..\"}[5m]) / rate(http_requests_total[5m]) * 100 by (service)",
            "legendFormat": "{{service}} - Error %"
          }
        ]
      },
      {
        "title": "Database Performance",
        "type": "graph",
        "targets": [
          {
            "expr": "histogram_quantile(0.95, rate(db_query_duration_seconds_bucket[5m])) by (service)",
            "legendFormat": "{{service}} - Query Time"
          }
        ]
      },
      {
        "title": "Cache Hit Rate",
        "type": "stat",
        "targets": [
          {
            "expr": "rate(cache_operations_total{result=\"hit\"}[5m]) / rate(cache_operations_total{operation=\"get\"}[5m]) * 100",
            "legendFormat": "Hit Rate %"
          }
        ]
      },
      {
        "title": "Authentication Success Rate",
        "type": "graph",
        "targets": [
          {
            "expr": "rate(authentication_attempts_total{result=\"success\"}[5m]) / rate(authentication_attempts_total[5m]) * 100 by (method)",
            "legendFormat": "{{method}} - Success %"
          }
        ]
      }
    ]
  }
}

Alerting Strategy

Alert Rules Configuration

Critical Alerts

# prometheus-alerts.yml
groups:
  - name: know-foolery-critical
    rules:
      - alert: HighErrorRate
        expr: rate(http_requests_total{status_code=~"5.."}[5m]) / rate(http_requests_total[5m]) > 0.05
        for: 2m
        labels:
          severity: critical
          team: backend
        annotations:
          summary: "High error rate detected in {{ $labels.service }}"
          description: "Error rate is {{ $value | humanizePercentage }} for service {{ $labels.service }}"

      - alert: DatabaseConnectionFailure
        expr: db_connections_active == 0
        for: 1m
        labels:
          severity: critical
          team: backend
        annotations:
          summary: "Database connections dropped to zero"
          description: "Service {{ $labels.service }} has no active database connections"

      - alert: AuthenticationSystemDown
        expr: up{service="zitadel"} == 0
        for: 1m
        labels:
          severity: critical
          team: security
        annotations:
          summary: "Authentication system is down"
          description: "Zitadel authentication service is unreachable"

      - alert: GameSessionsStuck
        expr: increase(games_started_total[5m]) > increase(games_completed_total[5m]) * 2
        for: 5m
        labels:
          severity: critical
          team: backend
        annotations:
          summary: "Game sessions not completing"
          description: "Many games are starting but not completing normally"

  - name: know-foolery-warning
    rules:
      - alert: HighLatency
        expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) > 1.0
        for: 5m
        labels:
          severity: warning
          team: backend
        annotations:
          summary: "High API latency detected"
          description: "95th percentile latency is {{ $value }}s for {{ $labels.service }}"

      - alert: LowGameCompletionRate
        expr: rate(games_completed_total{completion_type="normal"}[1h]) / rate(games_started_total[1h]) < 0.7
        for: 10m
        labels:
          severity: warning
          team: product
        annotations:
          summary: "Low game completion rate"
          description: "Only {{ $value | humanizePercentage }} of games are being completed normally"

      - alert: HighHintUsage
        expr: rate(hints_requested_total[1h]) / rate(questions_asked_total[1h]) > 0.8
        for: 15m
        labels:
          severity: warning
          team: product
        annotations:
          summary: "Unusually high hint usage"
          description: "{{ $value | humanizePercentage }} of questions are requesting hints"

  - name: know-foolery-security
    rules:
      - alert: HighAuthenticationFailures
        expr: rate(authentication_attempts_total{result="failure"}[5m]) > 10
        for: 2m
        labels:
          severity: warning
          team: security
        annotations:
          summary: "High authentication failure rate"
          description: "{{ $value }} authentication failures per second"

      - alert: SuspiciousUserActivity
        expr: rate(answers_submitted_total[1m]) by (user_id) > 5
        for: 1m
        labels:
          severity: warning
          team: security
        annotations:
          summary: "Suspicious user activity detected"
          description: "User {{ $labels.user_id }} is submitting answers at {{ $value }}/second"

Alert Routing and Escalation

# alertmanager.yml
global:
  slack_api_url: 'https://hooks.slack.com/services/...'

route:
  group_by: ['alertname', 'service']
  group_wait: 10s
  group_interval: 10s
  repeat_interval: 1h
  receiver: 'default'
  routes:
  - match:
      severity: critical
    receiver: 'critical-alerts'
    group_wait: 0s
  - match:
      team: security
    receiver: 'security-team'
  - match:
      team: product
    receiver: 'product-team'

receivers:
- name: 'default'
  slack_configs:
  - channel: '#alerts'
    title: 'Know Foolery Alert'
    text: '{{ range .Alerts }}{{ .Annotations.summary }}{{ end }}'

- name: 'critical-alerts'
  slack_configs:
  - channel: '#critical-alerts'
    title: 'CRITICAL: Know Foolery'
    text: '{{ range .Alerts }}{{ .Annotations.summary }}\n{{ .Annotations.description }}{{ end }}'
  pagerduty_configs:
  - service_key: 'your-pagerduty-key'
    description: '{{ range .Alerts }}{{ .Annotations.summary }}{{ end }}'

- name: 'security-team'
  slack_configs:
  - channel: '#security-alerts'
    title: 'Security Alert: Know Foolery'
    text: '{{ range .Alerts }}{{ .Annotations.summary }}{{ end }}'

- name: 'product-team'
  slack_configs:
  - channel: '#product-alerts'
    title: 'Product Alert: Know Foolery'
    text: '{{ range .Alerts }}{{ .Annotations.summary }}{{ end }}'

This comprehensive observability strategy ensures that Know Foolery has full visibility into its performance, user behavior, and system health, enabling proactive issue resolution and data-driven product improvements.