You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

40 KiB

Raw Blame History

Know Foolery - Detailed Observability Implementation Guidelines

Metrics Strategy

Application Metrics Collection

1. Business Metrics (Game-Specific)

// Business metrics for game insights
package metrics

import (
    "github.com/prometheus/client_golang/prometheus"
    "github.com/prometheus/client_golang/prometheus/promauto"
)

var (
    // Game session metrics
    gamesStarted = promauto.NewCounterVec(
        prometheus.CounterOpts{
            Name: "games_started_total",
            Help: "Total number of games started",
        },
        []string{"player_type", "platform"},
    )
    
    gamesCompleted = promauto.NewCounterVec(
        prometheus.CounterOpts{
            Name: "games_completed_total",
            Help: "Total number of games completed",
        },
        []string{"completion_type", "platform"}, // normal, timeout, abandoned
    )
    
    sessionDuration = promauto.NewHistogramVec(
        prometheus.HistogramOpts{
            Name:    "game_session_duration_seconds",
            Help:    "Duration of game sessions",
            Buckets: []float64{60, 300, 600, 900, 1200, 1500, 1800}, // 1min to 30min
        },
        []string{"completion_type"},
    )
    
    // Question and answer metrics
    questionsAsked = promauto.NewCounterVec(
        prometheus.CounterOpts{
            Name: "questions_asked_total",
            Help: "Total number of questions asked",
        },
        []string{"theme", "difficulty"},
    )
    
    answersSubmitted = promauto.NewCounterVec(
        prometheus.CounterOpts{
            Name: "answers_submitted_total",
            Help: "Total number of answers submitted",
        },
        []string{"theme", "is_correct", "attempt_number", "used_hint"},
    )
    
    hintsRequested = promauto.NewCounterVec(
        prometheus.CounterOpts{
            Name: "hints_requested_total",
            Help: "Total number of hints requested",
        },
        []string{"theme", "question_difficulty"},
    )
    
    // Score distribution
    scoreDistribution = promauto.NewHistogramVec(
        prometheus.HistogramOpts{
            Name:    "game_scores",
            Help:    "Distribution of game scores",
            Buckets: []float64{0, 5, 10, 15, 20, 25, 30, 40, 50, 60, 80, 100},
        },
        []string{"session_duration_bucket"},
    )
    
    // Leaderboard metrics
    leaderboardUpdates = promauto.NewCounter(
        prometheus.CounterOpts{
            Name: "leaderboard_updates_total",
            Help: "Total number of leaderboard updates",
        },
    )
    
    topScoreChanges = promauto.NewCounterVec(
        prometheus.CounterOpts{
            Name: "top_score_changes_total",
            Help: "Changes in top 10 scores",
        },
        []string{"position"}, // top_1, top_5, top_10
    )
)

// Business metrics collection service
type GameMetrics struct {
    registry prometheus.Registerer
}

func NewGameMetrics() *GameMetrics {
    return &GameMetrics{
        registry: prometheus.DefaultRegisterer,
    }
}

func (m *GameMetrics) RecordGameStart(playerType, platform string) {
    gamesStarted.WithLabelValues(playerType, platform).Inc()
}

func (m *GameMetrics) RecordGameCompletion(completionType, platform string, duration time.Duration) {
    gamesCompleted.WithLabelValues(completionType, platform).Inc()
    sessionDuration.WithLabelValues(completionType).Observe(duration.Seconds())
}

func (m *GameMetrics) RecordQuestionAsked(theme, difficulty string) {
    questionsAsked.WithLabelValues(theme, difficulty).Inc()
}

func (m *GameMetrics) RecordAnswerSubmitted(theme string, isCorrect bool, attemptNum int, usedHint bool) {
    answersSubmitted.WithLabelValues(
        theme,
        strconv.FormatBool(isCorrect),
        strconv.Itoa(attemptNum),
        strconv.FormatBool(usedHint),
    ).Inc()
}

func (m *GameMetrics) RecordFinalScore(score int, sessionDuration time.Duration) {
    durationBucket := m.getDurationBucket(sessionDuration)
    scoreDistribution.WithLabelValues(durationBucket).Observe(float64(score))
}

func (m *GameMetrics) getDurationBucket(duration time.Duration) string {
    minutes := int(duration.Minutes())
    switch {
    case minutes <= 5:
        return "0-5min"
    case minutes <= 15:
        return "5-15min"
    case minutes <= 25:
        return "15-25min"
    default:
        return "25-30min"
    }
}

2. Technical Metrics (Infrastructure)

// Technical metrics for system health
var (
    // HTTP metrics
    httpRequestsTotal = promauto.NewCounterVec(
        prometheus.CounterOpts{
            Name: "http_requests_total",
            Help: "Total number of HTTP requests",
        },
        []string{"method", "endpoint", "status_code", "service"},
    )
    
    httpRequestDuration = promauto.NewHistogramVec(
        prometheus.HistogramOpts{
            Name:    "http_request_duration_seconds",
            Help:    "HTTP request duration",
            Buckets: prometheus.DefBuckets,
        },
        []string{"method", "endpoint", "service"},
    )
    
    // Database metrics
    dbConnectionsActive = promauto.NewGaugeVec(
        prometheus.GaugeOpts{
            Name: "db_connections_active",
            Help: "Number of active database connections",
        },
        []string{"database", "service"},
    )
    
    dbQueryDuration = promauto.NewHistogramVec(
        prometheus.HistogramOpts{
            Name:    "db_query_duration_seconds",
            Help:    "Database query duration",
            Buckets: []float64{0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1.0, 2.0, 5.0},
        },
        []string{"query_type", "table", "service"},
    )
    
    dbErrors = promauto.NewCounterVec(
        prometheus.CounterOpts{
            Name: "db_errors_total",
            Help: "Total number of database errors",
        },
        []string{"error_type", "service"},
    )
    
    // Cache metrics
    cacheOperations = promauto.NewCounterVec(
        prometheus.CounterOpts{
            Name: "cache_operations_total",
            Help: "Total number of cache operations",
        },
        []string{"operation", "result", "service"}, // get/set/delete, hit/miss/error
    )
    
    cacheKeyCount = promauto.NewGaugeVec(
        prometheus.GaugeOpts{
            Name: "cache_keys_total",
            Help: "Number of keys in cache",
        },
        []string{"cache_type", "service"},
    )
    
    // Authentication metrics
    authenticationAttempts = promauto.NewCounterVec(
        prometheus.CounterOpts{
            Name: "authentication_attempts_total",
            Help: "Total authentication attempts",
        },
        []string{"method", "result", "user_type"}, // jwt/oauth, success/failure, player/admin
    )
    
    tokenOperations = promauto.NewCounterVec(
        prometheus.CounterOpts{
            Name: "token_operations_total",
            Help: "JWT token operations",
        },
        []string{"operation", "result"}, // validate/refresh, success/failure
    )
)

// Fiber middleware for HTTP metrics
func PrometheusMiddleware(serviceName string) fiber.Handler {
    return func(c *fiber.Ctx) error {
        start := time.Now()
        
        // Process request
        err := c.Next()
        
        // Record metrics
        duration := time.Since(start).Seconds()
        statusCode := strconv.Itoa(c.Response().StatusCode())
        
        httpRequestsTotal.WithLabelValues(
            c.Method(),
            c.Route().Path,
            statusCode,
            serviceName,
        ).Inc()
        
        httpRequestDuration.WithLabelValues(
            c.Method(),
            c.Route().Path,
            serviceName,
        ).Observe(duration)
        
        return err
    }
}

// Database metrics middleware for Ent
type MetricsHook struct {
    serviceName string
}

func NewMetricsHook(serviceName string) *MetricsHook {
    return &MetricsHook{serviceName: serviceName}
}

func (h *MetricsHook) Hook() ent.Hook {
    return hook.On(
        func(next ent.Mutator) ent.Mutator {
            return ent.MutateFunc(func(ctx context.Context, m ent.Mutation) (ent.Value, error) {
                start := time.Now()
                
                result, err := next.Mutate(ctx, m)
                
                duration := time.Since(start).Seconds()
                queryType := strings.ToLower(m.Op().String())
                table := m.Type()
                
                dbQueryDuration.WithLabelValues(queryType, table, h.serviceName).Observe(duration)
                
                if err != nil {
                    dbErrors.WithLabelValues("query_error", h.serviceName).Inc()
                }
                
                return result, err
            })
        },
        ent.OpCreate|ent.OpUpdate|ent.OpUpdateOne|ent.OpDelete|ent.OpDeleteOne,
    )
}

Frontend Metrics Collection

Web Application Metrics

// Frontend metrics collection
class MetricsCollector {
  private endpoint: string
  private batchSize: number = 50
  private flushInterval: number = 30000 // 30 seconds
  private metrics: MetricEvent[] = []

  constructor(endpoint: string) {
    this.endpoint = endpoint
    this.startBatchFlush()
    this.setupPerformanceObserver()
  }

  // User interaction metrics
  trackUserAction(action: string, properties: Record<string, any> = {}): void {
    this.addMetric({
      type: 'user_action',
      action,
      timestamp: Date.now(),
      session_id: this.getSessionId(),
      user_agent: navigator.userAgent,
      ...properties,
    })
  }

  // Game-specific metrics
  trackGameEvent(event: GameEvent): void {
    this.addMetric({
      type: 'game_event',
      event: event.type,
      timestamp: Date.now(),
      session_id: this.getSessionId(),
      game_session_id: event.gameSessionId,
      properties: event.properties,
    })
  }

  // Performance metrics
  trackPerformance(metric: PerformanceMetric): void {
    this.addMetric({
      type: 'performance',
      metric: metric.name,
      value: metric.value,
      timestamp: Date.now(),
      url: window.location.pathname,
    })
  }

  // Error tracking
  trackError(error: Error, context: string): void {
    this.addMetric({
      type: 'error',
      error_message: error.message,
      error_stack: error.stack,
      context,
      timestamp: Date.now(),
      url: window.location.pathname,
      user_agent: navigator.userAgent,
    })
  }

  private setupPerformanceObserver(): void {
    // Web Vitals tracking
    if ('PerformanceObserver' in window) {
      // Largest Contentful Paint
      new PerformanceObserver((list) => {
        list.getEntries().forEach((entry) => {
          this.trackPerformance({
            name: 'largest_contentful_paint',
            value: entry.startTime,
          })
        })
      }).observe({ entryTypes: ['largest-contentful-paint'] })

      // First Input Delay
      new PerformanceObserver((list) => {
        list.getEntries().forEach((entry) => {
          this.trackPerformance({
            name: 'first_input_delay',
            value: entry.processingStart - entry.startTime,
          })
        })
      }).observe({ entryTypes: ['first-input'] })

      // Cumulative Layout Shift
      new PerformanceObserver((list) => {
        let cumulativeScore = 0
        list.getEntries().forEach((entry) => {
          if (!entry.hadRecentInput) {
            cumulativeScore += entry.value
          }
        })
        this.trackPerformance({
          name: 'cumulative_layout_shift',
          value: cumulativeScore,
        })
      }).observe({ entryTypes: ['layout-shift'] })
    }

    // API response time tracking
    this.interceptFetch()
  }

  private interceptFetch(): void {
    const originalFetch = window.fetch
    window.fetch = async (...args) => {
      const start = performance.now()
      const url = args[0].toString()
      
      try {
        const response = await originalFetch(...args)
        const duration = performance.now() - start
        
        this.trackPerformance({
          name: 'api_request_duration',
          value: duration,
          url,
          status: response.status,
        })
        
        return response
      } catch (error) {
        const duration = performance.now() - start
        
        this.trackError(error as Error, `API request to ${url}`)
        this.trackPerformance({
          name: 'api_request_duration',
          value: duration,
          url,
          status: 0,
        })
        
        throw error
      }
    }
  }

  private addMetric(metric: MetricEvent): void {
    this.metrics.push(metric)
    
    if (this.metrics.length >= this.batchSize) {
      this.flush()
    }
  }

  private async flush(): Promise<void> {
    if (this.metrics.length === 0) return

    const batch = [...this.metrics]
    this.metrics = []

    try {
      await fetch(this.endpoint, {
        method: 'POST',
        headers: {
          'Content-Type': 'application/json',
        },
        body: JSON.stringify({ metrics: batch }),
      })
    } catch (error) {
      console.error('Failed to send metrics:', error)
      // Re-queue metrics for retry
      this.metrics.unshift(...batch)
    }
  }

  private startBatchFlush(): void {
    setInterval(() => {
      this.flush()
    }, this.flushInterval)

    // Flush on page unload
    window.addEventListener('beforeunload', () => {
      this.flush()
    })
  }

  private getSessionId(): string {
    // Implementation depends on session management
    return sessionStorage.getItem('session_id') || 'anonymous'
  }
}

// Game-specific metrics tracking
export class GameMetricsTracker {
  private collector: MetricsCollector

  constructor(collector: MetricsCollector) {
    this.collector = collector
  }

  trackGameStart(gameSessionId: string, playerName: string): void {
    this.collector.trackGameEvent({
      type: 'game_started',
      gameSessionId,
      properties: {
        player_name: playerName,
        platform: this.getPlatform(),
      },
    })
  }

  trackQuestionDisplayed(gameSessionId: string, questionId: string, theme: string): void {
    this.collector.trackGameEvent({
      type: 'question_displayed',
      gameSessionId,
      properties: {
        question_id: questionId,
        theme,
        display_time: Date.now(),
      },
    })
  }

  trackAnswerSubmitted(
    gameSessionId: string,
    questionId: string,
    isCorrect: boolean,
    attemptNumber: number,
    timeTaken: number,
    usedHint: boolean
  ): void {
    this.collector.trackGameEvent({
      type: 'answer_submitted',
      gameSessionId,
      properties: {
        question_id: questionId,
        is_correct: isCorrect,
        attempt_number: attemptNumber,
        time_taken_ms: timeTaken,
        used_hint: usedHint,
      },
    })
  }

  trackHintRequested(gameSessionId: string, questionId: string): void {
    this.collector.trackGameEvent({
      type: 'hint_requested',
      gameSessionId,
      properties: {
        question_id: questionId,
        request_time: Date.now(),
      },
    })
  }

  trackGameCompleted(
    gameSessionId: string,
    finalScore: number,
    questionsAnswered: number,
    completionType: 'normal' | 'timeout' | 'abandoned'
  ): void {
    this.collector.trackGameEvent({
      type: 'game_completed',
      gameSessionId,
      properties: {
        final_score: finalScore,
        questions_answered: questionsAnswered,
        completion_type: completionType,
        platform: this.getPlatform(),
      },
    })
  }

  private getPlatform(): string {
    // Detect platform
    if (/Android/i.test(navigator.userAgent)) return 'android'
    if (/iPhone|iPad|iPod/i.test(navigator.userAgent)) return 'ios'
    if (window.wails) return 'desktop' // For Wails apps
    return 'web'
  }
}

// Usage in React components
export const useGameMetrics = () => {
  const collector = useRef(new MetricsCollector('/api/v1/metrics'))
  const gameTracker = useRef(new GameMetricsTracker(collector.current))

  return {
    trackGameStart: gameTracker.current.trackGameStart.bind(gameTracker.current),
    trackQuestionDisplayed: gameTracker.current.trackQuestionDisplayed.bind(gameTracker.current),
    trackAnswerSubmitted: gameTracker.current.trackAnswerSubmitted.bind(gameTracker.current),
    trackHintRequested: gameTracker.current.trackHintRequested.bind(gameTracker.current),
    trackGameCompleted: gameTracker.current.trackGameCompleted.bind(gameTracker.current),
    trackUserAction: collector.current.trackUserAction.bind(collector.current),
    trackError: collector.current.trackError.bind(collector.current),
  }
}

Distributed Tracing

OpenTelemetry Integration

Backend Tracing Setup

// OpenTelemetry tracing setup
package observability

import (
    "context"
    "go.opentelemetry.io/otel"
    "go.opentelemetry.io/otel/attribute"
    "go.opentelemetry.io/otel/exporters/jaeger"
    "go.opentelemetry.io/otel/sdk/resource"
    "go.opentelemetry.io/otel/sdk/trace"
    "go.opentelemetry.io/otel/semconv/v1.12.0/httpconv"
    "go.opentelemetry.io/otel/semconv/v1.12.0/netconv"
)

type TracingConfig struct {
    ServiceName     string
    ServiceVersion  string
    Environment     string
    JaegerEndpoint  string
    SampleRate      float64
}

func InitTracing(config TracingConfig) (*trace.TracerProvider, error) {
    // Create Jaeger exporter
    jaegerExporter, err := jaeger.New(
        jaeger.WithCollectorEndpoint(jaeger.WithEndpoint(config.JaegerEndpoint)),
    )
    if err != nil {
        return nil, err
    }

    // Create resource with service information
    res, err := resource.New(
        context.Background(),
        resource.WithAttributes(
            attribute.String("service.name", config.ServiceName),
            attribute.String("service.version", config.ServiceVersion),
            attribute.String("environment", config.Environment),
        ),
    )
    if err != nil {
        return nil, err
    }

    // Create tracer provider
    tp := trace.NewTracerProvider(
        trace.WithBatcher(jaegerExporter),
        trace.WithResource(res),
        trace.WithSampler(trace.TraceIDRatioBased(config.SampleRate)),
    )

    // Set global tracer provider
    otel.SetTracerProvider(tp)

    return tp, nil
}

// Fiber middleware for distributed tracing
func TracingMiddleware(serviceName string) fiber.Handler {
    tracer := otel.Tracer(serviceName)
    
    return func(c *fiber.Ctx) error {
        // Start span
        ctx, span := tracer.Start(c.Context(), fmt.Sprintf("%s %s", c.Method(), c.Route().Path))
        defer span.End()

        // Set span attributes
        span.SetAttributes(
            httpconv.HTTPMethodKey.String(c.Method()),
            httpconv.HTTPURLKey.String(string(c.Request().URI().FullURI())),
            httpconv.HTTPUserAgentKey.String(string(c.Request().Header.UserAgent())),
            netconv.NetPeerIPKey.String(c.IP()),
        )

        // Add to context
        c.SetUserContext(ctx)

        // Process request
        err := c.Next()

        // Set response attributes
        span.SetAttributes(
            httpconv.HTTPStatusCodeKey.Int(c.Response().StatusCode()),
        )

        if err != nil {
            span.RecordError(err)
        }

        return err
    }
}

// Service-level tracing helpers
func TraceServiceOperation(ctx context.Context, serviceName, operation string, fn func(context.Context) error) error {
    tracer := otel.Tracer(serviceName)
    ctx, span := tracer.Start(ctx, operation)
    defer span.End()

    err := fn(ctx)
    if err != nil {
        span.RecordError(err)
        span.SetAttributes(attribute.Bool("error", true))
    }

    return err
}

// Database tracing for Ent
func TraceDatabaseOperation(ctx context.Context, operation, table string, fn func(context.Context) error) error {
    tracer := otel.Tracer("database")
    ctx, span := tracer.Start(ctx, fmt.Sprintf("db.%s.%s", operation, table))
    defer span.End()

    span.SetAttributes(
        attribute.String("db.operation", operation),
        attribute.String("db.table", table),
        attribute.String("db.system", "postgresql"),
    )

    err := fn(ctx)
    if err != nil {
        span.RecordError(err)
    }

    return err
}

Frontend Tracing Integration

// Frontend tracing with OpenTelemetry
import { WebTracerProvider } from '@opentelemetry/sdk-trace-web'
import { getWebAutoInstrumentations } from '@opentelemetry/auto-instrumentations-web'
import { JaegerExporter } from '@opentelemetry/exporter-jaeger'
import { registerInstrumentations } from '@opentelemetry/instrumentation'

export class FrontendTracing {
  private provider: WebTracerProvider

  constructor(config: TracingConfig) {
    this.provider = new WebTracerProvider({
      resource: new Resource({
        'service.name': config.serviceName,
        'service.version': config.serviceVersion,
      }),
    })

    // Configure Jaeger exporter
    const jaegerExporter = new JaegerExporter({
      endpoint: config.jaegerEndpoint,
    })

    this.provider.addSpanProcessor(
      new BatchSpanProcessor(jaegerExporter)
    )

    // Register provider
    this.provider.register()

    // Auto-instrument browser APIs
    registerInstrumentations({
      instrumentations: [
        getWebAutoInstrumentations({
          '@opentelemetry/instrumentation-document-load': {
            enabled: true,
          },
          '@opentelemetry/instrumentation-user-interaction': {
            enabled: true,
          },
          '@opentelemetry/instrumentation-fetch': {
            enabled: true,
            propagateTraceHeaderCorsUrls: [
              new RegExp(config.apiBaseUrl),
            ],
          },
        }),
      ],
    })
  }

  // Game-specific tracing
  traceGameAction(action: string, properties: Record<string, any>, fn: () => Promise<void>): Promise<void> {
    const tracer = trace.getTracer('game-frontend')
    
    return tracer.startActiveSpan(action, async (span) => {
      try {
        // Set span attributes
        Object.entries(properties).forEach(([key, value]) => {
          span.setAttributes({ [key]: value })
        })

        await fn()
      } catch (error) {
        span.recordException(error as Error)
        span.setStatus({ code: SpanStatusCode.ERROR })
        throw error
      } finally {
        span.end()
      }
    })
  }
}

// React hook for tracing
export const useTracing = () => {
  const tracer = trace.getTracer('react-components')

  const traceUserAction = useCallback(
    async (action: string, properties: Record<string, any>, fn: () => Promise<void>) => {
      return tracer.startActiveSpan(`user.${action}`, async (span) => {
        try {
          span.setAttributes(properties)
          await fn()
        } catch (error) {
          span.recordException(error as Error)
          throw error
        } finally {
          span.end()
        }
      })
    },
    [tracer]
  )

  return { traceUserAction }
}

Logging Strategy

Structured Logging Implementation

Backend Logging

// Structured logging with zerolog
package logging

import (
    "os"
    "time"
    "github.com/rs/zerolog"
    "github.com/rs/zerolog/log"
)

type Logger struct {
    logger zerolog.Logger
}

type LogConfig struct {
    Level       string
    Environment string
    ServiceName string
    Version     string
}

func NewLogger(config LogConfig) *Logger {
    // Parse log level
    level, err := zerolog.ParseLevel(config.Level)
    if err != nil {
        level = zerolog.InfoLevel
    }

    // Configure zerolog
    zerolog.SetGlobalLevel(level)
    zerolog.TimeFieldFormat = time.RFC3339Nano

    var logger zerolog.Logger

    if config.Environment == "development" {
        // Human-readable console output for development
        logger = zerolog.New(zerolog.ConsoleWriter{
            Out:        os.Stdout,
            TimeFormat: "15:04:05",
        }).With().Timestamp().Logger()
    } else {
        // JSON output for production
        logger = zerolog.New(os.Stdout).With().Timestamp().Logger()
    }

    // Add service metadata
    logger = logger.With().
        Str("service", config.ServiceName).
        Str("version", config.Version).
        Str("environment", config.Environment).
        Logger()

    return &Logger{logger: logger}
}

// Structured logging methods
func (l *Logger) GameEvent(event string, gameSessionID, userID string, properties map[string]interface{}) {
    l.logger.Info().
        Str("event_type", "game").
        Str("event", event).
        Str("game_session_id", gameSessionID).
        Str("user_id", userID).
        Fields(properties).
        Msg("Game event occurred")
}

func (l *Logger) APIRequest(method, path string, statusCode int, duration time.Duration, userID string) {
    l.logger.Info().
        Str("event_type", "api_request").
        Str("method", method).
        Str("path", path).
        Int("status_code", statusCode).
        Dur("duration_ms", duration).
        Str("user_id", userID).
        Msg("API request processed")
}

func (l *Logger) DatabaseOperation(operation, table string, duration time.Duration, rowsAffected int64) {
    l.logger.Debug().
        Str("event_type", "database").
        Str("operation", operation).
        Str("table", table).
        Dur("duration_ms", duration).
        Int64("rows_affected", rowsAffected).
        Msg("Database operation completed")
}

func (l *Logger) AuthenticationEvent(event, userID, userType string, success bool, details map[string]string) {
    level := l.logger.Info()
    if !success {
        level = l.logger.Warn()
    }

    level.
        Str("event_type", "authentication").
        Str("event", event).
        Str("user_id", userID).
        Str("user_type", userType).
        Bool("success", success).
        Fields(details).
        Msg("Authentication event")
}

func (l *Logger) SecurityEvent(event, userID, ipAddress string, severity string, details map[string]interface{}) {
    l.logger.Warn().
        Str("event_type", "security").
        Str("event", event).
        Str("user_id", userID).
        Str("ip_address", ipAddress).
        Str("severity", severity).
        Fields(details).
        Msg("Security event detected")
}

func (l *Logger) Error(err error, context string, fields map[string]interface{}) {
    l.logger.Error().
        Err(err).
        Str("context", context).
        Fields(fields).
        Msg("Error occurred")
}

// Fiber middleware for request logging
func RequestLoggingMiddleware(logger *Logger) fiber.Handler {
    return func(c *fiber.Ctx) error {
        start := time.Now()

        // Process request
        err := c.Next()

        // Log request
        duration := time.Since(start)
        userID := c.Locals("user_id")
        if userID == nil {
            userID = "anonymous"
        }

        logger.APIRequest(
            c.Method(),
            c.Route().Path,
            c.Response().StatusCode(),
            duration,
            userID.(string),
        )

        return err
    }
}

Frontend Logging

// Frontend structured logging
interface LogEntry {
  timestamp: string
  level: 'debug' | 'info' | 'warn' | 'error'
  message: string
  context?: string
  userId?: string
  sessionId?: string
  gameSessionId?: string
  error?: {
    name: string
    message: string
    stack?: string
  }
  properties?: Record<string, any>
}

export class FrontendLogger {
  private buffer: LogEntry[] = []
  private endpoint: string
  private maxBufferSize: number = 100
  private flushInterval: number = 30000

  constructor(endpoint: string) {
    this.endpoint = endpoint
    this.startPeriodicFlush()
  }

  debug(message: string, context?: string, properties?: Record<string, any>): void {
    this.log('debug', message, context, properties)
  }

  info(message: string, context?: string, properties?: Record<string, any>): void {
    this.log('info', message, context, properties)
  }

  warn(message: string, context?: string, properties?: Record<string, any>): void {
    this.log('warn', message, context, properties)
  }

  error(message: string, error?: Error, context?: string, properties?: Record<string, any>): void {
    const entry: LogEntry = {
      timestamp: new Date().toISOString(),
      level: 'error',
      message,
      context,
      userId: this.getUserId(),
      sessionId: this.getSessionId(),
      gameSessionId: this.getGameSessionId(),
      properties,
    }

    if (error) {
      entry.error = {
        name: error.name,
        message: error.message,
        stack: error.stack,
      }
    }

    this.buffer.push(entry)
    this.checkFlushConditions()
  }

  // Game-specific logging methods
  logGameEvent(event: string, gameSessionId: string, properties?: Record<string, any>): void {
    this.info(`Game event: ${event}`, 'game', {
      gameSessionId,
      ...properties,
    })
  }

  logUserAction(action: string, properties?: Record<string, any>): void {
    this.info(`User action: ${action}`, 'user', properties)
  }

  logPerformanceMetric(metric: string, value: number, unit: string): void {
    this.debug(`Performance metric: ${metric}`, 'performance', {
      metric,
      value,
      unit,
      url: window.location.pathname,
    })
  }

  private log(level: LogEntry['level'], message: string, context?: string, properties?: Record<string, any>): void {
    const entry: LogEntry = {
      timestamp: new Date().toISOString(),
      level,
      message,
      context,
      userId: this.getUserId(),
      sessionId: this.getSessionId(),
      gameSessionId: this.getGameSessionId(),
      properties,
    }

    this.buffer.push(entry)
    this.checkFlushConditions()
  }

  private checkFlushConditions(): void {
    if (this.buffer.length >= this.maxBufferSize) {
      this.flush()
    }
  }

  private async flush(): Promise<void> {
    if (this.buffer.length === 0) return

    const logs = [...this.buffer]
    this.buffer = []

    try {
      await fetch(this.endpoint, {
        method: 'POST',
        headers: {
          'Content-Type': 'application/json',
        },
        body: JSON.stringify({ logs }),
      })
    } catch (error) {
      console.error('Failed to send logs:', error)
      // Re-queue logs for retry (keep only most recent to avoid memory issues)
      this.buffer.unshift(...logs.slice(-50))
    }
  }

  private startPeriodicFlush(): void {
    setInterval(() => {
      this.flush()
    }, this.flushInterval)

    // Flush on page unload
    window.addEventListener('beforeunload', () => {
      // Use sendBeacon for reliable delivery during page unload
      if (this.buffer.length > 0) {
        navigator.sendBeacon(this.endpoint, JSON.stringify({ logs: this.buffer }))
      }
    })
  }

  private getUserId(): string | undefined {
    // Implementation depends on auth system
    return sessionStorage.getItem('user_id') || undefined
  }

  private getSessionId(): string | undefined {
    return sessionStorage.getItem('session_id') || undefined
  }

  private getGameSessionId(): string | undefined {
    return sessionStorage.getItem('game_session_id') || undefined
  }
}

Monitoring Dashboards

Grafana Dashboard Configuration

1. Business Intelligence Dashboard

{
  "dashboard": {
    "title": "Know Foolery - Business Intelligence",
    "panels": [
      {
        "title": "Active Games",
        "type": "stat",
        "targets": [
          {
            "expr": "sum(rate(games_started_total[5m])) * 300",
            "legendFormat": "Games per 5min"
          }
        ]
      },
      {
        "title": "Game Completion Rate",
        "type": "stat",
        "targets": [
          {
            "expr": "rate(games_completed_total{completion_type=\"normal\"}[1h]) / rate(games_started_total[1h]) * 100",
            "legendFormat": "Completion %"
          }
        ]
      },
      {
        "title": "Average Session Duration",
        "type": "graph",
        "targets": [
          {
            "expr": "histogram_quantile(0.5, rate(game_session_duration_seconds_bucket[5m]))",
            "legendFormat": "Median"
          },
          {
            "expr": "histogram_quantile(0.95, rate(game_session_duration_seconds_bucket[5m]))",
            "legendFormat": "95th percentile"
          }
        ]
      },
      {
        "title": "Question Accuracy by Theme",
        "type": "heatmap",
        "targets": [
          {
            "expr": "rate(answers_submitted_total{is_correct=\"true\"}[1h]) / rate(answers_submitted_total[1h]) by (theme)",
            "legendFormat": "{{theme}}"
          }
        ]
      },
      {
        "title": "Hint Usage Rate",
        "type": "graph",
        "targets": [
          {
            "expr": "rate(hints_requested_total[5m]) / rate(questions_asked_total[5m]) * 100",
            "legendFormat": "Hint Usage %"
          }
        ]
      },
      {
        "title": "Score Distribution",
        "type": "histogram",
        "targets": [
          {
            "expr": "histogram_quantile(0.25, rate(game_scores_bucket[1h]))",
            "legendFormat": "25th percentile"
          },
          {
            "expr": "histogram_quantile(0.5, rate(game_scores_bucket[1h]))",
            "legendFormat": "Median"
          },
          {
            "expr": "histogram_quantile(0.75, rate(game_scores_bucket[1h]))",
            "legendFormat": "75th percentile"
          }
        ]
      }
    ]
  }
}

2. Technical Performance Dashboard

{
  "dashboard": {
    "title": "Know Foolery - Technical Performance",
    "panels": [
      {
        "title": "API Response Times",
        "type": "graph",
        "targets": [
          {
            "expr": "histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) by (service)",
            "legendFormat": "{{service}} - 95th percentile"
          }
        ]
      },
      {
        "title": "Error Rate",
        "type": "graph",
        "targets": [
          {
            "expr": "rate(http_requests_total{status_code=~\"5..\"}[5m]) / rate(http_requests_total[5m]) * 100 by (service)",
            "legendFormat": "{{service}} - Error %"
          }
        ]
      },
      {
        "title": "Database Performance",
        "type": "graph",
        "targets": [
          {
            "expr": "histogram_quantile(0.95, rate(db_query_duration_seconds_bucket[5m])) by (service)",
            "legendFormat": "{{service}} - Query Time"
          }
        ]
      },
      {
        "title": "Cache Hit Rate",
        "type": "stat",
        "targets": [
          {
            "expr": "rate(cache_operations_total{result=\"hit\"}[5m]) / rate(cache_operations_total{operation=\"get\"}[5m]) * 100",
            "legendFormat": "Hit Rate %"
          }
        ]
      },
      {
        "title": "Authentication Success Rate",
        "type": "graph",
        "targets": [
          {
            "expr": "rate(authentication_attempts_total{result=\"success\"}[5m]) / rate(authentication_attempts_total[5m]) * 100 by (method)",
            "legendFormat": "{{method}} - Success %"
          }
        ]
      }
    ]
  }
}

Alerting Strategy

Alert Rules Configuration

Critical Alerts

# prometheus-alerts.yml
groups:
  - name: know-foolery-critical
    rules:
      - alert: HighErrorRate
        expr: rate(http_requests_total{status_code=~"5.."}[5m]) / rate(http_requests_total[5m]) > 0.05
        for: 2m
        labels:
          severity: critical
          team: backend
        annotations:
          summary: "High error rate detected in {{ $labels.service }}"
          description: "Error rate is {{ $value | humanizePercentage }} for service {{ $labels.service }}"

      - alert: DatabaseConnectionFailure
        expr: db_connections_active == 0
        for: 1m
        labels:
          severity: critical
          team: backend
        annotations:
          summary: "Database connections dropped to zero"
          description: "Service {{ $labels.service }} has no active database connections"

      - alert: AuthenticationSystemDown
        expr: up{service="zitadel"} == 0
        for: 1m
        labels:
          severity: critical
          team: security
        annotations:
          summary: "Authentication system is down"
          description: "Zitadel authentication service is unreachable"

      - alert: GameSessionsStuck
        expr: increase(games_started_total[5m]) > increase(games_completed_total[5m]) * 2
        for: 5m
        labels:
          severity: critical
          team: backend
        annotations:
          summary: "Game sessions not completing"
          description: "Many games are starting but not completing normally"

  - name: know-foolery-warning
    rules:
      - alert: HighLatency
        expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) > 1.0
        for: 5m
        labels:
          severity: warning
          team: backend
        annotations:
          summary: "High API latency detected"
          description: "95th percentile latency is {{ $value }}s for {{ $labels.service }}"

      - alert: LowGameCompletionRate
        expr: rate(games_completed_total{completion_type="normal"}[1h]) / rate(games_started_total[1h]) < 0.7
        for: 10m
        labels:
          severity: warning
          team: product
        annotations:
          summary: "Low game completion rate"
          description: "Only {{ $value | humanizePercentage }} of games are being completed normally"

      - alert: HighHintUsage
        expr: rate(hints_requested_total[1h]) / rate(questions_asked_total[1h]) > 0.8
        for: 15m
        labels:
          severity: warning
          team: product
        annotations:
          summary: "Unusually high hint usage"
          description: "{{ $value | humanizePercentage }} of questions are requesting hints"

  - name: know-foolery-security
    rules:
      - alert: HighAuthenticationFailures
        expr: rate(authentication_attempts_total{result="failure"}[5m]) > 10
        for: 2m
        labels:
          severity: warning
          team: security
        annotations:
          summary: "High authentication failure rate"
          description: "{{ $value }} authentication failures per second"

      - alert: SuspiciousUserActivity
        expr: rate(answers_submitted_total[1m]) by (user_id) > 5
        for: 1m
        labels:
          severity: warning
          team: security
        annotations:
          summary: "Suspicious user activity detected"
          description: "User {{ $labels.user_id }} is submitting answers at {{ $value }}/second"

Alert Routing and Escalation

# alertmanager.yml
global:
  slack_api_url: 'https://hooks.slack.com/services/...'

route:
  group_by: ['alertname', 'service']
  group_wait: 10s
  group_interval: 10s
  repeat_interval: 1h
  receiver: 'default'
  routes:
  - match:
      severity: critical
    receiver: 'critical-alerts'
    group_wait: 0s
  - match:
      team: security
    receiver: 'security-team'
  - match:
      team: product
    receiver: 'product-team'

receivers:
- name: 'default'
  slack_configs:
  - channel: '#alerts'
    title: 'Know Foolery Alert'
    text: '{{ range .Alerts }}{{ .Annotations.summary }}{{ end }}'

- name: 'critical-alerts'
  slack_configs:
  - channel: '#critical-alerts'
    title: 'CRITICAL: Know Foolery'
    text: '{{ range .Alerts }}{{ .Annotations.summary }}\n{{ .Annotations.description }}{{ end }}'
  pagerduty_configs:
  - service_key: 'your-pagerduty-key'
    description: '{{ range .Alerts }}{{ .Annotations.summary }}{{ end }}'

- name: 'security-team'
  slack_configs:
  - channel: '#security-alerts'
    title: 'Security Alert: Know Foolery'
    text: '{{ range .Alerts }}{{ .Annotations.summary }}{{ end }}'

- name: 'product-team'
  slack_configs:
  - channel: '#product-alerts'
    title: 'Product Alert: Know Foolery'
    text: '{{ range .Alerts }}{{ .Annotations.summary }}{{ end }}'

This comprehensive observability strategy ensures that Know Foolery has full visibility into its performance, user behavior, and system health, enabling proactive issue resolution and data-driven product improvements.

40 KiB Raw Blame History