You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
1446 lines
40 KiB
Markdown
1446 lines
40 KiB
Markdown
# Know Foolery - Detailed Observability Implementation Guidelines
|
|
|
|
## Metrics Strategy
|
|
|
|
### Application Metrics Collection
|
|
|
|
#### 1. Business Metrics (Game-Specific)
|
|
```go
|
|
// Business metrics for game insights
|
|
package metrics
|
|
|
|
import (
|
|
"github.com/prometheus/client_golang/prometheus"
|
|
"github.com/prometheus/client_golang/prometheus/promauto"
|
|
)
|
|
|
|
var (
|
|
// Game session metrics
|
|
gamesStarted = promauto.NewCounterVec(
|
|
prometheus.CounterOpts{
|
|
Name: "games_started_total",
|
|
Help: "Total number of games started",
|
|
},
|
|
[]string{"player_type", "platform"},
|
|
)
|
|
|
|
gamesCompleted = promauto.NewCounterVec(
|
|
prometheus.CounterOpts{
|
|
Name: "games_completed_total",
|
|
Help: "Total number of games completed",
|
|
},
|
|
[]string{"completion_type", "platform"}, // normal, timeout, abandoned
|
|
)
|
|
|
|
sessionDuration = promauto.NewHistogramVec(
|
|
prometheus.HistogramOpts{
|
|
Name: "game_session_duration_seconds",
|
|
Help: "Duration of game sessions",
|
|
Buckets: []float64{60, 300, 600, 900, 1200, 1500, 1800}, // 1min to 30min
|
|
},
|
|
[]string{"completion_type"},
|
|
)
|
|
|
|
// Question and answer metrics
|
|
questionsAsked = promauto.NewCounterVec(
|
|
prometheus.CounterOpts{
|
|
Name: "questions_asked_total",
|
|
Help: "Total number of questions asked",
|
|
},
|
|
[]string{"theme", "difficulty"},
|
|
)
|
|
|
|
answersSubmitted = promauto.NewCounterVec(
|
|
prometheus.CounterOpts{
|
|
Name: "answers_submitted_total",
|
|
Help: "Total number of answers submitted",
|
|
},
|
|
[]string{"theme", "is_correct", "attempt_number", "used_hint"},
|
|
)
|
|
|
|
hintsRequested = promauto.NewCounterVec(
|
|
prometheus.CounterOpts{
|
|
Name: "hints_requested_total",
|
|
Help: "Total number of hints requested",
|
|
},
|
|
[]string{"theme", "question_difficulty"},
|
|
)
|
|
|
|
// Score distribution
|
|
scoreDistribution = promauto.NewHistogramVec(
|
|
prometheus.HistogramOpts{
|
|
Name: "game_scores",
|
|
Help: "Distribution of game scores",
|
|
Buckets: []float64{0, 5, 10, 15, 20, 25, 30, 40, 50, 60, 80, 100},
|
|
},
|
|
[]string{"session_duration_bucket"},
|
|
)
|
|
|
|
// Leaderboard metrics
|
|
leaderboardUpdates = promauto.NewCounter(
|
|
prometheus.CounterOpts{
|
|
Name: "leaderboard_updates_total",
|
|
Help: "Total number of leaderboard updates",
|
|
},
|
|
)
|
|
|
|
topScoreChanges = promauto.NewCounterVec(
|
|
prometheus.CounterOpts{
|
|
Name: "top_score_changes_total",
|
|
Help: "Changes in top 10 scores",
|
|
},
|
|
[]string{"position"}, // top_1, top_5, top_10
|
|
)
|
|
)
|
|
|
|
// Business metrics collection service
|
|
type GameMetrics struct {
|
|
registry prometheus.Registerer
|
|
}
|
|
|
|
func NewGameMetrics() *GameMetrics {
|
|
return &GameMetrics{
|
|
registry: prometheus.DefaultRegisterer,
|
|
}
|
|
}
|
|
|
|
func (m *GameMetrics) RecordGameStart(playerType, platform string) {
|
|
gamesStarted.WithLabelValues(playerType, platform).Inc()
|
|
}
|
|
|
|
func (m *GameMetrics) RecordGameCompletion(completionType, platform string, duration time.Duration) {
|
|
gamesCompleted.WithLabelValues(completionType, platform).Inc()
|
|
sessionDuration.WithLabelValues(completionType).Observe(duration.Seconds())
|
|
}
|
|
|
|
func (m *GameMetrics) RecordQuestionAsked(theme, difficulty string) {
|
|
questionsAsked.WithLabelValues(theme, difficulty).Inc()
|
|
}
|
|
|
|
func (m *GameMetrics) RecordAnswerSubmitted(theme string, isCorrect bool, attemptNum int, usedHint bool) {
|
|
answersSubmitted.WithLabelValues(
|
|
theme,
|
|
strconv.FormatBool(isCorrect),
|
|
strconv.Itoa(attemptNum),
|
|
strconv.FormatBool(usedHint),
|
|
).Inc()
|
|
}
|
|
|
|
func (m *GameMetrics) RecordFinalScore(score int, sessionDuration time.Duration) {
|
|
durationBucket := m.getDurationBucket(sessionDuration)
|
|
scoreDistribution.WithLabelValues(durationBucket).Observe(float64(score))
|
|
}
|
|
|
|
func (m *GameMetrics) getDurationBucket(duration time.Duration) string {
|
|
minutes := int(duration.Minutes())
|
|
switch {
|
|
case minutes <= 5:
|
|
return "0-5min"
|
|
case minutes <= 15:
|
|
return "5-15min"
|
|
case minutes <= 25:
|
|
return "15-25min"
|
|
default:
|
|
return "25-30min"
|
|
}
|
|
}
|
|
```
|
|
|
|
#### 2. Technical Metrics (Infrastructure)
|
|
```go
|
|
// Technical metrics for system health
|
|
var (
|
|
// HTTP metrics
|
|
httpRequestsTotal = promauto.NewCounterVec(
|
|
prometheus.CounterOpts{
|
|
Name: "http_requests_total",
|
|
Help: "Total number of HTTP requests",
|
|
},
|
|
[]string{"method", "endpoint", "status_code", "service"},
|
|
)
|
|
|
|
httpRequestDuration = promauto.NewHistogramVec(
|
|
prometheus.HistogramOpts{
|
|
Name: "http_request_duration_seconds",
|
|
Help: "HTTP request duration",
|
|
Buckets: prometheus.DefBuckets,
|
|
},
|
|
[]string{"method", "endpoint", "service"},
|
|
)
|
|
|
|
// Database metrics
|
|
dbConnectionsActive = promauto.NewGaugeVec(
|
|
prometheus.GaugeOpts{
|
|
Name: "db_connections_active",
|
|
Help: "Number of active database connections",
|
|
},
|
|
[]string{"database", "service"},
|
|
)
|
|
|
|
dbQueryDuration = promauto.NewHistogramVec(
|
|
prometheus.HistogramOpts{
|
|
Name: "db_query_duration_seconds",
|
|
Help: "Database query duration",
|
|
Buckets: []float64{0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1.0, 2.0, 5.0},
|
|
},
|
|
[]string{"query_type", "table", "service"},
|
|
)
|
|
|
|
dbErrors = promauto.NewCounterVec(
|
|
prometheus.CounterOpts{
|
|
Name: "db_errors_total",
|
|
Help: "Total number of database errors",
|
|
},
|
|
[]string{"error_type", "service"},
|
|
)
|
|
|
|
// Cache metrics
|
|
cacheOperations = promauto.NewCounterVec(
|
|
prometheus.CounterOpts{
|
|
Name: "cache_operations_total",
|
|
Help: "Total number of cache operations",
|
|
},
|
|
[]string{"operation", "result", "service"}, // get/set/delete, hit/miss/error
|
|
)
|
|
|
|
cacheKeyCount = promauto.NewGaugeVec(
|
|
prometheus.GaugeOpts{
|
|
Name: "cache_keys_total",
|
|
Help: "Number of keys in cache",
|
|
},
|
|
[]string{"cache_type", "service"},
|
|
)
|
|
|
|
// Authentication metrics
|
|
authenticationAttempts = promauto.NewCounterVec(
|
|
prometheus.CounterOpts{
|
|
Name: "authentication_attempts_total",
|
|
Help: "Total authentication attempts",
|
|
},
|
|
[]string{"method", "result", "user_type"}, // jwt/oauth, success/failure, player/admin
|
|
)
|
|
|
|
tokenOperations = promauto.NewCounterVec(
|
|
prometheus.CounterOpts{
|
|
Name: "token_operations_total",
|
|
Help: "JWT token operations",
|
|
},
|
|
[]string{"operation", "result"}, // validate/refresh, success/failure
|
|
)
|
|
)
|
|
|
|
// Fiber middleware for HTTP metrics
|
|
func PrometheusMiddleware(serviceName string) fiber.Handler {
|
|
return func(c *fiber.Ctx) error {
|
|
start := time.Now()
|
|
|
|
// Process request
|
|
err := c.Next()
|
|
|
|
// Record metrics
|
|
duration := time.Since(start).Seconds()
|
|
statusCode := strconv.Itoa(c.Response().StatusCode())
|
|
|
|
httpRequestsTotal.WithLabelValues(
|
|
c.Method(),
|
|
c.Route().Path,
|
|
statusCode,
|
|
serviceName,
|
|
).Inc()
|
|
|
|
httpRequestDuration.WithLabelValues(
|
|
c.Method(),
|
|
c.Route().Path,
|
|
serviceName,
|
|
).Observe(duration)
|
|
|
|
return err
|
|
}
|
|
}
|
|
|
|
// Database metrics middleware for Ent
|
|
type MetricsHook struct {
|
|
serviceName string
|
|
}
|
|
|
|
func NewMetricsHook(serviceName string) *MetricsHook {
|
|
return &MetricsHook{serviceName: serviceName}
|
|
}
|
|
|
|
func (h *MetricsHook) Hook() ent.Hook {
|
|
return hook.On(
|
|
func(next ent.Mutator) ent.Mutator {
|
|
return ent.MutateFunc(func(ctx context.Context, m ent.Mutation) (ent.Value, error) {
|
|
start := time.Now()
|
|
|
|
result, err := next.Mutate(ctx, m)
|
|
|
|
duration := time.Since(start).Seconds()
|
|
queryType := strings.ToLower(m.Op().String())
|
|
table := m.Type()
|
|
|
|
dbQueryDuration.WithLabelValues(queryType, table, h.serviceName).Observe(duration)
|
|
|
|
if err != nil {
|
|
dbErrors.WithLabelValues("query_error", h.serviceName).Inc()
|
|
}
|
|
|
|
return result, err
|
|
})
|
|
},
|
|
ent.OpCreate|ent.OpUpdate|ent.OpUpdateOne|ent.OpDelete|ent.OpDeleteOne,
|
|
)
|
|
}
|
|
```
|
|
|
|
### Frontend Metrics Collection
|
|
|
|
#### Web Application Metrics
|
|
```typescript
|
|
// Frontend metrics collection
|
|
class MetricsCollector {
|
|
private endpoint: string
|
|
private batchSize: number = 50
|
|
private flushInterval: number = 30000 // 30 seconds
|
|
private metrics: MetricEvent[] = []
|
|
|
|
constructor(endpoint: string) {
|
|
this.endpoint = endpoint
|
|
this.startBatchFlush()
|
|
this.setupPerformanceObserver()
|
|
}
|
|
|
|
// User interaction metrics
|
|
trackUserAction(action: string, properties: Record<string, any> = {}): void {
|
|
this.addMetric({
|
|
type: 'user_action',
|
|
action,
|
|
timestamp: Date.now(),
|
|
session_id: this.getSessionId(),
|
|
user_agent: navigator.userAgent,
|
|
...properties,
|
|
})
|
|
}
|
|
|
|
// Game-specific metrics
|
|
trackGameEvent(event: GameEvent): void {
|
|
this.addMetric({
|
|
type: 'game_event',
|
|
event: event.type,
|
|
timestamp: Date.now(),
|
|
session_id: this.getSessionId(),
|
|
game_session_id: event.gameSessionId,
|
|
properties: event.properties,
|
|
})
|
|
}
|
|
|
|
// Performance metrics
|
|
trackPerformance(metric: PerformanceMetric): void {
|
|
this.addMetric({
|
|
type: 'performance',
|
|
metric: metric.name,
|
|
value: metric.value,
|
|
timestamp: Date.now(),
|
|
url: window.location.pathname,
|
|
})
|
|
}
|
|
|
|
// Error tracking
|
|
trackError(error: Error, context: string): void {
|
|
this.addMetric({
|
|
type: 'error',
|
|
error_message: error.message,
|
|
error_stack: error.stack,
|
|
context,
|
|
timestamp: Date.now(),
|
|
url: window.location.pathname,
|
|
user_agent: navigator.userAgent,
|
|
})
|
|
}
|
|
|
|
private setupPerformanceObserver(): void {
|
|
// Web Vitals tracking
|
|
if ('PerformanceObserver' in window) {
|
|
// Largest Contentful Paint
|
|
new PerformanceObserver((list) => {
|
|
list.getEntries().forEach((entry) => {
|
|
this.trackPerformance({
|
|
name: 'largest_contentful_paint',
|
|
value: entry.startTime,
|
|
})
|
|
})
|
|
}).observe({ entryTypes: ['largest-contentful-paint'] })
|
|
|
|
// First Input Delay
|
|
new PerformanceObserver((list) => {
|
|
list.getEntries().forEach((entry) => {
|
|
this.trackPerformance({
|
|
name: 'first_input_delay',
|
|
value: entry.processingStart - entry.startTime,
|
|
})
|
|
})
|
|
}).observe({ entryTypes: ['first-input'] })
|
|
|
|
// Cumulative Layout Shift
|
|
new PerformanceObserver((list) => {
|
|
let cumulativeScore = 0
|
|
list.getEntries().forEach((entry) => {
|
|
if (!entry.hadRecentInput) {
|
|
cumulativeScore += entry.value
|
|
}
|
|
})
|
|
this.trackPerformance({
|
|
name: 'cumulative_layout_shift',
|
|
value: cumulativeScore,
|
|
})
|
|
}).observe({ entryTypes: ['layout-shift'] })
|
|
}
|
|
|
|
// API response time tracking
|
|
this.interceptFetch()
|
|
}
|
|
|
|
private interceptFetch(): void {
|
|
const originalFetch = window.fetch
|
|
window.fetch = async (...args) => {
|
|
const start = performance.now()
|
|
const url = args[0].toString()
|
|
|
|
try {
|
|
const response = await originalFetch(...args)
|
|
const duration = performance.now() - start
|
|
|
|
this.trackPerformance({
|
|
name: 'api_request_duration',
|
|
value: duration,
|
|
url,
|
|
status: response.status,
|
|
})
|
|
|
|
return response
|
|
} catch (error) {
|
|
const duration = performance.now() - start
|
|
|
|
this.trackError(error as Error, `API request to ${url}`)
|
|
this.trackPerformance({
|
|
name: 'api_request_duration',
|
|
value: duration,
|
|
url,
|
|
status: 0,
|
|
})
|
|
|
|
throw error
|
|
}
|
|
}
|
|
}
|
|
|
|
private addMetric(metric: MetricEvent): void {
|
|
this.metrics.push(metric)
|
|
|
|
if (this.metrics.length >= this.batchSize) {
|
|
this.flush()
|
|
}
|
|
}
|
|
|
|
private async flush(): Promise<void> {
|
|
if (this.metrics.length === 0) return
|
|
|
|
const batch = [...this.metrics]
|
|
this.metrics = []
|
|
|
|
try {
|
|
await fetch(this.endpoint, {
|
|
method: 'POST',
|
|
headers: {
|
|
'Content-Type': 'application/json',
|
|
},
|
|
body: JSON.stringify({ metrics: batch }),
|
|
})
|
|
} catch (error) {
|
|
console.error('Failed to send metrics:', error)
|
|
// Re-queue metrics for retry
|
|
this.metrics.unshift(...batch)
|
|
}
|
|
}
|
|
|
|
private startBatchFlush(): void {
|
|
setInterval(() => {
|
|
this.flush()
|
|
}, this.flushInterval)
|
|
|
|
// Flush on page unload
|
|
window.addEventListener('beforeunload', () => {
|
|
this.flush()
|
|
})
|
|
}
|
|
|
|
private getSessionId(): string {
|
|
// Implementation depends on session management
|
|
return sessionStorage.getItem('session_id') || 'anonymous'
|
|
}
|
|
}
|
|
|
|
// Game-specific metrics tracking
|
|
export class GameMetricsTracker {
|
|
private collector: MetricsCollector
|
|
|
|
constructor(collector: MetricsCollector) {
|
|
this.collector = collector
|
|
}
|
|
|
|
trackGameStart(gameSessionId: string, playerName: string): void {
|
|
this.collector.trackGameEvent({
|
|
type: 'game_started',
|
|
gameSessionId,
|
|
properties: {
|
|
player_name: playerName,
|
|
platform: this.getPlatform(),
|
|
},
|
|
})
|
|
}
|
|
|
|
trackQuestionDisplayed(gameSessionId: string, questionId: string, theme: string): void {
|
|
this.collector.trackGameEvent({
|
|
type: 'question_displayed',
|
|
gameSessionId,
|
|
properties: {
|
|
question_id: questionId,
|
|
theme,
|
|
display_time: Date.now(),
|
|
},
|
|
})
|
|
}
|
|
|
|
trackAnswerSubmitted(
|
|
gameSessionId: string,
|
|
questionId: string,
|
|
isCorrect: boolean,
|
|
attemptNumber: number,
|
|
timeTaken: number,
|
|
usedHint: boolean
|
|
): void {
|
|
this.collector.trackGameEvent({
|
|
type: 'answer_submitted',
|
|
gameSessionId,
|
|
properties: {
|
|
question_id: questionId,
|
|
is_correct: isCorrect,
|
|
attempt_number: attemptNumber,
|
|
time_taken_ms: timeTaken,
|
|
used_hint: usedHint,
|
|
},
|
|
})
|
|
}
|
|
|
|
trackHintRequested(gameSessionId: string, questionId: string): void {
|
|
this.collector.trackGameEvent({
|
|
type: 'hint_requested',
|
|
gameSessionId,
|
|
properties: {
|
|
question_id: questionId,
|
|
request_time: Date.now(),
|
|
},
|
|
})
|
|
}
|
|
|
|
trackGameCompleted(
|
|
gameSessionId: string,
|
|
finalScore: number,
|
|
questionsAnswered: number,
|
|
completionType: 'normal' | 'timeout' | 'abandoned'
|
|
): void {
|
|
this.collector.trackGameEvent({
|
|
type: 'game_completed',
|
|
gameSessionId,
|
|
properties: {
|
|
final_score: finalScore,
|
|
questions_answered: questionsAnswered,
|
|
completion_type: completionType,
|
|
platform: this.getPlatform(),
|
|
},
|
|
})
|
|
}
|
|
|
|
private getPlatform(): string {
|
|
// Detect platform
|
|
if (/Android/i.test(navigator.userAgent)) return 'android'
|
|
if (/iPhone|iPad|iPod/i.test(navigator.userAgent)) return 'ios'
|
|
if (window.wails) return 'desktop' // For Wails apps
|
|
return 'web'
|
|
}
|
|
}
|
|
|
|
// Usage in React components
|
|
export const useGameMetrics = () => {
|
|
const collector = useRef(new MetricsCollector('/api/v1/metrics'))
|
|
const gameTracker = useRef(new GameMetricsTracker(collector.current))
|
|
|
|
return {
|
|
trackGameStart: gameTracker.current.trackGameStart.bind(gameTracker.current),
|
|
trackQuestionDisplayed: gameTracker.current.trackQuestionDisplayed.bind(gameTracker.current),
|
|
trackAnswerSubmitted: gameTracker.current.trackAnswerSubmitted.bind(gameTracker.current),
|
|
trackHintRequested: gameTracker.current.trackHintRequested.bind(gameTracker.current),
|
|
trackGameCompleted: gameTracker.current.trackGameCompleted.bind(gameTracker.current),
|
|
trackUserAction: collector.current.trackUserAction.bind(collector.current),
|
|
trackError: collector.current.trackError.bind(collector.current),
|
|
}
|
|
}
|
|
```
|
|
|
|
## Distributed Tracing
|
|
|
|
### OpenTelemetry Integration
|
|
|
|
#### Backend Tracing Setup
|
|
```go
|
|
// OpenTelemetry tracing setup
|
|
package observability
|
|
|
|
import (
|
|
"context"
|
|
"go.opentelemetry.io/otel"
|
|
"go.opentelemetry.io/otel/attribute"
|
|
"go.opentelemetry.io/otel/exporters/jaeger"
|
|
"go.opentelemetry.io/otel/sdk/resource"
|
|
"go.opentelemetry.io/otel/sdk/trace"
|
|
"go.opentelemetry.io/otel/semconv/v1.12.0/httpconv"
|
|
"go.opentelemetry.io/otel/semconv/v1.12.0/netconv"
|
|
)
|
|
|
|
type TracingConfig struct {
|
|
ServiceName string
|
|
ServiceVersion string
|
|
Environment string
|
|
JaegerEndpoint string
|
|
SampleRate float64
|
|
}
|
|
|
|
func InitTracing(config TracingConfig) (*trace.TracerProvider, error) {
|
|
// Create Jaeger exporter
|
|
jaegerExporter, err := jaeger.New(
|
|
jaeger.WithCollectorEndpoint(jaeger.WithEndpoint(config.JaegerEndpoint)),
|
|
)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
// Create resource with service information
|
|
res, err := resource.New(
|
|
context.Background(),
|
|
resource.WithAttributes(
|
|
attribute.String("service.name", config.ServiceName),
|
|
attribute.String("service.version", config.ServiceVersion),
|
|
attribute.String("environment", config.Environment),
|
|
),
|
|
)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
// Create tracer provider
|
|
tp := trace.NewTracerProvider(
|
|
trace.WithBatcher(jaegerExporter),
|
|
trace.WithResource(res),
|
|
trace.WithSampler(trace.TraceIDRatioBased(config.SampleRate)),
|
|
)
|
|
|
|
// Set global tracer provider
|
|
otel.SetTracerProvider(tp)
|
|
|
|
return tp, nil
|
|
}
|
|
|
|
// Fiber middleware for distributed tracing
|
|
func TracingMiddleware(serviceName string) fiber.Handler {
|
|
tracer := otel.Tracer(serviceName)
|
|
|
|
return func(c *fiber.Ctx) error {
|
|
// Start span
|
|
ctx, span := tracer.Start(c.Context(), fmt.Sprintf("%s %s", c.Method(), c.Route().Path))
|
|
defer span.End()
|
|
|
|
// Set span attributes
|
|
span.SetAttributes(
|
|
httpconv.HTTPMethodKey.String(c.Method()),
|
|
httpconv.HTTPURLKey.String(string(c.Request().URI().FullURI())),
|
|
httpconv.HTTPUserAgentKey.String(string(c.Request().Header.UserAgent())),
|
|
netconv.NetPeerIPKey.String(c.IP()),
|
|
)
|
|
|
|
// Add to context
|
|
c.SetUserContext(ctx)
|
|
|
|
// Process request
|
|
err := c.Next()
|
|
|
|
// Set response attributes
|
|
span.SetAttributes(
|
|
httpconv.HTTPStatusCodeKey.Int(c.Response().StatusCode()),
|
|
)
|
|
|
|
if err != nil {
|
|
span.RecordError(err)
|
|
}
|
|
|
|
return err
|
|
}
|
|
}
|
|
|
|
// Service-level tracing helpers
|
|
func TraceServiceOperation(ctx context.Context, serviceName, operation string, fn func(context.Context) error) error {
|
|
tracer := otel.Tracer(serviceName)
|
|
ctx, span := tracer.Start(ctx, operation)
|
|
defer span.End()
|
|
|
|
err := fn(ctx)
|
|
if err != nil {
|
|
span.RecordError(err)
|
|
span.SetAttributes(attribute.Bool("error", true))
|
|
}
|
|
|
|
return err
|
|
}
|
|
|
|
// Database tracing for Ent
|
|
func TraceDatabaseOperation(ctx context.Context, operation, table string, fn func(context.Context) error) error {
|
|
tracer := otel.Tracer("database")
|
|
ctx, span := tracer.Start(ctx, fmt.Sprintf("db.%s.%s", operation, table))
|
|
defer span.End()
|
|
|
|
span.SetAttributes(
|
|
attribute.String("db.operation", operation),
|
|
attribute.String("db.table", table),
|
|
attribute.String("db.system", "postgresql"),
|
|
)
|
|
|
|
err := fn(ctx)
|
|
if err != nil {
|
|
span.RecordError(err)
|
|
}
|
|
|
|
return err
|
|
}
|
|
```
|
|
|
|
#### Frontend Tracing Integration
|
|
```typescript
|
|
// Frontend tracing with OpenTelemetry
|
|
import { WebTracerProvider } from '@opentelemetry/sdk-trace-web'
|
|
import { getWebAutoInstrumentations } from '@opentelemetry/auto-instrumentations-web'
|
|
import { JaegerExporter } from '@opentelemetry/exporter-jaeger'
|
|
import { registerInstrumentations } from '@opentelemetry/instrumentation'
|
|
|
|
export class FrontendTracing {
|
|
private provider: WebTracerProvider
|
|
|
|
constructor(config: TracingConfig) {
|
|
this.provider = new WebTracerProvider({
|
|
resource: new Resource({
|
|
'service.name': config.serviceName,
|
|
'service.version': config.serviceVersion,
|
|
}),
|
|
})
|
|
|
|
// Configure Jaeger exporter
|
|
const jaegerExporter = new JaegerExporter({
|
|
endpoint: config.jaegerEndpoint,
|
|
})
|
|
|
|
this.provider.addSpanProcessor(
|
|
new BatchSpanProcessor(jaegerExporter)
|
|
)
|
|
|
|
// Register provider
|
|
this.provider.register()
|
|
|
|
// Auto-instrument browser APIs
|
|
registerInstrumentations({
|
|
instrumentations: [
|
|
getWebAutoInstrumentations({
|
|
'@opentelemetry/instrumentation-document-load': {
|
|
enabled: true,
|
|
},
|
|
'@opentelemetry/instrumentation-user-interaction': {
|
|
enabled: true,
|
|
},
|
|
'@opentelemetry/instrumentation-fetch': {
|
|
enabled: true,
|
|
propagateTraceHeaderCorsUrls: [
|
|
new RegExp(config.apiBaseUrl),
|
|
],
|
|
},
|
|
}),
|
|
],
|
|
})
|
|
}
|
|
|
|
// Game-specific tracing
|
|
traceGameAction(action: string, properties: Record<string, any>, fn: () => Promise<void>): Promise<void> {
|
|
const tracer = trace.getTracer('game-frontend')
|
|
|
|
return tracer.startActiveSpan(action, async (span) => {
|
|
try {
|
|
// Set span attributes
|
|
Object.entries(properties).forEach(([key, value]) => {
|
|
span.setAttributes({ [key]: value })
|
|
})
|
|
|
|
await fn()
|
|
} catch (error) {
|
|
span.recordException(error as Error)
|
|
span.setStatus({ code: SpanStatusCode.ERROR })
|
|
throw error
|
|
} finally {
|
|
span.end()
|
|
}
|
|
})
|
|
}
|
|
}
|
|
|
|
// React hook for tracing
|
|
export const useTracing = () => {
|
|
const tracer = trace.getTracer('react-components')
|
|
|
|
const traceUserAction = useCallback(
|
|
async (action: string, properties: Record<string, any>, fn: () => Promise<void>) => {
|
|
return tracer.startActiveSpan(`user.${action}`, async (span) => {
|
|
try {
|
|
span.setAttributes(properties)
|
|
await fn()
|
|
} catch (error) {
|
|
span.recordException(error as Error)
|
|
throw error
|
|
} finally {
|
|
span.end()
|
|
}
|
|
})
|
|
},
|
|
[tracer]
|
|
)
|
|
|
|
return { traceUserAction }
|
|
}
|
|
```
|
|
|
|
## Logging Strategy
|
|
|
|
### Structured Logging Implementation
|
|
|
|
#### Backend Logging
|
|
```go
|
|
// Structured logging with zerolog
|
|
package logging
|
|
|
|
import (
|
|
"os"
|
|
"time"
|
|
"github.com/rs/zerolog"
|
|
"github.com/rs/zerolog/log"
|
|
)
|
|
|
|
type Logger struct {
|
|
logger zerolog.Logger
|
|
}
|
|
|
|
type LogConfig struct {
|
|
Level string
|
|
Environment string
|
|
ServiceName string
|
|
Version string
|
|
}
|
|
|
|
func NewLogger(config LogConfig) *Logger {
|
|
// Parse log level
|
|
level, err := zerolog.ParseLevel(config.Level)
|
|
if err != nil {
|
|
level = zerolog.InfoLevel
|
|
}
|
|
|
|
// Configure zerolog
|
|
zerolog.SetGlobalLevel(level)
|
|
zerolog.TimeFieldFormat = time.RFC3339Nano
|
|
|
|
var logger zerolog.Logger
|
|
|
|
if config.Environment == "development" {
|
|
// Human-readable console output for development
|
|
logger = zerolog.New(zerolog.ConsoleWriter{
|
|
Out: os.Stdout,
|
|
TimeFormat: "15:04:05",
|
|
}).With().Timestamp().Logger()
|
|
} else {
|
|
// JSON output for production
|
|
logger = zerolog.New(os.Stdout).With().Timestamp().Logger()
|
|
}
|
|
|
|
// Add service metadata
|
|
logger = logger.With().
|
|
Str("service", config.ServiceName).
|
|
Str("version", config.Version).
|
|
Str("environment", config.Environment).
|
|
Logger()
|
|
|
|
return &Logger{logger: logger}
|
|
}
|
|
|
|
// Structured logging methods
|
|
func (l *Logger) GameEvent(event string, gameSessionID, userID string, properties map[string]interface{}) {
|
|
l.logger.Info().
|
|
Str("event_type", "game").
|
|
Str("event", event).
|
|
Str("game_session_id", gameSessionID).
|
|
Str("user_id", userID).
|
|
Fields(properties).
|
|
Msg("Game event occurred")
|
|
}
|
|
|
|
func (l *Logger) APIRequest(method, path string, statusCode int, duration time.Duration, userID string) {
|
|
l.logger.Info().
|
|
Str("event_type", "api_request").
|
|
Str("method", method).
|
|
Str("path", path).
|
|
Int("status_code", statusCode).
|
|
Dur("duration_ms", duration).
|
|
Str("user_id", userID).
|
|
Msg("API request processed")
|
|
}
|
|
|
|
func (l *Logger) DatabaseOperation(operation, table string, duration time.Duration, rowsAffected int64) {
|
|
l.logger.Debug().
|
|
Str("event_type", "database").
|
|
Str("operation", operation).
|
|
Str("table", table).
|
|
Dur("duration_ms", duration).
|
|
Int64("rows_affected", rowsAffected).
|
|
Msg("Database operation completed")
|
|
}
|
|
|
|
func (l *Logger) AuthenticationEvent(event, userID, userType string, success bool, details map[string]string) {
|
|
level := l.logger.Info()
|
|
if !success {
|
|
level = l.logger.Warn()
|
|
}
|
|
|
|
level.
|
|
Str("event_type", "authentication").
|
|
Str("event", event).
|
|
Str("user_id", userID).
|
|
Str("user_type", userType).
|
|
Bool("success", success).
|
|
Fields(details).
|
|
Msg("Authentication event")
|
|
}
|
|
|
|
func (l *Logger) SecurityEvent(event, userID, ipAddress string, severity string, details map[string]interface{}) {
|
|
l.logger.Warn().
|
|
Str("event_type", "security").
|
|
Str("event", event).
|
|
Str("user_id", userID).
|
|
Str("ip_address", ipAddress).
|
|
Str("severity", severity).
|
|
Fields(details).
|
|
Msg("Security event detected")
|
|
}
|
|
|
|
func (l *Logger) Error(err error, context string, fields map[string]interface{}) {
|
|
l.logger.Error().
|
|
Err(err).
|
|
Str("context", context).
|
|
Fields(fields).
|
|
Msg("Error occurred")
|
|
}
|
|
|
|
// Fiber middleware for request logging
|
|
func RequestLoggingMiddleware(logger *Logger) fiber.Handler {
|
|
return func(c *fiber.Ctx) error {
|
|
start := time.Now()
|
|
|
|
// Process request
|
|
err := c.Next()
|
|
|
|
// Log request
|
|
duration := time.Since(start)
|
|
userID := c.Locals("user_id")
|
|
if userID == nil {
|
|
userID = "anonymous"
|
|
}
|
|
|
|
logger.APIRequest(
|
|
c.Method(),
|
|
c.Route().Path,
|
|
c.Response().StatusCode(),
|
|
duration,
|
|
userID.(string),
|
|
)
|
|
|
|
return err
|
|
}
|
|
}
|
|
```
|
|
|
|
#### Frontend Logging
|
|
```typescript
|
|
// Frontend structured logging
|
|
interface LogEntry {
|
|
timestamp: string
|
|
level: 'debug' | 'info' | 'warn' | 'error'
|
|
message: string
|
|
context?: string
|
|
userId?: string
|
|
sessionId?: string
|
|
gameSessionId?: string
|
|
error?: {
|
|
name: string
|
|
message: string
|
|
stack?: string
|
|
}
|
|
properties?: Record<string, any>
|
|
}
|
|
|
|
export class FrontendLogger {
|
|
private buffer: LogEntry[] = []
|
|
private endpoint: string
|
|
private maxBufferSize: number = 100
|
|
private flushInterval: number = 30000
|
|
|
|
constructor(endpoint: string) {
|
|
this.endpoint = endpoint
|
|
this.startPeriodicFlush()
|
|
}
|
|
|
|
debug(message: string, context?: string, properties?: Record<string, any>): void {
|
|
this.log('debug', message, context, properties)
|
|
}
|
|
|
|
info(message: string, context?: string, properties?: Record<string, any>): void {
|
|
this.log('info', message, context, properties)
|
|
}
|
|
|
|
warn(message: string, context?: string, properties?: Record<string, any>): void {
|
|
this.log('warn', message, context, properties)
|
|
}
|
|
|
|
error(message: string, error?: Error, context?: string, properties?: Record<string, any>): void {
|
|
const entry: LogEntry = {
|
|
timestamp: new Date().toISOString(),
|
|
level: 'error',
|
|
message,
|
|
context,
|
|
userId: this.getUserId(),
|
|
sessionId: this.getSessionId(),
|
|
gameSessionId: this.getGameSessionId(),
|
|
properties,
|
|
}
|
|
|
|
if (error) {
|
|
entry.error = {
|
|
name: error.name,
|
|
message: error.message,
|
|
stack: error.stack,
|
|
}
|
|
}
|
|
|
|
this.buffer.push(entry)
|
|
this.checkFlushConditions()
|
|
}
|
|
|
|
// Game-specific logging methods
|
|
logGameEvent(event: string, gameSessionId: string, properties?: Record<string, any>): void {
|
|
this.info(`Game event: ${event}`, 'game', {
|
|
gameSessionId,
|
|
...properties,
|
|
})
|
|
}
|
|
|
|
logUserAction(action: string, properties?: Record<string, any>): void {
|
|
this.info(`User action: ${action}`, 'user', properties)
|
|
}
|
|
|
|
logPerformanceMetric(metric: string, value: number, unit: string): void {
|
|
this.debug(`Performance metric: ${metric}`, 'performance', {
|
|
metric,
|
|
value,
|
|
unit,
|
|
url: window.location.pathname,
|
|
})
|
|
}
|
|
|
|
private log(level: LogEntry['level'], message: string, context?: string, properties?: Record<string, any>): void {
|
|
const entry: LogEntry = {
|
|
timestamp: new Date().toISOString(),
|
|
level,
|
|
message,
|
|
context,
|
|
userId: this.getUserId(),
|
|
sessionId: this.getSessionId(),
|
|
gameSessionId: this.getGameSessionId(),
|
|
properties,
|
|
}
|
|
|
|
this.buffer.push(entry)
|
|
this.checkFlushConditions()
|
|
}
|
|
|
|
private checkFlushConditions(): void {
|
|
if (this.buffer.length >= this.maxBufferSize) {
|
|
this.flush()
|
|
}
|
|
}
|
|
|
|
private async flush(): Promise<void> {
|
|
if (this.buffer.length === 0) return
|
|
|
|
const logs = [...this.buffer]
|
|
this.buffer = []
|
|
|
|
try {
|
|
await fetch(this.endpoint, {
|
|
method: 'POST',
|
|
headers: {
|
|
'Content-Type': 'application/json',
|
|
},
|
|
body: JSON.stringify({ logs }),
|
|
})
|
|
} catch (error) {
|
|
console.error('Failed to send logs:', error)
|
|
// Re-queue logs for retry (keep only most recent to avoid memory issues)
|
|
this.buffer.unshift(...logs.slice(-50))
|
|
}
|
|
}
|
|
|
|
private startPeriodicFlush(): void {
|
|
setInterval(() => {
|
|
this.flush()
|
|
}, this.flushInterval)
|
|
|
|
// Flush on page unload
|
|
window.addEventListener('beforeunload', () => {
|
|
// Use sendBeacon for reliable delivery during page unload
|
|
if (this.buffer.length > 0) {
|
|
navigator.sendBeacon(this.endpoint, JSON.stringify({ logs: this.buffer }))
|
|
}
|
|
})
|
|
}
|
|
|
|
private getUserId(): string | undefined {
|
|
// Implementation depends on auth system
|
|
return sessionStorage.getItem('user_id') || undefined
|
|
}
|
|
|
|
private getSessionId(): string | undefined {
|
|
return sessionStorage.getItem('session_id') || undefined
|
|
}
|
|
|
|
private getGameSessionId(): string | undefined {
|
|
return sessionStorage.getItem('game_session_id') || undefined
|
|
}
|
|
}
|
|
```
|
|
|
|
## Monitoring Dashboards
|
|
|
|
### Grafana Dashboard Configuration
|
|
|
|
#### 1. Business Intelligence Dashboard
|
|
```json
|
|
{
|
|
"dashboard": {
|
|
"title": "Know Foolery - Business Intelligence",
|
|
"panels": [
|
|
{
|
|
"title": "Active Games",
|
|
"type": "stat",
|
|
"targets": [
|
|
{
|
|
"expr": "sum(rate(games_started_total[5m])) * 300",
|
|
"legendFormat": "Games per 5min"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"title": "Game Completion Rate",
|
|
"type": "stat",
|
|
"targets": [
|
|
{
|
|
"expr": "rate(games_completed_total{completion_type=\"normal\"}[1h]) / rate(games_started_total[1h]) * 100",
|
|
"legendFormat": "Completion %"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"title": "Average Session Duration",
|
|
"type": "graph",
|
|
"targets": [
|
|
{
|
|
"expr": "histogram_quantile(0.5, rate(game_session_duration_seconds_bucket[5m]))",
|
|
"legendFormat": "Median"
|
|
},
|
|
{
|
|
"expr": "histogram_quantile(0.95, rate(game_session_duration_seconds_bucket[5m]))",
|
|
"legendFormat": "95th percentile"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"title": "Question Accuracy by Theme",
|
|
"type": "heatmap",
|
|
"targets": [
|
|
{
|
|
"expr": "rate(answers_submitted_total{is_correct=\"true\"}[1h]) / rate(answers_submitted_total[1h]) by (theme)",
|
|
"legendFormat": "{{theme}}"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"title": "Hint Usage Rate",
|
|
"type": "graph",
|
|
"targets": [
|
|
{
|
|
"expr": "rate(hints_requested_total[5m]) / rate(questions_asked_total[5m]) * 100",
|
|
"legendFormat": "Hint Usage %"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"title": "Score Distribution",
|
|
"type": "histogram",
|
|
"targets": [
|
|
{
|
|
"expr": "histogram_quantile(0.25, rate(game_scores_bucket[1h]))",
|
|
"legendFormat": "25th percentile"
|
|
},
|
|
{
|
|
"expr": "histogram_quantile(0.5, rate(game_scores_bucket[1h]))",
|
|
"legendFormat": "Median"
|
|
},
|
|
{
|
|
"expr": "histogram_quantile(0.75, rate(game_scores_bucket[1h]))",
|
|
"legendFormat": "75th percentile"
|
|
}
|
|
]
|
|
}
|
|
]
|
|
}
|
|
}
|
|
```
|
|
|
|
#### 2. Technical Performance Dashboard
|
|
```json
|
|
{
|
|
"dashboard": {
|
|
"title": "Know Foolery - Technical Performance",
|
|
"panels": [
|
|
{
|
|
"title": "API Response Times",
|
|
"type": "graph",
|
|
"targets": [
|
|
{
|
|
"expr": "histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) by (service)",
|
|
"legendFormat": "{{service}} - 95th percentile"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"title": "Error Rate",
|
|
"type": "graph",
|
|
"targets": [
|
|
{
|
|
"expr": "rate(http_requests_total{status_code=~\"5..\"}[5m]) / rate(http_requests_total[5m]) * 100 by (service)",
|
|
"legendFormat": "{{service}} - Error %"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"title": "Database Performance",
|
|
"type": "graph",
|
|
"targets": [
|
|
{
|
|
"expr": "histogram_quantile(0.95, rate(db_query_duration_seconds_bucket[5m])) by (service)",
|
|
"legendFormat": "{{service}} - Query Time"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"title": "Cache Hit Rate",
|
|
"type": "stat",
|
|
"targets": [
|
|
{
|
|
"expr": "rate(cache_operations_total{result=\"hit\"}[5m]) / rate(cache_operations_total{operation=\"get\"}[5m]) * 100",
|
|
"legendFormat": "Hit Rate %"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"title": "Authentication Success Rate",
|
|
"type": "graph",
|
|
"targets": [
|
|
{
|
|
"expr": "rate(authentication_attempts_total{result=\"success\"}[5m]) / rate(authentication_attempts_total[5m]) * 100 by (method)",
|
|
"legendFormat": "{{method}} - Success %"
|
|
}
|
|
]
|
|
}
|
|
]
|
|
}
|
|
}
|
|
```
|
|
|
|
## Alerting Strategy
|
|
|
|
### Alert Rules Configuration
|
|
|
|
#### Critical Alerts
|
|
```yaml
|
|
# prometheus-alerts.yml
|
|
groups:
|
|
- name: know-foolery-critical
|
|
rules:
|
|
- alert: HighErrorRate
|
|
expr: rate(http_requests_total{status_code=~"5.."}[5m]) / rate(http_requests_total[5m]) > 0.05
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
team: backend
|
|
annotations:
|
|
summary: "High error rate detected in {{ $labels.service }}"
|
|
description: "Error rate is {{ $value | humanizePercentage }} for service {{ $labels.service }}"
|
|
|
|
- alert: DatabaseConnectionFailure
|
|
expr: db_connections_active == 0
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
team: backend
|
|
annotations:
|
|
summary: "Database connections dropped to zero"
|
|
description: "Service {{ $labels.service }} has no active database connections"
|
|
|
|
- alert: AuthenticationSystemDown
|
|
expr: up{service="zitadel"} == 0
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
team: security
|
|
annotations:
|
|
summary: "Authentication system is down"
|
|
description: "Zitadel authentication service is unreachable"
|
|
|
|
- alert: GameSessionsStuck
|
|
expr: increase(games_started_total[5m]) > increase(games_completed_total[5m]) * 2
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
team: backend
|
|
annotations:
|
|
summary: "Game sessions not completing"
|
|
description: "Many games are starting but not completing normally"
|
|
|
|
- name: know-foolery-warning
|
|
rules:
|
|
- alert: HighLatency
|
|
expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) > 1.0
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
team: backend
|
|
annotations:
|
|
summary: "High API latency detected"
|
|
description: "95th percentile latency is {{ $value }}s for {{ $labels.service }}"
|
|
|
|
- alert: LowGameCompletionRate
|
|
expr: rate(games_completed_total{completion_type="normal"}[1h]) / rate(games_started_total[1h]) < 0.7
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
team: product
|
|
annotations:
|
|
summary: "Low game completion rate"
|
|
description: "Only {{ $value | humanizePercentage }} of games are being completed normally"
|
|
|
|
- alert: HighHintUsage
|
|
expr: rate(hints_requested_total[1h]) / rate(questions_asked_total[1h]) > 0.8
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
team: product
|
|
annotations:
|
|
summary: "Unusually high hint usage"
|
|
description: "{{ $value | humanizePercentage }} of questions are requesting hints"
|
|
|
|
- name: know-foolery-security
|
|
rules:
|
|
- alert: HighAuthenticationFailures
|
|
expr: rate(authentication_attempts_total{result="failure"}[5m]) > 10
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
team: security
|
|
annotations:
|
|
summary: "High authentication failure rate"
|
|
description: "{{ $value }} authentication failures per second"
|
|
|
|
- alert: SuspiciousUserActivity
|
|
expr: rate(answers_submitted_total[1m]) by (user_id) > 5
|
|
for: 1m
|
|
labels:
|
|
severity: warning
|
|
team: security
|
|
annotations:
|
|
summary: "Suspicious user activity detected"
|
|
description: "User {{ $labels.user_id }} is submitting answers at {{ $value }}/second"
|
|
```
|
|
|
|
### Alert Routing and Escalation
|
|
```yaml
|
|
# alertmanager.yml
|
|
global:
|
|
slack_api_url: 'https://hooks.slack.com/services/...'
|
|
|
|
route:
|
|
group_by: ['alertname', 'service']
|
|
group_wait: 10s
|
|
group_interval: 10s
|
|
repeat_interval: 1h
|
|
receiver: 'default'
|
|
routes:
|
|
- match:
|
|
severity: critical
|
|
receiver: 'critical-alerts'
|
|
group_wait: 0s
|
|
- match:
|
|
team: security
|
|
receiver: 'security-team'
|
|
- match:
|
|
team: product
|
|
receiver: 'product-team'
|
|
|
|
receivers:
|
|
- name: 'default'
|
|
slack_configs:
|
|
- channel: '#alerts'
|
|
title: 'Know Foolery Alert'
|
|
text: '{{ range .Alerts }}{{ .Annotations.summary }}{{ end }}'
|
|
|
|
- name: 'critical-alerts'
|
|
slack_configs:
|
|
- channel: '#critical-alerts'
|
|
title: 'CRITICAL: Know Foolery'
|
|
text: '{{ range .Alerts }}{{ .Annotations.summary }}\n{{ .Annotations.description }}{{ end }}'
|
|
pagerduty_configs:
|
|
- service_key: 'your-pagerduty-key'
|
|
description: '{{ range .Alerts }}{{ .Annotations.summary }}{{ end }}'
|
|
|
|
- name: 'security-team'
|
|
slack_configs:
|
|
- channel: '#security-alerts'
|
|
title: 'Security Alert: Know Foolery'
|
|
text: '{{ range .Alerts }}{{ .Annotations.summary }}{{ end }}'
|
|
|
|
- name: 'product-team'
|
|
slack_configs:
|
|
- channel: '#product-alerts'
|
|
title: 'Product Alert: Know Foolery'
|
|
text: '{{ range .Alerts }}{{ .Annotations.summary }}{{ end }}'
|
|
```
|
|
|
|
This comprehensive observability strategy ensures that Know Foolery has full visibility into its performance, user behavior, and system health, enabling proactive issue resolution and data-driven product improvements. |