|
|
from pptx import Presentation
|
|
|
from pptx.util import Inches, Pt
|
|
|
from pptx.dml.color import RGBColor
|
|
|
from pptx.enum.text import PP_ALIGN
|
|
|
from pptx.enum.shapes import MSO_SHAPE, MSO_CONNECTOR
|
|
|
|
|
|
OUTPUT = "/Users/oabrivard/Projects/mdp/databricks_prime_tech.pptx"
|
|
|
|
|
|
prs = Presentation()
|
|
|
prs.slide_width = Inches(13.333)
|
|
|
prs.slide_height = Inches(7.5)
|
|
|
|
|
|
COLORS = {
|
|
|
"navy": RGBColor(11, 33, 58),
|
|
|
"blue": RGBColor(0, 84, 153),
|
|
|
"teal": RGBColor(0, 120, 130),
|
|
|
"green": RGBColor(26, 122, 82),
|
|
|
"gold": RGBColor(166, 126, 18),
|
|
|
"bg": RGBColor(243, 247, 251),
|
|
|
"card": RGBColor(255, 255, 255),
|
|
|
"text": RGBColor(34, 46, 57),
|
|
|
"muted": RGBColor(99, 113, 128),
|
|
|
"line": RGBColor(211, 220, 229),
|
|
|
"white": RGBColor(255, 255, 255),
|
|
|
"bronze": RGBColor(166, 109, 75),
|
|
|
"silver": RGBColor(126, 140, 156),
|
|
|
"gold_l": RGBColor(208, 173, 71),
|
|
|
}
|
|
|
|
|
|
MARGIN = Inches(0.55)
|
|
|
TOP = Inches(1.18)
|
|
|
W = Inches(12.25)
|
|
|
H = Inches(5.95)
|
|
|
|
|
|
|
|
|
def font(run, size=16, bold=False, color=None, name="Segoe UI"):
|
|
|
run.font.name = name
|
|
|
run.font.size = Pt(size)
|
|
|
run.font.bold = bold
|
|
|
run.font.color.rgb = color or COLORS["text"]
|
|
|
|
|
|
|
|
|
def brand(slide, title, subtitle=None):
|
|
|
bg = slide.background
|
|
|
bg.fill.solid(); bg.fill.fore_color.rgb = COLORS["bg"]
|
|
|
|
|
|
bar = slide.shapes.add_shape(MSO_SHAPE.RECTANGLE, 0, 0, prs.slide_width, Inches(0.88))
|
|
|
bar.fill.solid(); bar.fill.fore_color.rgb = COLORS["navy"]; bar.line.fill.background()
|
|
|
|
|
|
tb = slide.shapes.add_textbox(MARGIN, Inches(0.12), Inches(10.2), Inches(0.54))
|
|
|
tf = tb.text_frame; tf.clear()
|
|
|
p = tf.paragraphs[0]
|
|
|
r = p.add_run(); r.text = title
|
|
|
font(r, size=23, bold=True, color=COLORS["white"])
|
|
|
|
|
|
if subtitle:
|
|
|
sb = slide.shapes.add_textbox(MARGIN, Inches(0.9), Inches(11.5), Inches(0.25))
|
|
|
stf = sb.text_frame
|
|
|
p = stf.paragraphs[0]
|
|
|
r = p.add_run(); r.text = subtitle
|
|
|
font(r, size=11, color=COLORS["muted"])
|
|
|
|
|
|
foot = slide.shapes.add_shape(MSO_SHAPE.RECTANGLE, 0, Inches(7.2), prs.slide_width, Inches(0.3))
|
|
|
foot.fill.solid(); foot.fill.fore_color.rgb = COLORS["white"]; foot.line.fill.background()
|
|
|
fb = slide.shapes.add_textbox(MARGIN, Inches(7.22), Inches(9), Inches(0.2))
|
|
|
p = fb.text_frame.paragraphs[0]
|
|
|
r = p.add_run(); r.text = "Greenfield | Databricks-Primary Technical Deep Dive | Internal - Confidential"
|
|
|
font(r, size=9, color=COLORS["muted"])
|
|
|
|
|
|
|
|
|
def section(title, subtitle, accent="blue"):
|
|
|
s = prs.slides.add_slide(prs.slide_layouts[6])
|
|
|
s.background.fill.solid(); s.background.fill.fore_color.rgb = COLORS["navy"]
|
|
|
strip = s.shapes.add_shape(MSO_SHAPE.RECTANGLE, 0, Inches(5.7), prs.slide_width, Inches(1.8))
|
|
|
strip.fill.solid(); strip.fill.fore_color.rgb = COLORS[accent]; strip.line.fill.background()
|
|
|
box = s.shapes.add_textbox(Inches(0.8), Inches(2.2), Inches(11.8), Inches(2.3))
|
|
|
tf = box.text_frame; tf.clear()
|
|
|
p1 = tf.paragraphs[0]
|
|
|
r1 = p1.add_run(); r1.text = title
|
|
|
font(r1, size=40, bold=True, color=COLORS["white"])
|
|
|
p2 = tf.add_paragraph(); r2 = p2.add_run(); r2.text = subtitle
|
|
|
font(r2, size=18, color=COLORS["white"])
|
|
|
|
|
|
|
|
|
def bullet(title, items, subtitle=None):
|
|
|
s = prs.slides.add_slide(prs.slide_layouts[6])
|
|
|
brand(s, title, subtitle)
|
|
|
b = s.shapes.add_shape(MSO_SHAPE.ROUNDED_RECTANGLE, MARGIN, TOP, W, H)
|
|
|
b.fill.solid(); b.fill.fore_color.rgb = COLORS["card"]; b.line.color.rgb = COLORS["line"]
|
|
|
t = s.shapes.add_textbox(MARGIN + Inches(0.28), TOP + Inches(0.28), W - Inches(0.56), H - Inches(0.56))
|
|
|
tf = t.text_frame; tf.clear(); tf.word_wrap = True
|
|
|
for i, it in enumerate(items):
|
|
|
p = tf.paragraphs[0] if i == 0 else tf.add_paragraph()
|
|
|
p.space_after = Pt(7)
|
|
|
r = p.add_run(); r.text = f"• {it}"
|
|
|
font(r, size=17)
|
|
|
|
|
|
|
|
|
def two_col(title, left_title, left_items, right_title, right_items, subtitle=None):
|
|
|
s = prs.slides.add_slide(prs.slide_layouts[6])
|
|
|
brand(s, title, subtitle)
|
|
|
x1, x2, y, cw, ch = MARGIN, Inches(6.75), TOP, Inches(5.95), H
|
|
|
for x, t, color in [(x1, left_title, COLORS["blue"]), (x2, right_title, COLORS["teal"] )]:
|
|
|
c = s.shapes.add_shape(MSO_SHAPE.ROUNDED_RECTANGLE, x, y, cw, ch)
|
|
|
c.fill.solid(); c.fill.fore_color.rgb = COLORS["card"]; c.line.color.rgb = COLORS["line"]
|
|
|
h = s.shapes.add_shape(MSO_SHAPE.RECTANGLE, x, y, cw, Inches(0.58))
|
|
|
h.fill.solid(); h.fill.fore_color.rgb = color; h.line.fill.background()
|
|
|
hb = s.shapes.add_textbox(x + Inches(0.2), y + Inches(0.14), cw - Inches(0.4), Inches(0.3))
|
|
|
pr = hb.text_frame.paragraphs[0].add_run(); pr.text = t
|
|
|
font(pr, size=14, bold=True, color=COLORS["white"])
|
|
|
|
|
|
def write(x, arr):
|
|
|
tx = s.shapes.add_textbox(x + Inches(0.2), y + Inches(0.72), cw - Inches(0.4), ch - Inches(0.9))
|
|
|
tf = tx.text_frame; tf.clear(); tf.word_wrap=True
|
|
|
for i, it in enumerate(arr):
|
|
|
p = tf.paragraphs[0] if i==0 else tf.add_paragraph()
|
|
|
p.space_after = Pt(6)
|
|
|
r = p.add_run(); r.text = f"• {it}"
|
|
|
font(r, size=14)
|
|
|
|
|
|
write(x1, left_items)
|
|
|
write(x2, right_items)
|
|
|
|
|
|
|
|
|
def table_slide(title, cols, rows, subtitle=None):
|
|
|
s = prs.slides.add_slide(prs.slide_layouts[6])
|
|
|
brand(s, title, subtitle)
|
|
|
ts = s.shapes.add_table(len(rows)+1, len(cols), MARGIN, TOP, W, H)
|
|
|
t = ts.table
|
|
|
for j,c in enumerate(cols):
|
|
|
cell=t.cell(0,j); cell.text=c
|
|
|
cell.fill.solid(); cell.fill.fore_color.rgb = COLORS["blue"]
|
|
|
for p in cell.text_frame.paragraphs:
|
|
|
for r in p.runs: font(r, size=11, bold=True, color=COLORS["white"])
|
|
|
for i,row in enumerate(rows, start=1):
|
|
|
for j,v in enumerate(row):
|
|
|
cell=t.cell(i,j); cell.text=v
|
|
|
if i%2==0:
|
|
|
cell.fill.solid(); cell.fill.fore_color.rgb = RGBColor(235,241,247)
|
|
|
for p in cell.text_frame.paragraphs:
|
|
|
for r in p.runs: font(r, size=10)
|
|
|
|
|
|
|
|
|
def flow(title, steps, subtitle=None):
|
|
|
s = prs.slides.add_slide(prs.slide_layouts[6])
|
|
|
brand(s, title, subtitle)
|
|
|
n = len(steps)
|
|
|
gx = Inches(0.15)
|
|
|
sw = (W - gx*(n-1))/n
|
|
|
y = Inches(2.6)
|
|
|
for i,st in enumerate(steps):
|
|
|
x = MARGIN + i*(sw+gx)
|
|
|
b = s.shapes.add_shape(MSO_SHAPE.ROUNDED_RECTANGLE, x, y, sw, Inches(1.6))
|
|
|
b.fill.solid(); b.fill.fore_color.rgb=COLORS["card"]; b.line.color.rgb=COLORS["blue"]
|
|
|
tf=b.text_frame; tf.clear(); p=tf.paragraphs[0]; p.alignment=PP_ALIGN.CENTER
|
|
|
r=p.add_run(); r.text=st; font(r, size=13, bold=True)
|
|
|
if i < n-1:
|
|
|
a = s.shapes.add_shape(MSO_SHAPE.CHEVRON, x+sw-Inches(0.02), y+Inches(0.6), Inches(0.18), Inches(0.38))
|
|
|
a.fill.solid(); a.fill.fore_color.rgb=COLORS["teal"]; a.line.fill.background()
|
|
|
|
|
|
|
|
|
def image_slide(title, path, subtitle=None, caption=None):
|
|
|
s=prs.slides.add_slide(prs.slide_layouts[6])
|
|
|
brand(s,title,subtitle)
|
|
|
s.shapes.add_picture(path, MARGIN, TOP, width=W, height=H-Inches(0.15))
|
|
|
if caption:
|
|
|
c=s.shapes.add_textbox(MARGIN, Inches(7.0), Inches(11), Inches(0.2))
|
|
|
r=c.text_frame.paragraphs[0].add_run(); r.text=caption
|
|
|
font(r,size=9,color=COLORS["muted"])
|
|
|
|
|
|
|
|
|
def schema_layers(title, layers, subtitle=None):
|
|
|
s = prs.slides.add_slide(prs.slide_layouts[6])
|
|
|
brand(s, title, subtitle)
|
|
|
y = TOP + Inches(0.15)
|
|
|
lx = MARGIN + Inches(0.25)
|
|
|
lw = W - Inches(0.5)
|
|
|
lh = (H - Inches(1.0))/len(layers)
|
|
|
for i,(name,desc,col) in enumerate(layers):
|
|
|
by = y + i*lh
|
|
|
b = s.shapes.add_shape(MSO_SHAPE.ROUNDED_RECTANGLE, lx, by, lw, lh-Inches(0.08))
|
|
|
b.fill.solid(); b.fill.fore_color.rgb = col; b.line.fill.background()
|
|
|
tb = s.shapes.add_textbox(lx+Inches(0.2), by+Inches(0.12), lw-Inches(0.4), lh-Inches(0.25))
|
|
|
tf = tb.text_frame; tf.clear()
|
|
|
p1=tf.paragraphs[0]; r1=p1.add_run(); r1.text=name
|
|
|
font(r1,size=16,bold=True,color=COLORS["white"])
|
|
|
p2=tf.add_paragraph(); r2=p2.add_run(); r2.text=desc
|
|
|
font(r2,size=12,color=COLORS["white"])
|
|
|
|
|
|
|
|
|
def nfr_radar_like(title, items, subtitle=None):
|
|
|
s=prs.slides.add_slide(prs.slide_layouts[6])
|
|
|
brand(s,title,subtitle)
|
|
|
cx, cy = Inches(4.3), Inches(4.1)
|
|
|
radii = [Inches(0.8), Inches(1.3), Inches(1.8), Inches(2.3)]
|
|
|
for r in radii:
|
|
|
circ = s.shapes.add_shape(MSO_SHAPE.OVAL, cx-r, cy-r, 2*r, 2*r)
|
|
|
circ.fill.background(); circ.line.color.rgb = RGBColor(207,217,228)
|
|
|
for i,(label,val,col) in enumerate(items):
|
|
|
y = Inches(1.6) + i*Inches(0.68)
|
|
|
bx = Inches(8.0)
|
|
|
bar = s.shapes.add_shape(MSO_SHAPE.RECTANGLE, bx, y, Inches(0.06*val), Inches(0.38))
|
|
|
bar.fill.solid(); bar.fill.fore_color.rgb = col; bar.line.fill.background()
|
|
|
txt = s.shapes.add_textbox(Inches(6.3), y, Inches(1.6), Inches(0.38))
|
|
|
tr = txt.text_frame.paragraphs[0].add_run(); tr.text = label
|
|
|
font(tr,size=12)
|
|
|
title_b = s.shapes.add_textbox(Inches(1.2), Inches(1.3), Inches(4.7), Inches(0.6))
|
|
|
rr=title_b.text_frame.paragraphs[0].add_run(); rr.text="NFR Maturity Snapshot"
|
|
|
font(rr,size=20,bold=True,color=COLORS["blue"])
|
|
|
|
|
|
|
|
|
def control_matrix(title, rows, subtitle=None):
|
|
|
table_slide(title,["Control Domain","Design","Runtime Enforcement","Evidence"],rows,subtitle)
|
|
|
|
|
|
# Intro + context (1-8)
|
|
|
s=prs.slides.add_slide(prs.slide_layouts[6])
|
|
|
s.background.fill.solid(); s.background.fill.fore_color.rgb = COLORS["navy"]
|
|
|
hero=s.shapes.add_shape(MSO_SHAPE.RECTANGLE,0,Inches(4.85),prs.slide_width,Inches(2.65)); hero.fill.solid(); hero.fill.fore_color.rgb=COLORS["blue"]; hero.line.fill.background()
|
|
|
b=s.shapes.add_textbox(Inches(0.85),Inches(0.95),Inches(11.8),Inches(2.9)); tf=b.text_frame; tf.clear()
|
|
|
r=tf.paragraphs[0].add_run(); r.text="Greenfield Modern Data Platform"
|
|
|
font(r,size=44,bold=True,color=COLORS["white"])
|
|
|
p=tf.add_paragraph(); r2=p.add_run(); r2.text="Databricks-Primary Technical Deep Dive (v8.0)"
|
|
|
font(r2,size=27,color=COLORS["white"])
|
|
|
p3=tf.add_paragraph(); r3=p3.add_run(); r3.text="Architecture, Infrastructure, Security, and Delivery Mechanics"
|
|
|
font(r3,size=15,color=COLORS["white"])
|
|
|
|
|
|
bullet("Technical Agenda",[
|
|
|
"Architecture boundaries and workload placement",
|
|
|
"Medallion implementation details (ingestion to serving)",
|
|
|
"Governance, lineage, and data quality controls",
|
|
|
"Security/privacy implementation patterns",
|
|
|
"Azure infrastructure, network, and operations",
|
|
|
"Performance, reliability, FinOps, and DevSecOps",
|
|
|
"Execution roadmap and architecture decision gates",
|
|
|
],"76-slide engineering and architecture walkthrough")
|
|
|
|
|
|
bullet("Scope and Assumptions",[
|
|
|
"Primary architecture option: Databricks-primary v8.0.",
|
|
|
"Fabric scope constrained to BI serving via Direct Lake.",
|
|
|
"SAS Viya retained for regulated actuarial/risk use cases.",
|
|
|
"Canadian-only residency: Canada Central (prod), Canada East (DR).",
|
|
|
"Control baseline aligned to AMF, OSFI, Law 25, and PIPEDA.",
|
|
|
])
|
|
|
|
|
|
two_col("Business and Technical Objectives","Business Outcomes",[
|
|
|
"Trusted enterprise data products",
|
|
|
"Faster analytics and AI delivery cycle",
|
|
|
"Regulatory-grade traceability and evidence",
|
|
|
"Platform simplification and controlled spend",
|
|
|
],"Technical Outcomes",[
|
|
|
"Single Delta substrate with governed access",
|
|
|
"Composable medallion pipelines",
|
|
|
"Lineage and DQ from source to report",
|
|
|
"Automated policy and IaC enforcement",
|
|
|
])
|
|
|
|
|
|
bullet("Architecture Principles (Operationalized)",[
|
|
|
"Data as Product: ownership, contracts, certification, SLAs.",
|
|
|
"Security by Design: identity-first, zero trust, fine-grained controls.",
|
|
|
"Federated Execution: centralized guardrails, domain delivery autonomy.",
|
|
|
"Open Standards: Delta/Parquet, APIs, metadata portability.",
|
|
|
"Right Workload on Right Platform: explicit anti-overlap boundaries.",
|
|
|
])
|
|
|
|
|
|
flow("Decision Logic: Workload Placement",[
|
|
|
"Identify\nworkload type",
|
|
|
"Map to\nplatform capability",
|
|
|
"Apply security\nconstraints",
|
|
|
"Validate cost\nprofile",
|
|
|
"Approve via\nARB gate",
|
|
|
])
|
|
|
|
|
|
table_slide("Key Architecture Decisions (Condensed)",["AD","Decision","Technical Consequence"],[
|
|
|
["AD-01","Delta as canonical format","Unified storage semantics and replay"],
|
|
|
["AD-02","Databricks primary","Consolidated ETL/SQL/ML runtime"],
|
|
|
["AD-03","Fabric BI-only","Direct Lake at enterprise scale"],
|
|
|
["AD-04","SAS Compute Server","Sequential regulated compute path"],
|
|
|
["AD-05","Purview+UC+Manta","Multi-plane governance + lineage"],
|
|
|
["AD-06","Canada-only regions","Policy-enforced residency"],
|
|
|
["AD-07","ADLS shared substrate","Cross-platform non-duplicative access"],
|
|
|
])
|
|
|
|
|
|
nfr_radar_like("Non-Functional Priorities",[
|
|
|
("Security",9,COLORS["teal"]),
|
|
|
("Reliability",8,COLORS["blue"]),
|
|
|
("Performance",8,COLORS["green"]),
|
|
|
("Cost Efficiency",7,COLORS["gold"]),
|
|
|
("Operability",8,COLORS["silver"]),
|
|
|
],"Relative target maturity by design intent")
|
|
|
|
|
|
# Logical architecture (9-20)
|
|
|
section("Logical Architecture","Layer model, boundaries, and interactions","teal")
|
|
|
schema_layers("Seven-Layer Reference Model",[
|
|
|
("1. Ingestion","Batch, CDC, streaming, APIs; provenance and control metadata",RGBColor(36,93,140)),
|
|
|
("2. Bronze","Immutable raw Delta persistence with replay capability",COLORS["bronze"]),
|
|
|
("3. Silver","Conformance, quality, survivorship, and enrichment",COLORS["silver"]),
|
|
|
("4. Gold","Business-ready products, marts, aggregates, feature sets",COLORS["gold_l"]),
|
|
|
("5. Semantic","Business vocabulary and certified KPI definitions",RGBColor(71,118,167)),
|
|
|
("6. Serving","BI, SQL, APIs, SAS, and model consumption endpoints",RGBColor(38,145,126)),
|
|
|
("7. AI/ML","Experimentation, training, deployment, and monitoring",RGBColor(29,102,79)),
|
|
|
],"Layering aligned to DAMA-DMBOK and EDM-DCAM")
|
|
|
|
|
|
table_slide("Cross-Cutting Planes",["Plane","Responsibilities","Primary Systems"],[
|
|
|
["Governance","Catalog, glossary, lineage, quality, stewardship","Purview, Unity Catalog, Manta"],
|
|
|
["Security & Privacy","Identity, ABAC/RLS/CLS/DDM, DLP, consent","Entra ID, UC, Purview DLP"],
|
|
|
["Operations","Monitoring, SLAs, incident, runbook automation","Azure Monitor, Databricks Workflows"],
|
|
|
])
|
|
|
|
|
|
table_slide("Capability Mapping by Platform",["Capability","Databricks","Fabric","SAS Viya"],[
|
|
|
["ETL/ELT","Primary","Restricted","Targeted"],
|
|
|
["Warehousing","Primary","Serving-only","-"],
|
|
|
["BI","Secondary","Primary","Targeted"],
|
|
|
["ML/AI","Primary","-","Specialized"],
|
|
|
["Lineage","Contributing","Contributing","Contributing via Manta"],
|
|
|
["Semantic Intelligence","Contributing","Evaluate H2/H3","-"],
|
|
|
])
|
|
|
|
|
|
flow("Inter-Platform Data Flow",[
|
|
|
"Sources",
|
|
|
"ADF / Auto Loader",
|
|
|
"ADLS Delta",
|
|
|
"Databricks Bronze/Silver/Gold",
|
|
|
"Fabric Direct Lake + SAS JDBC",
|
|
|
],"No duplicated warehouse copies")
|
|
|
|
|
|
image_slide("Medallion Architecture (Reference Diagram)","/Users/oabrivard/Projects/mdp/.tmp_assets/medallion1.png","End-to-end governed flow")
|
|
|
image_slide("Medallion Architecture (Detailed Variant)","/Users/oabrivard/Projects/mdp/.tmp_assets/medallion2.png","Alternative technical rendering")
|
|
|
|
|
|
schema_layers("Fabric Scope Boundary (Allowed vs Prohibited)",[
|
|
|
("Allowed","Power BI models, reports, Direct Lake, minimal Dataflows Gen2",RGBColor(35,129,88)),
|
|
|
("Conditionally Allowed","Eventhouse/KQL for targeted operational dashboards",RGBColor(61,138,170)),
|
|
|
("Prohibited","Parallel Fabric warehouse/lakehouse and duplicated ETL",RGBColor(170,81,66)),
|
|
|
("Prohibited","Fabric Data Factory as primary orchestrator",RGBColor(170,81,66)),
|
|
|
("Prohibited","Fabric notebooks replacing Databricks engineering runtime",RGBColor(170,81,66)),
|
|
|
],"Architecture guardrail enforcement required")
|
|
|
|
|
|
bullet("Direct Lake Rationale at Enterprise Scale",[
|
|
|
"Power BI DirectQuery against Databricks SQL at 55k users is cost-latency intensive.",
|
|
|
"Direct Lake minimizes per-query round trips while preserving freshness.",
|
|
|
"Fabric’s role is economic BI serving optimization, not data platform duplication.",
|
|
|
"Capacity governance prevents expansion into unauthorized engineering use.",
|
|
|
])
|
|
|
|
|
|
flow("Logical Request Path: Analyst to Governed Data",[
|
|
|
"User Auth via Entra",
|
|
|
"Semantic Model",
|
|
|
"Direct Lake / SQL",
|
|
|
"UC Policy Evaluation",
|
|
|
"Delta Read + Mask",
|
|
|
],"Security and governance remain in-path")
|
|
|
|
|
|
two_col("Technical Debt Avoidance","Anti-Patterns to Block",[
|
|
|
"Shadow ETL in Fabric",
|
|
|
"Unmanaged extracts to local marts",
|
|
|
"Direct ADLS sensitive access from unmanaged runtimes",
|
|
|
"Unversioned security policies",
|
|
|
],"Mitigation Mechanisms",[
|
|
|
"ARB design gate + exception workflow",
|
|
|
"Policy-as-code + IaC scanning",
|
|
|
"Centralized service principal governance",
|
|
|
"Continuous lineage and DQ evidence",
|
|
|
])
|
|
|
|
|
|
# Medallion deep dive (21-34)
|
|
|
section("Medallion Implementation","Ingestion, transformations, and productization mechanics","gold")
|
|
|
|
|
|
schema_layers("Source and Ingestion Schema",[
|
|
|
("Sources","Core banking, insurance, cards, CRM, digital channels, external feeds",RGBColor(74,115,154)),
|
|
|
("Ingestion Runtime","ADF, Auto Loader, Structured Streaming, API pulls",RGBColor(40,122,138)),
|
|
|
("Staging + Audit","Landing, provenance metadata, extraction checksums",RGBColor(86,130,170)),
|
|
|
("DQ Pre-Gate","Purview sampling, structural checks, anomaly thresholds",RGBColor(170,120,57)),
|
|
|
("Bronze Commit","Append-only Delta commit with replay metadata",COLORS["bronze"]),
|
|
|
])
|
|
|
|
|
|
table_slide("Ingestion Patterns and Design",["Pattern","Mechanism","Control Concerns","Typical Domains"],[
|
|
|
["Batch Files","Auto Loader cloudFiles","Schema drift, duplicate drops","Partners, regulatory feeds"],
|
|
|
["Batch DB","ADF Copy + watermark/CDC","Latency window, idempotency","Core banking, policy admin"],
|
|
|
["Near Real-time","Debezium/Event Hub/Streaming","Exactly-once semantics","Transactions, status updates"],
|
|
|
["Streaming","Event Hub + SS","Backpressure, ordering","Fraud and telemetry"],
|
|
|
["API","ADF Web / notebook clients","Rate limits, retry strategies","Market and bureau data"],
|
|
|
["SAS Native","JDBC/authorized ADLS","RLS bypass risk on ADLS path","Actuarial inputs"],
|
|
|
])
|
|
|
|
|
|
flow("Pre-Bronze Quality Gate Sequence",[
|
|
|
"Ingestion Complete",
|
|
|
"Purview DQ Trigger",
|
|
|
"Sample & Profile",
|
|
|
"Threshold Decision",
|
|
|
"Promote or Quarantine",
|
|
|
],"Tier-1 control before immutable storage")
|
|
|
|
|
|
table_slide("Sampling and Threshold Model",["Domain Type","Sample Strategy","Blocking Criteria"],[
|
|
|
["Critical Data Elements","10-20% (or 100% low-volume)","Null/format/referential failure"],
|
|
|
["Non-CDE Attributes","1-5%","Structural anomaly warnings"],
|
|
|
["Streaming Windows","Every 1000 records or 5 min","Volume or schema spikes"],
|
|
|
["All Loads","Row-count baseline checks","±50% anomaly triggers quarantine"],
|
|
|
])
|
|
|
|
|
|
bullet("Bronze Layer Engineering Details",[
|
|
|
"Storage layout by source/entity with deterministic partitioning strategy.",
|
|
|
"Minimal transformation: metadata columns and type normalization only.",
|
|
|
"Append-only contracts enable full replay and forensic analysis.",
|
|
|
"Time travel retention baseline 90 days with domain overrides.",
|
|
|
"Quarantine tables maintain failed payloads for root-cause analysis.",
|
|
|
])
|
|
|
|
|
|
table_slide("Bronze Data Contracts",["Contract Field","Purpose","Validation"],[
|
|
|
["source_system","Trace producer lineage","Mandatory, controlled vocabulary"],
|
|
|
["ingested_at","Pipeline timing","UTC timestamp required"],
|
|
|
["batch_id","Reconciliation and idempotency","Uniqueness within source/entity"],
|
|
|
["source_file/hash","File-level replay evidence","Checksum comparison"],
|
|
|
["schema_version","Evolution management","Must map to catalog history"],
|
|
|
])
|
|
|
|
|
|
schema_layers("Silver Transformation Schema",[
|
|
|
("Standardize","Type coercion, timezone normalization, canonical naming",RGBColor(113,127,144)),
|
|
|
("Conform","Reference joins, key harmonization, entity resolution",RGBColor(119,137,155)),
|
|
|
("Validate","Business rules, expectations, anomaly checks",RGBColor(79,124,166)),
|
|
|
("Enrich","Master/reference datasets and derived attributes",RGBColor(60,134,109)),
|
|
|
("Publish Silver","Conformed tables for downstream aggregation",COLORS["silver"]),
|
|
|
])
|
|
|
|
|
|
flow("DLT Pipeline Topology",[
|
|
|
"Bronze Stream",
|
|
|
"DLT Bronze Views",
|
|
|
"DLT Silver Tables",
|
|
|
"Quality Expectations",
|
|
|
"Metrics + Lineage",
|
|
|
],"Declarative dependency and quality tracking")
|
|
|
|
|
|
table_slide("Silver Quality Rules (Examples)",["Rule","Type","Failure Action"],[
|
|
|
["customer_id not null","Completeness","Drop + alert"],
|
|
|
["account_status in set","Validity","Quarantine record"],
|
|
|
["txn_date <= current_date","Reasonableness","Warn and flag"],
|
|
|
["policy_id unique by source","Uniqueness","Deduplicate with survivorship"],
|
|
|
["reference code exists","Referential","Reject for steward remediation"],
|
|
|
])
|
|
|
|
|
|
bullet("Gold Layer Construction Patterns",[
|
|
|
"Star schemas for reporting domains with conformed dimensions.",
|
|
|
"Data products combine domain marts, KPI tables, and feature-ready sets.",
|
|
|
"Certification process includes Tier-3 SLA checks before release.",
|
|
|
"Semantic naming and KPI definitions synchronized with BI semantic layer.",
|
|
|
"Gold outputs are controlled sources for APIs, BI, and ML features.",
|
|
|
])
|
|
|
|
|
|
table_slide("Gold Certification Criteria",["Criterion","Target","Owner"],[
|
|
|
["CDE completeness",">=99.5%","Domain Data Owner"],
|
|
|
["Freshness SLA","Within contractual window","Data Engineering Lead"],
|
|
|
["Lineage completeness","Source-to-KPI trace available","Governance Office"],
|
|
|
["Policy compliance","RLS/CLS/DDM mapped","Security Architect"],
|
|
|
["Operational readiness","Monitoring + runbooks approved","Platform Operations"],
|
|
|
])
|
|
|
|
|
|
flow("End-to-End Medallion Release Path",[
|
|
|
"Ingest",
|
|
|
"Bronze Commit",
|
|
|
"Silver Conformance",
|
|
|
"Gold Certification",
|
|
|
"Consumer Publish",
|
|
|
],"Release gate with evidence at each stage")
|
|
|
|
|
|
# Governance & lineage deep dive (35-44)
|
|
|
section("Governance and Lineage","Metadata planes and operational controls","blue")
|
|
|
|
|
|
schema_layers("Three-Tier Governance Architecture",[
|
|
|
("Tier 1: Purview","Enterprise catalog, glossary, classification, DQ dashboards",RGBColor(71,120,171)),
|
|
|
("Tier 2: Unity Catalog","Technical grants, lineage, row/column security controls",RGBColor(46,133,119)),
|
|
|
("Tier 3: Manta","Cross-platform code-level lineage and impact analysis",RGBColor(97,111,130)),
|
|
|
],"Rationalized model replacing overlapping catalog duplication")
|
|
|
|
|
|
flow("Lineage Federation Pipeline",[
|
|
|
"Scan Code/Metadata",
|
|
|
"Parse Dependencies",
|
|
|
"Build Unified Graph",
|
|
|
"Publish to Purview",
|
|
|
"Expose to Engineers",
|
|
|
],"Databricks + ADF + SAS integration")
|
|
|
|
|
|
table_slide("Metadata Responsibilities",["Function","Purview","Unity Catalog","Manta"],[
|
|
|
["Business glossary","Primary","Reference","-"],
|
|
|
["Technical object ACLs","Reference","Primary","-"],
|
|
|
["Column-level lineage","Consume","Primary (Databricks scope)","Aggregate across tools"],
|
|
|
["Cross-platform lineage","Consume","Consume","Primary"],
|
|
|
["Steward workflows","Primary","Support","-"],
|
|
|
])
|
|
|
|
|
|
bullet("Governance Operating Model",[
|
|
|
"Central policy office defines standards, controls, and review criteria.",
|
|
|
"Domain teams deliver products under mandatory architecture and DQ gates.",
|
|
|
"Exceptions require formal risk acceptance and remediation timeline.",
|
|
|
"Stewardship KPIs tracked for issue aging, data quality trend, and ownership.",
|
|
|
])
|
|
|
|
|
|
control_matrix("Data Quality Tier Controls",[
|
|
|
["Tier 1 Ingestion","Purview sampling profiles","Pipeline-triggered gates","DQ scorecards + quarantine logs"],
|
|
|
["Tier 2 Transformation","DLT expectations + GE suites","Job fail/warn policies","DLT event logs + alerts"],
|
|
|
["Tier 3 Certification","SLA & semantic checks","Publish block on threshold miss","Certification artifacts"],
|
|
|
["Governance","Steward review cadence","Issue workflow SLA","Meeting minutes + tracker"],
|
|
|
])
|
|
|
|
|
|
bullet("Master and Reference Data Governance",[
|
|
|
"Enterprise entities managed as reusable certified products.",
|
|
|
"Reference tables versioned with controlled release workflow.",
|
|
|
"Survivorship logic implemented in Silver with reconciliation evidence.",
|
|
|
"Downstream consumers subscribe to contract versions, not ad-hoc extracts.",
|
|
|
])
|
|
|
|
|
|
flow("Change Impact Assessment Workflow",[
|
|
|
"Proposed Change",
|
|
|
"Lineage Impact Query",
|
|
|
"Consumer Risk Scoring",
|
|
|
"ARB Approval",
|
|
|
"Controlled Deployment",
|
|
|
],"Manta graph enables deterministic blast-radius analysis")
|
|
|
|
|
|
table_slide("Governance KPIs",["KPI","Definition","Target"],[
|
|
|
["Certified data products","Gold products passing Tier-3 checks","Quarterly growth with >95% compliance"],
|
|
|
["Steward issue closure","DQ/security issue closure within SLA",">90% within SLA"],
|
|
|
["Lineage completeness","Products with full source-to-consumption lineage","100% for regulated domains"],
|
|
|
["Policy exception aging","Open exceptions older than threshold","<5% over 90 days"],
|
|
|
])
|
|
|
|
|
|
# Security & privacy deep dive (45-56)
|
|
|
section("Security and Privacy Engineering","Control implementation details","gold")
|
|
|
|
|
|
schema_layers("Security Control Stack",[
|
|
|
("Identity","Entra ID, Conditional Access, MFA, SCIM",RGBColor(64,110,167)),
|
|
|
("Resource Authorization","Azure RBAC and workspace-level permissions",RGBColor(62,132,113)),
|
|
|
("Data Authorization","ABAC, RLS, CLS, DDM in Unity Catalog",RGBColor(107,121,139)),
|
|
|
("Network","Private endpoints, NSGs, firewall egress control",RGBColor(48,125,155)),
|
|
|
("Protection & Privacy","Encryption, DLP, consent, DSAR workflows",RGBColor(154,108,55)),
|
|
|
("Audit","Central logs, SIEM correlation, compliance evidence",RGBColor(82,103,125)),
|
|
|
])
|
|
|
|
|
|
table_slide("Identity and Access Patterns",["Component","Auth Pattern","Notes"],[
|
|
|
["Databricks","SAML/OAuth via Entra","SCIM group sync to principals"],
|
|
|
["Fabric","Native Entra integration","Capacity and workspace RBAC"],
|
|
|
["SAS Viya","SAML/OIDC federation","Group claims mapped to SAS roles"],
|
|
|
["Purview","Native Entra","Collection-level governance roles"],
|
|
|
["ADF","Managed identity runtime","Authoring RBAC via Entra groups"],
|
|
|
])
|
|
|
|
|
|
control_matrix("Conditional Access Baseline",[
|
|
|
["MFA","All data platform apps","Token issuance blocked if absent","Entra sign-in logs"],
|
|
|
["Compliant Device","Managed endpoints only","Access denied for non-compliant","Intune compliance reports"],
|
|
|
["Trusted Network","Corp/VPN/Bastion source","Block external unknown IPs","Conditional access outcomes"],
|
|
|
["Session Lifetime","12h interactive max","Forced reauthentication","Token policy logs"],
|
|
|
])
|
|
|
|
|
|
bullet("ABAC Implementation in Unity Catalog",[
|
|
|
"Attribute sources include role, business unit, region, and clearance class.",
|
|
|
"Policies are attached to catalogs/schemas/tables via grant templates.",
|
|
|
"Attribute propagation follows joiner-mover-leaver lifecycle automation.",
|
|
|
"Design separates coarse RBAC from fine-grained row/column policies.",
|
|
|
])
|
|
|
|
|
|
table_slide("RLS Policy Examples",["Dataset","Filter Logic","Applied For"],[
|
|
|
["Customer Gold","region in user_regions()","Regional operations teams"],
|
|
|
["Claims Gold","business_unit = current_bu()","P&C vs Life segmentation"],
|
|
|
["Risk Mart","risk_role in entitlement_map()","Second-line risk functions"],
|
|
|
["Actuarial Inputs","project_code in user_projects()","Model-specific SAS teams"],
|
|
|
])
|
|
|
|
|
|
table_slide("CLS and Masking Profiles",["Classification","Mask Strategy","Illustrative Fields"],[
|
|
|
["Public/Internal","No mask","Reference dimensions"],
|
|
|
["Confidential","Partial reveal","Postal code, phone"],
|
|
|
["Restricted","Deterministic hash/null","SIN, account number"],
|
|
|
["Highly Sensitive","Tokenized/proxy view only","Health/disability fields"],
|
|
|
])
|
|
|
|
|
|
flow("Sensitive Query Enforcement Path",[
|
|
|
"User Query",
|
|
|
"UC AuthZ Check",
|
|
|
"Row Filter Apply",
|
|
|
"Column Mask Apply",
|
|
|
"Result Delivery",
|
|
|
],"Policy decision stays server-side at query runtime")
|
|
|
|
|
|
bullet("Network and Exfiltration Controls",[
|
|
|
"Public endpoints disabled for ADLS, Key Vault, Purview, ADF, Event Hub.",
|
|
|
"All spoke egress forced through Azure Firewall Premium with FQDN rules.",
|
|
|
"NSGs maintain deny-by-default and explicit service flow allowances.",
|
|
|
"Fabric export and sharing controls restricted by tenant security settings.",
|
|
|
])
|
|
|
|
|
|
bullet("Privacy Engineering: Law 25 and PIPEDA",[
|
|
|
"Consent flags are propagated as policy attributes in curated layers.",
|
|
|
"DSAR workflows include discovery, extraction, and response evidence chain.",
|
|
|
"Erasure requests orchestrate delete/purge with retention rule exceptions.",
|
|
|
"Pseudonymization and anonymization patterns applied by data product type.",
|
|
|
])
|
|
|
|
|
|
control_matrix("Security Monitoring and Audit",[
|
|
|
["Access Logs","Databricks, Fabric, SAS, ADLS","SIEM correlation and anomaly rules","Central log retention"],
|
|
|
["Privilege Changes","Entra + UC grants","Daily drift detection jobs","Access review packs"],
|
|
|
["Policy Violations","Azure Policy + config scans","Automated incident ticketing","Exception registry"],
|
|
|
["Data Exfiltration","Firewall + DLP telemetry","High-risk alert workflows","Forensic packet/log evidence"],
|
|
|
])
|
|
|
|
|
|
# Infrastructure deep dive (57-67)
|
|
|
section("Azure Infrastructure Deep Dive","Landing zone, network, compute, and resilience","teal")
|
|
|
|
|
|
image_slide("Azure Topology Overview","/Users/oabrivard/Projects/mdp/.tmp_assets/azure_topology.png","Physical deployment baseline")
|
|
|
|
|
|
table_slide("Hub-Spoke Network Topology",["VNet","Purpose","Key Assets"],[
|
|
|
["vnet-hub-canadacentral","Shared connectivity and security","Firewall, ER Gateway, Bastion, DNS"],
|
|
|
["vnet-data-prod-cc","Production data workloads","Databricks, PEs, SQL paths"],
|
|
|
["vnet-data-nonprod-cc","Non-prod workloads","Dev/stg/sandbox workspaces"],
|
|
|
["vnet-sas-prod-cc","SAS compute isolation","AKS/VM compute + JDBC paths"],
|
|
|
["vnet-mgmt-cc","Ops and automation","CI runners, monitoring, tooling"],
|
|
|
])
|
|
|
|
|
|
schema_layers("Production Subnet Schema",[
|
|
|
("snet-dbx-host-prod (/22)","Databricks host VMs, delegated subnet",RGBColor(66,121,166)),
|
|
|
("snet-dbx-container-prod (/22)","Container network for Spark runtime",RGBColor(76,129,172)),
|
|
|
("snet-private-endpoints (/24)","PE NICs for data and control services",RGBColor(49,132,120)),
|
|
|
("snet-sqlwarehouse-prod (/24)","Serverless SQL connectivity config",RGBColor(90,137,177)),
|
|
|
("snet-adf-prod (/24)","Integration runtime agents",RGBColor(111,124,141)),
|
|
|
("snet-services-prod (/24)","Utility services and internal components",RGBColor(123,134,149)),
|
|
|
])
|
|
|
|
|
|
table_slide("Private Endpoint Matrix",["Service","Sub-resource","DNS Zone"],[
|
|
|
["ADLS Gen2","dfs/blob","privatelink.dfs/blob.core.windows.net"],
|
|
|
["Key Vault","vault","privatelink.vaultcore.azure.net"],
|
|
|
["Purview","account/portal","privatelink.purview*.azure.com"],
|
|
|
["Databricks","workspace UI/API","privatelink.azuredatabricks.net"],
|
|
|
["Event Hub","namespace","privatelink.servicebus.windows.net"],
|
|
|
["Data Factory","dataFactory/portal","privatelink.datafactory/adf.azure.com"],
|
|
|
])
|
|
|
|
|
|
control_matrix("Firewall Rule Collections",[
|
|
|
["Databricks Control Plane","Allow region endpoints","HTTPS 443 egress only","Firewall policy logs"],
|
|
|
["Azure Core Services","Allow identity/monitoring FQDNs","Restricted outbound set","Policy analytics"],
|
|
|
["Package Repositories","Allow approved package domains","Curated dependency access","Change review records"],
|
|
|
["Default Deny","Block all unmatched egress","No implicit internet access","Blocked flow evidence"],
|
|
|
])
|
|
|
|
|
|
table_slide("Compute Profiles",["Runtime","Sizing Pattern","Workload"],[
|
|
|
["Data Engineering","DS4/DS5 pools + autoscale","DLT/ETL and quality workflows"],
|
|
|
["Analytics SQL","Serverless small-medium","Ad-hoc and dashboard acceleration"],
|
|
|
["MLOps","CPU/GPU mixed pools","Training, feature generation, serving"],
|
|
|
["SAS Compute Server","E-series memory optimized","Actuarial and risk batch"],
|
|
|
["Fabric Capacity","F-SKU sized for BI only","Direct Lake serving"],
|
|
|
])
|
|
|
|
|
|
flow("Disaster Recovery Sequence",[
|
|
|
"Detect Incident",
|
|
|
"Activate DR Runbook",
|
|
|
"Restore Data Paths",
|
|
|
"Recover Compute",
|
|
|
"Validate SLAs",
|
|
|
],"Primary: Canada Central | DR: Canada East")
|
|
|
|
|
|
control_matrix("Resilience Targets",[
|
|
|
["RPO","Data loss tolerance by domain","Geo-redundant storage strategy","Backup/replication metrics"],
|
|
|
["RTO","Service restoration target","Automated infra and workload playbooks","DR drill outcomes"],
|
|
|
["Critical Pipelines","Priority recovery order","Dependency-aware restart","Operational logs"],
|
|
|
["Security During DR","Policies remain enforced","Failover with private networking","Control attestations"],
|
|
|
])
|
|
|
|
|
|
table_slide("Infrastructure as Code and CI/CD",["Stage","Objective","Gate"],[
|
|
|
["Validate","Lint, policy checks, static analysis","Fail-fast non-compliance"],
|
|
|
["Plan","Deterministic diff and approvals","Peer + architecture review"],
|
|
|
["Apply Non-Prod","Integration and smoke tests","Automated test pass"],
|
|
|
["Apply Prod","Controlled rollout","Dual approval + change window"],
|
|
|
])
|
|
|
|
|
|
# Performance, operations, roadmap (68-76)
|
|
|
section("Operations, Performance, and Execution","Run model and delivery plan","blue")
|
|
|
|
|
|
table_slide("Performance Engineering Levers",["Layer","Optimization Lever","Expected Benefit"],[
|
|
|
["Ingestion","Incremental loading and checkpointing","Reduced latency and retries"],
|
|
|
["Bronze/Silver","Partitioning and file sizing","Efficient scan and compaction"],
|
|
|
["Gold","Materialized aggregates and caching","Faster BI and API response"],
|
|
|
["SQL Warehouses","Autosuspend/scaling policies","Balanced throughput/cost"],
|
|
|
["ML Serving","Adaptive autoscale","SLA adherence under burst"],
|
|
|
])
|
|
|
|
|
|
control_matrix("Observability and SRE Controls",[
|
|
|
["Pipeline Health","Task-level metrics and SLA timers","Alert routing by severity","Ops dashboard + incident logs"],
|
|
|
["Data Quality","Rule breach telemetry","Quarantine and steward ticket","DQ trend boards"],
|
|
|
["Cost","Tag-based spend and anomalies","Budget thresholds and alerts","FinOps monthly packs"],
|
|
|
["Security","SIEM correlation and threat alerts","SOC incident response","Audit evidence"],
|
|
|
])
|
|
|
|
|
|
flow("Incident Management Workflow",[
|
|
|
"Detect",
|
|
|
"Triage",
|
|
|
"Contain",
|
|
|
"Recover",
|
|
|
"Postmortem",
|
|
|
],"Integrated with platform and security operations")
|
|
|
|
|
|
two_col("FinOps Model","Cost Drivers",[
|
|
|
"Compute DBU consumption by workload class",
|
|
|
"Storage growth and retention profile",
|
|
|
"Fabric BI capacity utilization",
|
|
|
"Network egress and private endpoint scale",
|
|
|
],"Optimization Actions",[
|
|
|
"Cluster policies and auto-termination",
|
|
|
"Workload scheduling and right-sizing",
|
|
|
"Storage lifecycle and compaction",
|
|
|
"Chargeback transparency by domain",
|
|
|
])
|
|
|
|
|
|
table_slide("Delivery Roadmap",["Horizon","Focus","Milestones"],[
|
|
|
["H1 (0-12m)","Foundation","Landing zone, core pipelines, governance baseline"],
|
|
|
["H2 (12-24m)","Scale","Domain product expansion, semantic acceleration"],
|
|
|
["H3 (24-36m)","Optimize","Advanced AI governance and automation depth"],
|
|
|
])
|
|
|
|
|
|
bullet("Architecture Review Board Gates",[
|
|
|
"Gate 1: workload placement and anti-pattern checks.",
|
|
|
"Gate 2: security/privacy controls and policy mapping.",
|
|
|
"Gate 3: data quality, lineage, and certification readiness.",
|
|
|
"Gate 4: operational readiness, SLOs, and runbook completeness.",
|
|
|
"Gate 5: post-implementation evidence and KPI outcomes.",
|
|
|
])
|
|
|
|
|
|
table_slide("Executive Technical Decisions",["Decision Point","Option Recommended","Technical Rationale"],[
|
|
|
["Primary Platform","Databricks-primary","End-to-end control and reduced overlap"],
|
|
|
["BI Serving","Fabric Direct Lake","Cost/performance at enterprise scale"],
|
|
|
["Governance Stack","Purview + UC + Manta","Full business + technical lineage"],
|
|
|
["SAS Integration","JDBC-first policy","Preserve UC row/column enforcement"],
|
|
|
["Delivery Strategy","Phased with ARB gates","Risk-controlled execution"],
|
|
|
],"Steering committee inputs required")
|
|
|
|
|
|
bullet("Closing: Implementation Priorities",[
|
|
|
"Confirm guardrails in architecture standards and CI controls.",
|
|
|
"Sequence high-value domains for Gold product certification.",
|
|
|
"Harden lineage and DQ evidence for regulated reporting.",
|
|
|
"Operationalize FinOps and reliability SLO dashboards.",
|
|
|
"Prepare Fabric IQ evaluation backlog under strict governance.",
|
|
|
],"Next technical action baseline")
|
|
|
|
|
|
assert 70 <= len(prs.slides) <= 80, f"Slide count out of range: {len(prs.slides)}"
|
|
|
prs.save(OUTPUT)
|
|
|
print(f"Created {OUTPUT} with {len(prs.slides)} slides")
|