from pptx import Presentation from pptx.util import Inches, Pt from pptx.dml.color import RGBColor from pptx.enum.text import PP_ALIGN from pptx.enum.shapes import MSO_SHAPE, MSO_CONNECTOR OUTPUT = "/Users/oabrivard/Projects/mdp/databricks_prime_tech.pptx" prs = Presentation() prs.slide_width = Inches(13.333) prs.slide_height = Inches(7.5) COLORS = { "navy": RGBColor(11, 33, 58), "blue": RGBColor(0, 84, 153), "teal": RGBColor(0, 120, 130), "green": RGBColor(26, 122, 82), "gold": RGBColor(166, 126, 18), "bg": RGBColor(243, 247, 251), "card": RGBColor(255, 255, 255), "text": RGBColor(34, 46, 57), "muted": RGBColor(99, 113, 128), "line": RGBColor(211, 220, 229), "white": RGBColor(255, 255, 255), "bronze": RGBColor(166, 109, 75), "silver": RGBColor(126, 140, 156), "gold_l": RGBColor(208, 173, 71), } MARGIN = Inches(0.55) TOP = Inches(1.18) W = Inches(12.25) H = Inches(5.95) def font(run, size=16, bold=False, color=None, name="Segoe UI"): run.font.name = name run.font.size = Pt(size) run.font.bold = bold run.font.color.rgb = color or COLORS["text"] def brand(slide, title, subtitle=None): bg = slide.background bg.fill.solid(); bg.fill.fore_color.rgb = COLORS["bg"] bar = slide.shapes.add_shape(MSO_SHAPE.RECTANGLE, 0, 0, prs.slide_width, Inches(0.88)) bar.fill.solid(); bar.fill.fore_color.rgb = COLORS["navy"]; bar.line.fill.background() tb = slide.shapes.add_textbox(MARGIN, Inches(0.12), Inches(10.2), Inches(0.54)) tf = tb.text_frame; tf.clear() p = tf.paragraphs[0] r = p.add_run(); r.text = title font(r, size=23, bold=True, color=COLORS["white"]) if subtitle: sb = slide.shapes.add_textbox(MARGIN, Inches(0.9), Inches(11.5), Inches(0.25)) stf = sb.text_frame p = stf.paragraphs[0] r = p.add_run(); r.text = subtitle font(r, size=11, color=COLORS["muted"]) foot = slide.shapes.add_shape(MSO_SHAPE.RECTANGLE, 0, Inches(7.2), prs.slide_width, Inches(0.3)) foot.fill.solid(); foot.fill.fore_color.rgb = COLORS["white"]; foot.line.fill.background() fb = slide.shapes.add_textbox(MARGIN, Inches(7.22), Inches(9), Inches(0.2)) p = fb.text_frame.paragraphs[0] r = p.add_run(); r.text = "Greenfield | Databricks-Primary Technical Deep Dive | Internal - Confidential" font(r, size=9, color=COLORS["muted"]) def section(title, subtitle, accent="blue"): s = prs.slides.add_slide(prs.slide_layouts[6]) s.background.fill.solid(); s.background.fill.fore_color.rgb = COLORS["navy"] strip = s.shapes.add_shape(MSO_SHAPE.RECTANGLE, 0, Inches(5.7), prs.slide_width, Inches(1.8)) strip.fill.solid(); strip.fill.fore_color.rgb = COLORS[accent]; strip.line.fill.background() box = s.shapes.add_textbox(Inches(0.8), Inches(2.2), Inches(11.8), Inches(2.3)) tf = box.text_frame; tf.clear() p1 = tf.paragraphs[0] r1 = p1.add_run(); r1.text = title font(r1, size=40, bold=True, color=COLORS["white"]) p2 = tf.add_paragraph(); r2 = p2.add_run(); r2.text = subtitle font(r2, size=18, color=COLORS["white"]) def bullet(title, items, subtitle=None): s = prs.slides.add_slide(prs.slide_layouts[6]) brand(s, title, subtitle) b = s.shapes.add_shape(MSO_SHAPE.ROUNDED_RECTANGLE, MARGIN, TOP, W, H) b.fill.solid(); b.fill.fore_color.rgb = COLORS["card"]; b.line.color.rgb = COLORS["line"] t = s.shapes.add_textbox(MARGIN + Inches(0.28), TOP + Inches(0.28), W - Inches(0.56), H - Inches(0.56)) tf = t.text_frame; tf.clear(); tf.word_wrap = True for i, it in enumerate(items): p = tf.paragraphs[0] if i == 0 else tf.add_paragraph() p.space_after = Pt(7) r = p.add_run(); r.text = f"• {it}" font(r, size=17) def two_col(title, left_title, left_items, right_title, right_items, subtitle=None): s = prs.slides.add_slide(prs.slide_layouts[6]) brand(s, title, subtitle) x1, x2, y, cw, ch = MARGIN, Inches(6.75), TOP, Inches(5.95), H for x, t, color in [(x1, left_title, COLORS["blue"]), (x2, right_title, COLORS["teal"] )]: c = s.shapes.add_shape(MSO_SHAPE.ROUNDED_RECTANGLE, x, y, cw, ch) c.fill.solid(); c.fill.fore_color.rgb = COLORS["card"]; c.line.color.rgb = COLORS["line"] h = s.shapes.add_shape(MSO_SHAPE.RECTANGLE, x, y, cw, Inches(0.58)) h.fill.solid(); h.fill.fore_color.rgb = color; h.line.fill.background() hb = s.shapes.add_textbox(x + Inches(0.2), y + Inches(0.14), cw - Inches(0.4), Inches(0.3)) pr = hb.text_frame.paragraphs[0].add_run(); pr.text = t font(pr, size=14, bold=True, color=COLORS["white"]) def write(x, arr): tx = s.shapes.add_textbox(x + Inches(0.2), y + Inches(0.72), cw - Inches(0.4), ch - Inches(0.9)) tf = tx.text_frame; tf.clear(); tf.word_wrap=True for i, it in enumerate(arr): p = tf.paragraphs[0] if i==0 else tf.add_paragraph() p.space_after = Pt(6) r = p.add_run(); r.text = f"• {it}" font(r, size=14) write(x1, left_items) write(x2, right_items) def table_slide(title, cols, rows, subtitle=None): s = prs.slides.add_slide(prs.slide_layouts[6]) brand(s, title, subtitle) ts = s.shapes.add_table(len(rows)+1, len(cols), MARGIN, TOP, W, H) t = ts.table for j,c in enumerate(cols): cell=t.cell(0,j); cell.text=c cell.fill.solid(); cell.fill.fore_color.rgb = COLORS["blue"] for p in cell.text_frame.paragraphs: for r in p.runs: font(r, size=11, bold=True, color=COLORS["white"]) for i,row in enumerate(rows, start=1): for j,v in enumerate(row): cell=t.cell(i,j); cell.text=v if i%2==0: cell.fill.solid(); cell.fill.fore_color.rgb = RGBColor(235,241,247) for p in cell.text_frame.paragraphs: for r in p.runs: font(r, size=10) def flow(title, steps, subtitle=None): s = prs.slides.add_slide(prs.slide_layouts[6]) brand(s, title, subtitle) n = len(steps) gx = Inches(0.15) sw = (W - gx*(n-1))/n y = Inches(2.6) for i,st in enumerate(steps): x = MARGIN + i*(sw+gx) b = s.shapes.add_shape(MSO_SHAPE.ROUNDED_RECTANGLE, x, y, sw, Inches(1.6)) b.fill.solid(); b.fill.fore_color.rgb=COLORS["card"]; b.line.color.rgb=COLORS["blue"] tf=b.text_frame; tf.clear(); p=tf.paragraphs[0]; p.alignment=PP_ALIGN.CENTER r=p.add_run(); r.text=st; font(r, size=13, bold=True) if i < n-1: a = s.shapes.add_shape(MSO_SHAPE.CHEVRON, x+sw-Inches(0.02), y+Inches(0.6), Inches(0.18), Inches(0.38)) a.fill.solid(); a.fill.fore_color.rgb=COLORS["teal"]; a.line.fill.background() def image_slide(title, path, subtitle=None, caption=None): s=prs.slides.add_slide(prs.slide_layouts[6]) brand(s,title,subtitle) s.shapes.add_picture(path, MARGIN, TOP, width=W, height=H-Inches(0.15)) if caption: c=s.shapes.add_textbox(MARGIN, Inches(7.0), Inches(11), Inches(0.2)) r=c.text_frame.paragraphs[0].add_run(); r.text=caption font(r,size=9,color=COLORS["muted"]) def schema_layers(title, layers, subtitle=None): s = prs.slides.add_slide(prs.slide_layouts[6]) brand(s, title, subtitle) y = TOP + Inches(0.15) lx = MARGIN + Inches(0.25) lw = W - Inches(0.5) lh = (H - Inches(1.0))/len(layers) for i,(name,desc,col) in enumerate(layers): by = y + i*lh b = s.shapes.add_shape(MSO_SHAPE.ROUNDED_RECTANGLE, lx, by, lw, lh-Inches(0.08)) b.fill.solid(); b.fill.fore_color.rgb = col; b.line.fill.background() tb = s.shapes.add_textbox(lx+Inches(0.2), by+Inches(0.12), lw-Inches(0.4), lh-Inches(0.25)) tf = tb.text_frame; tf.clear() p1=tf.paragraphs[0]; r1=p1.add_run(); r1.text=name font(r1,size=16,bold=True,color=COLORS["white"]) p2=tf.add_paragraph(); r2=p2.add_run(); r2.text=desc font(r2,size=12,color=COLORS["white"]) def nfr_radar_like(title, items, subtitle=None): s=prs.slides.add_slide(prs.slide_layouts[6]) brand(s,title,subtitle) cx, cy = Inches(4.3), Inches(4.1) radii = [Inches(0.8), Inches(1.3), Inches(1.8), Inches(2.3)] for r in radii: circ = s.shapes.add_shape(MSO_SHAPE.OVAL, cx-r, cy-r, 2*r, 2*r) circ.fill.background(); circ.line.color.rgb = RGBColor(207,217,228) for i,(label,val,col) in enumerate(items): y = Inches(1.6) + i*Inches(0.68) bx = Inches(8.0) bar = s.shapes.add_shape(MSO_SHAPE.RECTANGLE, bx, y, Inches(0.06*val), Inches(0.38)) bar.fill.solid(); bar.fill.fore_color.rgb = col; bar.line.fill.background() txt = s.shapes.add_textbox(Inches(6.3), y, Inches(1.6), Inches(0.38)) tr = txt.text_frame.paragraphs[0].add_run(); tr.text = label font(tr,size=12) title_b = s.shapes.add_textbox(Inches(1.2), Inches(1.3), Inches(4.7), Inches(0.6)) rr=title_b.text_frame.paragraphs[0].add_run(); rr.text="NFR Maturity Snapshot" font(rr,size=20,bold=True,color=COLORS["blue"]) def control_matrix(title, rows, subtitle=None): table_slide(title,["Control Domain","Design","Runtime Enforcement","Evidence"],rows,subtitle) # Intro + context (1-8) s=prs.slides.add_slide(prs.slide_layouts[6]) s.background.fill.solid(); s.background.fill.fore_color.rgb = COLORS["navy"] hero=s.shapes.add_shape(MSO_SHAPE.RECTANGLE,0,Inches(4.85),prs.slide_width,Inches(2.65)); hero.fill.solid(); hero.fill.fore_color.rgb=COLORS["blue"]; hero.line.fill.background() b=s.shapes.add_textbox(Inches(0.85),Inches(0.95),Inches(11.8),Inches(2.9)); tf=b.text_frame; tf.clear() r=tf.paragraphs[0].add_run(); r.text="Greenfield Modern Data Platform" font(r,size=44,bold=True,color=COLORS["white"]) p=tf.add_paragraph(); r2=p.add_run(); r2.text="Databricks-Primary Technical Deep Dive (v8.0)" font(r2,size=27,color=COLORS["white"]) p3=tf.add_paragraph(); r3=p3.add_run(); r3.text="Architecture, Infrastructure, Security, and Delivery Mechanics" font(r3,size=15,color=COLORS["white"]) bullet("Technical Agenda",[ "Architecture boundaries and workload placement", "Medallion implementation details (ingestion to serving)", "Governance, lineage, and data quality controls", "Security/privacy implementation patterns", "Azure infrastructure, network, and operations", "Performance, reliability, FinOps, and DevSecOps", "Execution roadmap and architecture decision gates", ],"76-slide engineering and architecture walkthrough") bullet("Scope and Assumptions",[ "Primary architecture option: Databricks-primary v8.0.", "Fabric scope constrained to BI serving via Direct Lake.", "SAS Viya retained for regulated actuarial/risk use cases.", "Canadian-only residency: Canada Central (prod), Canada East (DR).", "Control baseline aligned to AMF, OSFI, Law 25, and PIPEDA.", ]) two_col("Business and Technical Objectives","Business Outcomes",[ "Trusted enterprise data products", "Faster analytics and AI delivery cycle", "Regulatory-grade traceability and evidence", "Platform simplification and controlled spend", ],"Technical Outcomes",[ "Single Delta substrate with governed access", "Composable medallion pipelines", "Lineage and DQ from source to report", "Automated policy and IaC enforcement", ]) bullet("Architecture Principles (Operationalized)",[ "Data as Product: ownership, contracts, certification, SLAs.", "Security by Design: identity-first, zero trust, fine-grained controls.", "Federated Execution: centralized guardrails, domain delivery autonomy.", "Open Standards: Delta/Parquet, APIs, metadata portability.", "Right Workload on Right Platform: explicit anti-overlap boundaries.", ]) flow("Decision Logic: Workload Placement",[ "Identify\nworkload type", "Map to\nplatform capability", "Apply security\nconstraints", "Validate cost\nprofile", "Approve via\nARB gate", ]) table_slide("Key Architecture Decisions (Condensed)",["AD","Decision","Technical Consequence"],[ ["AD-01","Delta as canonical format","Unified storage semantics and replay"], ["AD-02","Databricks primary","Consolidated ETL/SQL/ML runtime"], ["AD-03","Fabric BI-only","Direct Lake at enterprise scale"], ["AD-04","SAS Compute Server","Sequential regulated compute path"], ["AD-05","Purview+UC+Manta","Multi-plane governance + lineage"], ["AD-06","Canada-only regions","Policy-enforced residency"], ["AD-07","ADLS shared substrate","Cross-platform non-duplicative access"], ]) nfr_radar_like("Non-Functional Priorities",[ ("Security",9,COLORS["teal"]), ("Reliability",8,COLORS["blue"]), ("Performance",8,COLORS["green"]), ("Cost Efficiency",7,COLORS["gold"]), ("Operability",8,COLORS["silver"]), ],"Relative target maturity by design intent") # Logical architecture (9-20) section("Logical Architecture","Layer model, boundaries, and interactions","teal") schema_layers("Seven-Layer Reference Model",[ ("1. Ingestion","Batch, CDC, streaming, APIs; provenance and control metadata",RGBColor(36,93,140)), ("2. Bronze","Immutable raw Delta persistence with replay capability",COLORS["bronze"]), ("3. Silver","Conformance, quality, survivorship, and enrichment",COLORS["silver"]), ("4. Gold","Business-ready products, marts, aggregates, feature sets",COLORS["gold_l"]), ("5. Semantic","Business vocabulary and certified KPI definitions",RGBColor(71,118,167)), ("6. Serving","BI, SQL, APIs, SAS, and model consumption endpoints",RGBColor(38,145,126)), ("7. AI/ML","Experimentation, training, deployment, and monitoring",RGBColor(29,102,79)), ],"Layering aligned to DAMA-DMBOK and EDM-DCAM") table_slide("Cross-Cutting Planes",["Plane","Responsibilities","Primary Systems"],[ ["Governance","Catalog, glossary, lineage, quality, stewardship","Purview, Unity Catalog, Manta"], ["Security & Privacy","Identity, ABAC/RLS/CLS/DDM, DLP, consent","Entra ID, UC, Purview DLP"], ["Operations","Monitoring, SLAs, incident, runbook automation","Azure Monitor, Databricks Workflows"], ]) table_slide("Capability Mapping by Platform",["Capability","Databricks","Fabric","SAS Viya"],[ ["ETL/ELT","Primary","Restricted","Targeted"], ["Warehousing","Primary","Serving-only","-"], ["BI","Secondary","Primary","Targeted"], ["ML/AI","Primary","-","Specialized"], ["Lineage","Contributing","Contributing","Contributing via Manta"], ["Semantic Intelligence","Contributing","Evaluate H2/H3","-"], ]) flow("Inter-Platform Data Flow",[ "Sources", "ADF / Auto Loader", "ADLS Delta", "Databricks Bronze/Silver/Gold", "Fabric Direct Lake + SAS JDBC", ],"No duplicated warehouse copies") image_slide("Medallion Architecture (Reference Diagram)","/Users/oabrivard/Projects/mdp/.tmp_assets/medallion1.png","End-to-end governed flow") image_slide("Medallion Architecture (Detailed Variant)","/Users/oabrivard/Projects/mdp/.tmp_assets/medallion2.png","Alternative technical rendering") schema_layers("Fabric Scope Boundary (Allowed vs Prohibited)",[ ("Allowed","Power BI models, reports, Direct Lake, minimal Dataflows Gen2",RGBColor(35,129,88)), ("Conditionally Allowed","Eventhouse/KQL for targeted operational dashboards",RGBColor(61,138,170)), ("Prohibited","Parallel Fabric warehouse/lakehouse and duplicated ETL",RGBColor(170,81,66)), ("Prohibited","Fabric Data Factory as primary orchestrator",RGBColor(170,81,66)), ("Prohibited","Fabric notebooks replacing Databricks engineering runtime",RGBColor(170,81,66)), ],"Architecture guardrail enforcement required") bullet("Direct Lake Rationale at Enterprise Scale",[ "Power BI DirectQuery against Databricks SQL at 55k users is cost-latency intensive.", "Direct Lake minimizes per-query round trips while preserving freshness.", "Fabric’s role is economic BI serving optimization, not data platform duplication.", "Capacity governance prevents expansion into unauthorized engineering use.", ]) flow("Logical Request Path: Analyst to Governed Data",[ "User Auth via Entra", "Semantic Model", "Direct Lake / SQL", "UC Policy Evaluation", "Delta Read + Mask", ],"Security and governance remain in-path") two_col("Technical Debt Avoidance","Anti-Patterns to Block",[ "Shadow ETL in Fabric", "Unmanaged extracts to local marts", "Direct ADLS sensitive access from unmanaged runtimes", "Unversioned security policies", ],"Mitigation Mechanisms",[ "ARB design gate + exception workflow", "Policy-as-code + IaC scanning", "Centralized service principal governance", "Continuous lineage and DQ evidence", ]) # Medallion deep dive (21-34) section("Medallion Implementation","Ingestion, transformations, and productization mechanics","gold") schema_layers("Source and Ingestion Schema",[ ("Sources","Core banking, insurance, cards, CRM, digital channels, external feeds",RGBColor(74,115,154)), ("Ingestion Runtime","ADF, Auto Loader, Structured Streaming, API pulls",RGBColor(40,122,138)), ("Staging + Audit","Landing, provenance metadata, extraction checksums",RGBColor(86,130,170)), ("DQ Pre-Gate","Purview sampling, structural checks, anomaly thresholds",RGBColor(170,120,57)), ("Bronze Commit","Append-only Delta commit with replay metadata",COLORS["bronze"]), ]) table_slide("Ingestion Patterns and Design",["Pattern","Mechanism","Control Concerns","Typical Domains"],[ ["Batch Files","Auto Loader cloudFiles","Schema drift, duplicate drops","Partners, regulatory feeds"], ["Batch DB","ADF Copy + watermark/CDC","Latency window, idempotency","Core banking, policy admin"], ["Near Real-time","Debezium/Event Hub/Streaming","Exactly-once semantics","Transactions, status updates"], ["Streaming","Event Hub + SS","Backpressure, ordering","Fraud and telemetry"], ["API","ADF Web / notebook clients","Rate limits, retry strategies","Market and bureau data"], ["SAS Native","JDBC/authorized ADLS","RLS bypass risk on ADLS path","Actuarial inputs"], ]) flow("Pre-Bronze Quality Gate Sequence",[ "Ingestion Complete", "Purview DQ Trigger", "Sample & Profile", "Threshold Decision", "Promote or Quarantine", ],"Tier-1 control before immutable storage") table_slide("Sampling and Threshold Model",["Domain Type","Sample Strategy","Blocking Criteria"],[ ["Critical Data Elements","10-20% (or 100% low-volume)","Null/format/referential failure"], ["Non-CDE Attributes","1-5%","Structural anomaly warnings"], ["Streaming Windows","Every 1000 records or 5 min","Volume or schema spikes"], ["All Loads","Row-count baseline checks","±50% anomaly triggers quarantine"], ]) bullet("Bronze Layer Engineering Details",[ "Storage layout by source/entity with deterministic partitioning strategy.", "Minimal transformation: metadata columns and type normalization only.", "Append-only contracts enable full replay and forensic analysis.", "Time travel retention baseline 90 days with domain overrides.", "Quarantine tables maintain failed payloads for root-cause analysis.", ]) table_slide("Bronze Data Contracts",["Contract Field","Purpose","Validation"],[ ["source_system","Trace producer lineage","Mandatory, controlled vocabulary"], ["ingested_at","Pipeline timing","UTC timestamp required"], ["batch_id","Reconciliation and idempotency","Uniqueness within source/entity"], ["source_file/hash","File-level replay evidence","Checksum comparison"], ["schema_version","Evolution management","Must map to catalog history"], ]) schema_layers("Silver Transformation Schema",[ ("Standardize","Type coercion, timezone normalization, canonical naming",RGBColor(113,127,144)), ("Conform","Reference joins, key harmonization, entity resolution",RGBColor(119,137,155)), ("Validate","Business rules, expectations, anomaly checks",RGBColor(79,124,166)), ("Enrich","Master/reference datasets and derived attributes",RGBColor(60,134,109)), ("Publish Silver","Conformed tables for downstream aggregation",COLORS["silver"]), ]) flow("DLT Pipeline Topology",[ "Bronze Stream", "DLT Bronze Views", "DLT Silver Tables", "Quality Expectations", "Metrics + Lineage", ],"Declarative dependency and quality tracking") table_slide("Silver Quality Rules (Examples)",["Rule","Type","Failure Action"],[ ["customer_id not null","Completeness","Drop + alert"], ["account_status in set","Validity","Quarantine record"], ["txn_date <= current_date","Reasonableness","Warn and flag"], ["policy_id unique by source","Uniqueness","Deduplicate with survivorship"], ["reference code exists","Referential","Reject for steward remediation"], ]) bullet("Gold Layer Construction Patterns",[ "Star schemas for reporting domains with conformed dimensions.", "Data products combine domain marts, KPI tables, and feature-ready sets.", "Certification process includes Tier-3 SLA checks before release.", "Semantic naming and KPI definitions synchronized with BI semantic layer.", "Gold outputs are controlled sources for APIs, BI, and ML features.", ]) table_slide("Gold Certification Criteria",["Criterion","Target","Owner"],[ ["CDE completeness",">=99.5%","Domain Data Owner"], ["Freshness SLA","Within contractual window","Data Engineering Lead"], ["Lineage completeness","Source-to-KPI trace available","Governance Office"], ["Policy compliance","RLS/CLS/DDM mapped","Security Architect"], ["Operational readiness","Monitoring + runbooks approved","Platform Operations"], ]) flow("End-to-End Medallion Release Path",[ "Ingest", "Bronze Commit", "Silver Conformance", "Gold Certification", "Consumer Publish", ],"Release gate with evidence at each stage") # Governance & lineage deep dive (35-44) section("Governance and Lineage","Metadata planes and operational controls","blue") schema_layers("Three-Tier Governance Architecture",[ ("Tier 1: Purview","Enterprise catalog, glossary, classification, DQ dashboards",RGBColor(71,120,171)), ("Tier 2: Unity Catalog","Technical grants, lineage, row/column security controls",RGBColor(46,133,119)), ("Tier 3: Manta","Cross-platform code-level lineage and impact analysis",RGBColor(97,111,130)), ],"Rationalized model replacing overlapping catalog duplication") flow("Lineage Federation Pipeline",[ "Scan Code/Metadata", "Parse Dependencies", "Build Unified Graph", "Publish to Purview", "Expose to Engineers", ],"Databricks + ADF + SAS integration") table_slide("Metadata Responsibilities",["Function","Purview","Unity Catalog","Manta"],[ ["Business glossary","Primary","Reference","-"], ["Technical object ACLs","Reference","Primary","-"], ["Column-level lineage","Consume","Primary (Databricks scope)","Aggregate across tools"], ["Cross-platform lineage","Consume","Consume","Primary"], ["Steward workflows","Primary","Support","-"], ]) bullet("Governance Operating Model",[ "Central policy office defines standards, controls, and review criteria.", "Domain teams deliver products under mandatory architecture and DQ gates.", "Exceptions require formal risk acceptance and remediation timeline.", "Stewardship KPIs tracked for issue aging, data quality trend, and ownership.", ]) control_matrix("Data Quality Tier Controls",[ ["Tier 1 Ingestion","Purview sampling profiles","Pipeline-triggered gates","DQ scorecards + quarantine logs"], ["Tier 2 Transformation","DLT expectations + GE suites","Job fail/warn policies","DLT event logs + alerts"], ["Tier 3 Certification","SLA & semantic checks","Publish block on threshold miss","Certification artifacts"], ["Governance","Steward review cadence","Issue workflow SLA","Meeting minutes + tracker"], ]) bullet("Master and Reference Data Governance",[ "Enterprise entities managed as reusable certified products.", "Reference tables versioned with controlled release workflow.", "Survivorship logic implemented in Silver with reconciliation evidence.", "Downstream consumers subscribe to contract versions, not ad-hoc extracts.", ]) flow("Change Impact Assessment Workflow",[ "Proposed Change", "Lineage Impact Query", "Consumer Risk Scoring", "ARB Approval", "Controlled Deployment", ],"Manta graph enables deterministic blast-radius analysis") table_slide("Governance KPIs",["KPI","Definition","Target"],[ ["Certified data products","Gold products passing Tier-3 checks","Quarterly growth with >95% compliance"], ["Steward issue closure","DQ/security issue closure within SLA",">90% within SLA"], ["Lineage completeness","Products with full source-to-consumption lineage","100% for regulated domains"], ["Policy exception aging","Open exceptions older than threshold","<5% over 90 days"], ]) # Security & privacy deep dive (45-56) section("Security and Privacy Engineering","Control implementation details","gold") schema_layers("Security Control Stack",[ ("Identity","Entra ID, Conditional Access, MFA, SCIM",RGBColor(64,110,167)), ("Resource Authorization","Azure RBAC and workspace-level permissions",RGBColor(62,132,113)), ("Data Authorization","ABAC, RLS, CLS, DDM in Unity Catalog",RGBColor(107,121,139)), ("Network","Private endpoints, NSGs, firewall egress control",RGBColor(48,125,155)), ("Protection & Privacy","Encryption, DLP, consent, DSAR workflows",RGBColor(154,108,55)), ("Audit","Central logs, SIEM correlation, compliance evidence",RGBColor(82,103,125)), ]) table_slide("Identity and Access Patterns",["Component","Auth Pattern","Notes"],[ ["Databricks","SAML/OAuth via Entra","SCIM group sync to principals"], ["Fabric","Native Entra integration","Capacity and workspace RBAC"], ["SAS Viya","SAML/OIDC federation","Group claims mapped to SAS roles"], ["Purview","Native Entra","Collection-level governance roles"], ["ADF","Managed identity runtime","Authoring RBAC via Entra groups"], ]) control_matrix("Conditional Access Baseline",[ ["MFA","All data platform apps","Token issuance blocked if absent","Entra sign-in logs"], ["Compliant Device","Managed endpoints only","Access denied for non-compliant","Intune compliance reports"], ["Trusted Network","Corp/VPN/Bastion source","Block external unknown IPs","Conditional access outcomes"], ["Session Lifetime","12h interactive max","Forced reauthentication","Token policy logs"], ]) bullet("ABAC Implementation in Unity Catalog",[ "Attribute sources include role, business unit, region, and clearance class.", "Policies are attached to catalogs/schemas/tables via grant templates.", "Attribute propagation follows joiner-mover-leaver lifecycle automation.", "Design separates coarse RBAC from fine-grained row/column policies.", ]) table_slide("RLS Policy Examples",["Dataset","Filter Logic","Applied For"],[ ["Customer Gold","region in user_regions()","Regional operations teams"], ["Claims Gold","business_unit = current_bu()","P&C vs Life segmentation"], ["Risk Mart","risk_role in entitlement_map()","Second-line risk functions"], ["Actuarial Inputs","project_code in user_projects()","Model-specific SAS teams"], ]) table_slide("CLS and Masking Profiles",["Classification","Mask Strategy","Illustrative Fields"],[ ["Public/Internal","No mask","Reference dimensions"], ["Confidential","Partial reveal","Postal code, phone"], ["Restricted","Deterministic hash/null","SIN, account number"], ["Highly Sensitive","Tokenized/proxy view only","Health/disability fields"], ]) flow("Sensitive Query Enforcement Path",[ "User Query", "UC AuthZ Check", "Row Filter Apply", "Column Mask Apply", "Result Delivery", ],"Policy decision stays server-side at query runtime") bullet("Network and Exfiltration Controls",[ "Public endpoints disabled for ADLS, Key Vault, Purview, ADF, Event Hub.", "All spoke egress forced through Azure Firewall Premium with FQDN rules.", "NSGs maintain deny-by-default and explicit service flow allowances.", "Fabric export and sharing controls restricted by tenant security settings.", ]) bullet("Privacy Engineering: Law 25 and PIPEDA",[ "Consent flags are propagated as policy attributes in curated layers.", "DSAR workflows include discovery, extraction, and response evidence chain.", "Erasure requests orchestrate delete/purge with retention rule exceptions.", "Pseudonymization and anonymization patterns applied by data product type.", ]) control_matrix("Security Monitoring and Audit",[ ["Access Logs","Databricks, Fabric, SAS, ADLS","SIEM correlation and anomaly rules","Central log retention"], ["Privilege Changes","Entra + UC grants","Daily drift detection jobs","Access review packs"], ["Policy Violations","Azure Policy + config scans","Automated incident ticketing","Exception registry"], ["Data Exfiltration","Firewall + DLP telemetry","High-risk alert workflows","Forensic packet/log evidence"], ]) # Infrastructure deep dive (57-67) section("Azure Infrastructure Deep Dive","Landing zone, network, compute, and resilience","teal") image_slide("Azure Topology Overview","/Users/oabrivard/Projects/mdp/.tmp_assets/azure_topology.png","Physical deployment baseline") table_slide("Hub-Spoke Network Topology",["VNet","Purpose","Key Assets"],[ ["vnet-hub-canadacentral","Shared connectivity and security","Firewall, ER Gateway, Bastion, DNS"], ["vnet-data-prod-cc","Production data workloads","Databricks, PEs, SQL paths"], ["vnet-data-nonprod-cc","Non-prod workloads","Dev/stg/sandbox workspaces"], ["vnet-sas-prod-cc","SAS compute isolation","AKS/VM compute + JDBC paths"], ["vnet-mgmt-cc","Ops and automation","CI runners, monitoring, tooling"], ]) schema_layers("Production Subnet Schema",[ ("snet-dbx-host-prod (/22)","Databricks host VMs, delegated subnet",RGBColor(66,121,166)), ("snet-dbx-container-prod (/22)","Container network for Spark runtime",RGBColor(76,129,172)), ("snet-private-endpoints (/24)","PE NICs for data and control services",RGBColor(49,132,120)), ("snet-sqlwarehouse-prod (/24)","Serverless SQL connectivity config",RGBColor(90,137,177)), ("snet-adf-prod (/24)","Integration runtime agents",RGBColor(111,124,141)), ("snet-services-prod (/24)","Utility services and internal components",RGBColor(123,134,149)), ]) table_slide("Private Endpoint Matrix",["Service","Sub-resource","DNS Zone"],[ ["ADLS Gen2","dfs/blob","privatelink.dfs/blob.core.windows.net"], ["Key Vault","vault","privatelink.vaultcore.azure.net"], ["Purview","account/portal","privatelink.purview*.azure.com"], ["Databricks","workspace UI/API","privatelink.azuredatabricks.net"], ["Event Hub","namespace","privatelink.servicebus.windows.net"], ["Data Factory","dataFactory/portal","privatelink.datafactory/adf.azure.com"], ]) control_matrix("Firewall Rule Collections",[ ["Databricks Control Plane","Allow region endpoints","HTTPS 443 egress only","Firewall policy logs"], ["Azure Core Services","Allow identity/monitoring FQDNs","Restricted outbound set","Policy analytics"], ["Package Repositories","Allow approved package domains","Curated dependency access","Change review records"], ["Default Deny","Block all unmatched egress","No implicit internet access","Blocked flow evidence"], ]) table_slide("Compute Profiles",["Runtime","Sizing Pattern","Workload"],[ ["Data Engineering","DS4/DS5 pools + autoscale","DLT/ETL and quality workflows"], ["Analytics SQL","Serverless small-medium","Ad-hoc and dashboard acceleration"], ["MLOps","CPU/GPU mixed pools","Training, feature generation, serving"], ["SAS Compute Server","E-series memory optimized","Actuarial and risk batch"], ["Fabric Capacity","F-SKU sized for BI only","Direct Lake serving"], ]) flow("Disaster Recovery Sequence",[ "Detect Incident", "Activate DR Runbook", "Restore Data Paths", "Recover Compute", "Validate SLAs", ],"Primary: Canada Central | DR: Canada East") control_matrix("Resilience Targets",[ ["RPO","Data loss tolerance by domain","Geo-redundant storage strategy","Backup/replication metrics"], ["RTO","Service restoration target","Automated infra and workload playbooks","DR drill outcomes"], ["Critical Pipelines","Priority recovery order","Dependency-aware restart","Operational logs"], ["Security During DR","Policies remain enforced","Failover with private networking","Control attestations"], ]) table_slide("Infrastructure as Code and CI/CD",["Stage","Objective","Gate"],[ ["Validate","Lint, policy checks, static analysis","Fail-fast non-compliance"], ["Plan","Deterministic diff and approvals","Peer + architecture review"], ["Apply Non-Prod","Integration and smoke tests","Automated test pass"], ["Apply Prod","Controlled rollout","Dual approval + change window"], ]) # Performance, operations, roadmap (68-76) section("Operations, Performance, and Execution","Run model and delivery plan","blue") table_slide("Performance Engineering Levers",["Layer","Optimization Lever","Expected Benefit"],[ ["Ingestion","Incremental loading and checkpointing","Reduced latency and retries"], ["Bronze/Silver","Partitioning and file sizing","Efficient scan and compaction"], ["Gold","Materialized aggregates and caching","Faster BI and API response"], ["SQL Warehouses","Autosuspend/scaling policies","Balanced throughput/cost"], ["ML Serving","Adaptive autoscale","SLA adherence under burst"], ]) control_matrix("Observability and SRE Controls",[ ["Pipeline Health","Task-level metrics and SLA timers","Alert routing by severity","Ops dashboard + incident logs"], ["Data Quality","Rule breach telemetry","Quarantine and steward ticket","DQ trend boards"], ["Cost","Tag-based spend and anomalies","Budget thresholds and alerts","FinOps monthly packs"], ["Security","SIEM correlation and threat alerts","SOC incident response","Audit evidence"], ]) flow("Incident Management Workflow",[ "Detect", "Triage", "Contain", "Recover", "Postmortem", ],"Integrated with platform and security operations") two_col("FinOps Model","Cost Drivers",[ "Compute DBU consumption by workload class", "Storage growth and retention profile", "Fabric BI capacity utilization", "Network egress and private endpoint scale", ],"Optimization Actions",[ "Cluster policies and auto-termination", "Workload scheduling and right-sizing", "Storage lifecycle and compaction", "Chargeback transparency by domain", ]) table_slide("Delivery Roadmap",["Horizon","Focus","Milestones"],[ ["H1 (0-12m)","Foundation","Landing zone, core pipelines, governance baseline"], ["H2 (12-24m)","Scale","Domain product expansion, semantic acceleration"], ["H3 (24-36m)","Optimize","Advanced AI governance and automation depth"], ]) bullet("Architecture Review Board Gates",[ "Gate 1: workload placement and anti-pattern checks.", "Gate 2: security/privacy controls and policy mapping.", "Gate 3: data quality, lineage, and certification readiness.", "Gate 4: operational readiness, SLOs, and runbook completeness.", "Gate 5: post-implementation evidence and KPI outcomes.", ]) table_slide("Executive Technical Decisions",["Decision Point","Option Recommended","Technical Rationale"],[ ["Primary Platform","Databricks-primary","End-to-end control and reduced overlap"], ["BI Serving","Fabric Direct Lake","Cost/performance at enterprise scale"], ["Governance Stack","Purview + UC + Manta","Full business + technical lineage"], ["SAS Integration","JDBC-first policy","Preserve UC row/column enforcement"], ["Delivery Strategy","Phased with ARB gates","Risk-controlled execution"], ],"Steering committee inputs required") bullet("Closing: Implementation Priorities",[ "Confirm guardrails in architecture standards and CI controls.", "Sequence high-value domains for Gold product certification.", "Harden lineage and DQ evidence for regulated reporting.", "Operationalize FinOps and reliability SLO dashboards.", "Prepare Fabric IQ evaluation backlog under strict governance.", ],"Next technical action baseline") assert 70 <= len(prs.slides) <= 80, f"Slide count out of range: {len(prs.slides)}" prs.save(OUTPUT) print(f"Created {OUTPUT} with {len(prs.slides)} slides")