From a5e6cf2ac0aca1f788347fb6237f55bcf05cb40d Mon Sep 17 00:00:00 2001
From: oabrivard <olivier@abrivard.fr>
Date: Wed, 25 Mar 2026 09:55:24 +0100
Subject: [PATCH] docs: update algorithm docs and generation time estimate

Update algorithm.md to reflect the rewritten per-article classify/summarize
pipeline (no batch classification, no rewrite pass). Update generation time
estimate from 1 minute to 10 minutes in frontend i18n and docs.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 docs/algorithm.md                            | 156 +++++--------------
 docs/implementation-plan/03-frontend-plan.md |   2 +-
 frontend/src/i18n/fr.ts                      |   2 +-
 3 files changed, 40 insertions(+), 120 deletions(-)
diff --git a/docs/algorithm.md b/docs/algorithm.md
index 51f2ff6..6829034 100644
--- a/docs/algorithm.md
+++ b/docs/algorithm.md
@@ -4,13 +4,13 @@
 
 1. **Load user settings** from DB (categories, provider, models, max_items, etc.)
 2. **Cleanup** — delete old article history entries (>N days, dropped only) + truncate old LLM call logs
-3. **Validate** — fail if no categories configured
+3. **Validate** — if no categories configured, there will just be the default category "Autre".
 4. **Load user sources** (personalized URLs like `https://openai.com/blog`)
 5. **Resolve LLM provider** — decrypt user's API key, create provider instance (`Arc<dyn LlmProvider>`)
 6. **Resolve models** — research model + writing model (user override or admin default)
 7. **Setup rate limiter** — per-user or global provider limiter
 8. **Prepare LLM scraping option** — if `use_llm_for_article_extraction` enabled, clone provider+model for concurrent use
-9. **Initialize tracking structures** — `filled_counts` (per-category article count), `all_scraped` (category→articles), `all_overflow` (dropped overflow), `seen_urls` (cross-phase dedup), classification categories (user categories + "Autre")
+9. **Initialize tracking structures** — `article_scraped` (category→articles), `source_counts` (per-source article count), `url_soucre` (per-article source), `filled_counts` (per-category article count), `seen_urls` (cross-phase dedup), classification categories (user categories + "Autre")
 
 ---
 
@@ -18,54 +18,36 @@
 
 **Skipped entirely if user has 0 sources.**
 
-### 1a. Extract article links from source pages
-
-- For each source (max 10), fetch the source page HTML
-- If `use_llm_for_source_links` enabled: send HTML `<head>` + first 8000 chars of `<body>` to LLM → extract article URLs (falls back to heuristic if LLM fails)
-- Otherwise: parse HTML `<a href>` links, filter by same-domain, non-homepage path, exclude `/tag/`, `/login/`, static assets, etc.
-- Over-fetch: `2 × max_articles_per_source` candidates per source
-- Deduplicate candidate URLs
-
-### 1b. Scrape candidate articles
-
-- Fetch each URL (bounded concurrency: 5 with LLM extraction, 10 without)
-- SSRF check (no private IPs), 15s timeout, 5MB body limit
-- If `use_llm_for_article_extraction` enabled: send `<head>` + body text to LLM → extract title, date, body, error detection (falls back to heuristic if LLM fails)
-- Otherwise: HTML parsing heuristics for title (`<title>`, `og:title`), date (meta tags, JSON-LD, `<time>`), body (strip scripts/nav), soft-404 detection
-- Capture final URL after redirects (canonical URL)
-
-### 1c. Filter empty content
-
-- Remove articles where scraped body text is empty (scrape failure, soft 404, too old)
-- Trace dropped articles in `article_history` with `status: filtered_empty`
-
-### 1d. Filter against article history
-
-- Hash each URL (normalized: lowercase, strip fragments/UTM params/trailing slashes)
-- Query `article_history` for existing hashes → remove matches
-- Trace dropped articles with `status: filtered_history`
-
-### 1e. Retry if under-filled
-
-- If valid articles < `categories × max_items_per_category` and history is enabled
-- Re-scrape source pages for NEW links (exclude already-fetched URLs)
-- Scrape + filter empty + filter history on retry candidates
-- Merge with existing valid articles
-- Only 1 retry attempt
-
-### 1f. LLM classification
-
-- Send articles (title + first 500 chars of body) + categories + "Autre" to LLM
-- LLM returns `{assignments: [{index, category}]}` mapping each article to a category
-- Overflow: articles that exceed both target category AND "Autre" limits → collected in `all_overflow`
-- **LLM call logged** with full prompt/response/timing
-
-### 1g. Enforce source diversity
-
-- Count domains across all categories
-- Remove articles where domain exceeds `max_articles_per_source`
-- Trace dropped articles with `status: filtered_diversity`
-- Recount category fill levels
+### 1a. Extract article links from source pages and filter against article history
+
+- Query `article_history` for the last source used. Reorder the personalized source so that the first source is the one following the last source used (rolling window) 
+- For each source, fetch the source page HTML:
+  - If `use_llm_for_source_links` enabled: send HTML `<head>` + first 8000 chars of `<body>` to LLM → extract all article URLs up to a maximum of 10, with the most recent first. If LLM call fails, fall back to HTML parsing as described below.
+    - **LLM call logged** with full prompt/response/timing
+  - Otherwise: parse HTML `<a href>` links, filter by same-domain, non-homepage path, exclude `/tag/`, `/login/`, `/contact/`,`/presentation/`,`/newsletter/`, static assets, etc. and keep only the first 10 links found
+  - Deduplicate candidate URLs
+  - Hash each URL (normalized: lowercase, strip fragments/UTM params/trailing slashes)
+  - Query `article_history` for existing hashes → remove matches
+  - Trace dropped articles with `status: filtered_history`
+  - Add the url to `url_soucre`
+
+### 1b. Scrape, classify and summarize articles
+
+- For each url from step 1a:
+  - if the number of articles in `source_counts` for the source of the current url exceeds `max_articles_per_source`:
+    - Trace dropped article with `status: filtered_diversity`
+    - Move to next url
+  - Fetch each URL (bounded concurrency: 5 with LLM extraction, 10 without). 
+  - SSRF check (no private IPs), 15s timeout, 5MB body limit. 
+  - HTML parsing heuristics for title (`<title>`, `og:title`), date (meta tags, JSON-LD, `<time>`), body (strip scripts/nav), soft-404 detection
+  - If article scraped body text is empty (scrape failure, soft 404, too old): 
+    - Trace dropped articles in `article_history` with `status: filtered_empty`
+    - Move to next url
+  - Send article (title + first 500 chars of body) + categories + "Autre" to LLM. LLM returns `{title, summary, category}` mapping the article to a category. The LLM generates the summary and a also a title if the provided title is empty
+    - **LLM call logged** with full prompt/response/timing
+  - Add the article to `article_scraped` and increase `filled_counts`
+  - if number of articles in the category of this artcile exceeds `max_items_per_category`: change the article catgeory to "Autre"
+  - If the total number of articles in `article_scraped` exceeds `number of categories (including Autre) × max_items_per_category` then exit for loop and move to synthesis generation 
 
 ---
 
@@ -78,21 +60,11 @@
 - For each user category: `needed = max - already_filled`
 - Only proceed if any category needs more
 
-### 2b. Load recent domains for diversity
-
-- If `source_diversity_window > 0`: extract domains from last N syntheses' JSONB sections
-- Used as soft "avoid if possible" instruction in search prompt
-
-### 2c. LLM web search pass
-
-- Build search prompt with theme, categories, gap counts ("find N articles for AI News, M for Cybersecurity"), recent domains to avoid, personalized source URLs
-- Call `provider.generate_search_pass()` with web search tool enabled
-- **LLM call logged** with full prompt/response/timing
-- Returns structured JSON: `{category_0: [{title, url, summary}], category_1: [...]}`
-
-### 2d. Filter pipeline on search results
+### 2b. LLM web search pass
 
-- **Parse** LLM output into `(category_key, Vec<NewsItem>)`
+- Build search prompt with theme, categories, gap counts ("find N articles for category_1, M for category_2")
+- Send search prompt to LLM. LLM returns structured JSON: `{category_0: [{title, url, summary}], category_1: [...]}`
+  - **LLM call logged** with full prompt/response/timing
 - **Filter homepage URLs** — drop articles with path `/` or empty
 - **Cross-phase dedup** — drop URLs already seen in Phase 1
 - **Dedup by URL** — drop duplicate URLs within Phase 2 (case-insensitive)
@@ -100,41 +72,13 @@
 - **Filter against article history** — BEFORE scraping (saves HTTP requests), drop already-seen URLs
 - Each drop traced in `article_history` with appropriate status
 
-### 2e. Scrape web search results
+### 2c. Scrape web search results
 
 - Same scraping as Phase 1 (bounded concurrency, SSRF check, optional LLM extraction)
 - Filter empty content (scrape failures, soft 404, too old)
 - Trace drops
-
-### 2f. LLM classification
-
-- Same as Phase 1 classification but with Phase 2 articles
-- `filled_counts` carries over from Phase 1 — categories already partially filled
-- Overflow collected
-- **LLM call logged**
 - Merge results into `all_scraped`
-
----
-
-## "Autre" Fill-Up
-
-- Count total articles across all categories
-- Target = `75% × (categories × max_items_per_category)` (user categories only, "Autre" excluded from denominator)
-- If shortfall > 0 and overflow exists:
-  - For each overflow article: check if domain is under `max_articles_per_source` limit
-  - Add to `all_scraped["category_autre"]` up to the shortfall
-
----
-
-## Combined Rewrite Pass
-
-- **Fail if no articles** — return error if all categories are empty
-- **Build rewrite prompt** — serialize all scraped articles with body content, instruct LLM to rewrite title + summary (4-5 lines) faithfully based on scraped content
-- **Build rewrite schema** — `minItems`/`maxItems` set to ACTUAL count per category (not user max), empty categories omitted, "Autre" included if non-empty
-- **LLM rewrite pass** — call `provider.generate_rewrite_pass()` with writing model
-- **LLM call logged** with full prompt/response/timing
-- **Build final sections** — map `category_N` keys to user category names, add "Autre" section if present, omit empty categories
-- **Restore scraped URLs** — replace any hallucinated URLs from LLM rewrite with the validated scraped URLs (matched by category + position)
+- Move to synthesis generation
 
 ---
 
@@ -144,27 +88,3 @@
 - **Save synthesis** — insert into `syntheses` table with `job_id`, `week` (ISO week), `sections` (JSONB), `status: completed`
 - **Record used articles** — insert each article URL into `article_history` with `status: used`, `synthesis_id`, `job_id`, and category name (for future dedup + provenance)
 
----
-
-## Summary of LLM Calls (up to 4 per generation)
-
-| # | Call | When | Model |
-|---|---|---|---|
-| 1 | Classification Phase 1 | After Phase 1 scraping | research |
-| 2 | Web Search | Phase 2 start | research |
-| 3 | Classification Phase 2 | After Phase 2 scraping | research |
-| 4 | Rewrite | After both phases | writing |
-
-Plus optionally per-article calls for LLM link extraction and LLM article extraction (when those settings are enabled).
-
-## Summary of Filtering Steps
-
-| Step | Phase | What's dropped |
-|---|---|---|
-| Empty content | 1 & 2 | Scrape failures, soft 404s, too old |
-| Article history | 1 & 2 | Already used in previous syntheses |
-| Homepage URLs | 2 | Path is `/` or empty |
-| Cross-phase dedup | 2 | URLs already found in Phase 1 |
-| URL dedup | 2 | Duplicate URLs within Phase 2 |
-| Source diversity | 1 & 2 | Domain exceeds `max_articles_per_source` |
-| Category overflow | 1 & 2 | Category + "Autre" both full |
diff --git a/docs/implementation-plan/03-frontend-plan.md b/docs/implementation-plan/03-frontend-plan.md
index 8de19a0..7029f66 100644
--- a/docs/implementation-plan/03-frontend-plan.md
+++ b/docs/implementation-plan/03-frontend-plan.md
@@ -1512,7 +1512,7 @@ const fr = {
   // Generate
   'generate.title': 'Generer la Synthese Hebdomadaire',
   'generate.description': "Cette action va lancer l'analyse des actualites des {days} derniers jours sur le theme \"{theme}\" via {provider} ({model}).",
-  'generate.note': 'Note : La generation peut prendre jusqu\'a une minute.',
+  'generate.note': 'Note : La generation peut prendre jusqu\'a 10 minutes.',
   'generate.noWebSearch': "Note : Le fournisseur selectionne ne dispose pas de la recherche web integree. Les resultats seront bases sur les connaissances du modele uniquement.",
   'generate.start': 'Lancer la generation',
   'generate.canLeave': 'Vous pouvez quitter cette page. La generation continuera en arriere-plan.',
diff --git a/frontend/src/i18n/fr.ts b/frontend/src/i18n/fr.ts
index 15c7971..770e4f4 100644
--- a/frontend/src/i18n/fr.ts
+++ b/frontend/src/i18n/fr.ts
@@ -68,7 +68,7 @@ const fr = {
   'generate.title': 'Generer la Synthese Hebdomadaire',
   'generate.description':
     "Cette action va lancer l'analyse des actualites des {days} derniers jours sur le theme \"{theme}\" via l'IA.",
-  'generate.note': 'Note : La generation peut prendre jusqu\'a une minute.',
+  'generate.note': 'Note : La generation peut prendre jusqu\'a 10 minutes.',
   'generate.launch': 'Lancer la generation',
   'generate.inProgress': 'Generation en cours...',
   'generate.step.search': 'Recherche d\'actualites',