diff --git a/backend/src/services/mod.rs b/backend/src/services/mod.rs
index ae7a96c..4432534 100644
--- a/backend/src/services/mod.rs
+++ b/backend/src/services/mod.rs
@@ -7,5 +7,6 @@ pub mod llm;
 pub mod prompts;
 pub mod rate_limiter;
 pub mod scraper;
+pub mod source_scraper;
 pub mod synthesis;
 pub mod turnstile;
diff --git a/backend/src/services/source_scraper.rs b/backend/src/services/source_scraper.rs
new file mode 100644
index 0000000..2d64482
--- /dev/null
+++ b/backend/src/services/source_scraper.rs
@@ -0,0 +1,205 @@
+//! Source page scraper: fetches a source URL and extracts article links.
+//!
+//! Used in Phase 1 of the generation pipeline to discover articles
+//! from user-configured sources before falling back to LLM web search.
+
+use crate::errors::AppError;
+use scraper::{Html, Selector};
+use url::Url;
+
+/// Patterns in URL paths that indicate non-article pages.
+const EXCLUDED_PATH_PATTERNS: &[&str] = &[
+    "/tag/", "/category/", "/author/", "/page/", "/login", "/signup",
+    "/privacy", "/terms", "/search", "/contact", "/about",
+];
+
+/// File extensions that indicate static assets, not articles.
+const EXCLUDED_EXTENSIONS: &[&str] = &[
+    ".css", ".js", ".png", ".jpg", ".jpeg", ".gif", ".svg",
+    ".pdf", ".zip", ".xml", ".json", ".ico", ".woff", ".woff2",
+];
+
+/// Extract article links from a source page.
+///
+/// Fetches the HTML at `source_url`, extracts all `<a href>` links,
+/// filters to same-domain article-like URLs, deduplicates, and returns
+/// up to `max_links` candidate URLs.
+pub async fn extract_article_links(
+    http_client: &reqwest::Client,
+    source_url: &str,
+    max_links: usize,
+) -> Result<Vec<String>, AppError> {
+    let base_url = Url::parse(source_url)
+        .map_err(|e| AppError::BadRequest(format!("Invalid source URL: {}", e)))?;
+    let base_domain = base_url.host_str().unwrap_or("").to_lowercase();
+
+    let response = http_client
+        .get(source_url)
+        .send()
+        .await
+        .map_err(|e| {
+            tracing::warn!(url = source_url, error = %e, "Failed to fetch source page");
+            AppError::Internal(anyhow::anyhow!("Failed to fetch source page"))
+        })?;
+
+    if !response.status().is_success() {
+        tracing::warn!(url = source_url, status = %response.status(), "Source page returned non-200");
+        return Ok(Vec::new());
+    }
+
+    let html_text = response.text().await.map_err(|e| {
+        AppError::Internal(anyhow::anyhow!("Failed to read source page body: {}", e))
+    })?;
+
+    let links = extract_links_from_html(&html_text, &base_url, &base_domain);
+
+    Ok(links.into_iter().take(max_links).collect())
+}
+
+/// Extract and filter article links from HTML content.
+///
+/// This is a pure function (no I/O) for easy testing.
+pub fn extract_links_from_html(
+    html: &str,
+    base_url: &Url,
+    base_domain: &str,
+) -> Vec<String> {
+    let document = Html::parse_document(html);
+    let selector = Selector::parse("a[href]").unwrap();
+    let mut seen = std::collections::HashSet::new();
+    let mut links = Vec::new();
+
+    for element in document.select(&selector) {
+        if let Some(href) = element.value().attr("href") {
+            let resolved = match base_url.join(href) {
+                Ok(u) => u,
+                Err(_) => continue,
+            };
+
+            if resolved.scheme() != "http" && resolved.scheme() != "https" {
+                continue;
+            }
+
+            let link_domain = resolved.host_str().unwrap_or("").to_lowercase();
+            if link_domain != base_domain {
+                continue;
+            }
+
+            let path = resolved.path();
+            if path.is_empty() || path == "/" {
+                continue;
+            }
+
+            let path_lower = path.to_lowercase();
+            if EXCLUDED_PATH_PATTERNS.iter().any(|p| path_lower.contains(p)) {
+                continue;
+            }
+
+            if EXCLUDED_EXTENSIONS.iter().any(|ext| path_lower.ends_with(ext)) {
+                continue;
+            }
+
+            let mut normalized = resolved.clone();
+            normalized.set_fragment(None);
+            let url_str = normalized.to_string();
+
+            if seen.insert(url_str.clone()) {
+                links.push(url_str);
+            }
+        }
+    }
+
+    links
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    fn base_url(s: &str) -> Url {
+        Url::parse(s).unwrap()
+    }
+
+    #[test]
+    fn extracts_article_links_from_html() {
+        let html = r#"
+        <html><body>
+            <a href="/blog/article-1">Article 1</a>
+            <a href="/blog/article-2">Article 2</a>
+            <a href="/">Home</a>
+        </body></html>"#;
+        let base = base_url("https://example.com/blog");
+        let links = extract_links_from_html(html, &base, "example.com");
+        assert_eq!(links.len(), 2);
+        assert!(links[0].contains("/blog/article-1"));
+        assert!(links[1].contains("/blog/article-2"));
+    }
+
+    #[test]
+    fn filters_external_links() {
+        let html = r#"<a href="https://other.com/article">External</a>"#;
+        let base = base_url("https://example.com");
+        let links = extract_links_from_html(html, &base, "example.com");
+        assert!(links.is_empty());
+    }
+
+    #[test]
+    fn filters_non_article_patterns() {
+        let html = r#"
+        <a href="/tag/ai">Tag</a>
+        <a href="/category/tech">Category</a>
+        <a href="/author/john">Author</a>
+        <a href="/login">Login</a>
+        "#;
+        let base = base_url("https://example.com");
+        let links = extract_links_from_html(html, &base, "example.com");
+        assert!(links.is_empty());
+    }
+
+    #[test]
+    fn filters_static_assets() {
+        let html = r#"
+        <a href="/style.css">CSS</a>
+        <a href="/script.js">JS</a>
+        <a href="/logo.png">Image</a>
+        "#;
+        let base = base_url("https://example.com");
+        let links = extract_links_from_html(html, &base, "example.com");
+        assert!(links.is_empty());
+    }
+
+    #[test]
+    fn deduplicates_links() {
+        let html = r#"
+        <a href="/article">Link 1</a>
+        <a href="/article">Link 2</a>
+        <a href="/article#section">Link 3</a>
+        "#;
+        let base = base_url("https://example.com");
+        let links = extract_links_from_html(html, &base, "example.com");
+        assert_eq!(links.len(), 1);
+    }
+
+    #[test]
+    fn resolves_relative_urls() {
+        let html = r#"<a href="my-post">Relative</a>"#;
+        let base = base_url("https://example.com/blog/");
+        let links = extract_links_from_html(html, &base, "example.com");
+        assert_eq!(links.len(), 1);
+        assert!(links[0].contains("/blog/my-post"));
+    }
+
+    #[test]
+    fn allows_single_segment_paths() {
+        let html = r#"<a href="/my-great-article">Article</a>"#;
+        let base = base_url("https://example.com");
+        let links = extract_links_from_html(html, &base, "example.com");
+        assert_eq!(links.len(), 1);
+    }
+
+    #[test]
+    fn empty_html_returns_empty() {
+        let links = extract_links_from_html("", &base_url("https://example.com"), "example.com");
+        assert!(links.is_empty());
+    }
+}