Added unsubscribe email recap

3 months ago · 3333a4e06d
parent 14942a88cc
commit 3333a4e06d
7 changed files with 416 additions and 1 deletions
--- a/.env.example
+++ b/.env.example
@ -8,4 +8,10 @@ LLM_TIMEOUT_SECONDS=20
 LLM_FALLBACK_TO_RULES=false
 GMAIL_SCAN_INTERVAL_MINUTES=5
 GMAIL_QUERY=in:inbox -label:AgentProcessed newer_than:7d
 UNSUBSCRIBE_DIGEST_INTERVAL_MINUTES=1440
 UNSUBSCRIBE_QUERY=label:Advertising
 UNSUBSCRIBE_MAX_RESULTS=500
 UNSUBSCRIBE_STATE_FILE=data/sent_unsubscribe_links.json
 UNSUBSCRIBE_DIGEST_RECIPIENT=
 UNSUBSCRIBE_SEND_EMPTY_DIGEST=false
 LOG_LEVEL=INFO
--- a/.gitignore
+++ b/.gitignore
@ -5,3 +5,4 @@ __pycache__/
 *.pyc
 credentials.json
 token.json
 data/
--- a/README.md
+++ b/README.md
@ -6,6 +6,7 @@ This project runs a small local API service that:
 - classifies emails with an LLM as `LINKEDIN`, `ADVERTISING`, or `OTHER`
 - moves LinkedIn emails to a `LinkedIn` label/folder
 - moves advertising emails to an `Advertising` label/folder
 - scans the `Advertising` label and emails you new unsubscribe links (deduplicated)
 - exposes a secure availability endpoint powered by Google Calendar free/busy
 ## 1) Prerequisites
@ -27,6 +28,7 @@ This project runs a small local API service that:
 3. Save it in this project as `credentials.json`.
 The first run opens a browser window for consent and creates `token.json`.
 If your existing token was created before `gmail.send` was added, you may be prompted again.
 ## 3) Install and configure
@ -39,6 +41,7 @@ Edit `.env` and set:
 - `AGENT_API_KEY` to a strong secret for agent-to-agent calls
 - `LLM_API_KEY` and optional `LLM_MODEL` / `LLM_BASE_URL`
 - optional unsubscribe digest settings (`UNSUBSCRIBE_*`)
 - optional scan frequency and Gmail query
 ## 4) Run
@ -79,6 +82,13 @@ curl -X POST "http://127.0.0.1:8000/availability" \
 If `available` is `true`, there are no busy slots in that range.
 ### Manual unsubscribe digest
 ```bash
 curl -X POST "http://127.0.0.1:8000/unsubscribe-digest?max_results=500" \
  -H "X-API-Key: your-secret"
 ```
 ## Classification behavior
 - LLM classification is used for each email (`LINKEDIN`, `ADVERTISING`, `OTHER`).
@ -86,6 +96,14 @@ If `available` is `true`, there are no busy slots in that range.
 - Set `LLM_FALLBACK_TO_RULES=true` only if you want rules-based backup when LLM calls fail.
 - Every scanned message gets an `AgentProcessed` label to avoid reprocessing loops.
 ## Unsubscribe digest behavior
 - Reads emails from `UNSUBSCRIBE_QUERY` (default `label:Advertising`).
 - Extracts unsubscribe URLs from `List-Unsubscribe` headers and message content.
 - Removes duplicates within the run and across runs.
 - Persists already sent links in `UNSUBSCRIBE_STATE_FILE`.
 - Sends only new links by email, unless `UNSUBSCRIBE_SEND_EMPTY_DIGEST=true`.
 ## Notes
 - Gmail "folders" are labels. This agent creates:
--- a/app/config.py
+++ b/app/config.py
@ -7,6 +7,7 @@ load_dotenv()
 GOOGLE_SCOPES = (
    "https://www.googleapis.com/auth/gmail.modify",
    "https://www.googleapis.com/auth/gmail.send",
    "https://www.googleapis.com/auth/calendar.readonly",
 )
@ -23,11 +24,18 @@ class Settings:
    llm_base_url: str | None
    llm_timeout_seconds: float
    llm_fallback_to_rules: bool
    unsubscribe_digest_interval_minutes: int
    unsubscribe_query: str
    unsubscribe_max_results: int
    unsubscribe_state_file: str
    unsubscribe_digest_recipient: str | None
    unsubscribe_send_empty_digest: bool
    log_level: str
 def get_settings() -> Settings:
    llm_base_url = os.getenv("LLM_BASE_URL", "").strip()
    unsubscribe_digest_recipient = os.getenv("UNSUBSCRIBE_DIGEST_RECIPIENT", "").strip()
    return Settings(
        google_client_secrets_file=os.getenv("GOOGLE_CLIENT_SECRETS_FILE", "credentials.json"),
        google_token_file=os.getenv("GOOGLE_TOKEN_FILE", "token.json"),
@ -41,6 +49,18 @@ def get_settings() -> Settings:
        llm_base_url=llm_base_url or None,
        llm_timeout_seconds=float(os.getenv("LLM_TIMEOUT_SECONDS", "20")),
        llm_fallback_to_rules=_as_bool(os.getenv("LLM_FALLBACK_TO_RULES", "false")),
        unsubscribe_digest_interval_minutes=int(
            os.getenv("UNSUBSCRIBE_DIGEST_INTERVAL_MINUTES", "1440")
        ),
        unsubscribe_query=os.getenv("UNSUBSCRIBE_QUERY", "label:Advertising"),
        unsubscribe_max_results=int(os.getenv("UNSUBSCRIBE_MAX_RESULTS", "500")),
        unsubscribe_state_file=os.getenv(
            "UNSUBSCRIBE_STATE_FILE", "data/sent_unsubscribe_links.json"
        ),
        unsubscribe_digest_recipient=unsubscribe_digest_recipient or None,
        unsubscribe_send_empty_digest=_as_bool(
            os.getenv("UNSUBSCRIBE_SEND_EMPTY_DIGEST", "false")
        ),
        log_level=os.getenv("LOG_LEVEL", "INFO"),
    )
--- a/app/google_clients.py
+++ b/app/google_clients.py
@ -15,6 +15,8 @@ def get_google_credentials(settings: Settings) -> Credentials:
        creds = Credentials.from_authorized_user_file(
            settings.google_token_file, GOOGLE_SCOPES
        )
        if not creds.has_scopes(GOOGLE_SCOPES):
            creds = None
    if not creds or not creds.valid:
        if creds and creds.expired and creds.refresh_token:
--- a/app/main.py
+++ b/app/main.py
@ -14,6 +14,7 @@ from app.config import get_settings
 from app.gmail_agent import GmailTriageAgent
 from app.google_clients import build_calendar_service, build_gmail_service
 from app.llm_classifier import LLMEmailClassifier
 from app.unsubscribe_agent import UnsubscribeDigestAgent
 settings = get_settings()
 logging.basicConfig(level=getattr(logging, settings.log_level.upper(), logging.INFO))
@ -22,6 +23,7 @@ logger = logging.getLogger("personal-agent")
 app = FastAPI(title="Personal Agent", version="0.1.0")
 scheduler: AsyncIOScheduler | None = None
 scan_lock: asyncio.Lock | None = None
 unsubscribe_lock: asyncio.Lock | None = None
 llm_key_warning_logged = False
@ -53,6 +55,14 @@ class AvailabilityResponse(BaseModel):
    checked_calendars: list[str]
 class UnsubscribeDigestResponse(BaseModel):
    scanned_messages: int
    extracted_unique_links: int
    new_links: int
    sent_to: str | None
    email_sent: bool
 def verify_api_key(
    x_api_key: Annotated[str | None, Header(alias="X-API-Key")] = None,
    authorization: Annotated[str | None, Header()] = None,
@ -91,6 +101,26 @@ def _run_scan_once(max_results: int) -> ScanResponse:
    )
 def _run_unsubscribe_digest_once(max_results: int) -> UnsubscribeDigestResponse:
    bounded_max_results = max(1, min(max_results, 500))
    gmail_service = build_gmail_service(settings)
    unsubscribe_agent = UnsubscribeDigestAgent(
        gmail_service=gmail_service,
        query=settings.unsubscribe_query,
        state_file=settings.unsubscribe_state_file,
        recipient_email=settings.unsubscribe_digest_recipient,
        send_empty_digest=settings.unsubscribe_send_empty_digest,
    )
    result = unsubscribe_agent.scan_and_send_digest(max_results=bounded_max_results)
    return UnsubscribeDigestResponse(
        scanned_messages=result.scanned_messages,
        extracted_unique_links=result.extracted_unique_links,
        new_links=result.new_links,
        sent_to=result.sent_to,
        email_sent=result.email_sent,
    )
 def _build_llm_classifier() -> LLMEmailClassifier | None:
    global llm_key_warning_logged
@ -127,6 +157,13 @@ def _get_scan_lock() -> asyncio.Lock:
    return scan_lock
 def _get_unsubscribe_lock() -> asyncio.Lock:
    global unsubscribe_lock
    if unsubscribe_lock is None:
        unsubscribe_lock = asyncio.Lock()
    return unsubscribe_lock
 async def _scheduled_scan() -> None:
    lock = _get_scan_lock()
    if lock.locked():
@ -141,10 +178,27 @@ async def _scheduled_scan() -> None:
            logger.exception("Scheduled scan failed")
 async def _scheduled_unsubscribe_digest() -> None:
    lock = _get_unsubscribe_lock()
    if lock.locked():
        logger.info("Previous unsubscribe digest still running, skipping this tick.")
        return
    async with lock:
        try:
            result = await asyncio.to_thread(
                _run_unsubscribe_digest_once, settings.unsubscribe_max_results
            )
            logger.info("Scheduled unsubscribe digest complete: %s", result.model_dump())
        except Exception:
            logger.exception("Scheduled unsubscribe digest failed")
@app.on_event("startup")
 async def startup_event() -> None:
    global scheduler
    _get_scan_lock()
    _get_unsubscribe_lock()
    scheduler = AsyncIOScheduler()
    scheduler.add_job(
        _scheduled_scan,
@ -152,9 +206,17 @@ async def startup_event() -> None:
        minutes=settings.gmail_scan_interval_minutes,
        next_run_time=datetime.now(),
    )
    scheduler.add_job(
        _scheduled_unsubscribe_digest,
        "interval",
        minutes=settings.unsubscribe_digest_interval_minutes,
        next_run_time=datetime.now(),
    )
    scheduler.start()
    logger.info(
-        "Scheduler started (interval=%s min)", settings.gmail_scan_interval_minutes
+        "Scheduler started (scan interval=%s min, unsubscribe interval=%s min)",
        settings.gmail_scan_interval_minutes,
        settings.unsubscribe_digest_interval_minutes,
    )
@ -169,6 +231,7 @@ def health() -> dict[str, object]:
    return {
        "status": "ok",
        "scan_interval_minutes": settings.gmail_scan_interval_minutes,
        "unsubscribe_digest_interval_minutes": settings.unsubscribe_digest_interval_minutes,
    }
@ -227,3 +290,26 @@ async def availability(request: AvailabilityRequest) -> AvailabilityResponse:
            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
            detail=f"Availability lookup failed: {exc}",
        ) from exc
@app.post(
    "/unsubscribe-digest",
    response_model=UnsubscribeDigestResponse,
    dependencies=[Depends(verify_api_key)],
 )
 async def unsubscribe_digest_now(
    max_results: int = Query(default=settings.unsubscribe_max_results, ge=1, le=500),
 ) -> UnsubscribeDigestResponse:
    async with _get_unsubscribe_lock():
        try:
            return await asyncio.to_thread(_run_unsubscribe_digest_once, max_results)
        except FileNotFoundError as exc:
            raise HTTPException(
                status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
                detail=str(exc),
            ) from exc
        except Exception as exc:
            raise HTTPException(
                status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
                detail=f"Unsubscribe digest failed: {exc}",
            ) from exc
--- a/app/unsubscribe_agent.py
+++ b/app/unsubscribe_agent.py
@ -0,0 +1,282 @@
 from __future__ import annotations
 import base64
 from dataclasses import dataclass
 from datetime import datetime, timezone
 from email.message import EmailMessage
 import html
 import json
 import logging
 from pathlib import Path
 import re
 from typing import Any
 from urllib.parse import parse_qsl, urlencode, urlsplit, urlunsplit
 logger = logging.getLogger("personal-agent.unsubscribe")
 TRACKING_QUERY_KEYS = {
    "fbclid",
    "gclid",
    "mc_cid",
    "mc_eid",
    "_hsenc",
    "_hsmi",
    "utm_campaign",
    "utm_content",
    "utm_id",
    "utm_medium",
    "utm_name",
    "utm_source",
    "utm_term",
 }
 UNSUBSCRIBE_HINTS = {
    "unsubscribe",
    "optout",
    "opt-out",
    "email-preferences",
    "manage-subscriptions",
 }
 URL_PATTERN = re.compile(r"https?://[^\s<>'\"()]+", re.IGNORECASE)
@dataclass(frozen=True)
 class UnsubscribeDigestResult:
    scanned_messages: int
    extracted_unique_links: int
    new_links: int
    sent_to: str | None
    email_sent: bool
 class UnsubscribeDigestAgent:
    def __init__(
        self,
        *,
        gmail_service: Any,
        query: str,
        state_file: str,
        recipient_email: str | None = None,
        send_empty_digest: bool = False,
    ) -> None:
        self.gmail_service = gmail_service
        self.query = query
        self.state_file = Path(state_file)
        self.recipient_email = recipient_email
        self.send_empty_digest = send_empty_digest
    def scan_and_send_digest(self, max_results: int = 500) -> UnsubscribeDigestResult:
        messages = (
            self.gmail_service.users()
            .messages()
            .list(userId="me", q=self.query, maxResults=max_results)
            .execute()
            .get("messages", [])
        )
        extracted_links: set[str] = set()
        for message in messages:
            extracted_links.update(self._extract_links_from_message(message["id"]))
        sent_links = self._load_sent_links()
        new_links = sorted(link for link in extracted_links if link not in sent_links)
        should_send = bool(new_links) or self.send_empty_digest
        sent_to: str | None = None
        email_sent = False
        if should_send:
            sent_to = self._resolve_recipient_email()
            self._send_digest_email(
                recipient_email=sent_to,
                new_links=new_links,
                scanned_messages=len(messages),
            )
            email_sent = True
        if new_links:
            sent_links.update(new_links)
            self._save_sent_links(sent_links)
        return UnsubscribeDigestResult(
            scanned_messages=len(messages),
            extracted_unique_links=len(extracted_links),
            new_links=len(new_links),
            sent_to=sent_to,
            email_sent=email_sent,
        )
    def _extract_links_from_message(self, message_id: str) -> set[str]:
        message = (
            self.gmail_service.users()
            .messages()
            .get(userId="me", id=message_id, format="full")
            .execute()
        )
        payload = message.get("payload", {})
        links: set[str] = set()
        for url in self._extract_list_unsubscribe_links(payload):
            normalized = _normalize_url(url)
            if normalized:
                links.add(normalized)
        for text_block in self._extract_text_blocks(payload):
            for url in URL_PATTERN.findall(html.unescape(text_block)):
                if not _looks_like_unsubscribe(url):
                    continue
                normalized = _normalize_url(url)
                if normalized:
                    links.add(normalized)
        return links
    def _extract_list_unsubscribe_links(self, payload: dict[str, Any]) -> set[str]:
        headers = {
            header.get("name", "").lower(): header.get("value", "")
            for header in payload.get("headers", [])
        }
        header_value = headers.get("list-unsubscribe", "")
        if not header_value:
            return set()
        results: set[str] = set()
        for candidate in re.findall(r"<([^>]+)>", header_value):
            candidate = candidate.strip()
            if candidate.lower().startswith(("http://", "https://")):
                results.add(candidate)
        if not results:
            for token in header_value.split(","):
                candidate = token.strip().strip("<>").strip()
                if candidate.lower().startswith(("http://", "https://")):
                    results.add(candidate)
        return results
    def _extract_text_blocks(self, payload: dict[str, Any]) -> list[str]:
        blocks: list[str] = []
        def walk(part: dict[str, Any]) -> None:
            mime_type = part.get("mimeType", "")
            body_data = part.get("body", {}).get("data")
            if body_data and mime_type in {"text/plain", "text/html"}:
                decoded = _decode_base64(body_data)
                if decoded:
                    blocks.append(decoded)
            for child in part.get("parts", []):
                walk(child)
        walk(payload)
        return blocks
    def _resolve_recipient_email(self) -> str:
        if self.recipient_email:
            return self.recipient_email
        profile = self.gmail_service.users().getProfile(userId="me").execute()
        email = profile.get("emailAddress", "").strip()
        if not email:
            raise RuntimeError(
                "Could not resolve recipient email. Set UNSUBSCRIBE_DIGEST_RECIPIENT in .env."
            )
        return email
    def _send_digest_email(
        self,
        *,
        recipient_email: str,
        new_links: list[str],
        scanned_messages: int,
    ) -> None:
        now_utc = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M UTC")
        subject = f"Unsubscribe recap: {len(new_links)} new link(s)"
        body_lines = [
            "Here is your unsubscribe digest.",
            "",
            f"Generated at: {now_utc}",
            f"Advertising messages scanned: {scanned_messages}",
            f"New unsubscribe links: {len(new_links)}",
            "",
        ]
        if new_links:
            body_lines.append("Links:")
            body_lines.extend([f"- {link}" for link in new_links])
        else:
            body_lines.append("No new unsubscribe links found.")
        message = EmailMessage()
        message["To"] = recipient_email
        message["Subject"] = subject
        message.set_content("\n".join(body_lines))
        raw = base64.urlsafe_b64encode(message.as_bytes()).decode("utf-8")
        (
            self.gmail_service.users()
            .messages()
            .send(userId="me", body={"raw": raw})
            .execute()
        )
    def _load_sent_links(self) -> set[str]:
        if not self.state_file.exists():
            return set()
        try:
            payload = json.loads(self.state_file.read_text(encoding="utf-8"))
        except json.JSONDecodeError:
            logger.warning("State file is invalid JSON: %s", self.state_file)
            return set()
        links = payload.get("sent_links", [])
        if not isinstance(links, list):
            return set()
        return {str(link) for link in links if str(link).strip()}
    def _save_sent_links(self, links: set[str]) -> None:
        self.state_file.parent.mkdir(parents=True, exist_ok=True)
        self.state_file.write_text(
            json.dumps({"sent_links": sorted(links)}, indent=2),
            encoding="utf-8",
        )
 def _decode_base64(data: str) -> str:
    padded = data + "=" * (-len(data) % 4)
    try:
        return base64.urlsafe_b64decode(padded.encode("utf-8")).decode(
            "utf-8", errors="replace"
        )
    except Exception:
        return ""
 def _looks_like_unsubscribe(url: str) -> bool:
    lowered = url.lower()
    return any(hint in lowered for hint in UNSUBSCRIBE_HINTS)
 def _normalize_url(url: str) -> str | None:
    cleaned = url.strip().strip(".,;)")
    split = urlsplit(cleaned)
    if split.scheme.lower() not in {"http", "https"} or not split.netloc:
        return None
    scheme = split.scheme.lower()
    netloc = split.netloc.lower()
    path = split.path or "/"
    if path != "/":
        path = path.rstrip("/")
    query_pairs = parse_qsl(split.query, keep_blank_values=True)
    filtered_pairs = [
        (key, value)
        for key, value in query_pairs
        if key.lower() not in TRACKING_QUERY_KEYS
    ]
    query = urlencode(filtered_pairs, doseq=True)
    return urlunsplit((scheme, netloc, path, query, ""))