from __future__ import annotations import base64 from dataclasses import dataclass from datetime import datetime, timezone from email.message import EmailMessage import html import json import logging from pathlib import Path import re from typing import Any from urllib.parse import parse_qsl, urlencode, urlsplit, urlunsplit logger = logging.getLogger("personal-agent.unsubscribe") TRACKING_QUERY_KEYS = { "fbclid", "gclid", "mc_cid", "mc_eid", "_hsenc", "_hsmi", "utm_campaign", "utm_content", "utm_id", "utm_medium", "utm_name", "utm_source", "utm_term", } UNSUBSCRIBE_HINTS = { "unsubscribe", "optout", "opt-out", "email-preferences", "manage-subscriptions", } URL_PATTERN = re.compile(r"https?://[^\s<>'\"()]+", re.IGNORECASE) @dataclass(frozen=True) class UnsubscribeDigestResult: scanned_messages: int extracted_unique_links: int new_links: int sent_to: str | None email_sent: bool class UnsubscribeDigestAgent: def __init__( self, *, gmail_service: Any, query: str, state_file: str, recipient_email: str | None = None, send_empty_digest: bool = False, ) -> None: self.gmail_service = gmail_service self.query = query self.state_file = Path(state_file) self.recipient_email = recipient_email self.send_empty_digest = send_empty_digest def scan_and_send_digest(self, max_results: int = 500) -> UnsubscribeDigestResult: messages = ( self.gmail_service.users() .messages() .list(userId="me", q=self.query, maxResults=max_results) .execute() .get("messages", []) ) extracted_links: set[str] = set() for message in messages: extracted_links.update(self._extract_links_from_message(message["id"])) sent_links = self._load_sent_links() new_links = sorted(link for link in extracted_links if link not in sent_links) should_send = bool(new_links) or self.send_empty_digest sent_to: str | None = None email_sent = False if should_send: sent_to = self._resolve_recipient_email() self._send_digest_email( recipient_email=sent_to, new_links=new_links, scanned_messages=len(messages), ) email_sent = True if new_links: sent_links.update(new_links) self._save_sent_links(sent_links) return UnsubscribeDigestResult( scanned_messages=len(messages), extracted_unique_links=len(extracted_links), new_links=len(new_links), sent_to=sent_to, email_sent=email_sent, ) def _extract_links_from_message(self, message_id: str) -> set[str]: message = ( self.gmail_service.users() .messages() .get(userId="me", id=message_id, format="full") .execute() ) payload = message.get("payload", {}) links: set[str] = set() for url in self._extract_list_unsubscribe_links(payload): normalized = _normalize_url(url) if normalized: links.add(normalized) for text_block in self._extract_text_blocks(payload): for url in URL_PATTERN.findall(html.unescape(text_block)): if not _looks_like_unsubscribe(url): continue normalized = _normalize_url(url) if normalized: links.add(normalized) return links def _extract_list_unsubscribe_links(self, payload: dict[str, Any]) -> set[str]: headers = { header.get("name", "").lower(): header.get("value", "") for header in payload.get("headers", []) } header_value = headers.get("list-unsubscribe", "") if not header_value: return set() results: set[str] = set() for candidate in re.findall(r"<([^>]+)>", header_value): candidate = candidate.strip() if candidate.lower().startswith(("http://", "https://")): results.add(candidate) if not results: for token in header_value.split(","): candidate = token.strip().strip("<>").strip() if candidate.lower().startswith(("http://", "https://")): results.add(candidate) return results def _extract_text_blocks(self, payload: dict[str, Any]) -> list[str]: blocks: list[str] = [] def walk(part: dict[str, Any]) -> None: mime_type = part.get("mimeType", "") body_data = part.get("body", {}).get("data") if body_data and mime_type in {"text/plain", "text/html"}: decoded = _decode_base64(body_data) if decoded: blocks.append(decoded) for child in part.get("parts", []): walk(child) walk(payload) return blocks def _resolve_recipient_email(self) -> str: if self.recipient_email: return self.recipient_email profile = self.gmail_service.users().getProfile(userId="me").execute() email = profile.get("emailAddress", "").strip() if not email: raise RuntimeError( "Could not resolve recipient email. Set UNSUBSCRIBE_DIGEST_RECIPIENT in .env." ) return email def _send_digest_email( self, *, recipient_email: str, new_links: list[str], scanned_messages: int, ) -> None: now_utc = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M UTC") subject = f"Unsubscribe recap: {len(new_links)} new link(s)" body_lines = [ "Here is your unsubscribe digest.", "", f"Generated at: {now_utc}", f"Advertising messages scanned: {scanned_messages}", f"New unsubscribe links: {len(new_links)}", "", ] if new_links: body_lines.append("Links:") body_lines.extend([f"- {link}" for link in new_links]) else: body_lines.append("No new unsubscribe links found.") message = EmailMessage() message["To"] = recipient_email message["Subject"] = subject message.set_content("\n".join(body_lines)) raw = base64.urlsafe_b64encode(message.as_bytes()).decode("utf-8") ( self.gmail_service.users() .messages() .send(userId="me", body={"raw": raw}) .execute() ) def _load_sent_links(self) -> set[str]: if not self.state_file.exists(): return set() try: payload = json.loads(self.state_file.read_text(encoding="utf-8")) except json.JSONDecodeError: logger.warning("State file is invalid JSON: %s", self.state_file) return set() links = payload.get("sent_links", []) if not isinstance(links, list): return set() return {str(link) for link in links if str(link).strip()} def _save_sent_links(self, links: set[str]) -> None: self.state_file.parent.mkdir(parents=True, exist_ok=True) self.state_file.write_text( json.dumps({"sent_links": sorted(links)}, indent=2), encoding="utf-8", ) def _decode_base64(data: str) -> str: padded = data + "=" * (-len(data) % 4) try: return base64.urlsafe_b64decode(padded.encode("utf-8")).decode( "utf-8", errors="replace" ) except Exception: return "" def _looks_like_unsubscribe(url: str) -> bool: lowered = url.lower() return any(hint in lowered for hint in UNSUBSCRIBE_HINTS) def _normalize_url(url: str) -> str | None: cleaned = url.strip().strip(".,;)") split = urlsplit(cleaned) if split.scheme.lower() not in {"http", "https"} or not split.netloc: return None scheme = split.scheme.lower() netloc = split.netloc.lower() path = split.path or "/" if path != "/": path = path.rstrip("/") query_pairs = parse_qsl(split.query, keep_blank_values=True) filtered_pairs = [ (key, value) for key, value in query_pairs if key.lower() not in TRACKING_QUERY_KEYS ] query = urlencode(filtered_pairs, doseq=True) return urlunsplit((scheme, netloc, path, query, ""))