Added unsubscribe email recap

master
oabrivard 1 week ago
parent 14942a88cc
commit 3333a4e06d

@ -8,4 +8,10 @@ LLM_TIMEOUT_SECONDS=20
LLM_FALLBACK_TO_RULES=false LLM_FALLBACK_TO_RULES=false
GMAIL_SCAN_INTERVAL_MINUTES=5 GMAIL_SCAN_INTERVAL_MINUTES=5
GMAIL_QUERY=in:inbox -label:AgentProcessed newer_than:7d GMAIL_QUERY=in:inbox -label:AgentProcessed newer_than:7d
UNSUBSCRIBE_DIGEST_INTERVAL_MINUTES=1440
UNSUBSCRIBE_QUERY=label:Advertising
UNSUBSCRIBE_MAX_RESULTS=500
UNSUBSCRIBE_STATE_FILE=data/sent_unsubscribe_links.json
UNSUBSCRIBE_DIGEST_RECIPIENT=
UNSUBSCRIBE_SEND_EMPTY_DIGEST=false
LOG_LEVEL=INFO LOG_LEVEL=INFO

1
.gitignore vendored

@ -5,3 +5,4 @@ __pycache__/
*.pyc *.pyc
credentials.json credentials.json
token.json token.json
data/

@ -6,6 +6,7 @@ This project runs a small local API service that:
- classifies emails with an LLM as `LINKEDIN`, `ADVERTISING`, or `OTHER` - classifies emails with an LLM as `LINKEDIN`, `ADVERTISING`, or `OTHER`
- moves LinkedIn emails to a `LinkedIn` label/folder - moves LinkedIn emails to a `LinkedIn` label/folder
- moves advertising emails to an `Advertising` label/folder - moves advertising emails to an `Advertising` label/folder
- scans the `Advertising` label and emails you new unsubscribe links (deduplicated)
- exposes a secure availability endpoint powered by Google Calendar free/busy - exposes a secure availability endpoint powered by Google Calendar free/busy
## 1) Prerequisites ## 1) Prerequisites
@ -27,6 +28,7 @@ This project runs a small local API service that:
3. Save it in this project as `credentials.json`. 3. Save it in this project as `credentials.json`.
The first run opens a browser window for consent and creates `token.json`. The first run opens a browser window for consent and creates `token.json`.
If your existing token was created before `gmail.send` was added, you may be prompted again.
## 3) Install and configure ## 3) Install and configure
@ -39,6 +41,7 @@ Edit `.env` and set:
- `AGENT_API_KEY` to a strong secret for agent-to-agent calls - `AGENT_API_KEY` to a strong secret for agent-to-agent calls
- `LLM_API_KEY` and optional `LLM_MODEL` / `LLM_BASE_URL` - `LLM_API_KEY` and optional `LLM_MODEL` / `LLM_BASE_URL`
- optional unsubscribe digest settings (`UNSUBSCRIBE_*`)
- optional scan frequency and Gmail query - optional scan frequency and Gmail query
## 4) Run ## 4) Run
@ -79,6 +82,13 @@ curl -X POST "http://127.0.0.1:8000/availability" \
If `available` is `true`, there are no busy slots in that range. If `available` is `true`, there are no busy slots in that range.
### Manual unsubscribe digest
```bash
curl -X POST "http://127.0.0.1:8000/unsubscribe-digest?max_results=500" \
-H "X-API-Key: your-secret"
```
## Classification behavior ## Classification behavior
- LLM classification is used for each email (`LINKEDIN`, `ADVERTISING`, `OTHER`). - LLM classification is used for each email (`LINKEDIN`, `ADVERTISING`, `OTHER`).
@ -86,6 +96,14 @@ If `available` is `true`, there are no busy slots in that range.
- Set `LLM_FALLBACK_TO_RULES=true` only if you want rules-based backup when LLM calls fail. - Set `LLM_FALLBACK_TO_RULES=true` only if you want rules-based backup when LLM calls fail.
- Every scanned message gets an `AgentProcessed` label to avoid reprocessing loops. - Every scanned message gets an `AgentProcessed` label to avoid reprocessing loops.
## Unsubscribe digest behavior
- Reads emails from `UNSUBSCRIBE_QUERY` (default `label:Advertising`).
- Extracts unsubscribe URLs from `List-Unsubscribe` headers and message content.
- Removes duplicates within the run and across runs.
- Persists already sent links in `UNSUBSCRIBE_STATE_FILE`.
- Sends only new links by email, unless `UNSUBSCRIBE_SEND_EMPTY_DIGEST=true`.
## Notes ## Notes
- Gmail "folders" are labels. This agent creates: - Gmail "folders" are labels. This agent creates:

@ -7,6 +7,7 @@ load_dotenv()
GOOGLE_SCOPES = ( GOOGLE_SCOPES = (
"https://www.googleapis.com/auth/gmail.modify", "https://www.googleapis.com/auth/gmail.modify",
"https://www.googleapis.com/auth/gmail.send",
"https://www.googleapis.com/auth/calendar.readonly", "https://www.googleapis.com/auth/calendar.readonly",
) )
@ -23,11 +24,18 @@ class Settings:
llm_base_url: str | None llm_base_url: str | None
llm_timeout_seconds: float llm_timeout_seconds: float
llm_fallback_to_rules: bool llm_fallback_to_rules: bool
unsubscribe_digest_interval_minutes: int
unsubscribe_query: str
unsubscribe_max_results: int
unsubscribe_state_file: str
unsubscribe_digest_recipient: str | None
unsubscribe_send_empty_digest: bool
log_level: str log_level: str
def get_settings() -> Settings: def get_settings() -> Settings:
llm_base_url = os.getenv("LLM_BASE_URL", "").strip() llm_base_url = os.getenv("LLM_BASE_URL", "").strip()
unsubscribe_digest_recipient = os.getenv("UNSUBSCRIBE_DIGEST_RECIPIENT", "").strip()
return Settings( return Settings(
google_client_secrets_file=os.getenv("GOOGLE_CLIENT_SECRETS_FILE", "credentials.json"), google_client_secrets_file=os.getenv("GOOGLE_CLIENT_SECRETS_FILE", "credentials.json"),
google_token_file=os.getenv("GOOGLE_TOKEN_FILE", "token.json"), google_token_file=os.getenv("GOOGLE_TOKEN_FILE", "token.json"),
@ -41,6 +49,18 @@ def get_settings() -> Settings:
llm_base_url=llm_base_url or None, llm_base_url=llm_base_url or None,
llm_timeout_seconds=float(os.getenv("LLM_TIMEOUT_SECONDS", "20")), llm_timeout_seconds=float(os.getenv("LLM_TIMEOUT_SECONDS", "20")),
llm_fallback_to_rules=_as_bool(os.getenv("LLM_FALLBACK_TO_RULES", "false")), llm_fallback_to_rules=_as_bool(os.getenv("LLM_FALLBACK_TO_RULES", "false")),
unsubscribe_digest_interval_minutes=int(
os.getenv("UNSUBSCRIBE_DIGEST_INTERVAL_MINUTES", "1440")
),
unsubscribe_query=os.getenv("UNSUBSCRIBE_QUERY", "label:Advertising"),
unsubscribe_max_results=int(os.getenv("UNSUBSCRIBE_MAX_RESULTS", "500")),
unsubscribe_state_file=os.getenv(
"UNSUBSCRIBE_STATE_FILE", "data/sent_unsubscribe_links.json"
),
unsubscribe_digest_recipient=unsubscribe_digest_recipient or None,
unsubscribe_send_empty_digest=_as_bool(
os.getenv("UNSUBSCRIBE_SEND_EMPTY_DIGEST", "false")
),
log_level=os.getenv("LOG_LEVEL", "INFO"), log_level=os.getenv("LOG_LEVEL", "INFO"),
) )

@ -15,6 +15,8 @@ def get_google_credentials(settings: Settings) -> Credentials:
creds = Credentials.from_authorized_user_file( creds = Credentials.from_authorized_user_file(
settings.google_token_file, GOOGLE_SCOPES settings.google_token_file, GOOGLE_SCOPES
) )
if not creds.has_scopes(GOOGLE_SCOPES):
creds = None
if not creds or not creds.valid: if not creds or not creds.valid:
if creds and creds.expired and creds.refresh_token: if creds and creds.expired and creds.refresh_token:

@ -14,6 +14,7 @@ from app.config import get_settings
from app.gmail_agent import GmailTriageAgent from app.gmail_agent import GmailTriageAgent
from app.google_clients import build_calendar_service, build_gmail_service from app.google_clients import build_calendar_service, build_gmail_service
from app.llm_classifier import LLMEmailClassifier from app.llm_classifier import LLMEmailClassifier
from app.unsubscribe_agent import UnsubscribeDigestAgent
settings = get_settings() settings = get_settings()
logging.basicConfig(level=getattr(logging, settings.log_level.upper(), logging.INFO)) logging.basicConfig(level=getattr(logging, settings.log_level.upper(), logging.INFO))
@ -22,6 +23,7 @@ logger = logging.getLogger("personal-agent")
app = FastAPI(title="Personal Agent", version="0.1.0") app = FastAPI(title="Personal Agent", version="0.1.0")
scheduler: AsyncIOScheduler | None = None scheduler: AsyncIOScheduler | None = None
scan_lock: asyncio.Lock | None = None scan_lock: asyncio.Lock | None = None
unsubscribe_lock: asyncio.Lock | None = None
llm_key_warning_logged = False llm_key_warning_logged = False
@ -53,6 +55,14 @@ class AvailabilityResponse(BaseModel):
checked_calendars: list[str] checked_calendars: list[str]
class UnsubscribeDigestResponse(BaseModel):
scanned_messages: int
extracted_unique_links: int
new_links: int
sent_to: str | None
email_sent: bool
def verify_api_key( def verify_api_key(
x_api_key: Annotated[str | None, Header(alias="X-API-Key")] = None, x_api_key: Annotated[str | None, Header(alias="X-API-Key")] = None,
authorization: Annotated[str | None, Header()] = None, authorization: Annotated[str | None, Header()] = None,
@ -91,6 +101,26 @@ def _run_scan_once(max_results: int) -> ScanResponse:
) )
def _run_unsubscribe_digest_once(max_results: int) -> UnsubscribeDigestResponse:
bounded_max_results = max(1, min(max_results, 500))
gmail_service = build_gmail_service(settings)
unsubscribe_agent = UnsubscribeDigestAgent(
gmail_service=gmail_service,
query=settings.unsubscribe_query,
state_file=settings.unsubscribe_state_file,
recipient_email=settings.unsubscribe_digest_recipient,
send_empty_digest=settings.unsubscribe_send_empty_digest,
)
result = unsubscribe_agent.scan_and_send_digest(max_results=bounded_max_results)
return UnsubscribeDigestResponse(
scanned_messages=result.scanned_messages,
extracted_unique_links=result.extracted_unique_links,
new_links=result.new_links,
sent_to=result.sent_to,
email_sent=result.email_sent,
)
def _build_llm_classifier() -> LLMEmailClassifier | None: def _build_llm_classifier() -> LLMEmailClassifier | None:
global llm_key_warning_logged global llm_key_warning_logged
@ -127,6 +157,13 @@ def _get_scan_lock() -> asyncio.Lock:
return scan_lock return scan_lock
def _get_unsubscribe_lock() -> asyncio.Lock:
global unsubscribe_lock
if unsubscribe_lock is None:
unsubscribe_lock = asyncio.Lock()
return unsubscribe_lock
async def _scheduled_scan() -> None: async def _scheduled_scan() -> None:
lock = _get_scan_lock() lock = _get_scan_lock()
if lock.locked(): if lock.locked():
@ -141,10 +178,27 @@ async def _scheduled_scan() -> None:
logger.exception("Scheduled scan failed") logger.exception("Scheduled scan failed")
async def _scheduled_unsubscribe_digest() -> None:
lock = _get_unsubscribe_lock()
if lock.locked():
logger.info("Previous unsubscribe digest still running, skipping this tick.")
return
async with lock:
try:
result = await asyncio.to_thread(
_run_unsubscribe_digest_once, settings.unsubscribe_max_results
)
logger.info("Scheduled unsubscribe digest complete: %s", result.model_dump())
except Exception:
logger.exception("Scheduled unsubscribe digest failed")
@app.on_event("startup") @app.on_event("startup")
async def startup_event() -> None: async def startup_event() -> None:
global scheduler global scheduler
_get_scan_lock() _get_scan_lock()
_get_unsubscribe_lock()
scheduler = AsyncIOScheduler() scheduler = AsyncIOScheduler()
scheduler.add_job( scheduler.add_job(
_scheduled_scan, _scheduled_scan,
@ -152,9 +206,17 @@ async def startup_event() -> None:
minutes=settings.gmail_scan_interval_minutes, minutes=settings.gmail_scan_interval_minutes,
next_run_time=datetime.now(), next_run_time=datetime.now(),
) )
scheduler.add_job(
_scheduled_unsubscribe_digest,
"interval",
minutes=settings.unsubscribe_digest_interval_minutes,
next_run_time=datetime.now(),
)
scheduler.start() scheduler.start()
logger.info( logger.info(
"Scheduler started (interval=%s min)", settings.gmail_scan_interval_minutes "Scheduler started (scan interval=%s min, unsubscribe interval=%s min)",
settings.gmail_scan_interval_minutes,
settings.unsubscribe_digest_interval_minutes,
) )
@ -169,6 +231,7 @@ def health() -> dict[str, object]:
return { return {
"status": "ok", "status": "ok",
"scan_interval_minutes": settings.gmail_scan_interval_minutes, "scan_interval_minutes": settings.gmail_scan_interval_minutes,
"unsubscribe_digest_interval_minutes": settings.unsubscribe_digest_interval_minutes,
} }
@ -227,3 +290,26 @@ async def availability(request: AvailabilityRequest) -> AvailabilityResponse:
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=f"Availability lookup failed: {exc}", detail=f"Availability lookup failed: {exc}",
) from exc ) from exc
@app.post(
"/unsubscribe-digest",
response_model=UnsubscribeDigestResponse,
dependencies=[Depends(verify_api_key)],
)
async def unsubscribe_digest_now(
max_results: int = Query(default=settings.unsubscribe_max_results, ge=1, le=500),
) -> UnsubscribeDigestResponse:
async with _get_unsubscribe_lock():
try:
return await asyncio.to_thread(_run_unsubscribe_digest_once, max_results)
except FileNotFoundError as exc:
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=str(exc),
) from exc
except Exception as exc:
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=f"Unsubscribe digest failed: {exc}",
) from exc

@ -0,0 +1,282 @@
from __future__ import annotations
import base64
from dataclasses import dataclass
from datetime import datetime, timezone
from email.message import EmailMessage
import html
import json
import logging
from pathlib import Path
import re
from typing import Any
from urllib.parse import parse_qsl, urlencode, urlsplit, urlunsplit
logger = logging.getLogger("personal-agent.unsubscribe")
TRACKING_QUERY_KEYS = {
"fbclid",
"gclid",
"mc_cid",
"mc_eid",
"_hsenc",
"_hsmi",
"utm_campaign",
"utm_content",
"utm_id",
"utm_medium",
"utm_name",
"utm_source",
"utm_term",
}
UNSUBSCRIBE_HINTS = {
"unsubscribe",
"optout",
"opt-out",
"email-preferences",
"manage-subscriptions",
}
URL_PATTERN = re.compile(r"https?://[^\s<>'\"()]+", re.IGNORECASE)
@dataclass(frozen=True)
class UnsubscribeDigestResult:
scanned_messages: int
extracted_unique_links: int
new_links: int
sent_to: str | None
email_sent: bool
class UnsubscribeDigestAgent:
def __init__(
self,
*,
gmail_service: Any,
query: str,
state_file: str,
recipient_email: str | None = None,
send_empty_digest: bool = False,
) -> None:
self.gmail_service = gmail_service
self.query = query
self.state_file = Path(state_file)
self.recipient_email = recipient_email
self.send_empty_digest = send_empty_digest
def scan_and_send_digest(self, max_results: int = 500) -> UnsubscribeDigestResult:
messages = (
self.gmail_service.users()
.messages()
.list(userId="me", q=self.query, maxResults=max_results)
.execute()
.get("messages", [])
)
extracted_links: set[str] = set()
for message in messages:
extracted_links.update(self._extract_links_from_message(message["id"]))
sent_links = self._load_sent_links()
new_links = sorted(link for link in extracted_links if link not in sent_links)
should_send = bool(new_links) or self.send_empty_digest
sent_to: str | None = None
email_sent = False
if should_send:
sent_to = self._resolve_recipient_email()
self._send_digest_email(
recipient_email=sent_to,
new_links=new_links,
scanned_messages=len(messages),
)
email_sent = True
if new_links:
sent_links.update(new_links)
self._save_sent_links(sent_links)
return UnsubscribeDigestResult(
scanned_messages=len(messages),
extracted_unique_links=len(extracted_links),
new_links=len(new_links),
sent_to=sent_to,
email_sent=email_sent,
)
def _extract_links_from_message(self, message_id: str) -> set[str]:
message = (
self.gmail_service.users()
.messages()
.get(userId="me", id=message_id, format="full")
.execute()
)
payload = message.get("payload", {})
links: set[str] = set()
for url in self._extract_list_unsubscribe_links(payload):
normalized = _normalize_url(url)
if normalized:
links.add(normalized)
for text_block in self._extract_text_blocks(payload):
for url in URL_PATTERN.findall(html.unescape(text_block)):
if not _looks_like_unsubscribe(url):
continue
normalized = _normalize_url(url)
if normalized:
links.add(normalized)
return links
def _extract_list_unsubscribe_links(self, payload: dict[str, Any]) -> set[str]:
headers = {
header.get("name", "").lower(): header.get("value", "")
for header in payload.get("headers", [])
}
header_value = headers.get("list-unsubscribe", "")
if not header_value:
return set()
results: set[str] = set()
for candidate in re.findall(r"<([^>]+)>", header_value):
candidate = candidate.strip()
if candidate.lower().startswith(("http://", "https://")):
results.add(candidate)
if not results:
for token in header_value.split(","):
candidate = token.strip().strip("<>").strip()
if candidate.lower().startswith(("http://", "https://")):
results.add(candidate)
return results
def _extract_text_blocks(self, payload: dict[str, Any]) -> list[str]:
blocks: list[str] = []
def walk(part: dict[str, Any]) -> None:
mime_type = part.get("mimeType", "")
body_data = part.get("body", {}).get("data")
if body_data and mime_type in {"text/plain", "text/html"}:
decoded = _decode_base64(body_data)
if decoded:
blocks.append(decoded)
for child in part.get("parts", []):
walk(child)
walk(payload)
return blocks
def _resolve_recipient_email(self) -> str:
if self.recipient_email:
return self.recipient_email
profile = self.gmail_service.users().getProfile(userId="me").execute()
email = profile.get("emailAddress", "").strip()
if not email:
raise RuntimeError(
"Could not resolve recipient email. Set UNSUBSCRIBE_DIGEST_RECIPIENT in .env."
)
return email
def _send_digest_email(
self,
*,
recipient_email: str,
new_links: list[str],
scanned_messages: int,
) -> None:
now_utc = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M UTC")
subject = f"Unsubscribe recap: {len(new_links)} new link(s)"
body_lines = [
"Here is your unsubscribe digest.",
"",
f"Generated at: {now_utc}",
f"Advertising messages scanned: {scanned_messages}",
f"New unsubscribe links: {len(new_links)}",
"",
]
if new_links:
body_lines.append("Links:")
body_lines.extend([f"- {link}" for link in new_links])
else:
body_lines.append("No new unsubscribe links found.")
message = EmailMessage()
message["To"] = recipient_email
message["Subject"] = subject
message.set_content("\n".join(body_lines))
raw = base64.urlsafe_b64encode(message.as_bytes()).decode("utf-8")
(
self.gmail_service.users()
.messages()
.send(userId="me", body={"raw": raw})
.execute()
)
def _load_sent_links(self) -> set[str]:
if not self.state_file.exists():
return set()
try:
payload = json.loads(self.state_file.read_text(encoding="utf-8"))
except json.JSONDecodeError:
logger.warning("State file is invalid JSON: %s", self.state_file)
return set()
links = payload.get("sent_links", [])
if not isinstance(links, list):
return set()
return {str(link) for link in links if str(link).strip()}
def _save_sent_links(self, links: set[str]) -> None:
self.state_file.parent.mkdir(parents=True, exist_ok=True)
self.state_file.write_text(
json.dumps({"sent_links": sorted(links)}, indent=2),
encoding="utf-8",
)
def _decode_base64(data: str) -> str:
padded = data + "=" * (-len(data) % 4)
try:
return base64.urlsafe_b64decode(padded.encode("utf-8")).decode(
"utf-8", errors="replace"
)
except Exception:
return ""
def _looks_like_unsubscribe(url: str) -> bool:
lowered = url.lower()
return any(hint in lowered for hint in UNSUBSCRIBE_HINTS)
def _normalize_url(url: str) -> str | None:
cleaned = url.strip().strip(".,;)")
split = urlsplit(cleaned)
if split.scheme.lower() not in {"http", "https"} or not split.netloc:
return None
scheme = split.scheme.lower()
netloc = split.netloc.lower()
path = split.path or "/"
if path != "/":
path = path.rstrip("/")
query_pairs = parse_qsl(split.query, keep_blank_values=True)
filtered_pairs = [
(key, value)
for key, value in query_pairs
if key.lower() not in TRACKING_QUERY_KEYS
]
query = urlencode(filtered_pairs, doseq=True)
return urlunsplit((scheme, netloc, path, query, ""))
Loading…
Cancel
Save