Added openai classifier

master
oabrivard 1 week ago
parent a14b02ad3c
commit d9c7497acb

@ -1,6 +1,11 @@
GOOGLE_CLIENT_SECRETS_FILE=credentials.json GOOGLE_CLIENT_SECRETS_FILE=credentials.json
GOOGLE_TOKEN_FILE=token.json GOOGLE_TOKEN_FILE=token.json
AGENT_API_KEY=change-me AGENT_API_KEY=change-me
LLM_API_KEY=
LLM_MODEL=gpt-4.1-mini
LLM_BASE_URL=
LLM_TIMEOUT_SECONDS=20
LLM_FALLBACK_TO_RULES=false
GMAIL_SCAN_INTERVAL_MINUTES=5 GMAIL_SCAN_INTERVAL_MINUTES=5
GMAIL_QUERY=in:inbox -label:AgentProcessed newer_than:7d GMAIL_QUERY=in:inbox -label:AgentProcessed newer_than:7d
LOG_LEVEL=INFO LOG_LEVEL=INFO

@ -3,6 +3,7 @@
This project runs a small local API service that: This project runs a small local API service that:
- scans new Gmail inbox messages - scans new Gmail inbox messages
- classifies emails with an LLM as `LINKEDIN`, `ADVERTISING`, or `OTHER`
- moves LinkedIn emails to a `LinkedIn` label/folder - moves LinkedIn emails to a `LinkedIn` label/folder
- moves advertising emails to an `Advertising` label/folder - moves advertising emails to an `Advertising` label/folder
- exposes a secure availability endpoint powered by Google Calendar free/busy - exposes a secure availability endpoint powered by Google Calendar free/busy
@ -11,6 +12,7 @@ This project runs a small local API service that:
- Python 3.11+ - Python 3.11+
- A Google account - A Google account
- An OpenAI-compatible API key for the LLM classifier
- A Google Cloud project with: - A Google Cloud project with:
- Gmail API enabled - Gmail API enabled
- Google Calendar API enabled - Google Calendar API enabled
@ -37,6 +39,7 @@ cp .env.example .env
Edit `.env` and set: Edit `.env` and set:
- `AGENT_API_KEY` to a strong secret for agent-to-agent calls - `AGENT_API_KEY` to a strong secret for agent-to-agent calls
- `LLM_API_KEY` and optional `LLM_MODEL` / `LLM_BASE_URL`
- optional scan frequency and Gmail query - optional scan frequency and Gmail query
## 4) Run ## 4) Run
@ -79,8 +82,9 @@ If `available` is `true`, there are no busy slots in that range.
## Classification behavior ## Classification behavior
- LinkedIn detection: sender or subject contains `linkedin` (LinkedIn has priority). - LLM classification is used for each email (`LINKEDIN`, `ADVERTISING`, `OTHER`).
- Advertising detection: Gmail promotion category, `List-Unsubscribe`, `Precedence: bulk/list/junk`, common promo keywords, and marketing sender hints. - LinkedIn has priority over advertising inside the classifier prompt.
- Set `LLM_FALLBACK_TO_RULES=true` only if you want rules-based backup when LLM calls fail.
- Every scanned message gets an `AgentProcessed` label to avoid reprocessing loops. - Every scanned message gets an `AgentProcessed` label to avoid reprocessing loops.
## Notes ## Notes

@ -18,10 +18,16 @@ class Settings:
gmail_scan_interval_minutes: int gmail_scan_interval_minutes: int
gmail_query: str gmail_query: str
agent_api_key: str agent_api_key: str
llm_api_key: str
llm_model: str
llm_base_url: str | None
llm_timeout_seconds: float
llm_fallback_to_rules: bool
log_level: str log_level: str
def get_settings() -> Settings: def get_settings() -> Settings:
llm_base_url = os.getenv("LLM_BASE_URL", "").strip()
return Settings( return Settings(
google_client_secrets_file=os.getenv("GOOGLE_CLIENT_SECRETS_FILE", "credentials.json"), google_client_secrets_file=os.getenv("GOOGLE_CLIENT_SECRETS_FILE", "credentials.json"),
google_token_file=os.getenv("GOOGLE_TOKEN_FILE", "token.json"), google_token_file=os.getenv("GOOGLE_TOKEN_FILE", "token.json"),
@ -30,5 +36,14 @@ def get_settings() -> Settings:
"GMAIL_QUERY", "in:inbox -label:AgentProcessed newer_than:7d" "GMAIL_QUERY", "in:inbox -label:AgentProcessed newer_than:7d"
), ),
agent_api_key=os.getenv("AGENT_API_KEY", ""), agent_api_key=os.getenv("AGENT_API_KEY", ""),
llm_api_key=os.getenv("LLM_API_KEY", ""),
llm_model=os.getenv("LLM_MODEL", "gpt-4.1-mini"),
llm_base_url=llm_base_url or None,
llm_timeout_seconds=float(os.getenv("LLM_TIMEOUT_SECONDS", "20")),
llm_fallback_to_rules=_as_bool(os.getenv("LLM_FALLBACK_TO_RULES", "false")),
log_level=os.getenv("LOG_LEVEL", "INFO"), log_level=os.getenv("LOG_LEVEL", "INFO"),
) )
def _as_bool(value: str) -> bool:
return value.strip().lower() in {"1", "true", "yes", "on"}

@ -5,6 +5,7 @@ from email.utils import parseaddr
import logging import logging
from typing import Any from typing import Any
from app.llm_classifier import LLMEmailClassifier
METADATA_HEADERS = [ METADATA_HEADERS = [
"From", "From",
@ -49,9 +50,18 @@ class ScanResult:
class GmailTriageAgent: class GmailTriageAgent:
def __init__(self, gmail_service: Any, query: str) -> None: def __init__(
self,
gmail_service: Any,
query: str,
*,
classifier: LLMEmailClassifier | None = None,
fallback_to_rules: bool = True,
) -> None:
self.gmail_service = gmail_service self.gmail_service = gmail_service
self.query = query self.query = query
self.classifier = classifier
self.fallback_to_rules = fallback_to_rules
def ensure_labels(self) -> dict[str, str]: def ensure_labels(self) -> dict[str, str]:
labels_response = ( labels_response = (
@ -134,24 +144,28 @@ class GmailTriageAgent:
sender = headers.get("from", "") sender = headers.get("from", "")
subject = headers.get("subject", "") subject = headers.get("subject", "")
snippet = message.get("snippet", "")
list_unsubscribe = headers.get("list-unsubscribe", "")
precedence = headers.get("precedence", "")
should_linkedin = self._is_linkedin_email(sender=sender, subject=subject) label = self._classify_email(
should_advertising = self._is_advertising_email( message_id=message_id,
sender=sender, sender=sender,
subject=subject, subject=subject,
list_unsubscribe=headers.get("list-unsubscribe", ""), snippet=snippet,
precedence=headers.get("precedence", ""), list_unsubscribe=list_unsubscribe,
precedence=precedence,
message_label_ids=label_ids, message_label_ids=label_ids,
) )
add_labels = [label_by_name["AgentProcessed"]] add_labels = [label_by_name["AgentProcessed"]]
remove_labels = [] remove_labels = []
if should_linkedin: if label == "LINKEDIN":
add_labels.insert(0, label_by_name["LinkedIn"]) add_labels.insert(0, label_by_name["LinkedIn"])
remove_labels.append("INBOX") remove_labels.append("INBOX")
outcome = "linkedin" outcome = "linkedin"
elif should_advertising: elif label == "ADVERTISING":
add_labels.insert(0, label_by_name["Advertising"]) add_labels.insert(0, label_by_name["Advertising"])
remove_labels.append("INBOX") remove_labels.append("INBOX")
outcome = "advertising" outcome = "advertising"
@ -177,6 +191,53 @@ class GmailTriageAgent:
logger.exception("Failed to route message %s", message_id) logger.exception("Failed to route message %s", message_id)
return "failed" return "failed"
def _classify_email(
self,
*,
message_id: str,
sender: str,
subject: str,
snippet: str,
list_unsubscribe: str,
precedence: str,
message_label_ids: set[str],
) -> str:
if self.classifier:
try:
llm_result = self.classifier.classify(
sender=sender,
subject=subject,
snippet=snippet,
list_unsubscribe=list_unsubscribe,
precedence=precedence,
message_label_ids=message_label_ids,
)
logger.info(
"Message %s classified by LLM as %s (confidence=%.2f)",
message_id,
llm_result.label,
llm_result.confidence,
)
return llm_result.label
except Exception:
logger.exception("LLM classification failed for %s", message_id)
if not self.fallback_to_rules:
return "OTHER"
if self.fallback_to_rules:
if self._is_linkedin_email(sender=sender, subject=subject):
return "LINKEDIN"
if self._is_advertising_email(
sender=sender,
subject=subject,
list_unsubscribe=list_unsubscribe,
precedence=precedence,
message_label_ids=message_label_ids,
):
return "ADVERTISING"
return "OTHER"
def _is_linkedin_email(self, sender: str, subject: str) -> bool: def _is_linkedin_email(self, sender: str, subject: str) -> bool:
sender_lower = sender.lower() sender_lower = sender.lower()
subject_lower = subject.lower() subject_lower = subject.lower()

@ -0,0 +1,120 @@
from __future__ import annotations
from dataclasses import dataclass
import json
import logging
import re
from openai import OpenAI
logger = logging.getLogger("personal-agent.llm")
ALLOWED_LABELS = {"LINKEDIN", "ADVERTISING", "OTHER"}
SYSTEM_PROMPT = """You classify incoming emails into exactly one label:
- LINKEDIN: official LinkedIn platform emails (job alerts, invites, network updates, LinkedIn newsletters).
- ADVERTISING: marketing/promotional/sales emails, newsletters, coupons, deals, brand campaigns.
- OTHER: anything else.
Rules:
1) If sender/content clearly belongs to LinkedIn, choose LINKEDIN even if promotional.
2) If uncertain between ADVERTISING and OTHER, choose OTHER.
3) Return only JSON with this schema:
{"label":"LINKEDIN|ADVERTISING|OTHER","confidence":0.0-1.0,"reason":"short reason"}"""
@dataclass(frozen=True)
class LLMClassification:
label: str
confidence: float
reason: str
class LLMEmailClassifier:
def __init__(
self,
*,
api_key: str,
model: str,
base_url: str | None = None,
timeout_seconds: float = 20.0,
) -> None:
if not api_key:
raise ValueError("LLM API key is required for LLM classification.")
self.model = model
self.client = OpenAI(
api_key=api_key,
base_url=base_url,
timeout=timeout_seconds,
)
def classify(
self,
*,
sender: str,
subject: str,
snippet: str,
list_unsubscribe: str,
precedence: str,
message_label_ids: set[str],
) -> LLMClassification:
email_payload = {
"sender": sender,
"subject": subject,
"snippet": snippet,
"list_unsubscribe_present": bool(list_unsubscribe.strip()),
"precedence": precedence,
"gmail_label_ids": sorted(message_label_ids),
}
completion = self.client.chat.completions.create(
model=self.model,
temperature=0,
response_format={"type": "json_object"},
max_tokens=120,
messages=[
{"role": "system", "content": SYSTEM_PROMPT},
{"role": "user", "content": json.dumps(email_payload, ensure_ascii=True)},
],
)
content = completion.choices[0].message.content or "{}"
parsed = _parse_json(content)
label = str(parsed.get("label", "OTHER")).upper().strip()
if label not in ALLOWED_LABELS:
logger.warning("Unexpected LLM label '%s', falling back to OTHER.", label)
label = "OTHER"
confidence = _to_confidence(parsed.get("confidence", 0.0))
reason = str(parsed.get("reason", "")).strip()
return LLMClassification(label=label, confidence=confidence, reason=reason)
def _parse_json(content: str) -> dict:
if not content:
return {}
try:
return json.loads(content)
except json.JSONDecodeError:
match = re.search(r"\{.*\}", content, re.DOTALL)
if not match:
return {}
try:
return json.loads(match.group(0))
except json.JSONDecodeError:
return {}
def _to_confidence(raw_value: object) -> float:
try:
confidence = float(raw_value)
except (TypeError, ValueError):
return 0.0
if confidence < 0:
return 0.0
if confidence > 1:
return 1.0
return confidence

@ -13,6 +13,7 @@ from app.calendar_agent import CalendarAvailabilityAgent
from app.config import get_settings from app.config import get_settings
from app.gmail_agent import GmailTriageAgent from app.gmail_agent import GmailTriageAgent
from app.google_clients import build_calendar_service, build_gmail_service from app.google_clients import build_calendar_service, build_gmail_service
from app.llm_classifier import LLMEmailClassifier
settings = get_settings() settings = get_settings()
logging.basicConfig(level=getattr(logging, settings.log_level.upper(), logging.INFO)) logging.basicConfig(level=getattr(logging, settings.log_level.upper(), logging.INFO))
@ -21,6 +22,7 @@ logger = logging.getLogger("personal-agent")
app = FastAPI(title="Personal Agent", version="0.1.0") app = FastAPI(title="Personal Agent", version="0.1.0")
scheduler: AsyncIOScheduler | None = None scheduler: AsyncIOScheduler | None = None
scan_lock: asyncio.Lock | None = None scan_lock: asyncio.Lock | None = None
llm_key_warning_logged = False
class ScanResponse(BaseModel): class ScanResponse(BaseModel):
@ -73,7 +75,12 @@ def verify_api_key(
def _run_scan_once(max_results: int) -> ScanResponse: def _run_scan_once(max_results: int) -> ScanResponse:
gmail_service = build_gmail_service(settings) gmail_service = build_gmail_service(settings)
gmail_agent = GmailTriageAgent(gmail_service=gmail_service, query=settings.gmail_query) gmail_agent = GmailTriageAgent(
gmail_service=gmail_service,
query=settings.gmail_query,
classifier=_build_llm_classifier(),
fallback_to_rules=settings.llm_fallback_to_rules,
)
result = gmail_agent.scan_and_route_messages(max_results=max_results) result = gmail_agent.scan_and_route_messages(max_results=max_results)
return ScanResponse( return ScanResponse(
scanned=result.scanned, scanned=result.scanned,
@ -84,6 +91,35 @@ def _run_scan_once(max_results: int) -> ScanResponse:
) )
def _build_llm_classifier() -> LLMEmailClassifier | None:
global llm_key_warning_logged
if not settings.llm_api_key:
if settings.llm_fallback_to_rules:
if not llm_key_warning_logged:
logger.warning(
"LLM_API_KEY not set. Falling back to rules-based classification."
)
llm_key_warning_logged = True
return None
raise RuntimeError(
"LLM_API_KEY is required when LLM_FALLBACK_TO_RULES is disabled."
)
try:
return LLMEmailClassifier(
api_key=settings.llm_api_key,
model=settings.llm_model,
base_url=settings.llm_base_url,
timeout_seconds=settings.llm_timeout_seconds,
)
except Exception:
if settings.llm_fallback_to_rules:
logger.exception("Could not initialize LLM classifier; using rules fallback.")
return None
raise
def _get_scan_lock() -> asyncio.Lock: def _get_scan_lock() -> asyncio.Lock:
global scan_lock global scan_lock
if scan_lock is None: if scan_lock is None:

@ -3,5 +3,6 @@ fastapi
google-api-python-client google-api-python-client
google-auth google-auth
google-auth-oauthlib google-auth-oauthlib
openai
python-dotenv python-dotenv
uvicorn[standard] uvicorn[standard]

Loading…
Cancel
Save