You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

353 lines
11 KiB
Python

from __future__ import annotations
from dataclasses import dataclass
from email.utils import parseaddr
import logging
from typing import Any
from app.strands_classifier import StrandsEmailClassifier
METADATA_HEADERS = [
"From",
"Subject",
"List-Unsubscribe",
"Precedence",
]
AD_SUBJECT_KEYWORDS = {
"discount",
"offer",
"sale",
"promo",
"newsletter",
"deal",
"save",
"coupon",
"special offer",
"limited time",
}
AD_SENDER_HINTS = {
"newsletter",
"marketing",
"offers",
"promotions",
"deals",
"no-reply",
"noreply",
}
logger = logging.getLogger("personal-agent.gmail")
@dataclass(frozen=True)
class ScanResult:
scanned: int
linkedin: int
advertising: int
veille_techno: int
skipped: int
failed: int
class GmailTriageAgent:
def __init__(
self,
gmail_service: Any,
query: str,
*,
classifier: StrandsEmailClassifier | None = None,
fallback_to_rules: bool = True,
) -> None:
self.gmail_service = gmail_service
self.query = query
self.classifier = classifier
self.fallback_to_rules = fallback_to_rules
def ensure_labels(self) -> dict[str, str]:
labels_response = (
self.gmail_service.users().labels().list(userId="me").execute()
)
labels = labels_response.get("labels", [])
label_by_name = {label["name"]: label["id"] for label in labels}
for required_name in ("LinkedIn", "Advertising", "AgentProcessed", "VeilleTechno"):
if required_name not in label_by_name:
created = (
self.gmail_service.users()
.labels()
.create(
userId="me",
body={
"name": required_name,
"labelListVisibility": "labelShow",
"messageListVisibility": "show",
},
)
.execute()
)
label_by_name[required_name] = created["id"]
return label_by_name
def scan_and_route_messages(self, max_results: int = 100) -> ScanResult:
label_by_name = self.ensure_labels()
effective_query = self._build_effective_query()
inbox_response = (
self.gmail_service.users()
.messages()
.list(userId="me", q=effective_query, maxResults=max_results)
.execute()
)
inbox_messages = inbox_response.get("messages", [])
linkedin = 0
advertising = 0
veille_techno = 0
skipped = 0
failed = 0
logger.info(
"Gmail scan base_query='%s' effective_query='%s' matched=%s "
"(resultSizeEstimate=%s, max_results=%s)",
self.query,
effective_query,
len(inbox_messages),
inbox_response.get("resultSizeEstimate", 0),
max_results,
)
if not inbox_messages:
self._log_query_diagnostics(effective_query)
for message in inbox_messages:
outcome = self._route_message(message["id"], label_by_name)
logger.info("Message %s routed with outcome: %s", message["id"], outcome)
if outcome == "linkedin":
linkedin += 1
elif outcome == "advertising":
advertising += 1
elif outcome == "veille_techno":
veille_techno += 1
elif outcome == "skipped":
skipped += 1
else:
failed += 1
return ScanResult(
scanned=len(inbox_messages),
linkedin=linkedin,
advertising=advertising,
veille_techno=veille_techno,
skipped=skipped,
failed=failed,
)
def _build_effective_query(self) -> str:
# Hard requirement: scan only unread messages in root inbox.
base_query = (self.query or "").strip()
query_lower = base_query.lower()
required_terms = []
if "in:inbox" not in query_lower:
required_terms.append("in:inbox")
if "is:unread" not in query_lower:
required_terms.append("is:unread")
if not base_query:
return " ".join(required_terms)
if not required_terms:
return base_query
return f"{base_query} {' '.join(required_terms)}"
def _log_query_diagnostics(self, effective_query: str) -> None:
diagnostic_queries = [
"in:inbox",
"in:inbox is:unread",
"in:inbox newer_than:30d",
"in:inbox is:unread -label:AgentProcessed",
]
logger.info("No messages matched current query. Running diagnostics...")
for diagnostic_query in diagnostic_queries:
if diagnostic_query == effective_query:
continue
try:
response = (
self.gmail_service.users()
.messages()
.list(userId="me", q=diagnostic_query, maxResults=1)
.execute()
)
logger.info(
"Diagnostic query='%s' resultSizeEstimate=%s",
diagnostic_query,
response.get("resultSizeEstimate", 0),
)
except Exception:
logger.exception(
"Failed to run diagnostic query='%s'", diagnostic_query
)
def _route_message(self, message_id: str, label_by_name: dict[str, str]) -> str:
try:
message = (
self.gmail_service.users()
.messages()
.get(
userId="me",
id=message_id,
format="metadata",
metadataHeaders=METADATA_HEADERS,
)
.execute()
)
headers = {
h["name"].lower(): h["value"]
for h in message.get("payload", {}).get("headers", [])
}
label_ids = set(message.get("labelIds", []))
sender = headers.get("from", "")
subject = headers.get("subject", "")
snippet = message.get("snippet", "")
list_unsubscribe = headers.get("list-unsubscribe", "")
precedence = headers.get("precedence", "")
label = self._classify_email(
message_id=message_id,
sender=sender,
subject=subject,
snippet=snippet,
list_unsubscribe=list_unsubscribe,
precedence=precedence,
message_label_ids=label_ids,
)
add_labels = [label_by_name["AgentProcessed"]]
remove_labels = []
if label == "LINKEDIN":
add_labels.insert(0, label_by_name["LinkedIn"])
remove_labels.append("INBOX")
outcome = "linkedin"
elif label == "ADVERTISING":
add_labels.insert(0, label_by_name["Advertising"])
remove_labels.append("INBOX")
outcome = "advertising"
elif label == "VEILLE_TECHNO":
add_labels.insert(0, label_by_name["VeilleTechno"])
remove_labels.append("INBOX")
outcome = "veille_techno"
else:
outcome = "skipped"
(
self.gmail_service.users()
.messages()
.modify(
userId="me",
id=message_id,
body={
"addLabelIds": add_labels,
"removeLabelIds": remove_labels,
},
)
.execute()
)
return outcome
except Exception:
logger.exception("Failed to route message %s", message_id)
return "failed"
def _classify_email(
self,
*,
message_id: str,
sender: str,
subject: str,
snippet: str,
list_unsubscribe: str,
precedence: str,
message_label_ids: set[str],
) -> str:
if self.classifier:
try:
classifier_result = self.classifier.classify(
sender=sender,
subject=subject,
snippet=snippet,
list_unsubscribe=list_unsubscribe,
precedence=precedence,
message_label_ids=message_label_ids,
)
logger.info(
"Message %s classified by model as %s (confidence=%.2f)",
message_id,
classifier_result.label,
classifier_result.confidence,
)
return classifier_result.label
except Exception:
logger.exception("Model classification failed for %s", message_id)
if not self.fallback_to_rules:
return "OTHER"
if self.fallback_to_rules:
if self._is_linkedin_email(sender=sender, subject=subject):
return "LINKEDIN"
if self._is_advertising_email(
sender=sender,
subject=subject,
list_unsubscribe=list_unsubscribe,
precedence=precedence,
message_label_ids=message_label_ids,
):
return "ADVERTISING"
if self._is_veille_techno_email(sender=sender, subject=subject):
return "VEILLE_TECHNO"
return "OTHER"
def _is_linkedin_email(self, sender: str, subject: str) -> bool:
sender_lower = sender.lower()
subject_lower = subject.lower()
if "linkedin" in sender_lower or "linkedin" in subject_lower:
return True
parsed_address = parseaddr(sender)[1].lower()
return parsed_address.endswith("@linkedin.com")
def _is_advertising_email(
self,
sender: str,
subject: str,
list_unsubscribe: str,
precedence: str,
message_label_ids: set[str],
) -> bool:
sender_lower = sender.lower()
subject_lower = subject.lower()
precedence_lower = precedence.lower()
if "CATEGORY_PROMOTIONS" in message_label_ids:
return True
if list_unsubscribe.strip():
return True
if precedence_lower in {"bulk", "list", "junk"}:
return True
if any(keyword in subject_lower for keyword in AD_SUBJECT_KEYWORDS):
return True
return any(hint in sender_lower for hint in AD_SENDER_HINTS)
def _is_veille_techno_email(self, sender: str, subject: str) -> bool:
sender_lower = sender.lower()
subject_lower = subject.lower()
if "cybernetica" in sender_lower or "cybernetica" in subject_lower:
return True
return False