Added VeilleTechno to Gmail classifier

3 months ago · 39d47b5570
parent 518658c96d
commit 39d47b5570
4 changed files with 31 additions and 6 deletions
--- a/README.md
+++ b/README.md
@ -3,9 +3,10 @@
 This project runs a small local API service that:

 - scans unread emails in the root Gmail inbox
- classifies emails with **Strands** (`LINKEDIN`, `ADVERTISING`, `OTHER`)
+- classifies emails with **Strands** (`LINKEDIN`, `ADVERTISING`, `VEILLE_TECHNO`, `OTHER`)
 - moves LinkedIn emails to a `LinkedIn` label/folder
 - moves advertising emails to an `Advertising` label/folder
+- moves veille techno emails to a `VeilleTechno` label/folder
 - scans the `Advertising` label and emails you new unsubscribe links (deduplicated)
 - discovers unsubscribe-ready mailing lists for human review, then auto-unsubscribes selected lists
 - exposes a secure availability endpoint powered by Google Calendar free/busy
@ -124,7 +125,7 @@ curl -X POST "http://127.0.0.1:8000/unsubscribe/auto-run?max_results=500" \

 - Scan scope is always forced to `in:inbox is:unread` (root inbox + unread).
 - `GMAIL_QUERY` is treated as additional filters (for example `-label:AgentProcessed`).
- Strands classification is used for each email (`LINKEDIN`, `ADVERTISING`, `OTHER`).
+- Strands classification is used for each email (`LINKEDIN`, `ADVERTISING`, `VEILLE_TECHNO`, `OTHER`).
 - LinkedIn has priority over advertising inside the classifier prompt.
 - Set `LLM_FALLBACK_TO_RULES=true` only if you want rules-based backup when LLM calls fail.
 - Every scanned message gets an `AgentProcessed` label to avoid reprocessing loops.
@ -150,6 +151,7 @@ curl -X POST "http://127.0.0.1:8000/unsubscribe/auto-run?max_results=500" \

 - Gmail "folders" are labels. This agent creates:
  - `LinkedIn`
+  - `VeilleTechno`
  - `Advertising`
  - `AgentProcessed`
- Messages classified as LinkedIn/Advertising are removed from `INBOX` (moved out of inbox).
+- Messages classified as LinkedIn/Advertising/Veille_Techno are removed from `INBOX` (moved out of inbox).
--- a/app/gmail_agent.py
+++ b/app/gmail_agent.py
@ -45,6 +45,7 @@ class ScanResult:
    scanned: int
    linkedin: int
    advertising: int
+    veille_techno: int
    skipped: int
    failed: int

@ -70,7 +71,7 @@ class GmailTriageAgent:
        labels = labels_response.get("labels", [])
        label_by_name = {label["name"]: label["id"] for label in labels}

-        for required_name in ("LinkedIn", "Advertising", "AgentProcessed"):
+        for required_name in ("LinkedIn", "Advertising", "AgentProcessed", "VeilleTechno"):
            if required_name not in label_by_name:
                created = (
                    self.gmail_service.users()
@ -102,6 +103,7 @@ class GmailTriageAgent:

        linkedin = 0
        advertising = 0
+        veille_techno = 0
        skipped = 0
        failed = 0

@ -124,6 +126,8 @@ class GmailTriageAgent:
                linkedin += 1
            elif outcome == "advertising":
                advertising += 1
+            elif outcome == "veille_techno":
+                veille_techno += 1
            elif outcome == "skipped":
                skipped += 1
            else:
@ -133,6 +137,7 @@ class GmailTriageAgent:
            scanned=len(inbox_messages),
            linkedin=linkedin,
            advertising=advertising,
+            veille_techno=veille_techno,
            skipped=skipped,
            failed=failed,
        )
@ -227,6 +232,10 @@ class GmailTriageAgent:
                add_labels.insert(0, label_by_name["Advertising"])
                remove_labels.append("INBOX")
                outcome = "advertising"
+            elif label == "VEILLE_TECHNO":
+                add_labels.insert(0, label_by_name["VeilleTechno"])
+                remove_labels.append("INBOX")
+                outcome = "veille_techno"
            else:
                outcome = "skipped"

@ -293,6 +302,8 @@ class GmailTriageAgent:
                message_label_ids=message_label_ids,
            ):
                return "ADVERTISING"
+            if self._is_veille_techno_email(sender=sender, subject=subject):
+                return "VEILLE_TECHNO"

        return "OTHER"

@ -331,3 +342,12 @@ class GmailTriageAgent:
            return True

        return any(hint in sender_lower for hint in AD_SENDER_HINTS)
+
+    def _is_veille_techno_email(self, sender: str, subject: str) -> bool:
+        sender_lower = sender.lower()
+        subject_lower = subject.lower()
+
+        if "cybernetica" in sender_lower or "cybernetica" in subject_lower:
+            return True
+
+        return False
--- a/app/main.py
+++ b/app/main.py
@ -36,6 +36,7 @@ class ScanResponse(BaseModel):
    scanned: int
    linkedin: int
    advertising: int
+    veille_techno: int
    skipped: int
    failed: int

@ -154,6 +155,7 @@ def _run_scan_once(max_results: int) -> ScanResponse:
        scanned=result.scanned,
        linkedin=result.linkedin,
        advertising=result.advertising,
+        veille_techno=result.veille_techno,
        skipped=result.skipped,
        failed=result.failed,
    )
--- a/app/strands_classifier.py
+++ b/app/strands_classifier.py
@ -10,18 +10,19 @@ from strands.models.openai import OpenAIModel

 logger = logging.getLogger("personal-agent.strands")

-ALLOWED_LABELS = {"LINKEDIN", "ADVERTISING", "OTHER"}
+ALLOWED_LABELS = {"LINKEDIN", "ADVERTISING", "VEILLE_TECHNO", "OTHER"}

 SYSTEM_PROMPT = """You classify incoming emails into exactly one label:
 - LINKEDIN: official LinkedIn platform emails (job alerts, invites, network updates, LinkedIn newsletters).
 - ADVERTISING: marketing/promotional/sales emails, newsletters, coupons, deals, brand campaigns. Do not label as ADVERTISING if the email is purely transactional (e.g. order confirmation, password reset) even if it contains some marketing language. Also do not label as ADVERTISING if the sender is Cybernetica. But if the sender is Cybernetica and the content is clearly promotional (e.g. "Check out our new product"), then label as ADVERTISING. And if the sender is Castorama and the content is about Communauté d'entraide, the label should be ADVERTISING.
+- VEILLE_TECHNO: Cybernetica emails that are clearly about technology watch, sharing interesting articles, insights, trends, etc. without a promotional angle.
 - OTHER: anything else.

 Rules:
 1) If sender/content clearly belongs to LinkedIn, choose LINKEDIN even if promotional.
 2) If uncertain between ADVERTISING and OTHER, choose OTHER.
 3) Return only JSON with this schema:
-{"label":"LINKEDIN|ADVERTISING|OTHER","confidence":0.0-1.0,"reason":"short reason"}"""
+{"label":"LINKEDIN|ADVERTISING|VEILLE_TECHNO|OTHER","confidence":0.0-1.0,"reason":"short reason"}"""


@dataclass(frozen=True)