feat: add custom route to export data for LLM

2026-06-11 19:07:18 +04:00
parent b565224fcd
commit a000af5c8b
4 changed files with 215 additions and 0 deletions
@@ -77,6 +77,50 @@ ___
 ### `/paroles/count`
 - `GET` : Récupère le nombre de texte

+### `/paroles/export` ⚙️ Token requis
+- `GET` : Exporter les paroles et traductions au format JSONL ou JSON pour l'entraînement de modèles LLM
+
+**Paramètres de requête :**
+
+| Paramètre | Valeurs acceptées | Défaut | Description |
+|-----------|-------------------|--------|-------------|
+| `type`    | `pairs` \| `instruct` | `pairs` | Format des exemples d'entraînement |
+| `lang`    | `fr,en,es,de,it` | toutes | Langues cibles à inclure (séparées par des virgules) |
+| `format`  | `jsonl` \| `json` | `jsonl` | Format de la réponse |
+
+**Type `pairs`** — corpus parallèle source/cible, adapté aux modèles de traduction :
+```json
+{"source_lang":"ka","target_lang":"fr","source":"Mwen ka palé épi ou…","target":"Je suis en train de te parler…","title":"Titre","artists":["Artiste"]}
+```
+
+**Type `instruct`** — format instruction/chat, adapté au fine-tuning de modèles d'instruction :
+```json
+{"messages":[{"role":"system","content":"Tu es un expert en langue KA…"},{"role":"user","content":"Tradui an fransé :\n\nMwen ka palé épi ou…"},{"role":"assistant","content":"Je suis en train de te parler…"}]}
+```
+
+**Format `jsonl`** : la première ligne contient les métadonnées (champ `_metadata: true`). Pour les filtrer :
+```bash
+jq 'select(._metadata | not)' export.jsonl
+```
+
+**Métadonnées incluses** (`_metadata: true` en JSONL, clé `metadata` en JSON) :
+
+| Champ | Description |
+|-------|-------------|
+| `exported_at` | Horodatage de l'export |
+| `total_paroles` | Nombre de paroles traitées |
+| `total_pairs` | Nombre d'exemples d'entraînement générés |
+| `languages` | Nombre de paires par langue |
+| `missing_translations` | Paroles avec des traductions manquantes, par langue |
+| `non_ka_transcriptions` | Paroles dont la transcription est suspectée d'être dans une autre langue (ex. français) |
+
+**Exemple :**
+```bash
+curl -H "Authorization: Bearer <token>" \
+  "https://api.pawol.nu/api/paroles/export?type=instruct&lang=fr,en&format=jsonl" \
+  -o dataset.jsonl
+```
+
 ## License

 Copyright (C) 2024 Cédric Famibelle-Pronzola & ORGANISATION KA INTERNATIONALE (OKI)
@@ -2,7 +2,43 @@

 const { createCoreController } = require('@strapi/strapi').factories;

+const VALID_LANGS = new Set(['fr', 'en', 'es', 'de', 'it'])
+
 module.exports = createCoreController('api::parole.parole', ({strapi}) => ({
+  async export(ctx) {
+    const { type = 'pairs', lang, format = 'jsonl' } = ctx.query
+
+    const langs = lang
+      ? lang.split(',').map(l => l.trim()).filter(l => VALID_LANGS.has(l))
+      : null
+
+    if (lang && (!langs || langs.length === 0)) {
+      return ctx.badRequest('Langue(s) invalide(s). Valeurs acceptées : fr, en, es, de, it.')
+    }
+
+    if (!['pairs', 'instruct'].includes(type)) {
+      return ctx.badRequest('type invalide. Valeurs acceptées : pairs, instruct.')
+    }
+
+    const paroles = await strapi.service('api::parole.parole').fetchAllParoles()
+    const { metadata, pairs } = strapi.service('api::parole.parole').buildExport(paroles, type, langs)
+
+    if (format === 'json') {
+      return ctx.send({ metadata, data: pairs })
+    }
+
+    // JSONL : première ligne = métadonnées, suivies des exemples d'entraînement.
+    // Pour filtrer la ligne de métadonnées : jq 'select(._metadata | not)'
+    const lines = [
+      JSON.stringify({ _metadata: true, ...metadata }),
+      ...pairs.map(p => JSON.stringify(p)),
+    ]
+
+    ctx.set('Content-Type', 'application/x-ndjson')
+    ctx.set('Content-Disposition', `attachment; filename="pawol-nu-export-${Date.now()}.jsonl"`)
+    ctx.body = lines.join('\n')
+  },
+
  async findOne(documentId) {
    const parole = await strapi.documents('api::parole.parole').findOne({
      documentId,
@@ -0,0 +1,15 @@
+'use strict';
+
+module.exports = {
+  routes: [
+    {
+      method: 'GET',
+      path: '/paroles/export',
+      handler: 'parole.export',
+      config: {
+        policies: [],
+        middlewares: [],
+      },
+    },
+  ],
+};
@@ -6,6 +6,42 @@ const Diff = require('diff')
 const { createCoreService } = require('@strapi/strapi').factories;
 const { ApplicationError } = require("@strapi/utils").errors

+const LANG_MAP = {
+  fr: { field: 'francais', targetLang: 'fr', userPrompt: 'Tradui an fransé' },
+  en: { field: 'anglais',  targetLang: 'en', userPrompt: 'Translate to English' },
+  es: { field: 'espagnol', targetLang: 'es', userPrompt: 'Traduce al español' },
+  de: { field: 'allemand', targetLang: 'de', userPrompt: 'Übersetze auf Deutsch' },
+  it: { field: 'italien',  targetLang: 'it', userPrompt: 'Traduci in italiano' },
+}
+
+const ALL_LANGS = Object.keys(LANG_MAP)
+
+function stripMarkdown(text) {
+  if (!text) return ''
+  return text
+    .replace(/#{1,6}\s+/g, '')
+    .replace(/\*\*(.*?)\*\*/gs, '$1')
+    .replace(/\*(.*?)\*/gs, '$1')
+    .replace(/__(.*?)__/gs, '$1')
+    .replace(/_(.*?)_/gs, '$1')
+    .replace(/\[([^\]]+)\]\([^\)]+\)/g, '$1')
+    .replace(/^[>\-\*\+]\s+/gm, '')
+    .replace(/\n{3,}/g, '\n\n')
+    .trim()
+}
+
+// Détecte si une transcription est probablement en français plutôt qu'en KA.
+// Heuristique : si les pronoms personnels français représentent > 4 % des mots.
+const FR_PRONOUNS = new Set(['je', 'tu', 'il', 'elle', 'nous', 'vous', 'ils', 'elles'])
+
+function suspectFrench(text) {
+  if (!text) return false
+  const words = text.toLowerCase().match(/\b[a-zàâäéèêëîïôöùûüç]+\b/g) || []
+  if (words.length < 10) return false
+  const frCount = words.filter(w => FR_PRONOUNS.has(w)).length
+  return frCount / words.length > 0.04
+}
+
 class Translator {
  constructor() {
    this.deeplApi = process.env.DEEPL_URL || 'api-free.deepl.com'
@@ -72,6 +108,90 @@ module.exports = createCoreService('api::parole.parole', ({strapi}) => ({
      throw new ApplicationError('La transcription doit contenir au moins 10 caractères.')
    }
  },
+  async fetchAllParoles() {
+    const pageSize = 100
+    let start = 0
+    const all = []
+
+    while (true) {
+      const batch = await strapi.documents('api::parole.parole').findMany({
+        status: 'published',
+        populate: ['artistes', 'traductions'],
+        fields: ['documentId', 'titre', 'slug', 'transcription', 'annee'],
+        limit: pageSize,
+        start,
+      })
+      all.push(...batch)
+      if (batch.length < pageSize) break
+      start += pageSize
+    }
+
+    return all
+  },
+
+  buildExport(paroles, type, langs) {
+    const targetLangs = langs && langs.length ? langs : ALL_LANGS
+    const pairs = []
+    const missing = []
+    const nonKa = []
+    const langCounts = {}
+
+    for (const parole of paroles) {
+      const source = stripMarkdown(parole.transcription)
+      const artists = (parole.artistes || []).map(a => a.alias)
+      const paroleMeta = { title: parole.titre, artists }
+
+      if (suspectFrench(source)) {
+        nonKa.push({ documentId: parole.documentId, slug: parole.slug, ...paroleMeta, suspected_lang: 'fr' })
+      }
+
+      const missingLangs = ALL_LANGS.filter(lang => !parole.traductions?.[LANG_MAP[lang].field])
+      if (missingLangs.length > 0) {
+        missing.push({ documentId: parole.documentId, slug: parole.slug, ...paroleMeta, missing: missingLangs })
+      }
+
+      for (const lang of targetLangs) {
+        const { field, targetLang, userPrompt } = LANG_MAP[lang]
+        const target = stripMarkdown(parole.traductions?.[field])
+        if (!target) continue
+
+        langCounts[lang] = (langCounts[lang] || 0) + 1
+
+        if (type === 'instruct') {
+          pairs.push({
+            messages: [
+              {
+                role: 'system',
+                content: 'Tu es un expert en langue KA (créole guadeloupéen/martiniquais). Traduis le texte KA suivant.',
+              },
+              { role: 'user',      content: `${userPrompt} :\n\n${source}` },
+              { role: 'assistant', content: target },
+            ],
+          })
+        } else {
+          pairs.push({
+            source_lang: 'ka',
+            target_lang: targetLang,
+            source,
+            target,
+            ...paroleMeta,
+          })
+        }
+      }
+    }
+
+    const metadata = {
+      exported_at: new Date().toISOString(),
+      total_paroles: paroles.length,
+      total_pairs: pairs.length,
+      languages: langCounts,
+      missing_translations: missing,
+      non_ka_transcriptions: nonKa,
+    }
+
+    return { metadata, pairs }
+  },
+
  parolesDiff(titre = '', oldString, newString) {
    const patch = Diff.createPatch(titre, oldString, newString, 'supprimée', 'ajoutée')
    const parsePatch = Diff.parsePatch(patch)