feat: add custom route to export data for LLM
Déploiement API PROD / build (push) Successful in 2m8s
Déploiement API PROD / deploy (push) Successful in 58s

This commit is contained in:
2026-06-11 19:07:18 +04:00
parent b565224fcd
commit a000af5c8b
4 changed files with 215 additions and 0 deletions
+120
View File
@@ -6,6 +6,42 @@ const Diff = require('diff')
const { createCoreService } = require('@strapi/strapi').factories;
const { ApplicationError } = require("@strapi/utils").errors
const LANG_MAP = {
fr: { field: 'francais', targetLang: 'fr', userPrompt: 'Tradui an fransé' },
en: { field: 'anglais', targetLang: 'en', userPrompt: 'Translate to English' },
es: { field: 'espagnol', targetLang: 'es', userPrompt: 'Traduce al español' },
de: { field: 'allemand', targetLang: 'de', userPrompt: 'Übersetze auf Deutsch' },
it: { field: 'italien', targetLang: 'it', userPrompt: 'Traduci in italiano' },
}
const ALL_LANGS = Object.keys(LANG_MAP)
function stripMarkdown(text) {
if (!text) return ''
return text
.replace(/#{1,6}\s+/g, '')
.replace(/\*\*(.*?)\*\*/gs, '$1')
.replace(/\*(.*?)\*/gs, '$1')
.replace(/__(.*?)__/gs, '$1')
.replace(/_(.*?)_/gs, '$1')
.replace(/\[([^\]]+)\]\([^\)]+\)/g, '$1')
.replace(/^[>\-\*\+]\s+/gm, '')
.replace(/\n{3,}/g, '\n\n')
.trim()
}
// Détecte si une transcription est probablement en français plutôt qu'en KA.
// Heuristique : si les pronoms personnels français représentent > 4 % des mots.
const FR_PRONOUNS = new Set(['je', 'tu', 'il', 'elle', 'nous', 'vous', 'ils', 'elles'])
function suspectFrench(text) {
if (!text) return false
const words = text.toLowerCase().match(/\b[a-zàâäéèêëîïôöùûüç]+\b/g) || []
if (words.length < 10) return false
const frCount = words.filter(w => FR_PRONOUNS.has(w)).length
return frCount / words.length > 0.04
}
class Translator {
constructor() {
this.deeplApi = process.env.DEEPL_URL || 'api-free.deepl.com'
@@ -72,6 +108,90 @@ module.exports = createCoreService('api::parole.parole', ({strapi}) => ({
throw new ApplicationError('La transcription doit contenir au moins 10 caractères.')
}
},
async fetchAllParoles() {
const pageSize = 100
let start = 0
const all = []
while (true) {
const batch = await strapi.documents('api::parole.parole').findMany({
status: 'published',
populate: ['artistes', 'traductions'],
fields: ['documentId', 'titre', 'slug', 'transcription', 'annee'],
limit: pageSize,
start,
})
all.push(...batch)
if (batch.length < pageSize) break
start += pageSize
}
return all
},
buildExport(paroles, type, langs) {
const targetLangs = langs && langs.length ? langs : ALL_LANGS
const pairs = []
const missing = []
const nonKa = []
const langCounts = {}
for (const parole of paroles) {
const source = stripMarkdown(parole.transcription)
const artists = (parole.artistes || []).map(a => a.alias)
const paroleMeta = { title: parole.titre, artists }
if (suspectFrench(source)) {
nonKa.push({ documentId: parole.documentId, slug: parole.slug, ...paroleMeta, suspected_lang: 'fr' })
}
const missingLangs = ALL_LANGS.filter(lang => !parole.traductions?.[LANG_MAP[lang].field])
if (missingLangs.length > 0) {
missing.push({ documentId: parole.documentId, slug: parole.slug, ...paroleMeta, missing: missingLangs })
}
for (const lang of targetLangs) {
const { field, targetLang, userPrompt } = LANG_MAP[lang]
const target = stripMarkdown(parole.traductions?.[field])
if (!target) continue
langCounts[lang] = (langCounts[lang] || 0) + 1
if (type === 'instruct') {
pairs.push({
messages: [
{
role: 'system',
content: 'Tu es un expert en langue KA (créole guadeloupéen/martiniquais). Traduis le texte KA suivant.',
},
{ role: 'user', content: `${userPrompt} :\n\n${source}` },
{ role: 'assistant', content: target },
],
})
} else {
pairs.push({
source_lang: 'ka',
target_lang: targetLang,
source,
target,
...paroleMeta,
})
}
}
}
const metadata = {
exported_at: new Date().toISOString(),
total_paroles: paroles.length,
total_pairs: pairs.length,
languages: langCounts,
missing_translations: missing,
non_ka_transcriptions: nonKa,
}
return { metadata, pairs }
},
parolesDiff(titre = '', oldString, newString) {
const patch = Diff.createPatch(titre, oldString, newString, 'supprimée', 'ajoutée')
const parsePatch = Diff.parsePatch(patch)