feat: add custom route to export data for LLM
This commit is contained in:
@@ -2,7 +2,43 @@
|
||||
|
||||
const { createCoreController } = require('@strapi/strapi').factories;
|
||||
|
||||
const VALID_LANGS = new Set(['fr', 'en', 'es', 'de', 'it'])
|
||||
|
||||
module.exports = createCoreController('api::parole.parole', ({strapi}) => ({
|
||||
async export(ctx) {
|
||||
const { type = 'pairs', lang, format = 'jsonl' } = ctx.query
|
||||
|
||||
const langs = lang
|
||||
? lang.split(',').map(l => l.trim()).filter(l => VALID_LANGS.has(l))
|
||||
: null
|
||||
|
||||
if (lang && (!langs || langs.length === 0)) {
|
||||
return ctx.badRequest('Langue(s) invalide(s). Valeurs acceptées : fr, en, es, de, it.')
|
||||
}
|
||||
|
||||
if (!['pairs', 'instruct'].includes(type)) {
|
||||
return ctx.badRequest('type invalide. Valeurs acceptées : pairs, instruct.')
|
||||
}
|
||||
|
||||
const paroles = await strapi.service('api::parole.parole').fetchAllParoles()
|
||||
const { metadata, pairs } = strapi.service('api::parole.parole').buildExport(paroles, type, langs)
|
||||
|
||||
if (format === 'json') {
|
||||
return ctx.send({ metadata, data: pairs })
|
||||
}
|
||||
|
||||
// JSONL : première ligne = métadonnées, suivies des exemples d'entraînement.
|
||||
// Pour filtrer la ligne de métadonnées : jq 'select(._metadata | not)'
|
||||
const lines = [
|
||||
JSON.stringify({ _metadata: true, ...metadata }),
|
||||
...pairs.map(p => JSON.stringify(p)),
|
||||
]
|
||||
|
||||
ctx.set('Content-Type', 'application/x-ndjson')
|
||||
ctx.set('Content-Disposition', `attachment; filename="pawol-nu-export-${Date.now()}.jsonl"`)
|
||||
ctx.body = lines.join('\n')
|
||||
},
|
||||
|
||||
async findOne(documentId) {
|
||||
const parole = await strapi.documents('api::parole.parole').findOne({
|
||||
documentId,
|
||||
|
||||
@@ -0,0 +1,15 @@
|
||||
'use strict';
|
||||
|
||||
module.exports = {
|
||||
routes: [
|
||||
{
|
||||
method: 'GET',
|
||||
path: '/paroles/export',
|
||||
handler: 'parole.export',
|
||||
config: {
|
||||
policies: [],
|
||||
middlewares: [],
|
||||
},
|
||||
},
|
||||
],
|
||||
};
|
||||
@@ -6,6 +6,42 @@ const Diff = require('diff')
|
||||
const { createCoreService } = require('@strapi/strapi').factories;
|
||||
const { ApplicationError } = require("@strapi/utils").errors
|
||||
|
||||
const LANG_MAP = {
|
||||
fr: { field: 'francais', targetLang: 'fr', userPrompt: 'Tradui an fransé' },
|
||||
en: { field: 'anglais', targetLang: 'en', userPrompt: 'Translate to English' },
|
||||
es: { field: 'espagnol', targetLang: 'es', userPrompt: 'Traduce al español' },
|
||||
de: { field: 'allemand', targetLang: 'de', userPrompt: 'Übersetze auf Deutsch' },
|
||||
it: { field: 'italien', targetLang: 'it', userPrompt: 'Traduci in italiano' },
|
||||
}
|
||||
|
||||
const ALL_LANGS = Object.keys(LANG_MAP)
|
||||
|
||||
function stripMarkdown(text) {
|
||||
if (!text) return ''
|
||||
return text
|
||||
.replace(/#{1,6}\s+/g, '')
|
||||
.replace(/\*\*(.*?)\*\*/gs, '$1')
|
||||
.replace(/\*(.*?)\*/gs, '$1')
|
||||
.replace(/__(.*?)__/gs, '$1')
|
||||
.replace(/_(.*?)_/gs, '$1')
|
||||
.replace(/\[([^\]]+)\]\([^\)]+\)/g, '$1')
|
||||
.replace(/^[>\-\*\+]\s+/gm, '')
|
||||
.replace(/\n{3,}/g, '\n\n')
|
||||
.trim()
|
||||
}
|
||||
|
||||
// Détecte si une transcription est probablement en français plutôt qu'en KA.
|
||||
// Heuristique : si les pronoms personnels français représentent > 4 % des mots.
|
||||
const FR_PRONOUNS = new Set(['je', 'tu', 'il', 'elle', 'nous', 'vous', 'ils', 'elles'])
|
||||
|
||||
function suspectFrench(text) {
|
||||
if (!text) return false
|
||||
const words = text.toLowerCase().match(/\b[a-zàâäéèêëîïôöùûüç]+\b/g) || []
|
||||
if (words.length < 10) return false
|
||||
const frCount = words.filter(w => FR_PRONOUNS.has(w)).length
|
||||
return frCount / words.length > 0.04
|
||||
}
|
||||
|
||||
class Translator {
|
||||
constructor() {
|
||||
this.deeplApi = process.env.DEEPL_URL || 'api-free.deepl.com'
|
||||
@@ -72,6 +108,90 @@ module.exports = createCoreService('api::parole.parole', ({strapi}) => ({
|
||||
throw new ApplicationError('La transcription doit contenir au moins 10 caractères.')
|
||||
}
|
||||
},
|
||||
async fetchAllParoles() {
|
||||
const pageSize = 100
|
||||
let start = 0
|
||||
const all = []
|
||||
|
||||
while (true) {
|
||||
const batch = await strapi.documents('api::parole.parole').findMany({
|
||||
status: 'published',
|
||||
populate: ['artistes', 'traductions'],
|
||||
fields: ['documentId', 'titre', 'slug', 'transcription', 'annee'],
|
||||
limit: pageSize,
|
||||
start,
|
||||
})
|
||||
all.push(...batch)
|
||||
if (batch.length < pageSize) break
|
||||
start += pageSize
|
||||
}
|
||||
|
||||
return all
|
||||
},
|
||||
|
||||
buildExport(paroles, type, langs) {
|
||||
const targetLangs = langs && langs.length ? langs : ALL_LANGS
|
||||
const pairs = []
|
||||
const missing = []
|
||||
const nonKa = []
|
||||
const langCounts = {}
|
||||
|
||||
for (const parole of paroles) {
|
||||
const source = stripMarkdown(parole.transcription)
|
||||
const artists = (parole.artistes || []).map(a => a.alias)
|
||||
const paroleMeta = { title: parole.titre, artists }
|
||||
|
||||
if (suspectFrench(source)) {
|
||||
nonKa.push({ documentId: parole.documentId, slug: parole.slug, ...paroleMeta, suspected_lang: 'fr' })
|
||||
}
|
||||
|
||||
const missingLangs = ALL_LANGS.filter(lang => !parole.traductions?.[LANG_MAP[lang].field])
|
||||
if (missingLangs.length > 0) {
|
||||
missing.push({ documentId: parole.documentId, slug: parole.slug, ...paroleMeta, missing: missingLangs })
|
||||
}
|
||||
|
||||
for (const lang of targetLangs) {
|
||||
const { field, targetLang, userPrompt } = LANG_MAP[lang]
|
||||
const target = stripMarkdown(parole.traductions?.[field])
|
||||
if (!target) continue
|
||||
|
||||
langCounts[lang] = (langCounts[lang] || 0) + 1
|
||||
|
||||
if (type === 'instruct') {
|
||||
pairs.push({
|
||||
messages: [
|
||||
{
|
||||
role: 'system',
|
||||
content: 'Tu es un expert en langue KA (créole guadeloupéen/martiniquais). Traduis le texte KA suivant.',
|
||||
},
|
||||
{ role: 'user', content: `${userPrompt} :\n\n${source}` },
|
||||
{ role: 'assistant', content: target },
|
||||
],
|
||||
})
|
||||
} else {
|
||||
pairs.push({
|
||||
source_lang: 'ka',
|
||||
target_lang: targetLang,
|
||||
source,
|
||||
target,
|
||||
...paroleMeta,
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const metadata = {
|
||||
exported_at: new Date().toISOString(),
|
||||
total_paroles: paroles.length,
|
||||
total_pairs: pairs.length,
|
||||
languages: langCounts,
|
||||
missing_translations: missing,
|
||||
non_ka_transcriptions: nonKa,
|
||||
}
|
||||
|
||||
return { metadata, pairs }
|
||||
},
|
||||
|
||||
parolesDiff(titre = '', oldString, newString) {
|
||||
const patch = Diff.createPatch(titre, oldString, newString, 'supprimée', 'ajoutée')
|
||||
const parsePatch = Diff.parsePatch(patch)
|
||||
|
||||
Reference in New Issue
Block a user