{
  "id": "ds-enrich-12",
  "meta": {
    "instanceId": "vorlux-hub"
  },
  "name": "Vorlux AI | Dataset Enricher (Instruction Variants)",
  "active": true,
  "nodes": [
    {
      "id": "f2a3b4c5-0012-4fff-8012-000000000001",
      "name": "Webhook Trigger",
      "type": "n8n-nodes-base.webhook",
      "typeVersion": 2,
      "position": [220, 300],
      "parameters": {
        "path": "dataset-enrich",
        "httpMethod": "POST",
        "responseMode": "onReceived",
        "options": {}
      }
    },
    {
      "id": "f2a3b4c5-0012-4fff-8012-000000000002",
      "name": "Read Training Data",
      "type": "n8n-nodes-base.httpRequest",
      "typeVersion": 4.2,
      "position": [460, 300],
      "parameters": {
        "method": "GET",
        "url": "={{$env.VORLUX_HUB_URL}}/api/admin/finetune/dataset?action=read&file=combined-finetune-v2.jsonl&limit=200",
        "options": {
          "timeout": 30000
        }
      },
      "notes": "Reads existing training examples to generate variants from"
    },
    {
      "id": "f2a3b4c5-0012-4fff-8012-000000000003",
      "name": "Generate Variants",
      "type": "n8n-nodes-base.code",
      "typeVersion": 2,
      "position": [700, 300],
      "notes": "For each training example, generates 3 instruction variants using Ollama to diversify the dataset",
      "parameters": {
        "mode": "runOnceForAllItems",
        "jsCode": "const examples = $input.first().json.data || [];\nconst ollamaUrl = $env.OLLAMA_BASE_URL || 'http://localhost:11434';\nconst newExamples = [];\nconst batchSize = $('Webhook Trigger').first().json.body?.batchSize || 50;\n\n// Sample a subset to enrich (don't process all at once)\nconst sample = examples.slice(0, batchSize);\n\nfor (const ex of sample) {\n  if (!ex.instruction || !ex.output) continue;\n  \n  try {\n    const res = await fetch(ollamaUrl + '/api/generate', {\n      method: 'POST',\n      headers: {'Content-Type':'application/json'},\n      body: JSON.stringify({\n        model: 'qwen2.5:7b',\n        stream: false,\n        prompt: `Generate exactly 3 alternative instruction phrasings for the following task. Each should ask for the same thing but with different wording. Return as a JSON array of 3 strings.\\n\\nOriginal instruction: \"${ex.instruction.substring(0, 500)}\"\\n\\nReturn ONLY a JSON array like: [\"variant 1\", \"variant 2\", \"variant 3\"]`,\n        options: { temperature: 0.7 }\n      }),\n      signal: AbortSignal.timeout(30000)\n    });\n    const data = await res.json();\n    let variants = [];\n    try {\n      // Try to parse JSON array from response\n      const match = (data.response || '').match(/\\[([\\s\\S]*?)\\]/);\n      if (match) variants = JSON.parse('[' + match[1] + ']');\n    } catch {\n      // Fallback: create simple variants\n      variants = [\n        'Build an automation that handles ' + ex.instruction.replace(/^(create|build|design|make)\\s+(an?\\s+)?/i, '').substring(0, 200),\n        'I need a workflow for ' + ex.instruction.replace(/^(create|build|design|make)\\s+(an?\\s+)?/i, '').substring(0, 200),\n        'Set up automation for ' + ex.instruction.replace(/^(create|build|design|make)\\s+(an?\\s+)?/i, '').substring(0, 200)\n      ];\n    }\n    \n    for (const v of variants.filter(v => typeof v === 'string' && v.length > 10)) {\n      newExamples.push({\n        instruction: v,\n        input: ex.input || '',\n        output: ex.output,\n        source: 'enriched_variant',\n        original_instruction: ex.instruction.substring(0, 200)\n      });\n    }\n  } catch {\n    // Skip this example on error\n  }\n}\n\nreturn [{ json: { newExamples, originalCount: sample.length, enrichedCount: newExamples.length, ratio: sample.length ? (newExamples.length / sample.length).toFixed(1) : 0 } }];"
      }
    },
    {
      "id": "f2a3b4c5-0012-4fff-8012-000000000004",
      "name": "Validate Variants",
      "type": "n8n-nodes-base.code",
      "typeVersion": 2,
      "position": [940, 300],
      "notes": "Validates each enriched example: checks instruction quality, deduplicates",
      "parameters": {
        "mode": "runOnceForAllItems",
        "jsCode": "const examples = $input.first().json.newExamples || [];\nconst seen = new Set();\nconst valid = [];\n\nfor (const ex of examples) {\n  // Skip too short instructions\n  if (ex.instruction.length < 15) continue;\n  \n  // Skip duplicates\n  const key = ex.instruction.toLowerCase().trim().substring(0, 100);\n  if (seen.has(key)) continue;\n  seen.add(key);\n  \n  // Skip if instruction is same as output (corruption)\n  if (ex.instruction === ex.output) continue;\n  \n  // Validate output is present and reasonable\n  if (!ex.output || ex.output.length < 20) continue;\n  \n  valid.push({ instruction: ex.instruction, input: ex.input || '', output: ex.output });\n}\n\nreturn [{ json: { validExamples: valid, validCount: valid.length, filteredOut: examples.length - valid.length } }];"
      }
    },
    {
      "id": "f2a3b4c5-0012-4fff-8012-000000000005",
      "name": "Append to Training File",
      "type": "n8n-nodes-base.httpRequest",
      "typeVersion": 4.2,
      "position": [1180, 200],
      "parameters": {
        "method": "POST",
        "url": "={{$env.VORLUX_HUB_URL}}/api/admin/finetune/append",
        "sendBody": true,
        "specifyBody": "json",
        "jsonBody": "={{ JSON.stringify({ examples: $json.validExamples, targetFile: 'combined-finetune-v2.jsonl' }) }}",
        "options": {
          "timeout": 30000
        }
      }
    },
    {
      "id": "f2a3b4c5-0012-4fff-8012-000000000006",
      "name": "Discord Notify",
      "type": "n8n-nodes-base.httpRequest",
      "typeVersion": 4.2,
      "position": [1180, 400],
      "parameters": {
        "method": "POST",
        "url": "={{$env.DISCORD_OPS_WEBHOOK}}",
        "sendBody": true,
        "specifyBody": "json",
        "jsonBody": "={\"embeds\":[{\"title\":\"Dataset Enricher Complete\",\"description\":\"Original examples processed: {{ $('Generate Variants').first().json.originalCount }}\\nNew variants generated: {{ $('Generate Variants').first().json.enrichedCount }}\\nValid after filtering: {{ $json.validCount }}\\nFiltered out: {{ $json.filteredOut }}\\n\\nRatio: {{ $('Generate Variants').first().json.ratio }}x enrichment\",\"color\":3066993,\"footer\":{\"text\":\"Dataset Enricher\"}}]}",
        "options": {
          "timeout": 10000
        }
      }
    }
  ],
  "connections": {
    "Webhook Trigger": {
      "main": [
        [
          {
            "node": "Read Training Data",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Read Training Data": {
      "main": [
        [
          {
            "node": "Generate Variants",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Generate Variants": {
      "main": [
        [
          {
            "node": "Validate Variants",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Validate Variants": {
      "main": [
        [
          {
            "node": "Append to Training File",
            "type": "main",
            "index": 0
          },
          {
            "node": "Discord Notify",
            "type": "main",
            "index": 0
          }
        ]
      ]
    }
  },
  "settings": {
    "executionOrder": "v1",
    "saveManualExecutions": true,
    "saveExecutionProgress": true
  },
  "tags": [
    { "name": "ai" },
    { "name": "dataset" },
    { "name": "enrichment" }
  ],
  "versionId": "2"
}