webdev-pipeline/tests/website-enrichment-action.test.ts

import assert from "node:assert/strict";
import { existsSync, readFileSync } from "node:fs";
import path from "node:path";
import test from "node:test";
import ts from "typescript";

const convexConfigPath = path.join(process.cwd(), "convex.json");
const convexConfigSource = readFileSync(convexConfigPath, "utf8");

const websiteEnrichmentPath = path.join(
  process.cwd(),
  "convex/websiteEnrichment.ts",
);
const actionPath = path.join(process.cwd(), "convex/websiteEnrichmentAction.ts");

const websiteEnrichmentSource = readFileSync(websiteEnrichmentPath, "utf8");
const actionSource = readFileSync(actionPath, "utf8");

const websiteEnrichmentSourceFile = ts.createSourceFile(
  "websiteEnrichment.ts",
  websiteEnrichmentSource,
  ts.ScriptTarget.ES2022,
  true,
  ts.ScriptKind.TS,
);
const actionSourceFile = ts.createSourceFile(
  "websiteEnrichmentAction.ts",
  actionSource,
  ts.ScriptTarget.ES2022,
  true,
  ts.ScriptKind.TS,
);

function getExportedConstNames(file: ts.SourceFile) {
  const names = new Set<string>();

  const visit = (node: ts.Node) => {
    if (ts.isVariableStatement(node)) {
      const isExported = node.modifiers?.some(
        (mod) => mod.kind === ts.SyntaxKind.ExportKeyword,
      );

      if (!isExported) {
        ts.forEachChild(node, visit);
        return;
      }

      const isConst = node.declarationList.flags & ts.NodeFlags.Const;

      if (!isConst) {
        ts.forEachChild(node, visit);
        return;
      }

      for (const declaration of node.declarationList.declarations) {
        if (ts.isIdentifier(declaration.name)) {
          names.add(declaration.name.text);
        }
      }
    }

    ts.forEachChild(node, visit);
  };

  ts.forEachChild(file, visit);
  return names;
}

function hasPattern(source: string, pattern: RegExp) {
  return pattern.test(source);
}

function extractExportSource(source: string, name: string) {
  const marker = `export const ${name} = `;
  const declarationIndex = source.indexOf(marker);
  assert.notEqual(declarationIndex, -1, `Expected declaration for ${name}`);

  const openBraceIndex = source.indexOf("{", declarationIndex);
  let depth = 0;
  let end = -1;

  for (let index = openBraceIndex; index < source.length; index += 1) {
    const char = source[index];
    if (char === "{") {
      depth += 1;
    } else if (char === "}") {
      depth -= 1;
      if (depth === 0) {
        end = index;
        break;
      }
    }
  }

  assert.notEqual(end, -1, `Expected balanced braces for ${name}`);
  return source.slice(openBraceIndex, end + 1);
}

test("website enrichment mutation module exists and has runtime assertions", () => {
  assert.equal(
    existsSync(websiteEnrichmentPath),
    true,
    "websiteEnrichment.ts should be present",
  );

  assert.equal(
    hasPattern(websiteEnrichmentSource, /^"use node";/m),
    false,
    "websiteEnrichment.ts should not declare a Node runtime",
  );
});

test("website enrichment action module exists and uses Node runtime", () => {
  assert.equal(
    existsSync(actionPath),
    true,
    "websiteEnrichmentAction.ts should be present",
  );

  assert.equal(
    hasPattern(actionSource, /^"use node";/m),
    true,
    "websiteEnrichmentAction.ts should declare Node runtime",
  );
});

test("module exports are split across mutations and action", () => {
  const mutationExports = getExportedConstNames(websiteEnrichmentSourceFile);
  const actionExports = getExportedConstNames(actionSourceFile);

  const requiredMutationExports = [
    "queueLeadEnrichment",
    "startLeadEnrichmentRun",
    "persistLeadEnrichmentResult",
    "finishLeadEnrichmentRun",
    "patchLeadFromWebsiteEnrichment",
  ];
  const requiredActionExports = ["processLeadEnrichment"];

  for (const exportName of requiredMutationExports) {
    assert.equal(
      mutationExports.has(exportName),
      true,
      `Expected mutation export in websiteEnrichment.ts: ${exportName}`,
    );
  }

  for (const exportName of requiredActionExports) {
    assert.equal(
      actionExports.has(exportName),
      true,
      `Expected action export in websiteEnrichmentAction.ts: ${exportName}`,
    );
  }
});

test("queueLeadEnrichment schedules internal.websiteEnrichmentAction.processLeadEnrichment", () => {
  assert.equal(
    hasPattern(
      websiteEnrichmentSource,
      /queueLeadEnrichment\s*=\s*internalMutation\([\s\S]*?ctx\.scheduler\.runAfter\(\s*0,\s*internal\.websiteEnrichmentAction\.processLeadEnrichment/,
    ),
    true,
    "Queue mutation should schedule action with runAfter(0, internal.websiteEnrichmentAction.processLeadEnrichment)",
  );
});

test("queueLeadEnrichment uses lead-aware run index and does not use fixed-size .take(50) windows", () => {
  const queueBodyMatch = websiteEnrichmentSource.match(
    /export const queueLeadEnrichment[\s\S]*?(?=\nexport const startLeadEnrichmentRun)/,
  );
  assert.equal(
    queueBodyMatch !== null,
    true,
    "queueLeadEnrichment block should be parseable for source assertions",
  );

  const queueBody = queueBodyMatch?.[0] ?? "";
  assert.equal(
    hasPattern(
      queueBody,
      /withIndex\("by_type_and_status_and_leadId"[\s\S]*?eq\("type",\s*"website_enrichment"\)[\s\S]*?eq\("status",\s*"pending"\)[\s\S]*?eq\("leadId",\s*args\.leadId\)/,
    ),
    true,
    "Queue dedupe for pending runs should use direct type+status+leadId index.",
  );
  assert.equal(
    hasPattern(
      queueBody,
      /withIndex\("by_type_and_status_and_leadId"[\s\S]*?eq\("type",\s*"website_enrichment"\)[\s\S]*?eq\("status",\s*"running"\)[\s\S]*?eq\("leadId",\s*args\.leadId\)/,
    ),
    true,
    "Queue dedupe for running runs should use direct type+status+leadId index.",
  );
  assert.equal(hasPattern(queueBody, /take\(50\)/), false, "No fixed-size .take(50) window in dedupe queries.");
});

test("website enrichment action uses Chromium desktop/mobile devices and runtime Playwright import", () => {
  assert.equal(
    hasPattern(
      actionSource,
      /import\s+type\s+\{[^\n]*BrowserContext[^\n]*\}\s+from\s+["']playwright-core["']/,
    ),
    true,
    "Action should import BrowserContext type for typed helper signatures",
  );
  assert.equal(
    hasPattern(actionSource, /loadPlaywrightModules\(\)/),
    true,
    "Action should load Playwright at runtime from inside action",
  );
  assert.equal(
    hasPattern(actionSource, /import\("playwright-core"\)/),
    true,
    "Action should use a dynamic import for playwright-core that Convex can detect as an external package",
  );
  assert.equal(
    hasPattern(actionSource, /import\("@sparticuz\/chromium-min"\)/),
    true,
    "Action should use a dynamic import for @sparticuz/chromium-min as the lightweight browser package",
  );
  assert.equal(
    hasPattern(actionSource, /TASK8_BROWSER_ASSET_URL/),
    true,
    "Action should reference TASK8_BROWSER_ASSET_URL when loading browser assets",
  );
  assert.equal(
    hasPattern(
      actionSource,
      /TASK8_BROWSER_ASSET_URL[\s\S]{0,240}(throw|Error|required|missing|not configured|configured|konfiguriert|setze)/i,
    ),
    true,
    "Action should surface a clear error when the browser asset URL is not configured",
  );
  assert.equal(
    hasPattern(actionSource, /import\("@sparticuz\/chromium"\)/),
    false,
    "Action should not import the oversized @sparticuz/chromium package",
  );
  const externalPackages = JSON.parse(convexConfigSource).node?.externalPackages;
  assert.equal(Array.isArray(externalPackages), true, "convex.json should define node.externalPackages");
  assert.equal(
    externalPackages?.includes("playwright-core"),
    true,
    "convex.json must include playwright-core in externalPackages",
  );
  assert.equal(
    externalPackages?.includes("@sparticuz/chromium-min"),
    true,
    "convex.json should include @sparticuz/chromium-min for browser runtime",
  );
  assert.equal(
    externalPackages?.includes("@sparticuz/chromium"),
    false,
    "convex.json should not include the oversized @sparticuz/chromium package",
  );
  assert.equal(
    hasPattern(actionSource, /serverlessChromium/),
    true,
    "Runtime bootstrap should still use a serverless Chromium wrapper object for launch config",
  );
  assert.equal(
    hasPattern(actionSource, /devices\["Desktop Chrome"\]/),
    true,
    "Desktop context should use Playwright Desktop Chrome device profile",
  );
  assert.equal(
    hasPattern(actionSource, /devices\["iPhone 11"\]/),
    true,
    "Mobile context should use Playwright iPhone 11 device profile",
  );
});

test("website enrichment action invalidates stale @sparticuz/chromium-min cache when source changes", () => {
  assert.equal(
    hasPattern(actionSource, /CHROMIUM_SOURCE_MARKER_FILE/),
    true,
    "Action should declare a temporary marker file path for Chromium executable source cache tracking.",
  );
  assert.equal(
    hasPattern(
      actionSource,
      /tmpdir\(\)/,
    ),
    true,
    "Action should derive temporary cache paths from os.tmpdir().",
  );
  assert.equal(
    hasPattern(actionSource, /getChromiumSourceMarker\(/),
    true,
    "Action should hash executable sources into a stable marker.",
  );
  assert.equal(
    hasPattern(actionSource, /clearChromiumCacheForSourceMismatch\(/),
    true,
    "Action should centralize cache invalidation in a dedicated helper.",
  );
  assert.equal(
    hasPattern(
      actionSource,
      /rm\(CHROMIUM_EXECUTABLE_PATH,\s*\{ force: true, recursive: true \}\),/,
    ),
    true,
    "Action should remove /tmp/chromium when executable source changes.",
  );
  assert.equal(
    hasPattern(
      actionSource,
      /rm\(CHROMIUM_PACK_PATH,\s*\{ force: true, recursive: true \}\),/,
    ),
    true,
    "Action should remove /tmp/chromium-pack when executable source changes.",
  );
  assert.equal(
    hasPattern(
      actionSource,
      /clearChromiumCacheForSourceMismatch\(executableSource\)[\s\S]*?chromium\.executablePath\(executableSource\)/,
    ),
    true,
    "Action should clear stale cache before resolving Chromium executable path.",
  );
  assert.equal(
    hasPattern(
      actionSource,
      /writeFile\([\s\S]*?CHROMIUM_SOURCE_MARKER_FILE,[\s\S]*?getChromiumSourceMarker\(executableSource\)/,
    ),
    true,
    "Action should persist the source marker after executable path resolution.",
  );
});

test("website enrichment action prepares Chromium AL2023 shared libraries for Convex runtime", () => {
  const hasChromiumHelpers =
    (hasPattern(actionSource, /inflate/) &&
      hasPattern(actionSource, /setupLambdaEnvironment/)) ||
    hasPattern(actionSource, /LD_LIBRARY_PATH/);
  assert.equal(
    hasChromiumHelpers,
    true,
    "Action should explicitly prepare chromium-min runtime environment for AL2023 shared libraries to avoid `/tmp/chromium: error while loading shared libraries: libnspr4.so` (inflate/setupLambdaEnvironment or LD_LIBRARY_PATH).",
  );

  const hasAl2023LibPath =
    hasPattern(
      actionSource,
      /path\.join\(\s*tmpdir\(\),\s*["']al2023["'],\s*["']lib["']\s*\)/,
    ) ||
    (hasPattern(actionSource, /LD_LIBRARY_PATH/) &&
      hasPattern(actionSource, /al2023\/lib/));

  const referencesRuntimeArchive = hasPattern(actionSource, /al2023\.tar\.br/);
  const referencesPackPath = hasPattern(
    actionSource,
    /CHROMIUM_PACK_PATH/,
  );
  assert.equal(
    referencesRuntimeArchive && referencesPackPath && hasAl2023LibPath,
    true,
    "Action should reference al2023.tar.br, track CHROMIUM_PACK_PATH, and ensure /tmp/al2023/lib is prepared for Convex launch.",
  );

  const executableIndex = actionSource.indexOf(
    "const executablePath = await resolveChromiumExecutablePath(",
  );
  const launchIndex = actionSource.indexOf("chromium.launch({");
  const hasSetupIndex = Math.max(
    actionSource.indexOf("setupLambdaEnvironment("),
    actionSource.indexOf("LD_LIBRARY_PATH"),
    actionSource.indexOf("path.join(tmpdir(), \"al2023\", \"lib\")"),
  );
  assert.equal(
    executableIndex >= 0 &&
      hasSetupIndex > executableIndex &&
      hasSetupIndex < launchIndex,
    true,
    "Executable resolution and AL2023 shared-library setup should happen before chromium launch in the action runtime path.",
  );
});

test("processLeadEnrichment wraps Playwright bootstrap in protected try/catch", () => {
  assert.equal(
    hasPattern(
      actionSource,
      /try\s*\{[\s\S]*?const \{ playwrightCore, serverlessChromium \}\s*=\s*await loadPlaywrightModules\(\);[\s\S]*?const executablePath = await resolveChromiumExecutablePath\(\s*serverlessChromium,\s*\);[\s\S]*?browser = await playwrightCore\.chromium\.launch\([\s\S]*?executablePath,[\s\S]*?desktopContext = await browser\.newContext\([\s\S]*?mobileContext = await browser\.newContext\(/,
    ),
    true,
    "Playwright runtime bootstrap should use resolveChromiumExecutablePath() inside the action's try/catch-protected block",
  );
  assert.equal(
    hasPattern(
      actionSource,
      /catch\s*\(error\)\s*\{[\s\S]*?finishLeadEnrichmentRun[\s\S]*?runs\.appendEvent[\s\S]*?patchLeadFromWebsiteEnrichment/,
    ),
    true,
    "Bootstrap failures should be handled by finish + error event + lead patch in catch",
  );
});

test("persistence caps candidates and links before writing", () => {
  assert.equal(
    hasPattern(actionSource, /MAX_PERSISTED_LINKS\s*=\s*120/),
    true,
    "Action should define MAX_PERSISTED_LINKS with value 120.",
  );
  assert.equal(
    hasPattern(actionSource, /MAX_PERSISTED_EMAIL_CANDIDATES\s*=\s*40/),
    true,
    "Action should define MAX_PERSISTED_EMAIL_CANDIDATES with value 40.",
  );
  assert.equal(
    hasPattern(
      actionSource,
      /deduplicateCrawlLinks\(allLinks\)[\s\S]*?slice\([\s\S]*?MAX_PERSISTED_LINKS/,
    ),
    true,
    "Action should dedupe and cap link persistence at MAX_PERSISTED_LINKS.",
  );
  assert.equal(
    hasPattern(
      actionSource,
      /validCandidates\.slice\([\s\S]*?MAX_PERSISTED_EMAIL_CANDIDATES/,
      ),
    true,
    "Action should cap candidate persistence at MAX_PERSISTED_EMAIL_CANDIDATES.",
  );
});

test("website enrichment process stores homepage screenshots in Convex storage as PNG", () => {
  assert.equal(
    hasPattern(actionSource, /ctx\.storage\.store\(/),
    true,
    "Action should store screenshot blobs via ctx.storage.store",
  );
  assert.equal(
    hasPattern(
      actionSource,
      /new\s+Blob\(\[[\s\S]*?SCREENSHOT_MIME_TYPE/,
    ),
    true,
    "Action should wrap screenshots in Blob with image/png MIME type",
  );
});

test("startLeadEnrichmentRun marks missing website lead with contact status reason", () => {
  assert.equal(
    hasPattern(
      websiteEnrichmentSource,
      /if \(!lead\.websiteUrl\)\s*\{[\s\S]*?status:\s*"failed"[\s\S]*?contactStatusReason:\s*"Website-URL fehlt für das Website-Enrichment\."/,
    ),
    true,
    "Missing websiteUrl should set a specific contactStatusReason on the lead",
  );
});

test("website enrichment persistence inserts all required evidence table rows", () => {
  const expectedTables = [
    "websiteCrawlPages",
    "websiteCrawlLinks",
    "websiteEmailCandidates",
    "websiteCrawlScreenshots",
    "websiteTechnicalChecks",
  ] as const;

  for (const tableName of expectedTables) {
    assert.equal(
      hasPattern(
        websiteEnrichmentSource,
        new RegExp(`ctx\\.db\\.insert\\(["']${tableName}["']`, "s"),
      ),
      true,
      `persistLeadEnrichmentResult should insert into ${tableName}`,
    );
  }
});

test("website enrichment flow uses TASK-7 email selection helper for lead patching", () => {
  assert.equal(
    hasPattern(
      actionSource,
      /getUsableContactEmailFromEntries\([\s\S]*?\)/,
    ),
    true,
    "Action should call getUsableContactEmailFromEntries",
  );
  assert.equal(
    hasPattern(
      actionSource,
      /runMutation\(\s*internal\.websiteEnrichment\.patchLeadFromWebsiteEnrichment[\s\S]*?\{[\s\S]*?email:\s*usable\.email/,
    ),
    true,
    "Action should patch lead from usable email result",
  );
  assert.equal(
    hasPattern(
      actionSource,
      /currentContactStatus\s*:\s*started\.lead\.contactStatus/,
    ),
    true,
    "Action should pass lead contact status to patchLeadFromWebsiteEnrichment",
  );
  assert.equal(
    hasPattern(websiteEnrichmentSource, /args\.currentContactStatus\s*===\s*\"missing_contact\"/),
    true,
    "Lead patch mutation should only set new status for missing_contact",
  );
});

test("failure handling marks run as failed and writes lead-facing reason", () => {
  assert.equal(
    hasPattern(
      actionSource,
      /runMutation\(\s*internal\.websiteEnrichment\.finishLeadEnrichmentRun[\s\S]*?status:\s*"failed"/,
    ),
    true,
    "Action should persist failed run state on fatal crawl errors",
  );
  assert.equal(
    hasPattern(
      actionSource,
      /runMutation\(\s*api\.runs\.appendEvent[\s\S]*?level:\s*"error"[\s\S]*?message:\s*"Website-Enrichment fehlgeschlagen/,
    ),
    true,
    "Action should append a visible error event on failure",
  );
  assert.equal(
    hasPattern(
      actionSource,
      /contactStatusReason:\s*`Website-Enrichment fehlgeschlagen:\s*\$\{errorSummary\}`/,
    ),
    true,
    "Action should patch the lead with an actionable failure reason",
  );
  assert.equal(
    hasPattern(
      actionSource,
      /contactStatusReason:\s*"Website-Enrichment fehlgeschlagen: Ungültige Website-URL\."/,
    ),
    true,
    "Invalid-url failure should also update lead contact status reason",
  );
});

test("website enrichment enforces TASK-8 crawler limits and runtime timeboxes", () => {
  assert.equal(
    hasPattern(actionSource, /TASK8_CRAWL_TIMEOUT_MS/g),
    true,
    "TASK8_CRAWL_TIMEOUT_MS environment override should be used",
  );
  assert.equal(
    hasPattern(actionSource, /DEFAULT_CRAWL_TIMEOUT_MS\s*=\s*60_000/),
    true,
    "Default crawl timeout should be 60s",
  );
  assert.equal(
    hasPattern(actionSource, /DEFAULT_CRAWL_MAX_PAGES\s*=\s*5/),
    true,
    "Default max crawl page count should be 5",
  );
});

test("processLeadEnrichment schedules PageSpeed audit jobs after successful enrichment", () => {
  const processBody = extractExportSource(actionSource, "processLeadEnrichment");
  const persistIndex = processBody.indexOf(
    "internal.websiteEnrichment.persistLeadEnrichmentResult",
  );
  const queueIndex = processBody.indexOf(
    "internal.pageSpeed.queueLeadPageSpeedAudit",
    persistIndex,
  );
  const finishIndex = processBody.indexOf(
    "internal.websiteEnrichment.finishLeadEnrichmentRun",
    persistIndex,
  );

  assert.notEqual(queueIndex, -1, "processLeadEnrichment should queue PageSpeed audits");
  assert.notEqual(persistIndex, -1, "processLeadEnrichment should persist website enrichment result");
  assert.notEqual(finishIndex, -1, "processLeadEnrichment should finish enrichment run");
  assert.equal(
    hasPattern(
      processBody,
      /runMutation\(\s*internal\.pageSpeed\.queueLeadPageSpeedAudit[\s\S]*leadId:\s*started\.lead\._id[\s\S]*parentRunId:\s*runId[\s\S]*\)/,
    ),
    true,
    "Queue call should pass lead ID and parent run ID",
  );

  assert.equal(queueIndex > persistIndex, true, "PageSpeed queueing should happen after persistence");
  assert.equal(queueIndex < finishIndex, true, "PageSpeed queueing should happen before success finish");
});

test("processLeadEnrichment records warning on PageSpeed queue failure and continues", () => {
  const processBody = extractExportSource(actionSource, "processLeadEnrichment");

  assert.equal(
    hasPattern(
      processBody,
      /try\s*\{[\s\S]*internal\.pageSpeed\.queueLeadPageSpeedAudit[\s\S]*\}\s*catch\s*\([^)]*\)\s*\{[\s\S]*api\.runs\.appendEvent[\s\S]*level:\s*"warning"/,
    ),
    true,
    "Queueing PageSpeed should be wrapped in warning-safe try/catch",
  );
  assert.equal(
    hasPattern(
      processBody,
      /PageSpeed-Analyse konnte nicht in die Warteschlange gesetzt werden\./,
    ),
    true,
    "Warning event should describe queue failure",
  );
});

test("processLeadEnrichment regression: queue PageSpeed on invalid URL failure when started lead exists", () => {
  const processBody = extractExportSource(actionSource, "processLeadEnrichment");
  const invalidUrlStart = processBody.indexOf("if (!rootUrl)");
  assert.notEqual(invalidUrlStart, -1, "Invalid URL guard should exist");

  const invalidUrlReturnNull = processBody.indexOf("return null;", invalidUrlStart);
  assert.notEqual(
    invalidUrlReturnNull,
    -1,
    "Invalid URL branch should return null",
  );

  const queueCallInInvalidUrl = processBody.indexOf(
    "internal.pageSpeed.queueLeadPageSpeedAudit",
    invalidUrlStart,
  );
  assert.equal(
    queueCallInInvalidUrl > invalidUrlStart && queueCallInInvalidUrl < invalidUrlReturnNull,
    true,
    "Invalid URL failure path should queue PageSpeed before returning.",
  );
  const invalidUrlBranch = processBody.slice(invalidUrlStart, invalidUrlReturnNull);
  assert.equal(
    hasPattern(
      invalidUrlBranch,
      /leadId:\s*started\.lead\._id[\s\S]*?parentRunId:\s*runId/,
    ),
    true,
    "Invalid URL queue payload should use started.lead._id and parentRunId runId.",
  );
});

test("processLeadEnrichment regression: queue PageSpeed in fatal catch path with started lead", () => {
  const processBody = extractExportSource(actionSource, "processLeadEnrichment");
  const outerCatchStart = processBody.lastIndexOf("catch (error)");
  assert.notEqual(outerCatchStart, -1, "Outer catch block should exist");

  const startedGuard = processBody.indexOf("if (started)", outerCatchStart);
  assert.notEqual(startedGuard, -1, "Outer catch should guard lead patch by started check.");

  const catchReturnNull = processBody.indexOf("return null;", outerCatchStart);
  assert.notEqual(
    catchReturnNull,
    -1,
    "Outer catch should return null on unrecoverable errors.",
  );

  const queueCallInCatch = processBody.indexOf(
    "internal.pageSpeed.queueLeadPageSpeedAudit",
    outerCatchStart,
  );
  assert.equal(
    queueCallInCatch > outerCatchStart &&
      queueCallInCatch > startedGuard &&
      queueCallInCatch < catchReturnNull,
    true,
    "Fatal catch path should queue PageSpeed before returning, while started lead exists.",
  );
  const catchBlock = processBody.slice(outerCatchStart, catchReturnNull);
  assert.equal(
    hasPattern(
      catchBlock,
      /leadId:\s*started\.lead\._id[\s\S]*?parentRunId:\s*runId/,
    ),
    true,
    "Catch-path PageSpeed queue payload should use started.lead._id and parentRunId runId.",
  );
});