"use node"; import type { Browser, BrowserContext } from "playwright-core"; import { createHash } from "node:crypto"; import { access, readFile, rm, writeFile } from "node:fs/promises"; import { tmpdir } from "node:os"; import path from "node:path"; import { v } from "convex/values"; import { buildTechnicalChecks, discoverRelevantSubpageUrls, extractContactSignalsFromHtmlLikeText, isSameRegistrableHostishDomain, normalizeCrawlUrl, } from "../lib/website-crawler"; import { getUsableContactEmailFromEntries, normalizeEmailAddress, } from "../lib/lead-discovery-google"; import { api, internal } from "./_generated/api"; import type { Doc, Id } from "./_generated/dataModel"; import { internalAction, type ActionCtx } from "./_generated/server"; const DEFAULT_CRAWL_TIMEOUT_MS = 60_000; const DEFAULT_CRAWL_MAX_PAGES = 5; const MAX_PERSISTED_LINKS = 120; const MAX_PERSISTED_EMAIL_CANDIDATES = 40; const SCREENSHOT_MIME_TYPE = "image/png"; const CHROMIUM_SOURCE_MARKER_FILE = path.join(tmpdir(), "chromium-source.sha256"); const CHROMIUM_EXECUTABLE_PATH = path.join(tmpdir(), "chromium"); const CHROMIUM_PACK_PATH = path.join(tmpdir(), "chromium-pack"); const GENERIC_EMAIL_LOCALS = new Set([ "info", "kontakt", "contact", "sales", "team", "support", "service", "hello", "marketing", "admin", "office", "impressum", "post", ]); const CHROMIUM_EXECUTABLE_SOURCE_ENV_VARS = [ "TASK8_BROWSER_ASSET_URL", "TASK8_CHROMIUM_EXECUTABLE_URL", "TASK8_CHROMIUM_EXECUTABLE", ]; type EnrichmentPageKind = | "homepage" | "contact" | "impressum" | "services" | "about" | "team" | "other"; type CrawlPageLink = { href: string; text: string; isInternal: boolean; }; type PersistedCrawlLink = CrawlPageLink & { pageUrl: string; }; type PageResult = { sourceUrl: string; finalUrl: string; pageKind: EnrichmentPageKind; title: string; metaDescription: string; headings: string[]; visibleText: string; links: CrawlPageLink[]; emailCandidates: Array<{ email: string; emailSource: string; contactPerson: string | null; isBusinessContactAddress: boolean; isGeneric: boolean; sourceUrl: string; accepted: boolean; normalizedEmail: string; }>; hasContactFormSignal: boolean; hasContactCtaSignal: boolean; }; type StoredScreenshot = { storageId: Id<"_storage">; viewport: "desktop" | "mobile"; sourceUrl: string; capturedAt: number; width: number; height: number; mimeType: string; }; type WebsiteLead = Pick< Doc<"leads">, "_id" | "websiteUrl" | "contactStatus" >; type StartedLead = { lead: WebsiteLead; }; type ServerlessChromiumModule = { args: string[]; executablePath: (input?: string) => Promise; inflate: (filePath: string) => Promise; setupLambdaEnvironment: (baseLibPath: string) => void; }; function messageFromError(error: unknown) { return error instanceof Error ? error.message : String(error); } function readPositiveIntEnv(key: string, fallback: number) { const raw = process.env[key]?.trim(); if (!raw) { return fallback; } const parsed = Number.parseInt(raw, 10); return Number.isFinite(parsed) && parsed > 0 ? parsed : fallback; } function crawlTimeoutMs() { return readPositiveIntEnv("TASK8_CRAWL_TIMEOUT_MS", DEFAULT_CRAWL_TIMEOUT_MS); } function crawlMaxPages() { return Math.max( 1, Math.min( DEFAULT_CRAWL_MAX_PAGES, readPositiveIntEnv("TASK8_CRAWL_MAX_PAGES", DEFAULT_CRAWL_MAX_PAGES), ), ); } function makePageKind(url: string, rootUrl: string): EnrichmentPageKind { const normalizedRoot = normalizeCrawlUrl(rootUrl); if (!normalizedRoot) { return "other"; } const homepagePath = new URL(normalizedRoot).pathname.replace(/\/$/, "") || "/"; let pageUrl: string; try { pageUrl = new URL(url).pathname.toLowerCase(); } catch { return "other"; } if (pageUrl === homepagePath || pageUrl === homepagePath.replace(/\/$/, "")) { return "homepage"; } const normalizedPath = pageUrl.toLowerCase(); if (/(?:^|\/)(kontakt|contact)(?:[-/]|$)/.test(normalizedPath)) { return "contact"; } if (/(?:^|\/)(impressum|imprint)(?:[-/]|$)/.test(normalizedPath)) { return "impressum"; } if (/(?:^|\/)(leistungen|angebot|services?)(?:[-/]|$)/.test(normalizedPath)) { return "services"; } if (/(?:^|\/)(ueber|über|about|team)(?:[-/]|$)/.test(normalizedPath)) { return "about"; } return "other"; } function trimExcerpt(value: string) { return value.replace(/\s+/g, " ").trim().slice(0, 1200); } function isGenericBusinessEmail(email: string) { const local = email.split("@")[0]?.toLowerCase() ?? ""; const base = local.split("+")[0] ?? ""; return GENERIC_EMAIL_LOCALS.has(base); } async function loadPlaywrightModules() { const [playwrightCore, chromiumPackage] = await Promise.all([ import("playwright-core"), import("@sparticuz/chromium-min"), ]); return { playwrightCore, serverlessChromium: { args: chromiumPackage.default.args, executablePath: chromiumPackage.default.executablePath, inflate: chromiumPackage.inflate, setupLambdaEnvironment: chromiumPackage.setupLambdaEnvironment, } as ServerlessChromiumModule, }; } function getChromiumExecutableSource() { for (const key of CHROMIUM_EXECUTABLE_SOURCE_ENV_VARS) { const value = process.env[key]?.trim(); if (value) { return value; } } return null; } function getChromiumSourceMarker(source: string) { return createHash("sha256").update(source).digest("hex"); } async function clearChromiumCacheForSourceMismatch(executableSource: string) { const nextMarker = getChromiumSourceMarker(executableSource); const marker = await readFile(CHROMIUM_SOURCE_MARKER_FILE, "utf8").catch(() => null); if ((marker ?? "").trim() === nextMarker) { return; } await Promise.all([ rm(CHROMIUM_EXECUTABLE_PATH, { force: true, recursive: true }), rm(CHROMIUM_PACK_PATH, { force: true, recursive: true }), ]); } async function resolveChromiumExecutablePath( chromium: ServerlessChromiumModule, ) { const executableSource = getChromiumExecutableSource(); if (!executableSource) { throw new Error( `Set TASK8_BROWSER_ASSET_URL (or legacy TASK8_CHROMIUM_EXECUTABLE_URL / TASK8_CHROMIUM_EXECUTABLE) to configure the Chromium source; no source is configured.`, ); } await clearChromiumCacheForSourceMismatch(executableSource); const executablePath = await chromium.executablePath(executableSource); await writeFile( CHROMIUM_SOURCE_MARKER_FILE, getChromiumSourceMarker(executableSource), ); return executablePath; } async function captureHomepageScreenshot( ctx: ActionCtx, context: BrowserContext, homepageUrl: string, viewport: "desktop" | "mobile", timeoutMs: number, ) { const page = await context.newPage(); try { await page.goto(homepageUrl, { waitUntil: "domcontentloaded", timeout: timeoutMs, }); const sourceUrl = page.url(); const screenshot = await page.screenshot({ fullPage: true, type: "png", }); const storageId = await ctx.storage.store( new Blob([new Uint8Array(screenshot)], { type: SCREENSHOT_MIME_TYPE }), ); const viewportSize = page.viewportSize() ?? { width: 0, height: 0 }; return { storageId, viewport, sourceUrl, capturedAt: Date.now(), width: viewportSize.width, height: viewportSize.height, mimeType: SCREENSHOT_MIME_TYPE, } satisfies StoredScreenshot; } finally { await page.close(); } } async function crawlPage( context: BrowserContext, targetUrl: string, rootUrl: string, timeoutMs: number, ) { const page = await context.newPage(); try { const response = await page.goto(targetUrl, { waitUntil: "domcontentloaded", timeout: timeoutMs, }); if (!response) { return null; } const finalUrl = page.url(); const title = await page.title().catch(() => ""); const metaDescription = await page .evaluate(() => { const meta = document.querySelector( "meta[name='description']", ) as HTMLMetaElement | null; return meta?.content ?? ""; }) .catch(() => ""); const content = await page.content(); const signals = extractContactSignalsFromHtmlLikeText(content); const headings = await page .evaluate(() => Array.from(document.querySelectorAll("h1, h2, h3")) .map((element) => element.textContent?.trim() ?? "") .filter((heading) => heading.length > 0), ) .catch(() => []); const visibleText = await page.evaluate(() => { return document.body?.innerText ?? ""; }); const rawLinks = await page .evaluate(() => Array.from(document.querySelectorAll("a[href]")).map((anchor) => ({ href: anchor.getAttribute("href") ?? "", text: anchor.textContent?.trim() ?? "", })), ) .catch(() => []); const normalizedLinks = rawLinks .map((link) => { const normalizedHref = normalizeCrawlUrl(link.href, finalUrl); if (!normalizedHref) { return null; } return { href: normalizedHref, text: link.text, isInternal: isSameRegistrableHostishDomain(normalizedHref, rootUrl), }; }) .filter( (entry): entry is { href: string; text: string; isInternal: boolean } => entry !== null, ); const emailCandidates = signals.emailCandidates .map((entry) => { const normalizedEmail = normalizeEmailAddress(entry.email); if (!normalizedEmail) { return null; } return { email: normalizedEmail, emailSource: finalUrl, contactPerson: entry.contactPerson ?? null, isBusinessContactAddress: entry.isBusinessContactAddress, isGeneric: isGenericBusinessEmail(normalizedEmail), sourceUrl: finalUrl, accepted: false, normalizedEmail, }; }) .filter((entry): entry is NonNullable => entry !== null); return { sourceUrl: finalUrl, finalUrl, pageKind: makePageKind(targetUrl, rootUrl), title, metaDescription, headings, visibleText, links: normalizedLinks, emailCandidates, hasContactFormSignal: signals.hasContactFormSignal, hasContactCtaSignal: signals.hasContactCtaSignal, } satisfies PageResult; } finally { await page.close(); } } function deduplicateLeadEmailCandidates( candidates: PageResult["emailCandidates"], ) { const unique = new Map(); for (const candidate of candidates) { if (!unique.has(candidate.normalizedEmail)) { unique.set(candidate.normalizedEmail, candidate); } } return [...unique.values()]; } function deduplicateCrawlLinks(links: PersistedCrawlLink[]) { const unique = new Map(); for (const link of links) { if (!unique.has(link.href)) { unique.set(link.href, link); } } return [...unique.values()]; } export const processLeadEnrichment = internalAction({ args: { runId: v.id("agentRuns") }, handler: async (ctx, args) => { let started: StartedLead | null = null; const runId = args.runId; let browser: Browser | null = null; let desktopContext: BrowserContext | null = null; let mobileContext: BrowserContext | null = null; try { started = await ctx.runMutation(internal.websiteEnrichment.startLeadEnrichmentRun, { runId, }); if (!started) { return null; } const rootUrl = normalizeCrawlUrl(started.lead.websiteUrl); if (!rootUrl) { await ctx.runMutation(internal.websiteEnrichment.finishLeadEnrichmentRun, { runId, status: "failed", currentStep: "website_enrichment", errorSummary: "Ungültige Website-URL.", errors: 1, }); await ctx.runMutation(api.runs.appendEvent, { runId, level: "error", message: "Website-Enrichment fehlgeschlagen: Ungültige Website-URL.", details: [{ label: "Lead", value: started.lead._id }], }); await ctx.runMutation(internal.websiteEnrichment.patchLeadFromWebsiteEnrichment, { leadId: started.lead._id, currentContactStatus: started.lead.contactStatus, contactStatusReason: "Website-Enrichment fehlgeschlagen: Ungültige Website-URL.", }); return null; } const timeoutMs = crawlTimeoutMs(); const maxPages = crawlMaxPages(); const { playwrightCore, serverlessChromium } = await loadPlaywrightModules(); const executablePath = await resolveChromiumExecutablePath( serverlessChromium, ); const prepareChromiumSharedLibraries = async ( chromiumRuntime: ServerlessChromiumModule, ) => { const runtimeArchivePath = path.join( CHROMIUM_PACK_PATH, "al2023.tar.br", ); await access(runtimeArchivePath).catch(() => { throw new Error( `AL2023 shared library archive not found at ${runtimeArchivePath}; cannot prepare Chromium shared libraries.`, ); }); await chromiumRuntime.inflate(runtimeArchivePath); chromiumRuntime.setupLambdaEnvironment(path.join(tmpdir(), "al2023", "lib")); }; await prepareChromiumSharedLibraries(serverlessChromium); browser = await playwrightCore.chromium.launch({ headless: true, executablePath, args: serverlessChromium.args, }); const { devices } = playwrightCore; desktopContext = await browser.newContext({ ...devices["Desktop Chrome"], }); mobileContext = await browser.newContext({ ...devices["iPhone 11"], }); const homepage = await crawlPage(desktopContext, rootUrl, rootUrl, timeoutMs); if (!homepage) { throw new Error("Homepage konnte nicht geladen werden."); } const requestedPages = discoverRelevantSubpageUrls( homepage.links.map((link) => link.href), rootUrl, ); const crawlTargets = requestedPages.slice(0, maxPages); const crawledPages: PageResult[] = [homepage]; for (const pageUrl of crawlTargets.slice(1)) { const crawled = await crawlPage(desktopContext, pageUrl, rootUrl, timeoutMs); if (crawled) { crawledPages.push(crawled); } } const allLinks: PersistedCrawlLink[] = crawledPages.flatMap((page) => page.links.map((link) => ({ ...link, pageUrl: page.finalUrl, })), ); const internalLinks = allLinks.filter((link) => link.isInternal); const uniqueInternalLinks = [...new Set(internalLinks.map((link) => link.href))]; const checkMap = new Map< string, { status: number | null; isBroken: boolean } >(); for (const href of uniqueInternalLinks.slice(0, 30)) { try { const response = await desktopContext.request.get(href, { timeout: Math.max(1_000, timeoutMs - 1_000), }); const status = response.status(); checkMap.set(href, { status, isBroken: status < 200 || status >= 400, }); } catch { checkMap.set(href, { status: null, isBroken: true, }); } } const desktopScreenshot = await captureHomepageScreenshot( ctx, desktopContext, homepage.finalUrl, "desktop", timeoutMs, ); const mobileScreenshot = await captureHomepageScreenshot( ctx, mobileContext, homepage.finalUrl, "mobile", timeoutMs, ); const technicalInput = buildTechnicalChecks({ rootUrl, finalUrl: homepage.finalUrl, title: homepage.title, metaDescription: homepage.metaDescription, visibleText: homepage.visibleText, checkedUrls: crawledPages.map((page) => page.finalUrl), links: allLinks.map((link) => { const check = checkMap.get(link.href); return { href: link.href, status: check?.status ?? undefined, statusCode: check?.status ?? undefined, isBroken: check?.isBroken, }; }), }); const validCandidates = deduplicateLeadEmailCandidates( crawledPages.flatMap((page) => page.emailCandidates), ); const persistedLinks = deduplicateCrawlLinks(allLinks).slice( 0, MAX_PERSISTED_LINKS, ); const persistedCandidates = validCandidates.slice( 0, MAX_PERSISTED_EMAIL_CANDIDATES, ); const usable = getUsableContactEmailFromEntries( validCandidates.map((candidate) => ({ email: candidate.email, emailSource: candidate.emailSource, contactPerson: candidate.contactPerson, isBusinessContactAddress: candidate.isBusinessContactAddress, })), ); await ctx.runMutation(internal.websiteEnrichment.persistLeadEnrichmentResult, { runId, leadId: started.lead._id, pages: crawledPages.map((page) => ({ sourceUrl: page.sourceUrl, finalUrl: page.finalUrl, pageKind: page.pageKind, title: page.title, metaDescription: page.metaDescription, headings: page.headings, visibleTextExcerpt: trimExcerpt(page.visibleText), hasContactFormSignal: page.hasContactFormSignal, hasContactCtaSignal: page.hasContactCtaSignal, })), links: persistedLinks.map((link) => ({ pageUrl: link.pageUrl, href: link.href, text: link.text, isInternal: link.isInternal, isBroken: checkMap.get(link.href)?.isBroken, })), emailCandidates: persistedCandidates.map((candidate) => ({ email: candidate.email, normalizedEmail: candidate.normalizedEmail, emailSource: candidate.emailSource, sourceUrl: candidate.sourceUrl, contactPerson: candidate.contactPerson ?? undefined, isBusinessContactAddress: candidate.isBusinessContactAddress, isGeneric: candidate.isGeneric, accepted: usable !== null && candidate.normalizedEmail === usable.email, })), screenshots: [ ...(desktopScreenshot ? [desktopScreenshot] : []), ...(mobileScreenshot ? [mobileScreenshot] : []), ], technicalChecks: [ { sourceUrl: homepage.sourceUrl, finalUrl: homepage.finalUrl, usesHttps: technicalInput.https, missingTitle: technicalInput.missingTitle, missingMetaDescription: technicalInput.missingMetaDescription, hasVisibleContactPath: technicalInput.hasVisibleContactPath, brokenInternalLinkCount: technicalInput.brokenInternalLinks.length, }, ], }); if (usable) { await ctx.runMutation(internal.websiteEnrichment.patchLeadFromWebsiteEnrichment, { leadId: started.lead._id, email: usable.email, emailSource: usable.emailSource ?? undefined, contactPerson: usable.contactPerson ?? undefined, currentContactStatus: started.lead.contactStatus, }); } else { await ctx.runMutation(internal.websiteEnrichment.patchLeadFromWebsiteEnrichment, { leadId: started.lead._id, currentContactStatus: started.lead.contactStatus, contactStatusReason: "Kein verwertbarer Kontakt auf der Website gefunden.", }); } await ctx.runMutation(internal.websiteEnrichment.finishLeadEnrichmentRun, { runId, status: "succeeded", currentStep: "website_enrichment", errors: 0, }); await ctx.runMutation(api.runs.appendEvent, { runId, level: "info", message: usable ? "Website-Enrichment erfolgreich mit nutzbarer E-Mail abgeschlossen." : "Website-Enrichment abgeschlossen, aber ohne nutzbare E-Mail.", }); return runId; } catch (error) { const errorSummary = messageFromError(error); await ctx.runMutation(internal.websiteEnrichment.finishLeadEnrichmentRun, { runId, status: "failed", currentStep: "website_enrichment", errorSummary, errors: 1, }); await ctx.runMutation(api.runs.appendEvent, { runId, level: "error", message: "Website-Enrichment fehlgeschlagen.", details: [ { label: "Fehler", value: errorSummary, source: "website_enrichment" }, ], }); if (started) { await ctx.runMutation(internal.websiteEnrichment.patchLeadFromWebsiteEnrichment, { leadId: started.lead._id, currentContactStatus: started.lead.contactStatus, contactStatusReason: `Website-Enrichment fehlgeschlagen: ${errorSummary}`, }); } return null; } finally { if (desktopContext) { await desktopContext.close(); } if (mobileContext) { await mobileContext.close(); } if (browser) { await browser.close(); } } }, });