"use node"; import type { Browser, BrowserContext } from "playwright-core"; import { createHash } from "node:crypto"; import { access, readFile, rm, writeFile } from "node:fs/promises"; import { tmpdir } from "node:os"; import path from "node:path"; import { v } from "convex/values"; import { buildTechnicalChecks, discoverRelevantSubpageUrls, extractContactSignalsFromHtmlLikeText, isSameRegistrableHostishDomain, normalizeCrawlUrl, } from "../lib/website-crawler"; import { getUsableContactEmailFromEntries, normalizeEmailAddress, } from "../lib/lead-discovery-google"; import { internal } from "./_generated/api"; import type { Doc, Id } from "./_generated/dataModel"; import { internalAction, type ActionCtx } from "./_generated/server"; const DEFAULT_CRAWL_TIMEOUT_MS = 60_000; const DEFAULT_CRAWL_MAX_PAGES = 5; const DEFAULT_ACTION_BUDGET_MS = 120_000; const MIN_ACTION_BUDGET_MS = 30_000; const MAX_ACTION_BUDGET_MS = 140_000; const ACTION_TIMEOUT_BUFFER_MS = 5_000; const MAX_PERSISTED_LINKS = 120; const MAX_PERSISTED_EMAIL_CANDIDATES = 40; const SCREENSHOT_MIME_TYPE = "image/png"; const MAX_BROWSERLESS_PAGE_BYTES = 750_000; const MAX_BROWSERLESS_LINK_TEXT_CHARS = 180; const BROWSERLESS_CRAWL_PATHS = [ "/", "/kontakt", "/impressum", "/leistungen", "/ueber-uns", ]; const BROWSERLESS_USER_AGENT = "Mozilla/5.0 (compatible; WebDevPipelineBot/1.0; +https://webdev-pipeline.local)"; const CHROMIUM_SOURCE_MARKER_FILE = path.join(tmpdir(), "chromium-source.sha256"); const CHROMIUM_EXECUTABLE_PATH = path.join(tmpdir(), "chromium"); const CHROMIUM_PACK_PATH = path.join(tmpdir(), "chromium-pack"); const GENERIC_EMAIL_LOCALS = new Set([ "info", "kontakt", "contact", "sales", "team", "support", "service", "hello", "marketing", "admin", "office", "impressum", "post", ]); const CHROMIUM_EXECUTABLE_SOURCE_ENV_VARS = [ "TASK8_BROWSER_ASSET_URL", "TASK8_CHROMIUM_EXECUTABLE_URL", "TASK8_CHROMIUM_EXECUTABLE", ]; type EnrichmentPageKind = | "homepage" | "contact" | "impressum" | "services" | "about" | "team" | "other"; type CrawlPageLink = { href: string; text: string; isInternal: boolean; }; type PersistedCrawlLink = CrawlPageLink & { pageUrl: string; }; type PageResult = { sourceUrl: string; finalUrl: string; pageKind: EnrichmentPageKind; title: string; metaDescription: string; headings: string[]; visibleText: string; links: CrawlPageLink[]; emailCandidates: Array<{ email: string; emailSource: string; contactPerson: string | null; isBusinessContactAddress: boolean; isGeneric: boolean; sourceUrl: string; accepted: boolean; normalizedEmail: string; }>; hasContactFormSignal: boolean; hasContactCtaSignal: boolean; }; type StoredScreenshot = { storageId: Id<"_storage">; viewport: "desktop" | "mobile"; sourceUrl: string; capturedAt: number; width: number; height: number; mimeType: string; }; type WebsiteLead = Pick< Doc<"leads">, "_id" | "websiteUrl" | "contactStatus" >; type StartedLead = { lead: WebsiteLead; }; type ServerlessChromiumModule = { args: string[]; executablePath: (input?: string) => Promise; inflate: (filePath: string) => Promise; setupLambdaEnvironment: (baseLibPath: string) => void; }; type PlaywrightClosableResource = { close: () => Promise; }; function messageFromError(error: unknown) { return error instanceof Error ? error.message : String(error); } function isPlaywrightTargetClosedError(error: unknown) { const message = messageFromError(error); return /Target page, context or browser has been closed|Target closed|Browser has been closed|Context has been closed|Page has been closed/i.test( message, ); } async function closePlaywrightResourceSafely( resource: PlaywrightClosableResource | null, label: string, ) { if (!resource) { return; } try { await resource.close(); } catch (error) { if (isPlaywrightTargetClosedError(error)) { return; } console.warn(`Playwright cleanup ignored failed close for ${label}.`, { error: messageFromError(error), }); } } function readPositiveIntEnv(key: string, fallback: number) { const raw = process.env[key]?.trim(); if (!raw) { return fallback; } const parsed = Number.parseInt(raw, 10); return Number.isFinite(parsed) && parsed > 0 ? parsed : fallback; } function crawlTimeoutMs() { return readPositiveIntEnv("TASK8_CRAWL_TIMEOUT_MS", DEFAULT_CRAWL_TIMEOUT_MS); } function crawlMaxPages() { return Math.max( 1, Math.min( DEFAULT_CRAWL_MAX_PAGES, readPositiveIntEnv("TASK8_CRAWL_MAX_PAGES", DEFAULT_CRAWL_MAX_PAGES), ), ); } function actionBudgetMs() { return Math.max( MIN_ACTION_BUDGET_MS, Math.min( MAX_ACTION_BUDGET_MS, readPositiveIntEnv("TASK8_ACTION_BUDGET_MS", DEFAULT_ACTION_BUDGET_MS), ), ); } function remainingActionBudgetMs(startedAt: number, budgetMs: number) { const elapsed = Date.now() - startedAt; return Math.max(1_000, budgetMs - elapsed - ACTION_TIMEOUT_BUFFER_MS); } async function withActionTimeout( promise: Promise, timeoutMs: number, label: string, ): Promise { let timeout: ReturnType | null = null; try { return await Promise.race([ promise, new Promise((_, reject) => { timeout = setTimeout(() => { reject( new Error( `Website-Enrichment Zeitbudget ueberschritten: ${label}.`, ), ); }, Math.max(1, timeoutMs)); }), ]); } finally { if (timeout) { clearTimeout(timeout); } } } function makePageKind(url: string, rootUrl: string): EnrichmentPageKind { const normalizedRoot = normalizeCrawlUrl(rootUrl); if (!normalizedRoot) { return "other"; } const homepagePath = new URL(normalizedRoot).pathname.replace(/\/$/, "") || "/"; let pageUrl: string; try { pageUrl = new URL(url).pathname.toLowerCase(); } catch { return "other"; } if (pageUrl === homepagePath || pageUrl === homepagePath.replace(/\/$/, "")) { return "homepage"; } const normalizedPath = pageUrl.toLowerCase(); if (/(?:^|\/)(kontakt|contact)(?:[-/]|$)/.test(normalizedPath)) { return "contact"; } if (/(?:^|\/)(impressum|imprint)(?:[-/]|$)/.test(normalizedPath)) { return "impressum"; } if (/(?:^|\/)(leistungen|angebot|services?)(?:[-/]|$)/.test(normalizedPath)) { return "services"; } if (/(?:^|\/)(ueber|über|about|team)(?:[-/]|$)/.test(normalizedPath)) { return "about"; } return "other"; } function trimExcerpt(value: string) { return value.replace(/\s+/g, " ").trim().slice(0, 1200); } function isGenericBusinessEmail(email: string) { const local = email.split("@")[0]?.toLowerCase() ?? ""; const base = local.split("+")[0] ?? ""; return GENERIC_EMAIL_LOCALS.has(base); } function decodeHtmlCodePoint(rawCode: string, radix: number) { const codePoint = Number.parseInt(rawCode, radix); if (!Number.isFinite(codePoint) || codePoint < 0 || codePoint > 0x10ffff) { return ""; } try { return String.fromCodePoint(codePoint); } catch { return ""; } } function decodeHtmlText(input: string) { return input .replace(/&#(\d+);/g, (_, code: string) => decodeHtmlCodePoint(code, 10), ) .replace(/&#x([0-9a-f]+);/gi, (_, code: string) => decodeHtmlCodePoint(code, 16), ) .replace(/ | | /gi, " ") .replace(/&/gi, "&") .replace(/</gi, "<") .replace(/>/gi, ">") .replace(/"/gi, '"') .replace(/'|'/gi, "'") .replace(/\s+/g, " ") .trim(); } function stripHtmlForLabel(input: string) { return decodeHtmlText( input .replace(//gi, " ") .replace(//gi, " ") .replace(/<[^>]*>/g, " "), ); } function getHtmlAttribute(tag: string, attribute: string) { const match = new RegExp( `\\b${attribute}\\s*=\\s*(?:"([^"]*)"|'([^']*)'|([^\\s>]+))`, "i", ).exec(tag); const value = match?.[1] ?? match?.[2] ?? match?.[3]; return value ? decodeHtmlText(value) : ""; } function extractFirstTagText(html: string, tagName: string) { const match = new RegExp(`<${tagName}\\b[^>]*>([\\s\\S]*?)<\\/${tagName}>`, "i").exec( html, ); return match?.[1] ? stripHtmlForLabel(match[1]) : ""; } function extractMetaDescriptionFromHtml(html: string) { const metaTags = html.matchAll(/]*>/gi); for (const match of metaTags) { const tag = match[0] ?? ""; const name = getHtmlAttribute(tag, "name") || getHtmlAttribute(tag, "property"); if (!/^(description|og:description|twitter:description)$/i.test(name)) { continue; } const content = getHtmlAttribute(tag, "content"); if (content) { return content; } } return ""; } function extractHeadingsFromHtml(html: string) { return Array.from(html.matchAll(/]*>([\s\S]*?)<\/h[1-3]>/gi)) .map((match) => stripHtmlForLabel(match[1] ?? "")) .filter((heading) => heading.length > 0) .slice(0, 12); } function extractAnchorLinksFromHtml( html: string, finalUrl: string, rootUrl: string, ) { return Array.from(html.matchAll(/]*)>([\s\S]*?)<\/a>/gi)) .map((match) => { const href = getHtmlAttribute(match[1] ?? "", "href"); const normalizedHref = normalizeCrawlUrl(href, finalUrl); if (!normalizedHref) { return null; } return { href: normalizedHref, text: stripHtmlForLabel(match[2] ?? "").slice( 0, MAX_BROWSERLESS_LINK_TEXT_CHARS, ), isInternal: isSameRegistrableHostishDomain(normalizedHref, rootUrl), }; }) .filter( (entry): entry is { href: string; text: string; isInternal: boolean } => entry !== null, ); } function makeBrowserlessCrawlTargets( rootUrl: string, homepageLinks: string[], maxPages: number, ) { const normalizedRoot = normalizeCrawlUrl(rootUrl); if (!normalizedRoot) { return []; } const discoveredUrls = discoverRelevantSubpageUrls(homepageLinks, normalizedRoot); const fallbackUrls = BROWSERLESS_CRAWL_PATHS.map((pathname) => normalizeCrawlUrl(pathname, normalizedRoot), ).filter((url): url is string => url !== null); const seen = new Set(); const targets: string[] = []; for (const candidate of [normalizedRoot, ...discoveredUrls, ...fallbackUrls]) { const normalized = normalizeCrawlUrl(candidate, normalizedRoot); if (!normalized || seen.has(normalized)) { continue; } seen.add(normalized); targets.push(normalized); if (targets.length >= maxPages) { break; } } return targets; } async function readLimitedBrowserlessResponseText( response: Response, signal?: AbortSignal, ) { if (!response.body) { return ""; } const reader = response.body.getReader(); const chunks: Uint8Array[] = []; let totalBytes = 0; try { while (true) { if (signal?.aborted) { throw new Error("Website-Enrichment Fetch wurde abgebrochen."); } const { done, value } = await reader.read(); if (done) { break; } if (!value) { continue; } const nextChunk = value.slice( 0, Math.max(0, MAX_BROWSERLESS_PAGE_BYTES - totalBytes), ); if (nextChunk.length > 0) { chunks.push(nextChunk); totalBytes += nextChunk.length; } if (totalBytes >= MAX_BROWSERLESS_PAGE_BYTES) { await reader.cancel().catch(() => undefined); break; } } } finally { reader.releaseLock(); } const output = new Uint8Array(totalBytes); let offset = 0; for (const chunk of chunks) { output.set(chunk, offset); offset += chunk.length; } return new TextDecoder().decode(output); } async function fetchBrowserlessPage(targetUrl: string, timeoutMs: number) { const controller = new AbortController(); const timeout = setTimeout(() => controller.abort(), Math.max(1, timeoutMs)); try { const response = await fetch(targetUrl, { headers: { "User-Agent": BROWSERLESS_USER_AGENT }, redirect: "follow", signal: controller.signal, }); const contentType = response.headers.get("content-type") ?? ""; if ( response.status >= 400 || (contentType && !/text|html|xml|xhtml/i.test(contentType)) ) { await response.body?.cancel().catch(() => undefined); return null; } return { finalUrl: normalizeCrawlUrl(response.url || targetUrl, targetUrl) ?? targetUrl, html: await readLimitedBrowserlessResponseText( response, controller.signal, ), status: response.status, }; } finally { clearTimeout(timeout); } } async function crawlPageWithoutBrowser( targetUrl: string, rootUrl: string, timeoutMs: number, ) { const fetched = await fetchBrowserlessPage(targetUrl, timeoutMs); if (!fetched || !fetched.html.trim()) { return null; } const finalUrl = fetched.finalUrl; const signals = extractContactSignalsFromHtmlLikeText(fetched.html); const links = extractAnchorLinksFromHtml(fetched.html, finalUrl, rootUrl); const emailCandidates = signals.emailCandidates .map((entry) => { const normalizedEmail = normalizeEmailAddress(entry.email); if (!normalizedEmail) { return null; } return { email: normalizedEmail, emailSource: finalUrl, contactPerson: entry.contactPerson ?? null, isBusinessContactAddress: entry.isBusinessContactAddress, isGeneric: isGenericBusinessEmail(normalizedEmail), sourceUrl: finalUrl, accepted: false, normalizedEmail, }; }) .filter((entry): entry is NonNullable => entry !== null); return { sourceUrl: targetUrl, finalUrl, pageKind: makePageKind(finalUrl, rootUrl), title: extractFirstTagText(fetched.html, "title"), metaDescription: extractMetaDescriptionFromHtml(fetched.html), headings: extractHeadingsFromHtml(fetched.html), visibleText: signals.visibleText, links, emailCandidates, hasContactFormSignal: signals.hasContactFormSignal, hasContactCtaSignal: signals.hasContactCtaSignal, } satisfies PageResult; } async function loadPlaywrightModules() { const [playwrightCore, chromiumPackage] = await Promise.all([ import("playwright-core"), import("@sparticuz/chromium-min"), ]); return { playwrightCore, serverlessChromium: { args: chromiumPackage.default.args, executablePath: chromiumPackage.default.executablePath, inflate: chromiumPackage.inflate, setupLambdaEnvironment: chromiumPackage.setupLambdaEnvironment, } as ServerlessChromiumModule, }; } function getChromiumExecutableSource() { for (const key of CHROMIUM_EXECUTABLE_SOURCE_ENV_VARS) { const value = process.env[key]?.trim(); if (value) { return value; } } return null; } function getChromiumSourceMarker(source: string) { return createHash("sha256").update(source).digest("hex"); } async function clearChromiumCacheForSourceMismatch(executableSource: string) { const nextMarker = getChromiumSourceMarker(executableSource); const marker = await readFile(CHROMIUM_SOURCE_MARKER_FILE, "utf8").catch(() => null); if ((marker ?? "").trim() === nextMarker) { return; } await Promise.all([ rm(CHROMIUM_EXECUTABLE_PATH, { force: true, recursive: true }), rm(CHROMIUM_PACK_PATH, { force: true, recursive: true }), ]); } async function resolveChromiumExecutablePath( chromium: ServerlessChromiumModule, ) { const executableSource = getChromiumExecutableSource(); if (!executableSource) { throw new Error( `Set TASK8_BROWSER_ASSET_URL (or legacy TASK8_CHROMIUM_EXECUTABLE_URL / TASK8_CHROMIUM_EXECUTABLE) to configure the Chromium source; no source is configured.`, ); } await clearChromiumCacheForSourceMismatch(executableSource); const executablePath = await chromium.executablePath(executableSource); await writeFile( CHROMIUM_SOURCE_MARKER_FILE, getChromiumSourceMarker(executableSource), ); return executablePath; } async function captureHomepageScreenshot( ctx: ActionCtx, context: BrowserContext, homepageUrl: string, viewport: "desktop" | "mobile", timeoutMs: number, ) { const page = await context.newPage(); try { await page.goto(homepageUrl, { waitUntil: "domcontentloaded", timeout: timeoutMs, }); const sourceUrl = page.url(); const screenshot = await page.screenshot({ fullPage: true, type: "png", }); const storageId = await ctx.storage.store( new Blob([new Uint8Array(screenshot)], { type: SCREENSHOT_MIME_TYPE }), ); const viewportSize = page.viewportSize() ?? { width: 0, height: 0 }; return { storageId, viewport, sourceUrl, capturedAt: Date.now(), width: viewportSize.width, height: viewportSize.height, mimeType: SCREENSHOT_MIME_TYPE, } satisfies StoredScreenshot; } finally { await closePlaywrightResourceSafely(page, "homepage screenshot page"); } } async function crawlPage( context: BrowserContext, targetUrl: string, rootUrl: string, timeoutMs: number, ) { const page = await context.newPage(); try { const response = await page.goto(targetUrl, { waitUntil: "domcontentloaded", timeout: timeoutMs, }); if (!response) { return null; } const finalUrl = page.url(); const title = await page.title().catch(() => ""); const metaDescription = await page .evaluate(() => { const meta = document.querySelector( "meta[name='description']", ) as HTMLMetaElement | null; return meta?.content ?? ""; }) .catch(() => ""); const content = await page.content(); const signals = extractContactSignalsFromHtmlLikeText(content); const headings = await page .evaluate(() => Array.from(document.querySelectorAll("h1, h2, h3")) .map((element) => element.textContent?.trim() ?? "") .filter((heading) => heading.length > 0), ) .catch(() => []); const visibleText = await page.evaluate(() => { return document.body?.innerText ?? ""; }); const rawLinks = await page .evaluate(() => Array.from(document.querySelectorAll("a[href]")).map((anchor) => ({ href: anchor.getAttribute("href") ?? "", text: anchor.textContent?.trim() ?? "", })), ) .catch(() => []); const normalizedLinks = rawLinks .map((link) => { const normalizedHref = normalizeCrawlUrl(link.href, finalUrl); if (!normalizedHref) { return null; } return { href: normalizedHref, text: link.text, isInternal: isSameRegistrableHostishDomain(normalizedHref, rootUrl), }; }) .filter( (entry): entry is { href: string; text: string; isInternal: boolean } => entry !== null, ); const emailCandidates = signals.emailCandidates .map((entry) => { const normalizedEmail = normalizeEmailAddress(entry.email); if (!normalizedEmail) { return null; } return { email: normalizedEmail, emailSource: finalUrl, contactPerson: entry.contactPerson ?? null, isBusinessContactAddress: entry.isBusinessContactAddress, isGeneric: isGenericBusinessEmail(normalizedEmail), sourceUrl: finalUrl, accepted: false, normalizedEmail, }; }) .filter((entry): entry is NonNullable => entry !== null); return { sourceUrl: finalUrl, finalUrl, pageKind: makePageKind(targetUrl, rootUrl), title, metaDescription, headings, visibleText, links: normalizedLinks, emailCandidates, hasContactFormSignal: signals.hasContactFormSignal, hasContactCtaSignal: signals.hasContactCtaSignal, } satisfies PageResult; } finally { await closePlaywrightResourceSafely(page, "crawl page"); } } function deduplicateLeadEmailCandidates( candidates: PageResult["emailCandidates"], ) { const unique = new Map(); for (const candidate of candidates) { if (!unique.has(candidate.normalizedEmail)) { unique.set(candidate.normalizedEmail, candidate); } } return [...unique.values()]; } function deduplicateCrawlLinks(links: PersistedCrawlLink[]) { const unique = new Map(); for (const link of links) { if (!unique.has(link.href)) { unique.set(link.href, link); } } return [...unique.values()]; } async function processLeadEnrichmentWithoutBrowser( ctx: ActionCtx, args: { runId: Id<"agentRuns">; lead: WebsiteLead; rootUrl: string; timeoutMs: number; maxPages: number; actionStartedAt: number; actionBudget: number; }, ): Promise> { const { runId, lead, rootUrl, timeoutMs, maxPages, actionStartedAt, actionBudget, } = args; await ctx.runMutation(internal.runs.appendEventInternal, { runId, level: "warning", message: "Chromium ist nicht konfiguriert; Website-Enrichment nutzt browserlosen Fetch-Fallback.", details: [{ label: "Lead", value: lead._id }], }); const homepage = await withActionTimeout( crawlPageWithoutBrowser( rootUrl, rootUrl, Math.min(timeoutMs, remainingActionBudgetMs(actionStartedAt, actionBudget)), ), remainingActionBudgetMs(actionStartedAt, actionBudget), "Homepage browserlos crawlen", ); if (!homepage) { throw new Error("Homepage konnte im browserlosen Fallback nicht geladen werden."); } const crawlTargets = makeBrowserlessCrawlTargets( rootUrl, homepage.links.map((link) => link.href), maxPages, ); const crawledPages: PageResult[] = [homepage]; const crawledUrls = new Set(); const normalizedHomepageUrl = normalizeCrawlUrl(homepage.finalUrl, rootUrl); if (normalizedHomepageUrl) { crawledUrls.add(normalizedHomepageUrl); } for (const pageUrl of crawlTargets.slice(1)) { const normalizedTarget = normalizeCrawlUrl(pageUrl, rootUrl); if (!normalizedTarget || crawledUrls.has(normalizedTarget)) { continue; } const crawled = await withActionTimeout( crawlPageWithoutBrowser( normalizedTarget, rootUrl, Math.min( timeoutMs, remainingActionBudgetMs(actionStartedAt, actionBudget), ), ), remainingActionBudgetMs(actionStartedAt, actionBudget), `Unterseite browserlos crawlen: ${normalizedTarget}`, ); if (crawled) { crawledPages.push(crawled); const normalizedCrawledUrl = normalizeCrawlUrl(crawled.finalUrl, rootUrl); if (normalizedCrawledUrl) { crawledUrls.add(normalizedCrawledUrl); } } } const allLinks: PersistedCrawlLink[] = crawledPages.flatMap((page) => page.links.map((link) => ({ ...link, pageUrl: page.finalUrl, })), ); const technicalInput = buildTechnicalChecks({ rootUrl, finalUrl: homepage.finalUrl, title: homepage.title, metaDescription: homepage.metaDescription, visibleText: homepage.visibleText, checkedUrls: crawledPages.map((page) => page.finalUrl), links: allLinks.map((link) => link.href), }); const validCandidates = deduplicateLeadEmailCandidates( crawledPages.flatMap((page) => page.emailCandidates), ); const persistedLinks = deduplicateCrawlLinks(allLinks).slice( 0, MAX_PERSISTED_LINKS, ); const persistedCandidates = validCandidates.slice( 0, MAX_PERSISTED_EMAIL_CANDIDATES, ); const usable = getUsableContactEmailFromEntries( validCandidates.map((candidate) => ({ email: candidate.email, emailSource: candidate.emailSource, contactPerson: candidate.contactPerson, isBusinessContactAddress: candidate.isBusinessContactAddress, })), ); await ctx.runMutation(internal.websiteEnrichment.persistLeadEnrichmentResult, { runId, leadId: lead._id, pages: crawledPages.map((page) => ({ sourceUrl: page.sourceUrl, finalUrl: page.finalUrl, pageKind: page.pageKind, title: page.title, metaDescription: page.metaDescription, headings: page.headings, visibleTextExcerpt: trimExcerpt(page.visibleText), hasContactFormSignal: page.hasContactFormSignal, hasContactCtaSignal: page.hasContactCtaSignal, })), links: persistedLinks.map((link) => ({ pageUrl: link.pageUrl, href: link.href, text: link.text, isInternal: link.isInternal, })), emailCandidates: persistedCandidates.map((candidate) => ({ email: candidate.email, normalizedEmail: candidate.normalizedEmail, emailSource: candidate.emailSource, sourceUrl: candidate.sourceUrl, contactPerson: candidate.contactPerson ?? undefined, isBusinessContactAddress: candidate.isBusinessContactAddress, isGeneric: candidate.isGeneric, accepted: usable !== null && candidate.normalizedEmail === usable.email, })), screenshots: [], technicalChecks: [ { sourceUrl: homepage.sourceUrl, finalUrl: homepage.finalUrl, usesHttps: technicalInput.https, missingTitle: technicalInput.missingTitle, missingMetaDescription: technicalInput.missingMetaDescription, hasVisibleContactPath: technicalInput.hasVisibleContactPath, brokenInternalLinkCount: technicalInput.brokenInternalLinks.length, }, ], }); if (usable) { await ctx.runMutation(internal.websiteEnrichment.patchLeadFromWebsiteEnrichment, { leadId: lead._id, email: usable.email, emailSource: usable.emailSource ?? undefined, contactPerson: usable.contactPerson ?? undefined, currentContactStatus: lead.contactStatus, }); } else { await ctx.runMutation(internal.websiteEnrichment.patchLeadFromWebsiteEnrichment, { leadId: lead._id, currentContactStatus: lead.contactStatus, contactStatusReason: "Browserloses Website-Enrichment abgeschlossen, aber kein verwertbarer Kontakt gefunden.", }); } try { await ctx.runMutation(internal.pageSpeed.queueLeadPageSpeedAudit, { leadId: lead._id, parentRunId: runId, }); } catch (pageSpeedQueueError) { await ctx.runMutation(internal.runs.appendEventInternal, { runId, level: "warning", message: "PageSpeed-Analyse konnte nicht in die Warteschlange gesetzt werden.", details: [ { label: "Lead", value: lead._id }, { label: "Fehler", value: messageFromError(pageSpeedQueueError), source: "pagespeed_queue", }, ], }); } await ctx.runMutation(internal.websiteEnrichment.finishLeadEnrichmentRun, { runId, status: "succeeded", currentStep: "website_enrichment", errors: 0, }); await ctx.runMutation(internal.runs.appendEventInternal, { runId, level: "info", message: usable ? "Website-Enrichment browserlos mit nutzbarer E-Mail abgeschlossen." : "Website-Enrichment browserlos abgeschlossen, aber ohne nutzbare E-Mail.", }); return runId; } export const processLeadEnrichment = internalAction({ args: { runId: v.id("agentRuns") }, handler: async (ctx, args): Promise | null> => { let started: StartedLead | null = null; const runId = args.runId; const actionStartedAt = Date.now(); const actionBudget = actionBudgetMs(); let browser: Browser | null = null; let desktopContext: BrowserContext | null = null; let mobileContext: BrowserContext | null = null; try { started = await ctx.runMutation(internal.websiteEnrichment.startLeadEnrichmentRun, { runId, }); if (!started) { return null; } const rootUrl = normalizeCrawlUrl(started.lead.websiteUrl); if (!rootUrl) { try { await ctx.runMutation(internal.pageSpeed.queueLeadPageSpeedAudit, { leadId: started.lead._id, parentRunId: runId, }); } catch (pageSpeedQueueError) { await ctx.runMutation(internal.runs.appendEventInternal, { runId, level: "warning", message: "PageSpeed-Analyse konnte nicht in die Warteschlange gesetzt werden.", details: [ { label: "Lead", value: started.lead._id }, { label: "Fehler", value: messageFromError(pageSpeedQueueError), source: "pagespeed_queue", }, ], }); } await ctx.runMutation(internal.websiteEnrichment.finishLeadEnrichmentRun, { runId, status: "failed", currentStep: "website_enrichment", errorSummary: "Ungültige Website-URL.", errors: 1, }); await ctx.runMutation(internal.runs.appendEventInternal, { runId, level: "error", message: "Website-Enrichment fehlgeschlagen: Ungültige Website-URL.", details: [{ label: "Lead", value: started.lead._id }], }); await ctx.runMutation(internal.websiteEnrichment.patchLeadFromWebsiteEnrichment, { leadId: started.lead._id, currentContactStatus: started.lead.contactStatus, contactStatusReason: "Website-Enrichment fehlgeschlagen: Ungültige Website-URL.", }); return null; } const timeoutMs = crawlTimeoutMs(); const maxPages = crawlMaxPages(); if (!getChromiumExecutableSource()) { return await processLeadEnrichmentWithoutBrowser(ctx, { runId, lead: started.lead, rootUrl, timeoutMs, maxPages, actionStartedAt, actionBudget, }); } const { playwrightCore, serverlessChromium } = await withActionTimeout( loadPlaywrightModules(), remainingActionBudgetMs(actionStartedAt, actionBudget), "Playwright-Module laden", ); const executablePath = await withActionTimeout( resolveChromiumExecutablePath(serverlessChromium), remainingActionBudgetMs(actionStartedAt, actionBudget), "Chromium executable vorbereiten", ); const prepareChromiumSharedLibraries = async ( chromiumRuntime: ServerlessChromiumModule, ) => { const runtimeArchivePath = path.join( CHROMIUM_PACK_PATH, "al2023.tar.br", ); await access(runtimeArchivePath).catch(() => { throw new Error( `AL2023 shared library archive not found at ${runtimeArchivePath}; cannot prepare Chromium shared libraries.`, ); }); await chromiumRuntime.inflate(runtimeArchivePath); chromiumRuntime.setupLambdaEnvironment(path.join(tmpdir(), "al2023", "lib")); }; await withActionTimeout( prepareChromiumSharedLibraries(serverlessChromium), remainingActionBudgetMs(actionStartedAt, actionBudget), "Chromium-Bibliotheken vorbereiten", ); browser = await withActionTimeout( playwrightCore.chromium.launch({ headless: true, executablePath, args: serverlessChromium.args, timeout: remainingActionBudgetMs(actionStartedAt, actionBudget), }), remainingActionBudgetMs(actionStartedAt, actionBudget), "Chromium starten", ); const { devices } = playwrightCore; desktopContext = await withActionTimeout( browser.newContext({ ...devices["Desktop Chrome"], }), remainingActionBudgetMs(actionStartedAt, actionBudget), "Desktop-Kontext erstellen", ); mobileContext = await withActionTimeout( browser.newContext({ ...devices["iPhone 11"], }), remainingActionBudgetMs(actionStartedAt, actionBudget), "Mobile-Kontext erstellen", ); const homepage = await withActionTimeout( crawlPage( desktopContext, rootUrl, rootUrl, Math.min( timeoutMs, remainingActionBudgetMs(actionStartedAt, actionBudget), ), ), remainingActionBudgetMs(actionStartedAt, actionBudget), "Homepage crawlen", ); if (!homepage) { throw new Error("Homepage konnte nicht geladen werden."); } const requestedPages = discoverRelevantSubpageUrls( homepage.links.map((link) => link.href), rootUrl, ); const crawlTargets = requestedPages.slice(0, maxPages); const crawledPages: PageResult[] = [homepage]; for (const pageUrl of crawlTargets.slice(1)) { const crawled = await withActionTimeout( crawlPage( desktopContext, pageUrl, rootUrl, Math.min( timeoutMs, remainingActionBudgetMs(actionStartedAt, actionBudget), ), ), remainingActionBudgetMs(actionStartedAt, actionBudget), `Unterseite crawlen: ${pageUrl}`, ); if (crawled) { crawledPages.push(crawled); } } const allLinks: PersistedCrawlLink[] = crawledPages.flatMap((page) => page.links.map((link) => ({ ...link, pageUrl: page.finalUrl, })), ); const internalLinks = allLinks.filter((link) => link.isInternal); const uniqueInternalLinks = [...new Set(internalLinks.map((link) => link.href))]; const checkMap = new Map< string, { status: number | null; isBroken: boolean } >(); for (const href of uniqueInternalLinks.slice(0, 30)) { try { const response = await desktopContext.request.get(href, { timeout: Math.min( Math.max(1_000, timeoutMs - 1_000), remainingActionBudgetMs(actionStartedAt, actionBudget), ), }); const status = response.status(); checkMap.set(href, { status, isBroken: status < 200 || status >= 400, }); } catch { checkMap.set(href, { status: null, isBroken: true, }); } } const desktopScreenshot = await withActionTimeout( captureHomepageScreenshot( ctx, desktopContext, homepage.finalUrl, "desktop", Math.min( timeoutMs, remainingActionBudgetMs(actionStartedAt, actionBudget), ), ), remainingActionBudgetMs(actionStartedAt, actionBudget), "Desktop-Screenshot erfassen", ); const mobileScreenshot = await withActionTimeout( captureHomepageScreenshot( ctx, mobileContext, homepage.finalUrl, "mobile", Math.min( timeoutMs, remainingActionBudgetMs(actionStartedAt, actionBudget), ), ), remainingActionBudgetMs(actionStartedAt, actionBudget), "Mobile-Screenshot erfassen", ); const technicalInput = buildTechnicalChecks({ rootUrl, finalUrl: homepage.finalUrl, title: homepage.title, metaDescription: homepage.metaDescription, visibleText: homepage.visibleText, checkedUrls: crawledPages.map((page) => page.finalUrl), links: allLinks.map((link) => { const check = checkMap.get(link.href); return { href: link.href, status: check?.status ?? undefined, statusCode: check?.status ?? undefined, isBroken: check?.isBroken, }; }), }); const validCandidates = deduplicateLeadEmailCandidates( crawledPages.flatMap((page) => page.emailCandidates), ); const persistedLinks = deduplicateCrawlLinks(allLinks).slice( 0, MAX_PERSISTED_LINKS, ); const persistedCandidates = validCandidates.slice( 0, MAX_PERSISTED_EMAIL_CANDIDATES, ); const usable = getUsableContactEmailFromEntries( validCandidates.map((candidate) => ({ email: candidate.email, emailSource: candidate.emailSource, contactPerson: candidate.contactPerson, isBusinessContactAddress: candidate.isBusinessContactAddress, })), ); await ctx.runMutation(internal.websiteEnrichment.persistLeadEnrichmentResult, { runId, leadId: started.lead._id, pages: crawledPages.map((page) => ({ sourceUrl: page.sourceUrl, finalUrl: page.finalUrl, pageKind: page.pageKind, title: page.title, metaDescription: page.metaDescription, headings: page.headings, visibleTextExcerpt: trimExcerpt(page.visibleText), hasContactFormSignal: page.hasContactFormSignal, hasContactCtaSignal: page.hasContactCtaSignal, })), links: persistedLinks.map((link) => ({ pageUrl: link.pageUrl, href: link.href, text: link.text, isInternal: link.isInternal, isBroken: checkMap.get(link.href)?.isBroken, })), emailCandidates: persistedCandidates.map((candidate) => ({ email: candidate.email, normalizedEmail: candidate.normalizedEmail, emailSource: candidate.emailSource, sourceUrl: candidate.sourceUrl, contactPerson: candidate.contactPerson ?? undefined, isBusinessContactAddress: candidate.isBusinessContactAddress, isGeneric: candidate.isGeneric, accepted: usable !== null && candidate.normalizedEmail === usable.email, })), screenshots: [ ...(desktopScreenshot ? [desktopScreenshot] : []), ...(mobileScreenshot ? [mobileScreenshot] : []), ], technicalChecks: [ { sourceUrl: homepage.sourceUrl, finalUrl: homepage.finalUrl, usesHttps: technicalInput.https, missingTitle: technicalInput.missingTitle, missingMetaDescription: technicalInput.missingMetaDescription, hasVisibleContactPath: technicalInput.hasVisibleContactPath, brokenInternalLinkCount: technicalInput.brokenInternalLinks.length, }, ], }); if (usable) { await ctx.runMutation(internal.websiteEnrichment.patchLeadFromWebsiteEnrichment, { leadId: started.lead._id, email: usable.email, emailSource: usable.emailSource ?? undefined, contactPerson: usable.contactPerson ?? undefined, currentContactStatus: started.lead.contactStatus, }); } else { await ctx.runMutation(internal.websiteEnrichment.patchLeadFromWebsiteEnrichment, { leadId: started.lead._id, currentContactStatus: started.lead.contactStatus, contactStatusReason: "Kein verwertbarer Kontakt auf der Website gefunden.", }); } try { await ctx.runMutation(internal.pageSpeed.queueLeadPageSpeedAudit, { leadId: started.lead._id, parentRunId: runId, }); } catch (pageSpeedQueueError) { await ctx.runMutation(internal.runs.appendEventInternal, { runId, level: "warning", message: "PageSpeed-Analyse konnte nicht in die Warteschlange gesetzt werden.", details: [ { label: "Lead", value: started.lead._id }, { label: "Fehler", value: messageFromError(pageSpeedQueueError), source: "pagespeed_queue", }, ], }); } await ctx.runMutation(internal.websiteEnrichment.finishLeadEnrichmentRun, { runId, status: "succeeded", currentStep: "website_enrichment", errors: 0, }); await ctx.runMutation(internal.runs.appendEventInternal, { runId, level: "info", message: usable ? "Website-Enrichment erfolgreich mit nutzbarer E-Mail abgeschlossen." : "Website-Enrichment abgeschlossen, aber ohne nutzbare E-Mail.", }); return runId; } catch (error) { const errorSummary = messageFromError(error); await ctx.runMutation(internal.websiteEnrichment.finishLeadEnrichmentRun, { runId, status: "failed", currentStep: "website_enrichment", errorSummary, errors: 1, }); await ctx.runMutation(internal.runs.appendEventInternal, { runId, level: "error", message: "Website-Enrichment fehlgeschlagen.", details: [ { label: "Fehler", value: errorSummary, source: "website_enrichment" }, ], }); if (started) { try { await ctx.runMutation(internal.pageSpeed.queueLeadPageSpeedAudit, { leadId: started.lead._id, parentRunId: runId, }); } catch (pageSpeedQueueError) { await ctx.runMutation(internal.runs.appendEventInternal, { runId, level: "warning", message: "PageSpeed-Analyse konnte nicht in die Warteschlange gesetzt werden.", details: [ { label: "Lead", value: started.lead._id }, { label: "Fehler", value: messageFromError(pageSpeedQueueError), source: "pagespeed_queue", }, ], }); } await ctx.runMutation(internal.websiteEnrichment.patchLeadFromWebsiteEnrichment, { leadId: started.lead._id, currentContactStatus: started.lead.contactStatus, contactStatusReason: `Website-Enrichment fehlgeschlagen: ${errorSummary}`, }); } return null; } finally { if (desktopContext) { await closePlaywrightResourceSafely( desktopContext, "desktop browser context", ); } if (mobileContext) { await closePlaywrightResourceSafely( mobileContext, "mobile browser context", ); } if (browser) { await closePlaywrightResourceSafely(browser, "browser"); } } }, });