Files
pitchfast/convex/websiteEnrichmentAction.ts

726 lines
21 KiB
TypeScript

"use node";
import type { Browser, BrowserContext } from "playwright-core";
import { createHash } from "node:crypto";
import { access, readFile, rm, writeFile } from "node:fs/promises";
import { tmpdir } from "node:os";
import path from "node:path";
import { v } from "convex/values";
import {
buildTechnicalChecks,
discoverRelevantSubpageUrls,
extractContactSignalsFromHtmlLikeText,
isSameRegistrableHostishDomain,
normalizeCrawlUrl,
} from "../lib/website-crawler";
import {
getUsableContactEmailFromEntries,
normalizeEmailAddress,
} from "../lib/lead-discovery-google";
import { api, internal } from "./_generated/api";
import type { Doc, Id } from "./_generated/dataModel";
import { internalAction, type ActionCtx } from "./_generated/server";
const DEFAULT_CRAWL_TIMEOUT_MS = 60_000;
const DEFAULT_CRAWL_MAX_PAGES = 5;
const MAX_PERSISTED_LINKS = 120;
const MAX_PERSISTED_EMAIL_CANDIDATES = 40;
const SCREENSHOT_MIME_TYPE = "image/png";
const CHROMIUM_SOURCE_MARKER_FILE = path.join(tmpdir(), "chromium-source.sha256");
const CHROMIUM_EXECUTABLE_PATH = path.join(tmpdir(), "chromium");
const CHROMIUM_PACK_PATH = path.join(tmpdir(), "chromium-pack");
const GENERIC_EMAIL_LOCALS = new Set([
"info",
"kontakt",
"contact",
"sales",
"team",
"support",
"service",
"hello",
"marketing",
"admin",
"office",
"impressum",
"post",
]);
const CHROMIUM_EXECUTABLE_SOURCE_ENV_VARS = [
"TASK8_BROWSER_ASSET_URL",
"TASK8_CHROMIUM_EXECUTABLE_URL",
"TASK8_CHROMIUM_EXECUTABLE",
];
type EnrichmentPageKind =
| "homepage"
| "contact"
| "impressum"
| "services"
| "about"
| "team"
| "other";
type CrawlPageLink = {
href: string;
text: string;
isInternal: boolean;
};
type PersistedCrawlLink = CrawlPageLink & {
pageUrl: string;
};
type PageResult = {
sourceUrl: string;
finalUrl: string;
pageKind: EnrichmentPageKind;
title: string;
metaDescription: string;
headings: string[];
visibleText: string;
links: CrawlPageLink[];
emailCandidates: Array<{
email: string;
emailSource: string;
contactPerson: string | null;
isBusinessContactAddress: boolean;
isGeneric: boolean;
sourceUrl: string;
accepted: boolean;
normalizedEmail: string;
}>;
hasContactFormSignal: boolean;
hasContactCtaSignal: boolean;
};
type StoredScreenshot = {
storageId: Id<"_storage">;
viewport: "desktop" | "mobile";
sourceUrl: string;
capturedAt: number;
width: number;
height: number;
mimeType: string;
};
type WebsiteLead = Pick<
Doc<"leads">,
"_id" | "websiteUrl" | "contactStatus"
>;
type StartedLead = {
lead: WebsiteLead;
};
type ServerlessChromiumModule = {
args: string[];
executablePath: (input?: string) => Promise<string>;
inflate: (filePath: string) => Promise<string>;
setupLambdaEnvironment: (baseLibPath: string) => void;
};
function messageFromError(error: unknown) {
return error instanceof Error ? error.message : String(error);
}
function readPositiveIntEnv(key: string, fallback: number) {
const raw = process.env[key]?.trim();
if (!raw) {
return fallback;
}
const parsed = Number.parseInt(raw, 10);
return Number.isFinite(parsed) && parsed > 0 ? parsed : fallback;
}
function crawlTimeoutMs() {
return readPositiveIntEnv("TASK8_CRAWL_TIMEOUT_MS", DEFAULT_CRAWL_TIMEOUT_MS);
}
function crawlMaxPages() {
return Math.max(
1,
Math.min(
DEFAULT_CRAWL_MAX_PAGES,
readPositiveIntEnv("TASK8_CRAWL_MAX_PAGES", DEFAULT_CRAWL_MAX_PAGES),
),
);
}
function makePageKind(url: string, rootUrl: string): EnrichmentPageKind {
const normalizedRoot = normalizeCrawlUrl(rootUrl);
if (!normalizedRoot) {
return "other";
}
const homepagePath = new URL(normalizedRoot).pathname.replace(/\/$/, "") || "/";
let pageUrl: string;
try {
pageUrl = new URL(url).pathname.toLowerCase();
} catch {
return "other";
}
if (pageUrl === homepagePath || pageUrl === homepagePath.replace(/\/$/, "")) {
return "homepage";
}
const normalizedPath = pageUrl.toLowerCase();
if (/(?:^|\/)(kontakt|contact)(?:[-/]|$)/.test(normalizedPath)) {
return "contact";
}
if (/(?:^|\/)(impressum|imprint)(?:[-/]|$)/.test(normalizedPath)) {
return "impressum";
}
if (/(?:^|\/)(leistungen|angebot|services?)(?:[-/]|$)/.test(normalizedPath)) {
return "services";
}
if (/(?:^|\/)(ueber|über|about|team)(?:[-/]|$)/.test(normalizedPath)) {
return "about";
}
return "other";
}
function trimExcerpt(value: string) {
return value.replace(/\s+/g, " ").trim().slice(0, 1200);
}
function isGenericBusinessEmail(email: string) {
const local = email.split("@")[0]?.toLowerCase() ?? "";
const base = local.split("+")[0] ?? "";
return GENERIC_EMAIL_LOCALS.has(base);
}
async function loadPlaywrightModules() {
const [playwrightCore, chromiumPackage] = await Promise.all([
import("playwright-core"),
import("@sparticuz/chromium-min"),
]);
return {
playwrightCore,
serverlessChromium: {
args: chromiumPackage.default.args,
executablePath: chromiumPackage.default.executablePath,
inflate: chromiumPackage.inflate,
setupLambdaEnvironment: chromiumPackage.setupLambdaEnvironment,
} as ServerlessChromiumModule,
};
}
function getChromiumExecutableSource() {
for (const key of CHROMIUM_EXECUTABLE_SOURCE_ENV_VARS) {
const value = process.env[key]?.trim();
if (value) {
return value;
}
}
return null;
}
function getChromiumSourceMarker(source: string) {
return createHash("sha256").update(source).digest("hex");
}
async function clearChromiumCacheForSourceMismatch(executableSource: string) {
const nextMarker = getChromiumSourceMarker(executableSource);
const marker = await readFile(CHROMIUM_SOURCE_MARKER_FILE, "utf8").catch(() => null);
if ((marker ?? "").trim() === nextMarker) {
return;
}
await Promise.all([
rm(CHROMIUM_EXECUTABLE_PATH, { force: true, recursive: true }),
rm(CHROMIUM_PACK_PATH, { force: true, recursive: true }),
]);
}
async function resolveChromiumExecutablePath(
chromium: ServerlessChromiumModule,
) {
const executableSource = getChromiumExecutableSource();
if (!executableSource) {
throw new Error(
`Set TASK8_BROWSER_ASSET_URL (or legacy TASK8_CHROMIUM_EXECUTABLE_URL / TASK8_CHROMIUM_EXECUTABLE) to configure the Chromium source; no source is configured.`,
);
}
await clearChromiumCacheForSourceMismatch(executableSource);
const executablePath = await chromium.executablePath(executableSource);
await writeFile(
CHROMIUM_SOURCE_MARKER_FILE,
getChromiumSourceMarker(executableSource),
);
return executablePath;
}
async function captureHomepageScreenshot(
ctx: ActionCtx,
context: BrowserContext,
homepageUrl: string,
viewport: "desktop" | "mobile",
timeoutMs: number,
) {
const page = await context.newPage();
try {
await page.goto(homepageUrl, {
waitUntil: "domcontentloaded",
timeout: timeoutMs,
});
const sourceUrl = page.url();
const screenshot = await page.screenshot({
fullPage: true,
type: "png",
});
const storageId = await ctx.storage.store(
new Blob([new Uint8Array(screenshot)], { type: SCREENSHOT_MIME_TYPE }),
);
const viewportSize = page.viewportSize() ?? { width: 0, height: 0 };
return {
storageId,
viewport,
sourceUrl,
capturedAt: Date.now(),
width: viewportSize.width,
height: viewportSize.height,
mimeType: SCREENSHOT_MIME_TYPE,
} satisfies StoredScreenshot;
} finally {
await page.close();
}
}
async function crawlPage(
context: BrowserContext,
targetUrl: string,
rootUrl: string,
timeoutMs: number,
) {
const page = await context.newPage();
try {
const response = await page.goto(targetUrl, {
waitUntil: "domcontentloaded",
timeout: timeoutMs,
});
if (!response) {
return null;
}
const finalUrl = page.url();
const title = await page.title().catch(() => "");
const metaDescription = await page
.evaluate(() => {
const meta = document.querySelector(
"meta[name='description']",
) as HTMLMetaElement | null;
return meta?.content ?? "";
})
.catch(() => "");
const content = await page.content();
const signals = extractContactSignalsFromHtmlLikeText(content);
const headings = await page
.evaluate(() =>
Array.from(document.querySelectorAll("h1, h2, h3"))
.map((element) => element.textContent?.trim() ?? "")
.filter((heading) => heading.length > 0),
)
.catch(() => []);
const visibleText = await page.evaluate(() => {
return document.body?.innerText ?? "";
});
const rawLinks = await page
.evaluate(() =>
Array.from(document.querySelectorAll("a[href]")).map((anchor) => ({
href: anchor.getAttribute("href") ?? "",
text: anchor.textContent?.trim() ?? "",
})),
)
.catch(() => []);
const normalizedLinks = rawLinks
.map((link) => {
const normalizedHref = normalizeCrawlUrl(link.href, finalUrl);
if (!normalizedHref) {
return null;
}
return {
href: normalizedHref,
text: link.text,
isInternal: isSameRegistrableHostishDomain(normalizedHref, rootUrl),
};
})
.filter(
(entry): entry is { href: string; text: string; isInternal: boolean } =>
entry !== null,
);
const emailCandidates = signals.emailCandidates
.map((entry) => {
const normalizedEmail = normalizeEmailAddress(entry.email);
if (!normalizedEmail) {
return null;
}
return {
email: normalizedEmail,
emailSource: finalUrl,
contactPerson: entry.contactPerson ?? null,
isBusinessContactAddress: entry.isBusinessContactAddress,
isGeneric: isGenericBusinessEmail(normalizedEmail),
sourceUrl: finalUrl,
accepted: false,
normalizedEmail,
};
})
.filter((entry): entry is NonNullable<typeof entry> => entry !== null);
return {
sourceUrl: finalUrl,
finalUrl,
pageKind: makePageKind(targetUrl, rootUrl),
title,
metaDescription,
headings,
visibleText,
links: normalizedLinks,
emailCandidates,
hasContactFormSignal: signals.hasContactFormSignal,
hasContactCtaSignal: signals.hasContactCtaSignal,
} satisfies PageResult;
} finally {
await page.close();
}
}
function deduplicateLeadEmailCandidates(
candidates: PageResult["emailCandidates"],
) {
const unique = new Map<string, PageResult["emailCandidates"][number]>();
for (const candidate of candidates) {
if (!unique.has(candidate.normalizedEmail)) {
unique.set(candidate.normalizedEmail, candidate);
}
}
return [...unique.values()];
}
function deduplicateCrawlLinks(links: PersistedCrawlLink[]) {
const unique = new Map<string, PersistedCrawlLink>();
for (const link of links) {
if (!unique.has(link.href)) {
unique.set(link.href, link);
}
}
return [...unique.values()];
}
export const processLeadEnrichment = internalAction({
args: { runId: v.id("agentRuns") },
handler: async (ctx, args) => {
let started: StartedLead | null = null;
const runId = args.runId;
let browser: Browser | null = null;
let desktopContext: BrowserContext | null = null;
let mobileContext: BrowserContext | null = null;
try {
started = await ctx.runMutation(internal.websiteEnrichment.startLeadEnrichmentRun, {
runId,
});
if (!started) {
return null;
}
const rootUrl = normalizeCrawlUrl(started.lead.websiteUrl);
if (!rootUrl) {
await ctx.runMutation(internal.websiteEnrichment.finishLeadEnrichmentRun, {
runId,
status: "failed",
currentStep: "website_enrichment",
errorSummary: "Ungültige Website-URL.",
errors: 1,
});
await ctx.runMutation(api.runs.appendEvent, {
runId,
level: "error",
message: "Website-Enrichment fehlgeschlagen: Ungültige Website-URL.",
details: [{ label: "Lead", value: started.lead._id }],
});
await ctx.runMutation(internal.websiteEnrichment.patchLeadFromWebsiteEnrichment, {
leadId: started.lead._id,
currentContactStatus: started.lead.contactStatus,
contactStatusReason:
"Website-Enrichment fehlgeschlagen: Ungültige Website-URL.",
});
return null;
}
const timeoutMs = crawlTimeoutMs();
const maxPages = crawlMaxPages();
const { playwrightCore, serverlessChromium } =
await loadPlaywrightModules();
const executablePath = await resolveChromiumExecutablePath(
serverlessChromium,
);
const prepareChromiumSharedLibraries = async (
chromiumRuntime: ServerlessChromiumModule,
) => {
const runtimeArchivePath = path.join(
CHROMIUM_PACK_PATH,
"al2023.tar.br",
);
await access(runtimeArchivePath).catch(() => {
throw new Error(
`AL2023 shared library archive not found at ${runtimeArchivePath}; cannot prepare Chromium shared libraries.`,
);
});
await chromiumRuntime.inflate(runtimeArchivePath);
chromiumRuntime.setupLambdaEnvironment(path.join(tmpdir(), "al2023", "lib"));
};
await prepareChromiumSharedLibraries(serverlessChromium);
browser = await playwrightCore.chromium.launch({
headless: true,
executablePath,
args: serverlessChromium.args,
});
const { devices } = playwrightCore;
desktopContext = await browser.newContext({
...devices["Desktop Chrome"],
});
mobileContext = await browser.newContext({
...devices["iPhone 11"],
});
const homepage = await crawlPage(desktopContext, rootUrl, rootUrl, timeoutMs);
if (!homepage) {
throw new Error("Homepage konnte nicht geladen werden.");
}
const requestedPages = discoverRelevantSubpageUrls(
homepage.links.map((link) => link.href),
rootUrl,
);
const crawlTargets = requestedPages.slice(0, maxPages);
const crawledPages: PageResult[] = [homepage];
for (const pageUrl of crawlTargets.slice(1)) {
const crawled = await crawlPage(desktopContext, pageUrl, rootUrl, timeoutMs);
if (crawled) {
crawledPages.push(crawled);
}
}
const allLinks: PersistedCrawlLink[] = crawledPages.flatMap((page) =>
page.links.map((link) => ({
...link,
pageUrl: page.finalUrl,
})),
);
const internalLinks = allLinks.filter((link) => link.isInternal);
const uniqueInternalLinks = [...new Set(internalLinks.map((link) => link.href))];
const checkMap = new Map<
string,
{ status: number | null; isBroken: boolean }
>();
for (const href of uniqueInternalLinks.slice(0, 30)) {
try {
const response = await desktopContext.request.get(href, {
timeout: Math.max(1_000, timeoutMs - 1_000),
});
const status = response.status();
checkMap.set(href, {
status,
isBroken: status < 200 || status >= 400,
});
} catch {
checkMap.set(href, {
status: null,
isBroken: true,
});
}
}
const desktopScreenshot = await captureHomepageScreenshot(
ctx,
desktopContext,
homepage.finalUrl,
"desktop",
timeoutMs,
);
const mobileScreenshot = await captureHomepageScreenshot(
ctx,
mobileContext,
homepage.finalUrl,
"mobile",
timeoutMs,
);
const technicalInput = buildTechnicalChecks({
rootUrl,
finalUrl: homepage.finalUrl,
title: homepage.title,
metaDescription: homepage.metaDescription,
visibleText: homepage.visibleText,
checkedUrls: crawledPages.map((page) => page.finalUrl),
links: allLinks.map((link) => {
const check = checkMap.get(link.href);
return {
href: link.href,
status: check?.status ?? undefined,
statusCode: check?.status ?? undefined,
isBroken: check?.isBroken,
};
}),
});
const validCandidates = deduplicateLeadEmailCandidates(
crawledPages.flatMap((page) => page.emailCandidates),
);
const persistedLinks = deduplicateCrawlLinks(allLinks).slice(
0,
MAX_PERSISTED_LINKS,
);
const persistedCandidates = validCandidates.slice(
0,
MAX_PERSISTED_EMAIL_CANDIDATES,
);
const usable = getUsableContactEmailFromEntries(
validCandidates.map((candidate) => ({
email: candidate.email,
emailSource: candidate.emailSource,
contactPerson: candidate.contactPerson,
isBusinessContactAddress: candidate.isBusinessContactAddress,
})),
);
await ctx.runMutation(internal.websiteEnrichment.persistLeadEnrichmentResult, {
runId,
leadId: started.lead._id,
pages: crawledPages.map((page) => ({
sourceUrl: page.sourceUrl,
finalUrl: page.finalUrl,
pageKind: page.pageKind,
title: page.title,
metaDescription: page.metaDescription,
headings: page.headings,
visibleTextExcerpt: trimExcerpt(page.visibleText),
hasContactFormSignal: page.hasContactFormSignal,
hasContactCtaSignal: page.hasContactCtaSignal,
})),
links: persistedLinks.map((link) => ({
pageUrl: link.pageUrl,
href: link.href,
text: link.text,
isInternal: link.isInternal,
isBroken: checkMap.get(link.href)?.isBroken,
})),
emailCandidates: persistedCandidates.map((candidate) => ({
email: candidate.email,
normalizedEmail: candidate.normalizedEmail,
emailSource: candidate.emailSource,
sourceUrl: candidate.sourceUrl,
contactPerson: candidate.contactPerson ?? undefined,
isBusinessContactAddress: candidate.isBusinessContactAddress,
isGeneric: candidate.isGeneric,
accepted:
usable !== null && candidate.normalizedEmail === usable.email,
})),
screenshots: [
...(desktopScreenshot ? [desktopScreenshot] : []),
...(mobileScreenshot ? [mobileScreenshot] : []),
],
technicalChecks: [
{
sourceUrl: homepage.sourceUrl,
finalUrl: homepage.finalUrl,
usesHttps: technicalInput.https,
missingTitle: technicalInput.missingTitle,
missingMetaDescription: technicalInput.missingMetaDescription,
hasVisibleContactPath: technicalInput.hasVisibleContactPath,
brokenInternalLinkCount: technicalInput.brokenInternalLinks.length,
},
],
});
if (usable) {
await ctx.runMutation(internal.websiteEnrichment.patchLeadFromWebsiteEnrichment, {
leadId: started.lead._id,
email: usable.email,
emailSource: usable.emailSource ?? undefined,
contactPerson: usable.contactPerson ?? undefined,
currentContactStatus: started.lead.contactStatus,
});
} else {
await ctx.runMutation(internal.websiteEnrichment.patchLeadFromWebsiteEnrichment, {
leadId: started.lead._id,
currentContactStatus: started.lead.contactStatus,
contactStatusReason:
"Kein verwertbarer Kontakt auf der Website gefunden.",
});
}
await ctx.runMutation(internal.websiteEnrichment.finishLeadEnrichmentRun, {
runId,
status: "succeeded",
currentStep: "website_enrichment",
errors: 0,
});
await ctx.runMutation(api.runs.appendEvent, {
runId,
level: "info",
message: usable
? "Website-Enrichment erfolgreich mit nutzbarer E-Mail abgeschlossen."
: "Website-Enrichment abgeschlossen, aber ohne nutzbare E-Mail.",
});
return runId;
} catch (error) {
const errorSummary = messageFromError(error);
await ctx.runMutation(internal.websiteEnrichment.finishLeadEnrichmentRun, {
runId,
status: "failed",
currentStep: "website_enrichment",
errorSummary,
errors: 1,
});
await ctx.runMutation(api.runs.appendEvent, {
runId,
level: "error",
message: "Website-Enrichment fehlgeschlagen.",
details: [
{ label: "Fehler", value: errorSummary, source: "website_enrichment" },
],
});
if (started) {
await ctx.runMutation(internal.websiteEnrichment.patchLeadFromWebsiteEnrichment, {
leadId: started.lead._id,
currentContactStatus: started.lead.contactStatus,
contactStatusReason: `Website-Enrichment fehlgeschlagen: ${errorSummary}`,
});
}
return null;
} finally {
if (desktopContext) {
await desktopContext.close();
}
if (mobileContext) {
await mobileContext.close();
}
if (browser) {
await browser.close();
}
}
},
});