1450 lines
42 KiB
TypeScript
1450 lines
42 KiB
TypeScript
"use node";
|
|
|
|
import type { Browser, BrowserContext } from "playwright-core";
|
|
import { createHash } from "node:crypto";
|
|
import { access, readFile, rm, writeFile } from "node:fs/promises";
|
|
import { tmpdir } from "node:os";
|
|
import path from "node:path";
|
|
import { v } from "convex/values";
|
|
import {
|
|
buildTechnicalChecks,
|
|
discoverRelevantSubpageUrls,
|
|
extractContactSignalsFromHtmlLikeText,
|
|
isSameRegistrableHostishDomain,
|
|
normalizeCrawlUrl,
|
|
} from "../lib/website-crawler";
|
|
import {
|
|
getUsableContactEmailFromEntries,
|
|
normalizeEmailAddress,
|
|
} from "../lib/lead-discovery-google";
|
|
import { internal } from "./_generated/api";
|
|
import type { Doc, Id } from "./_generated/dataModel";
|
|
import { internalAction, type ActionCtx } from "./_generated/server";
|
|
|
|
const DEFAULT_CRAWL_TIMEOUT_MS = 60_000;
|
|
const DEFAULT_CRAWL_MAX_PAGES = 5;
|
|
const DEFAULT_ACTION_BUDGET_MS = 120_000;
|
|
const MIN_ACTION_BUDGET_MS = 30_000;
|
|
const MAX_ACTION_BUDGET_MS = 140_000;
|
|
const ACTION_TIMEOUT_BUFFER_MS = 5_000;
|
|
const MAX_PERSISTED_LINKS = 120;
|
|
const MAX_PERSISTED_EMAIL_CANDIDATES = 40;
|
|
const SCREENSHOT_MIME_TYPE = "image/png";
|
|
const MAX_BROWSERLESS_PAGE_BYTES = 750_000;
|
|
const MAX_BROWSERLESS_LINK_TEXT_CHARS = 180;
|
|
const BROWSERLESS_CRAWL_PATHS = [
|
|
"/",
|
|
"/kontakt",
|
|
"/impressum",
|
|
"/leistungen",
|
|
"/ueber-uns",
|
|
];
|
|
const BROWSERLESS_USER_AGENT =
|
|
"Mozilla/5.0 (compatible; WebDevPipelineBot/1.0; +https://webdev-pipeline.local)";
|
|
const CHROMIUM_SOURCE_MARKER_FILE = path.join(tmpdir(), "chromium-source.sha256");
|
|
const CHROMIUM_EXECUTABLE_PATH = path.join(tmpdir(), "chromium");
|
|
const CHROMIUM_PACK_PATH = path.join(tmpdir(), "chromium-pack");
|
|
const GENERIC_EMAIL_LOCALS = new Set([
|
|
"info",
|
|
"kontakt",
|
|
"contact",
|
|
"sales",
|
|
"team",
|
|
"support",
|
|
"service",
|
|
"hello",
|
|
"marketing",
|
|
"admin",
|
|
"office",
|
|
"impressum",
|
|
"post",
|
|
]);
|
|
const CHROMIUM_EXECUTABLE_SOURCE_ENV_VARS = [
|
|
"TASK8_BROWSER_ASSET_URL",
|
|
"TASK8_CHROMIUM_EXECUTABLE_URL",
|
|
"TASK8_CHROMIUM_EXECUTABLE",
|
|
];
|
|
|
|
type EnrichmentPageKind =
|
|
| "homepage"
|
|
| "contact"
|
|
| "impressum"
|
|
| "services"
|
|
| "about"
|
|
| "team"
|
|
| "other";
|
|
type CrawlPageLink = {
|
|
href: string;
|
|
text: string;
|
|
isInternal: boolean;
|
|
};
|
|
type PersistedCrawlLink = CrawlPageLink & {
|
|
pageUrl: string;
|
|
};
|
|
type PageResult = {
|
|
sourceUrl: string;
|
|
finalUrl: string;
|
|
pageKind: EnrichmentPageKind;
|
|
title: string;
|
|
metaDescription: string;
|
|
headings: string[];
|
|
visibleText: string;
|
|
links: CrawlPageLink[];
|
|
emailCandidates: Array<{
|
|
email: string;
|
|
emailSource: string;
|
|
contactPerson: string | null;
|
|
isBusinessContactAddress: boolean;
|
|
isGeneric: boolean;
|
|
sourceUrl: string;
|
|
accepted: boolean;
|
|
normalizedEmail: string;
|
|
}>;
|
|
hasContactFormSignal: boolean;
|
|
hasContactCtaSignal: boolean;
|
|
};
|
|
type StoredScreenshot = {
|
|
storageId: Id<"_storage">;
|
|
viewport: "desktop" | "mobile";
|
|
sourceUrl: string;
|
|
capturedAt: number;
|
|
width: number;
|
|
height: number;
|
|
mimeType: string;
|
|
};
|
|
|
|
type WebsiteLead = Pick<
|
|
Doc<"leads">,
|
|
"_id" | "websiteUrl" | "contactStatus"
|
|
>;
|
|
type StartedLead = {
|
|
lead: WebsiteLead;
|
|
};
|
|
|
|
type ServerlessChromiumModule = {
|
|
args: string[];
|
|
executablePath: (input?: string) => Promise<string>;
|
|
inflate: (filePath: string) => Promise<string>;
|
|
setupLambdaEnvironment: (baseLibPath: string) => void;
|
|
};
|
|
type PlaywrightClosableResource = {
|
|
close: () => Promise<unknown>;
|
|
};
|
|
|
|
function messageFromError(error: unknown) {
|
|
return error instanceof Error ? error.message : String(error);
|
|
}
|
|
|
|
function isPlaywrightTargetClosedError(error: unknown) {
|
|
const message = messageFromError(error);
|
|
return /Target page, context or browser has been closed|Target closed|Browser has been closed|Context has been closed|Page has been closed/i.test(
|
|
message,
|
|
);
|
|
}
|
|
|
|
async function closePlaywrightResourceSafely(
|
|
resource: PlaywrightClosableResource | null,
|
|
label: string,
|
|
) {
|
|
if (!resource) {
|
|
return;
|
|
}
|
|
|
|
try {
|
|
await resource.close();
|
|
} catch (error) {
|
|
if (isPlaywrightTargetClosedError(error)) {
|
|
return;
|
|
}
|
|
console.warn(`Playwright cleanup ignored failed close for ${label}.`, {
|
|
error: messageFromError(error),
|
|
});
|
|
}
|
|
}
|
|
|
|
function readPositiveIntEnv(key: string, fallback: number) {
|
|
const raw = process.env[key]?.trim();
|
|
if (!raw) {
|
|
return fallback;
|
|
}
|
|
const parsed = Number.parseInt(raw, 10);
|
|
return Number.isFinite(parsed) && parsed > 0 ? parsed : fallback;
|
|
}
|
|
|
|
function crawlTimeoutMs() {
|
|
return readPositiveIntEnv("TASK8_CRAWL_TIMEOUT_MS", DEFAULT_CRAWL_TIMEOUT_MS);
|
|
}
|
|
|
|
function crawlMaxPages() {
|
|
return Math.max(
|
|
1,
|
|
Math.min(
|
|
DEFAULT_CRAWL_MAX_PAGES,
|
|
readPositiveIntEnv("TASK8_CRAWL_MAX_PAGES", DEFAULT_CRAWL_MAX_PAGES),
|
|
),
|
|
);
|
|
}
|
|
|
|
function actionBudgetMs() {
|
|
return Math.max(
|
|
MIN_ACTION_BUDGET_MS,
|
|
Math.min(
|
|
MAX_ACTION_BUDGET_MS,
|
|
readPositiveIntEnv("TASK8_ACTION_BUDGET_MS", DEFAULT_ACTION_BUDGET_MS),
|
|
),
|
|
);
|
|
}
|
|
|
|
function remainingActionBudgetMs(startedAt: number, budgetMs: number) {
|
|
const elapsed = Date.now() - startedAt;
|
|
return Math.max(1_000, budgetMs - elapsed - ACTION_TIMEOUT_BUFFER_MS);
|
|
}
|
|
|
|
async function withActionTimeout<T>(
|
|
promise: Promise<T>,
|
|
timeoutMs: number,
|
|
label: string,
|
|
): Promise<T> {
|
|
let timeout: ReturnType<typeof setTimeout> | null = null;
|
|
try {
|
|
return await Promise.race([
|
|
promise,
|
|
new Promise<T>((_, reject) => {
|
|
timeout = setTimeout(() => {
|
|
reject(
|
|
new Error(
|
|
`Website-Enrichment Zeitbudget ueberschritten: ${label}.`,
|
|
),
|
|
);
|
|
}, Math.max(1, timeoutMs));
|
|
}),
|
|
]);
|
|
} finally {
|
|
if (timeout) {
|
|
clearTimeout(timeout);
|
|
}
|
|
}
|
|
}
|
|
|
|
function makePageKind(url: string, rootUrl: string): EnrichmentPageKind {
|
|
const normalizedRoot = normalizeCrawlUrl(rootUrl);
|
|
if (!normalizedRoot) {
|
|
return "other";
|
|
}
|
|
|
|
const homepagePath = new URL(normalizedRoot).pathname.replace(/\/$/, "") || "/";
|
|
let pageUrl: string;
|
|
try {
|
|
pageUrl = new URL(url).pathname.toLowerCase();
|
|
} catch {
|
|
return "other";
|
|
}
|
|
|
|
if (pageUrl === homepagePath || pageUrl === homepagePath.replace(/\/$/, "")) {
|
|
return "homepage";
|
|
}
|
|
|
|
const normalizedPath = pageUrl.toLowerCase();
|
|
if (/(?:^|\/)(kontakt|contact)(?:[-/]|$)/.test(normalizedPath)) {
|
|
return "contact";
|
|
}
|
|
if (/(?:^|\/)(impressum|imprint)(?:[-/]|$)/.test(normalizedPath)) {
|
|
return "impressum";
|
|
}
|
|
if (/(?:^|\/)(leistungen|angebot|services?)(?:[-/]|$)/.test(normalizedPath)) {
|
|
return "services";
|
|
}
|
|
if (/(?:^|\/)(ueber|über|about|team)(?:[-/]|$)/.test(normalizedPath)) {
|
|
return "about";
|
|
}
|
|
|
|
return "other";
|
|
}
|
|
|
|
function trimExcerpt(value: string) {
|
|
return value.replace(/\s+/g, " ").trim().slice(0, 1200);
|
|
}
|
|
|
|
function isGenericBusinessEmail(email: string) {
|
|
const local = email.split("@")[0]?.toLowerCase() ?? "";
|
|
const base = local.split("+")[0] ?? "";
|
|
return GENERIC_EMAIL_LOCALS.has(base);
|
|
}
|
|
|
|
function decodeHtmlCodePoint(rawCode: string, radix: number) {
|
|
const codePoint = Number.parseInt(rawCode, radix);
|
|
if (!Number.isFinite(codePoint) || codePoint < 0 || codePoint > 0x10ffff) {
|
|
return "";
|
|
}
|
|
|
|
try {
|
|
return String.fromCodePoint(codePoint);
|
|
} catch {
|
|
return "";
|
|
}
|
|
}
|
|
|
|
function decodeHtmlText(input: string) {
|
|
return input
|
|
.replace(/&#(\d+);/g, (_, code: string) =>
|
|
decodeHtmlCodePoint(code, 10),
|
|
)
|
|
.replace(/&#x([0-9a-f]+);/gi, (_, code: string) =>
|
|
decodeHtmlCodePoint(code, 16),
|
|
)
|
|
.replace(/ | | /gi, " ")
|
|
.replace(/&/gi, "&")
|
|
.replace(/</gi, "<")
|
|
.replace(/>/gi, ">")
|
|
.replace(/"/gi, '"')
|
|
.replace(/'|'/gi, "'")
|
|
.replace(/\s+/g, " ")
|
|
.trim();
|
|
}
|
|
|
|
function stripHtmlForLabel(input: string) {
|
|
return decodeHtmlText(
|
|
input
|
|
.replace(/<script[\s\S]*?<\/script>/gi, " ")
|
|
.replace(/<style[\s\S]*?<\/style>/gi, " ")
|
|
.replace(/<[^>]*>/g, " "),
|
|
);
|
|
}
|
|
|
|
function getHtmlAttribute(tag: string, attribute: string) {
|
|
const match = new RegExp(
|
|
`\\b${attribute}\\s*=\\s*(?:"([^"]*)"|'([^']*)'|([^\\s>]+))`,
|
|
"i",
|
|
).exec(tag);
|
|
const value = match?.[1] ?? match?.[2] ?? match?.[3];
|
|
return value ? decodeHtmlText(value) : "";
|
|
}
|
|
|
|
function extractFirstTagText(html: string, tagName: string) {
|
|
const match = new RegExp(`<${tagName}\\b[^>]*>([\\s\\S]*?)<\\/${tagName}>`, "i").exec(
|
|
html,
|
|
);
|
|
return match?.[1] ? stripHtmlForLabel(match[1]) : "";
|
|
}
|
|
|
|
function extractMetaDescriptionFromHtml(html: string) {
|
|
const metaTags = html.matchAll(/<meta\b[^>]*>/gi);
|
|
for (const match of metaTags) {
|
|
const tag = match[0] ?? "";
|
|
const name = getHtmlAttribute(tag, "name") || getHtmlAttribute(tag, "property");
|
|
if (!/^(description|og:description|twitter:description)$/i.test(name)) {
|
|
continue;
|
|
}
|
|
const content = getHtmlAttribute(tag, "content");
|
|
if (content) {
|
|
return content;
|
|
}
|
|
}
|
|
|
|
return "";
|
|
}
|
|
|
|
function extractHeadingsFromHtml(html: string) {
|
|
return Array.from(html.matchAll(/<h[1-3]\b[^>]*>([\s\S]*?)<\/h[1-3]>/gi))
|
|
.map((match) => stripHtmlForLabel(match[1] ?? ""))
|
|
.filter((heading) => heading.length > 0)
|
|
.slice(0, 12);
|
|
}
|
|
|
|
function extractAnchorLinksFromHtml(
|
|
html: string,
|
|
finalUrl: string,
|
|
rootUrl: string,
|
|
) {
|
|
return Array.from(html.matchAll(/<a\b([^>]*)>([\s\S]*?)<\/a>/gi))
|
|
.map((match) => {
|
|
const href = getHtmlAttribute(match[1] ?? "", "href");
|
|
const normalizedHref = normalizeCrawlUrl(href, finalUrl);
|
|
if (!normalizedHref) {
|
|
return null;
|
|
}
|
|
|
|
return {
|
|
href: normalizedHref,
|
|
text: stripHtmlForLabel(match[2] ?? "").slice(
|
|
0,
|
|
MAX_BROWSERLESS_LINK_TEXT_CHARS,
|
|
),
|
|
isInternal: isSameRegistrableHostishDomain(normalizedHref, rootUrl),
|
|
};
|
|
})
|
|
.filter(
|
|
(entry): entry is { href: string; text: string; isInternal: boolean } =>
|
|
entry !== null,
|
|
);
|
|
}
|
|
|
|
function makeBrowserlessCrawlTargets(
|
|
rootUrl: string,
|
|
homepageLinks: string[],
|
|
maxPages: number,
|
|
) {
|
|
const normalizedRoot = normalizeCrawlUrl(rootUrl);
|
|
if (!normalizedRoot) {
|
|
return [];
|
|
}
|
|
|
|
const discoveredUrls = discoverRelevantSubpageUrls(homepageLinks, normalizedRoot);
|
|
const fallbackUrls = BROWSERLESS_CRAWL_PATHS.map((pathname) =>
|
|
normalizeCrawlUrl(pathname, normalizedRoot),
|
|
).filter((url): url is string => url !== null);
|
|
const seen = new Set<string>();
|
|
const targets: string[] = [];
|
|
|
|
for (const candidate of [normalizedRoot, ...discoveredUrls, ...fallbackUrls]) {
|
|
const normalized = normalizeCrawlUrl(candidate, normalizedRoot);
|
|
if (!normalized || seen.has(normalized)) {
|
|
continue;
|
|
}
|
|
seen.add(normalized);
|
|
targets.push(normalized);
|
|
if (targets.length >= maxPages) {
|
|
break;
|
|
}
|
|
}
|
|
|
|
return targets;
|
|
}
|
|
|
|
async function readLimitedBrowserlessResponseText(
|
|
response: Response,
|
|
signal?: AbortSignal,
|
|
) {
|
|
if (!response.body) {
|
|
return "";
|
|
}
|
|
|
|
const reader = response.body.getReader();
|
|
const chunks: Uint8Array[] = [];
|
|
let totalBytes = 0;
|
|
|
|
try {
|
|
while (true) {
|
|
if (signal?.aborted) {
|
|
throw new Error("Website-Enrichment Fetch wurde abgebrochen.");
|
|
}
|
|
|
|
const { done, value } = await reader.read();
|
|
if (done) {
|
|
break;
|
|
}
|
|
|
|
if (!value) {
|
|
continue;
|
|
}
|
|
|
|
const nextChunk = value.slice(
|
|
0,
|
|
Math.max(0, MAX_BROWSERLESS_PAGE_BYTES - totalBytes),
|
|
);
|
|
if (nextChunk.length > 0) {
|
|
chunks.push(nextChunk);
|
|
totalBytes += nextChunk.length;
|
|
}
|
|
|
|
if (totalBytes >= MAX_BROWSERLESS_PAGE_BYTES) {
|
|
await reader.cancel().catch(() => undefined);
|
|
break;
|
|
}
|
|
}
|
|
} finally {
|
|
reader.releaseLock();
|
|
}
|
|
|
|
const output = new Uint8Array(totalBytes);
|
|
let offset = 0;
|
|
for (const chunk of chunks) {
|
|
output.set(chunk, offset);
|
|
offset += chunk.length;
|
|
}
|
|
|
|
return new TextDecoder().decode(output);
|
|
}
|
|
|
|
async function fetchBrowserlessPage(targetUrl: string, timeoutMs: number) {
|
|
const controller = new AbortController();
|
|
const timeout = setTimeout(() => controller.abort(), Math.max(1, timeoutMs));
|
|
|
|
try {
|
|
const response = await fetch(targetUrl, {
|
|
headers: { "User-Agent": BROWSERLESS_USER_AGENT },
|
|
redirect: "follow",
|
|
signal: controller.signal,
|
|
});
|
|
const contentType = response.headers.get("content-type") ?? "";
|
|
if (
|
|
response.status >= 400 ||
|
|
(contentType && !/text|html|xml|xhtml/i.test(contentType))
|
|
) {
|
|
await response.body?.cancel().catch(() => undefined);
|
|
return null;
|
|
}
|
|
|
|
return {
|
|
finalUrl: normalizeCrawlUrl(response.url || targetUrl, targetUrl) ?? targetUrl,
|
|
html: await readLimitedBrowserlessResponseText(
|
|
response,
|
|
controller.signal,
|
|
),
|
|
status: response.status,
|
|
};
|
|
} finally {
|
|
clearTimeout(timeout);
|
|
}
|
|
}
|
|
|
|
async function crawlPageWithoutBrowser(
|
|
targetUrl: string,
|
|
rootUrl: string,
|
|
timeoutMs: number,
|
|
) {
|
|
const fetched = await fetchBrowserlessPage(targetUrl, timeoutMs);
|
|
if (!fetched || !fetched.html.trim()) {
|
|
return null;
|
|
}
|
|
|
|
const finalUrl = fetched.finalUrl;
|
|
const signals = extractContactSignalsFromHtmlLikeText(fetched.html);
|
|
const links = extractAnchorLinksFromHtml(fetched.html, finalUrl, rootUrl);
|
|
const emailCandidates = signals.emailCandidates
|
|
.map((entry) => {
|
|
const normalizedEmail = normalizeEmailAddress(entry.email);
|
|
if (!normalizedEmail) {
|
|
return null;
|
|
}
|
|
return {
|
|
email: normalizedEmail,
|
|
emailSource: finalUrl,
|
|
contactPerson: entry.contactPerson ?? null,
|
|
isBusinessContactAddress: entry.isBusinessContactAddress,
|
|
isGeneric: isGenericBusinessEmail(normalizedEmail),
|
|
sourceUrl: finalUrl,
|
|
accepted: false,
|
|
normalizedEmail,
|
|
};
|
|
})
|
|
.filter((entry): entry is NonNullable<typeof entry> => entry !== null);
|
|
|
|
return {
|
|
sourceUrl: targetUrl,
|
|
finalUrl,
|
|
pageKind: makePageKind(finalUrl, rootUrl),
|
|
title: extractFirstTagText(fetched.html, "title"),
|
|
metaDescription: extractMetaDescriptionFromHtml(fetched.html),
|
|
headings: extractHeadingsFromHtml(fetched.html),
|
|
visibleText: signals.visibleText,
|
|
links,
|
|
emailCandidates,
|
|
hasContactFormSignal: signals.hasContactFormSignal,
|
|
hasContactCtaSignal: signals.hasContactCtaSignal,
|
|
} satisfies PageResult;
|
|
}
|
|
|
|
async function loadPlaywrightModules() {
|
|
const [playwrightCore, chromiumPackage] = await Promise.all([
|
|
import("playwright-core"),
|
|
import("@sparticuz/chromium-min"),
|
|
]);
|
|
return {
|
|
playwrightCore,
|
|
serverlessChromium: {
|
|
args: chromiumPackage.default.args,
|
|
executablePath: chromiumPackage.default.executablePath,
|
|
inflate: chromiumPackage.inflate,
|
|
setupLambdaEnvironment: chromiumPackage.setupLambdaEnvironment,
|
|
} as ServerlessChromiumModule,
|
|
};
|
|
}
|
|
|
|
function getChromiumExecutableSource() {
|
|
for (const key of CHROMIUM_EXECUTABLE_SOURCE_ENV_VARS) {
|
|
const value = process.env[key]?.trim();
|
|
if (value) {
|
|
return value;
|
|
}
|
|
}
|
|
|
|
return null;
|
|
}
|
|
|
|
function getChromiumSourceMarker(source: string) {
|
|
return createHash("sha256").update(source).digest("hex");
|
|
}
|
|
|
|
async function clearChromiumCacheForSourceMismatch(executableSource: string) {
|
|
const nextMarker = getChromiumSourceMarker(executableSource);
|
|
const marker = await readFile(CHROMIUM_SOURCE_MARKER_FILE, "utf8").catch(() => null);
|
|
if ((marker ?? "").trim() === nextMarker) {
|
|
return;
|
|
}
|
|
|
|
await Promise.all([
|
|
rm(CHROMIUM_EXECUTABLE_PATH, { force: true, recursive: true }),
|
|
rm(CHROMIUM_PACK_PATH, { force: true, recursive: true }),
|
|
]);
|
|
}
|
|
|
|
async function resolveChromiumExecutablePath(
|
|
chromium: ServerlessChromiumModule,
|
|
) {
|
|
const executableSource = getChromiumExecutableSource();
|
|
if (!executableSource) {
|
|
throw new Error(
|
|
`Set TASK8_BROWSER_ASSET_URL (or legacy TASK8_CHROMIUM_EXECUTABLE_URL / TASK8_CHROMIUM_EXECUTABLE) to configure the Chromium source; no source is configured.`,
|
|
);
|
|
}
|
|
|
|
await clearChromiumCacheForSourceMismatch(executableSource);
|
|
const executablePath = await chromium.executablePath(executableSource);
|
|
await writeFile(
|
|
CHROMIUM_SOURCE_MARKER_FILE,
|
|
getChromiumSourceMarker(executableSource),
|
|
);
|
|
|
|
return executablePath;
|
|
}
|
|
|
|
async function captureHomepageScreenshot(
|
|
ctx: ActionCtx,
|
|
context: BrowserContext,
|
|
homepageUrl: string,
|
|
viewport: "desktop" | "mobile",
|
|
timeoutMs: number,
|
|
) {
|
|
const page = await context.newPage();
|
|
try {
|
|
await page.goto(homepageUrl, {
|
|
waitUntil: "domcontentloaded",
|
|
timeout: timeoutMs,
|
|
});
|
|
const sourceUrl = page.url();
|
|
const screenshot = await page.screenshot({
|
|
fullPage: true,
|
|
type: "png",
|
|
});
|
|
const storageId = await ctx.storage.store(
|
|
new Blob([new Uint8Array(screenshot)], { type: SCREENSHOT_MIME_TYPE }),
|
|
);
|
|
const viewportSize = page.viewportSize() ?? { width: 0, height: 0 };
|
|
|
|
return {
|
|
storageId,
|
|
viewport,
|
|
sourceUrl,
|
|
capturedAt: Date.now(),
|
|
width: viewportSize.width,
|
|
height: viewportSize.height,
|
|
mimeType: SCREENSHOT_MIME_TYPE,
|
|
} satisfies StoredScreenshot;
|
|
} finally {
|
|
await closePlaywrightResourceSafely(page, "homepage screenshot page");
|
|
}
|
|
}
|
|
|
|
async function crawlPage(
|
|
context: BrowserContext,
|
|
targetUrl: string,
|
|
rootUrl: string,
|
|
timeoutMs: number,
|
|
) {
|
|
const page = await context.newPage();
|
|
try {
|
|
const response = await page.goto(targetUrl, {
|
|
waitUntil: "domcontentloaded",
|
|
timeout: timeoutMs,
|
|
});
|
|
if (!response) {
|
|
return null;
|
|
}
|
|
|
|
const finalUrl = page.url();
|
|
const title = await page.title().catch(() => "");
|
|
const metaDescription = await page
|
|
.evaluate(() => {
|
|
const meta = document.querySelector(
|
|
"meta[name='description']",
|
|
) as HTMLMetaElement | null;
|
|
return meta?.content ?? "";
|
|
})
|
|
.catch(() => "");
|
|
const content = await page.content();
|
|
const signals = extractContactSignalsFromHtmlLikeText(content);
|
|
const headings = await page
|
|
.evaluate(() =>
|
|
Array.from(document.querySelectorAll("h1, h2, h3"))
|
|
.map((element) => element.textContent?.trim() ?? "")
|
|
.filter((heading) => heading.length > 0),
|
|
)
|
|
.catch(() => []);
|
|
const visibleText = await page.evaluate(() => {
|
|
return document.body?.innerText ?? "";
|
|
});
|
|
const rawLinks = await page
|
|
.evaluate(() =>
|
|
Array.from(document.querySelectorAll("a[href]")).map((anchor) => ({
|
|
href: anchor.getAttribute("href") ?? "",
|
|
text: anchor.textContent?.trim() ?? "",
|
|
})),
|
|
)
|
|
.catch(() => []);
|
|
|
|
const normalizedLinks = rawLinks
|
|
.map((link) => {
|
|
const normalizedHref = normalizeCrawlUrl(link.href, finalUrl);
|
|
if (!normalizedHref) {
|
|
return null;
|
|
}
|
|
return {
|
|
href: normalizedHref,
|
|
text: link.text,
|
|
isInternal: isSameRegistrableHostishDomain(normalizedHref, rootUrl),
|
|
};
|
|
})
|
|
.filter(
|
|
(entry): entry is { href: string; text: string; isInternal: boolean } =>
|
|
entry !== null,
|
|
);
|
|
|
|
const emailCandidates = signals.emailCandidates
|
|
.map((entry) => {
|
|
const normalizedEmail = normalizeEmailAddress(entry.email);
|
|
if (!normalizedEmail) {
|
|
return null;
|
|
}
|
|
return {
|
|
email: normalizedEmail,
|
|
emailSource: finalUrl,
|
|
contactPerson: entry.contactPerson ?? null,
|
|
isBusinessContactAddress: entry.isBusinessContactAddress,
|
|
isGeneric: isGenericBusinessEmail(normalizedEmail),
|
|
sourceUrl: finalUrl,
|
|
accepted: false,
|
|
normalizedEmail,
|
|
};
|
|
})
|
|
.filter((entry): entry is NonNullable<typeof entry> => entry !== null);
|
|
|
|
return {
|
|
sourceUrl: finalUrl,
|
|
finalUrl,
|
|
pageKind: makePageKind(targetUrl, rootUrl),
|
|
title,
|
|
metaDescription,
|
|
headings,
|
|
visibleText,
|
|
links: normalizedLinks,
|
|
emailCandidates,
|
|
hasContactFormSignal: signals.hasContactFormSignal,
|
|
hasContactCtaSignal: signals.hasContactCtaSignal,
|
|
} satisfies PageResult;
|
|
} finally {
|
|
await closePlaywrightResourceSafely(page, "crawl page");
|
|
}
|
|
}
|
|
|
|
function deduplicateLeadEmailCandidates(
|
|
candidates: PageResult["emailCandidates"],
|
|
) {
|
|
const unique = new Map<string, PageResult["emailCandidates"][number]>();
|
|
|
|
for (const candidate of candidates) {
|
|
if (!unique.has(candidate.normalizedEmail)) {
|
|
unique.set(candidate.normalizedEmail, candidate);
|
|
}
|
|
}
|
|
|
|
return [...unique.values()];
|
|
}
|
|
|
|
function deduplicateCrawlLinks(links: PersistedCrawlLink[]) {
|
|
const unique = new Map<string, PersistedCrawlLink>();
|
|
|
|
for (const link of links) {
|
|
if (!unique.has(link.href)) {
|
|
unique.set(link.href, link);
|
|
}
|
|
}
|
|
|
|
return [...unique.values()];
|
|
}
|
|
|
|
async function processLeadEnrichmentWithoutBrowser(
|
|
ctx: ActionCtx,
|
|
args: {
|
|
runId: Id<"agentRuns">;
|
|
lead: WebsiteLead;
|
|
rootUrl: string;
|
|
timeoutMs: number;
|
|
maxPages: number;
|
|
actionStartedAt: number;
|
|
actionBudget: number;
|
|
},
|
|
): Promise<Id<"agentRuns">> {
|
|
const {
|
|
runId,
|
|
lead,
|
|
rootUrl,
|
|
timeoutMs,
|
|
maxPages,
|
|
actionStartedAt,
|
|
actionBudget,
|
|
} = args;
|
|
|
|
await ctx.runMutation(internal.runs.appendEventInternal, {
|
|
runId,
|
|
level: "warning",
|
|
message:
|
|
"Chromium ist nicht konfiguriert; Website-Enrichment nutzt browserlosen Fetch-Fallback.",
|
|
details: [{ label: "Lead", value: lead._id }],
|
|
});
|
|
|
|
const homepage = await withActionTimeout(
|
|
crawlPageWithoutBrowser(
|
|
rootUrl,
|
|
rootUrl,
|
|
Math.min(timeoutMs, remainingActionBudgetMs(actionStartedAt, actionBudget)),
|
|
),
|
|
remainingActionBudgetMs(actionStartedAt, actionBudget),
|
|
"Homepage browserlos crawlen",
|
|
);
|
|
if (!homepage) {
|
|
throw new Error("Homepage konnte im browserlosen Fallback nicht geladen werden.");
|
|
}
|
|
|
|
const crawlTargets = makeBrowserlessCrawlTargets(
|
|
rootUrl,
|
|
homepage.links.map((link) => link.href),
|
|
maxPages,
|
|
);
|
|
const crawledPages: PageResult[] = [homepage];
|
|
const crawledUrls = new Set<string>();
|
|
const normalizedHomepageUrl = normalizeCrawlUrl(homepage.finalUrl, rootUrl);
|
|
if (normalizedHomepageUrl) {
|
|
crawledUrls.add(normalizedHomepageUrl);
|
|
}
|
|
|
|
for (const pageUrl of crawlTargets.slice(1)) {
|
|
const normalizedTarget = normalizeCrawlUrl(pageUrl, rootUrl);
|
|
if (!normalizedTarget || crawledUrls.has(normalizedTarget)) {
|
|
continue;
|
|
}
|
|
|
|
const crawled = await withActionTimeout(
|
|
crawlPageWithoutBrowser(
|
|
normalizedTarget,
|
|
rootUrl,
|
|
Math.min(
|
|
timeoutMs,
|
|
remainingActionBudgetMs(actionStartedAt, actionBudget),
|
|
),
|
|
),
|
|
remainingActionBudgetMs(actionStartedAt, actionBudget),
|
|
`Unterseite browserlos crawlen: ${normalizedTarget}`,
|
|
);
|
|
if (crawled) {
|
|
crawledPages.push(crawled);
|
|
const normalizedCrawledUrl = normalizeCrawlUrl(crawled.finalUrl, rootUrl);
|
|
if (normalizedCrawledUrl) {
|
|
crawledUrls.add(normalizedCrawledUrl);
|
|
}
|
|
}
|
|
}
|
|
|
|
const allLinks: PersistedCrawlLink[] = crawledPages.flatMap((page) =>
|
|
page.links.map((link) => ({
|
|
...link,
|
|
pageUrl: page.finalUrl,
|
|
})),
|
|
);
|
|
const technicalInput = buildTechnicalChecks({
|
|
rootUrl,
|
|
finalUrl: homepage.finalUrl,
|
|
title: homepage.title,
|
|
metaDescription: homepage.metaDescription,
|
|
visibleText: homepage.visibleText,
|
|
checkedUrls: crawledPages.map((page) => page.finalUrl),
|
|
links: allLinks.map((link) => link.href),
|
|
});
|
|
const validCandidates = deduplicateLeadEmailCandidates(
|
|
crawledPages.flatMap((page) => page.emailCandidates),
|
|
);
|
|
const persistedLinks = deduplicateCrawlLinks(allLinks).slice(
|
|
0,
|
|
MAX_PERSISTED_LINKS,
|
|
);
|
|
const persistedCandidates = validCandidates.slice(
|
|
0,
|
|
MAX_PERSISTED_EMAIL_CANDIDATES,
|
|
);
|
|
const usable = getUsableContactEmailFromEntries(
|
|
validCandidates.map((candidate) => ({
|
|
email: candidate.email,
|
|
emailSource: candidate.emailSource,
|
|
contactPerson: candidate.contactPerson,
|
|
isBusinessContactAddress: candidate.isBusinessContactAddress,
|
|
})),
|
|
);
|
|
|
|
await ctx.runMutation(internal.websiteEnrichment.persistLeadEnrichmentResult, {
|
|
runId,
|
|
leadId: lead._id,
|
|
pages: crawledPages.map((page) => ({
|
|
sourceUrl: page.sourceUrl,
|
|
finalUrl: page.finalUrl,
|
|
pageKind: page.pageKind,
|
|
title: page.title,
|
|
metaDescription: page.metaDescription,
|
|
headings: page.headings,
|
|
visibleTextExcerpt: trimExcerpt(page.visibleText),
|
|
hasContactFormSignal: page.hasContactFormSignal,
|
|
hasContactCtaSignal: page.hasContactCtaSignal,
|
|
})),
|
|
links: persistedLinks.map((link) => ({
|
|
pageUrl: link.pageUrl,
|
|
href: link.href,
|
|
text: link.text,
|
|
isInternal: link.isInternal,
|
|
})),
|
|
emailCandidates: persistedCandidates.map((candidate) => ({
|
|
email: candidate.email,
|
|
normalizedEmail: candidate.normalizedEmail,
|
|
emailSource: candidate.emailSource,
|
|
sourceUrl: candidate.sourceUrl,
|
|
contactPerson: candidate.contactPerson ?? undefined,
|
|
isBusinessContactAddress: candidate.isBusinessContactAddress,
|
|
isGeneric: candidate.isGeneric,
|
|
accepted: usable !== null && candidate.normalizedEmail === usable.email,
|
|
})),
|
|
screenshots: [],
|
|
technicalChecks: [
|
|
{
|
|
sourceUrl: homepage.sourceUrl,
|
|
finalUrl: homepage.finalUrl,
|
|
usesHttps: technicalInput.https,
|
|
missingTitle: technicalInput.missingTitle,
|
|
missingMetaDescription: technicalInput.missingMetaDescription,
|
|
hasVisibleContactPath: technicalInput.hasVisibleContactPath,
|
|
brokenInternalLinkCount: technicalInput.brokenInternalLinks.length,
|
|
},
|
|
],
|
|
});
|
|
|
|
if (usable) {
|
|
await ctx.runMutation(internal.websiteEnrichment.patchLeadFromWebsiteEnrichment, {
|
|
leadId: lead._id,
|
|
email: usable.email,
|
|
emailSource: usable.emailSource ?? undefined,
|
|
contactPerson: usable.contactPerson ?? undefined,
|
|
currentContactStatus: lead.contactStatus,
|
|
});
|
|
} else {
|
|
await ctx.runMutation(internal.websiteEnrichment.patchLeadFromWebsiteEnrichment, {
|
|
leadId: lead._id,
|
|
currentContactStatus: lead.contactStatus,
|
|
contactStatusReason:
|
|
"Browserloses Website-Enrichment abgeschlossen, aber kein verwertbarer Kontakt gefunden.",
|
|
});
|
|
}
|
|
|
|
try {
|
|
await ctx.runMutation(internal.pageSpeed.queueLeadPageSpeedAudit, {
|
|
leadId: lead._id,
|
|
parentRunId: runId,
|
|
});
|
|
} catch (pageSpeedQueueError) {
|
|
await ctx.runMutation(internal.runs.appendEventInternal, {
|
|
runId,
|
|
level: "warning",
|
|
message: "PageSpeed-Analyse konnte nicht in die Warteschlange gesetzt werden.",
|
|
details: [
|
|
{ label: "Lead", value: lead._id },
|
|
{
|
|
label: "Fehler",
|
|
value: messageFromError(pageSpeedQueueError),
|
|
source: "pagespeed_queue",
|
|
},
|
|
],
|
|
});
|
|
}
|
|
|
|
await ctx.runMutation(internal.websiteEnrichment.finishLeadEnrichmentRun, {
|
|
runId,
|
|
status: "succeeded",
|
|
currentStep: "website_enrichment",
|
|
errors: 0,
|
|
});
|
|
|
|
await ctx.runMutation(internal.runs.appendEventInternal, {
|
|
runId,
|
|
level: "info",
|
|
message: usable
|
|
? "Website-Enrichment browserlos mit nutzbarer E-Mail abgeschlossen."
|
|
: "Website-Enrichment browserlos abgeschlossen, aber ohne nutzbare E-Mail.",
|
|
});
|
|
|
|
return runId;
|
|
}
|
|
|
|
export const processLeadEnrichment = internalAction({
|
|
args: { runId: v.id("agentRuns") },
|
|
handler: async (ctx, args): Promise<Id<"agentRuns"> | null> => {
|
|
let started: StartedLead | null = null;
|
|
const runId = args.runId;
|
|
const actionStartedAt = Date.now();
|
|
const actionBudget = actionBudgetMs();
|
|
let browser: Browser | null = null;
|
|
let desktopContext: BrowserContext | null = null;
|
|
let mobileContext: BrowserContext | null = null;
|
|
|
|
try {
|
|
started = await ctx.runMutation(internal.websiteEnrichment.startLeadEnrichmentRun, {
|
|
runId,
|
|
});
|
|
|
|
if (!started) {
|
|
return null;
|
|
}
|
|
|
|
const rootUrl = normalizeCrawlUrl(started.lead.websiteUrl);
|
|
if (!rootUrl) {
|
|
try {
|
|
await ctx.runMutation(internal.pageSpeed.queueLeadPageSpeedAudit, {
|
|
leadId: started.lead._id,
|
|
parentRunId: runId,
|
|
});
|
|
} catch (pageSpeedQueueError) {
|
|
await ctx.runMutation(internal.runs.appendEventInternal, {
|
|
runId,
|
|
level: "warning",
|
|
message: "PageSpeed-Analyse konnte nicht in die Warteschlange gesetzt werden.",
|
|
details: [
|
|
{ label: "Lead", value: started.lead._id },
|
|
{
|
|
label: "Fehler",
|
|
value: messageFromError(pageSpeedQueueError),
|
|
source: "pagespeed_queue",
|
|
},
|
|
],
|
|
});
|
|
}
|
|
|
|
await ctx.runMutation(internal.websiteEnrichment.finishLeadEnrichmentRun, {
|
|
runId,
|
|
status: "failed",
|
|
currentStep: "website_enrichment",
|
|
errorSummary: "Ungültige Website-URL.",
|
|
errors: 1,
|
|
});
|
|
await ctx.runMutation(internal.runs.appendEventInternal, {
|
|
runId,
|
|
level: "error",
|
|
message: "Website-Enrichment fehlgeschlagen: Ungültige Website-URL.",
|
|
details: [{ label: "Lead", value: started.lead._id }],
|
|
});
|
|
await ctx.runMutation(internal.websiteEnrichment.patchLeadFromWebsiteEnrichment, {
|
|
leadId: started.lead._id,
|
|
currentContactStatus: started.lead.contactStatus,
|
|
contactStatusReason:
|
|
"Website-Enrichment fehlgeschlagen: Ungültige Website-URL.",
|
|
});
|
|
return null;
|
|
}
|
|
|
|
const timeoutMs = crawlTimeoutMs();
|
|
const maxPages = crawlMaxPages();
|
|
|
|
if (!getChromiumExecutableSource()) {
|
|
return await processLeadEnrichmentWithoutBrowser(ctx, {
|
|
runId,
|
|
lead: started.lead,
|
|
rootUrl,
|
|
timeoutMs,
|
|
maxPages,
|
|
actionStartedAt,
|
|
actionBudget,
|
|
});
|
|
}
|
|
|
|
const { playwrightCore, serverlessChromium } =
|
|
await withActionTimeout(
|
|
loadPlaywrightModules(),
|
|
remainingActionBudgetMs(actionStartedAt, actionBudget),
|
|
"Playwright-Module laden",
|
|
);
|
|
const executablePath = await withActionTimeout(
|
|
resolveChromiumExecutablePath(serverlessChromium),
|
|
remainingActionBudgetMs(actionStartedAt, actionBudget),
|
|
"Chromium executable vorbereiten",
|
|
);
|
|
|
|
const prepareChromiumSharedLibraries = async (
|
|
chromiumRuntime: ServerlessChromiumModule,
|
|
) => {
|
|
const runtimeArchivePath = path.join(
|
|
CHROMIUM_PACK_PATH,
|
|
"al2023.tar.br",
|
|
);
|
|
await access(runtimeArchivePath).catch(() => {
|
|
throw new Error(
|
|
`AL2023 shared library archive not found at ${runtimeArchivePath}; cannot prepare Chromium shared libraries.`,
|
|
);
|
|
});
|
|
|
|
await chromiumRuntime.inflate(runtimeArchivePath);
|
|
chromiumRuntime.setupLambdaEnvironment(path.join(tmpdir(), "al2023", "lib"));
|
|
};
|
|
|
|
await withActionTimeout(
|
|
prepareChromiumSharedLibraries(serverlessChromium),
|
|
remainingActionBudgetMs(actionStartedAt, actionBudget),
|
|
"Chromium-Bibliotheken vorbereiten",
|
|
);
|
|
browser = await withActionTimeout(
|
|
playwrightCore.chromium.launch({
|
|
headless: true,
|
|
executablePath,
|
|
args: serverlessChromium.args,
|
|
timeout: remainingActionBudgetMs(actionStartedAt, actionBudget),
|
|
}),
|
|
remainingActionBudgetMs(actionStartedAt, actionBudget),
|
|
"Chromium starten",
|
|
);
|
|
const { devices } = playwrightCore;
|
|
desktopContext = await withActionTimeout(
|
|
browser.newContext({
|
|
...devices["Desktop Chrome"],
|
|
}),
|
|
remainingActionBudgetMs(actionStartedAt, actionBudget),
|
|
"Desktop-Kontext erstellen",
|
|
);
|
|
mobileContext = await withActionTimeout(
|
|
browser.newContext({
|
|
...devices["iPhone 11"],
|
|
}),
|
|
remainingActionBudgetMs(actionStartedAt, actionBudget),
|
|
"Mobile-Kontext erstellen",
|
|
);
|
|
|
|
const homepage = await withActionTimeout(
|
|
crawlPage(
|
|
desktopContext,
|
|
rootUrl,
|
|
rootUrl,
|
|
Math.min(
|
|
timeoutMs,
|
|
remainingActionBudgetMs(actionStartedAt, actionBudget),
|
|
),
|
|
),
|
|
remainingActionBudgetMs(actionStartedAt, actionBudget),
|
|
"Homepage crawlen",
|
|
);
|
|
if (!homepage) {
|
|
throw new Error("Homepage konnte nicht geladen werden.");
|
|
}
|
|
|
|
const requestedPages = discoverRelevantSubpageUrls(
|
|
homepage.links.map((link) => link.href),
|
|
rootUrl,
|
|
);
|
|
const crawlTargets = requestedPages.slice(0, maxPages);
|
|
const crawledPages: PageResult[] = [homepage];
|
|
|
|
for (const pageUrl of crawlTargets.slice(1)) {
|
|
const crawled = await withActionTimeout(
|
|
crawlPage(
|
|
desktopContext,
|
|
pageUrl,
|
|
rootUrl,
|
|
Math.min(
|
|
timeoutMs,
|
|
remainingActionBudgetMs(actionStartedAt, actionBudget),
|
|
),
|
|
),
|
|
remainingActionBudgetMs(actionStartedAt, actionBudget),
|
|
`Unterseite crawlen: ${pageUrl}`,
|
|
);
|
|
if (crawled) {
|
|
crawledPages.push(crawled);
|
|
}
|
|
}
|
|
|
|
const allLinks: PersistedCrawlLink[] = crawledPages.flatMap((page) =>
|
|
page.links.map((link) => ({
|
|
...link,
|
|
pageUrl: page.finalUrl,
|
|
})),
|
|
);
|
|
const internalLinks = allLinks.filter((link) => link.isInternal);
|
|
const uniqueInternalLinks = [...new Set(internalLinks.map((link) => link.href))];
|
|
|
|
const checkMap = new Map<
|
|
string,
|
|
{ status: number | null; isBroken: boolean }
|
|
>();
|
|
|
|
for (const href of uniqueInternalLinks.slice(0, 30)) {
|
|
try {
|
|
const response = await desktopContext.request.get(href, {
|
|
timeout: Math.min(
|
|
Math.max(1_000, timeoutMs - 1_000),
|
|
remainingActionBudgetMs(actionStartedAt, actionBudget),
|
|
),
|
|
});
|
|
const status = response.status();
|
|
checkMap.set(href, {
|
|
status,
|
|
isBroken: status < 200 || status >= 400,
|
|
});
|
|
} catch {
|
|
checkMap.set(href, {
|
|
status: null,
|
|
isBroken: true,
|
|
});
|
|
}
|
|
}
|
|
|
|
const desktopScreenshot = await withActionTimeout(
|
|
captureHomepageScreenshot(
|
|
ctx,
|
|
desktopContext,
|
|
homepage.finalUrl,
|
|
"desktop",
|
|
Math.min(
|
|
timeoutMs,
|
|
remainingActionBudgetMs(actionStartedAt, actionBudget),
|
|
),
|
|
),
|
|
remainingActionBudgetMs(actionStartedAt, actionBudget),
|
|
"Desktop-Screenshot erfassen",
|
|
);
|
|
const mobileScreenshot = await withActionTimeout(
|
|
captureHomepageScreenshot(
|
|
ctx,
|
|
mobileContext,
|
|
homepage.finalUrl,
|
|
"mobile",
|
|
Math.min(
|
|
timeoutMs,
|
|
remainingActionBudgetMs(actionStartedAt, actionBudget),
|
|
),
|
|
),
|
|
remainingActionBudgetMs(actionStartedAt, actionBudget),
|
|
"Mobile-Screenshot erfassen",
|
|
);
|
|
|
|
const technicalInput = buildTechnicalChecks({
|
|
rootUrl,
|
|
finalUrl: homepage.finalUrl,
|
|
title: homepage.title,
|
|
metaDescription: homepage.metaDescription,
|
|
visibleText: homepage.visibleText,
|
|
checkedUrls: crawledPages.map((page) => page.finalUrl),
|
|
links: allLinks.map((link) => {
|
|
const check = checkMap.get(link.href);
|
|
return {
|
|
href: link.href,
|
|
status: check?.status ?? undefined,
|
|
statusCode: check?.status ?? undefined,
|
|
isBroken: check?.isBroken,
|
|
};
|
|
}),
|
|
});
|
|
|
|
const validCandidates = deduplicateLeadEmailCandidates(
|
|
crawledPages.flatMap((page) => page.emailCandidates),
|
|
);
|
|
const persistedLinks = deduplicateCrawlLinks(allLinks).slice(
|
|
0,
|
|
MAX_PERSISTED_LINKS,
|
|
);
|
|
const persistedCandidates = validCandidates.slice(
|
|
0,
|
|
MAX_PERSISTED_EMAIL_CANDIDATES,
|
|
);
|
|
const usable = getUsableContactEmailFromEntries(
|
|
validCandidates.map((candidate) => ({
|
|
email: candidate.email,
|
|
emailSource: candidate.emailSource,
|
|
contactPerson: candidate.contactPerson,
|
|
isBusinessContactAddress: candidate.isBusinessContactAddress,
|
|
})),
|
|
);
|
|
|
|
await ctx.runMutation(internal.websiteEnrichment.persistLeadEnrichmentResult, {
|
|
runId,
|
|
leadId: started.lead._id,
|
|
pages: crawledPages.map((page) => ({
|
|
sourceUrl: page.sourceUrl,
|
|
finalUrl: page.finalUrl,
|
|
pageKind: page.pageKind,
|
|
title: page.title,
|
|
metaDescription: page.metaDescription,
|
|
headings: page.headings,
|
|
visibleTextExcerpt: trimExcerpt(page.visibleText),
|
|
hasContactFormSignal: page.hasContactFormSignal,
|
|
hasContactCtaSignal: page.hasContactCtaSignal,
|
|
})),
|
|
links: persistedLinks.map((link) => ({
|
|
pageUrl: link.pageUrl,
|
|
href: link.href,
|
|
text: link.text,
|
|
isInternal: link.isInternal,
|
|
isBroken: checkMap.get(link.href)?.isBroken,
|
|
})),
|
|
emailCandidates: persistedCandidates.map((candidate) => ({
|
|
email: candidate.email,
|
|
normalizedEmail: candidate.normalizedEmail,
|
|
emailSource: candidate.emailSource,
|
|
sourceUrl: candidate.sourceUrl,
|
|
contactPerson: candidate.contactPerson ?? undefined,
|
|
isBusinessContactAddress: candidate.isBusinessContactAddress,
|
|
isGeneric: candidate.isGeneric,
|
|
accepted:
|
|
usable !== null && candidate.normalizedEmail === usable.email,
|
|
})),
|
|
screenshots: [
|
|
...(desktopScreenshot ? [desktopScreenshot] : []),
|
|
...(mobileScreenshot ? [mobileScreenshot] : []),
|
|
],
|
|
technicalChecks: [
|
|
{
|
|
sourceUrl: homepage.sourceUrl,
|
|
finalUrl: homepage.finalUrl,
|
|
usesHttps: technicalInput.https,
|
|
missingTitle: technicalInput.missingTitle,
|
|
missingMetaDescription: technicalInput.missingMetaDescription,
|
|
hasVisibleContactPath: technicalInput.hasVisibleContactPath,
|
|
brokenInternalLinkCount: technicalInput.brokenInternalLinks.length,
|
|
},
|
|
],
|
|
});
|
|
|
|
if (usable) {
|
|
await ctx.runMutation(internal.websiteEnrichment.patchLeadFromWebsiteEnrichment, {
|
|
leadId: started.lead._id,
|
|
email: usable.email,
|
|
emailSource: usable.emailSource ?? undefined,
|
|
contactPerson: usable.contactPerson ?? undefined,
|
|
currentContactStatus: started.lead.contactStatus,
|
|
});
|
|
} else {
|
|
await ctx.runMutation(internal.websiteEnrichment.patchLeadFromWebsiteEnrichment, {
|
|
leadId: started.lead._id,
|
|
currentContactStatus: started.lead.contactStatus,
|
|
contactStatusReason:
|
|
"Kein verwertbarer Kontakt auf der Website gefunden.",
|
|
});
|
|
}
|
|
|
|
try {
|
|
await ctx.runMutation(internal.pageSpeed.queueLeadPageSpeedAudit, {
|
|
leadId: started.lead._id,
|
|
parentRunId: runId,
|
|
});
|
|
} catch (pageSpeedQueueError) {
|
|
await ctx.runMutation(internal.runs.appendEventInternal, {
|
|
runId,
|
|
level: "warning",
|
|
message: "PageSpeed-Analyse konnte nicht in die Warteschlange gesetzt werden.",
|
|
details: [
|
|
{ label: "Lead", value: started.lead._id },
|
|
{
|
|
label: "Fehler",
|
|
value: messageFromError(pageSpeedQueueError),
|
|
source: "pagespeed_queue",
|
|
},
|
|
],
|
|
});
|
|
}
|
|
|
|
await ctx.runMutation(internal.websiteEnrichment.finishLeadEnrichmentRun, {
|
|
runId,
|
|
status: "succeeded",
|
|
currentStep: "website_enrichment",
|
|
errors: 0,
|
|
});
|
|
|
|
await ctx.runMutation(internal.runs.appendEventInternal, {
|
|
runId,
|
|
level: "info",
|
|
message: usable
|
|
? "Website-Enrichment erfolgreich mit nutzbarer E-Mail abgeschlossen."
|
|
: "Website-Enrichment abgeschlossen, aber ohne nutzbare E-Mail.",
|
|
});
|
|
|
|
return runId;
|
|
|
|
} catch (error) {
|
|
const errorSummary = messageFromError(error);
|
|
|
|
await ctx.runMutation(internal.websiteEnrichment.finishLeadEnrichmentRun, {
|
|
runId,
|
|
status: "failed",
|
|
currentStep: "website_enrichment",
|
|
errorSummary,
|
|
errors: 1,
|
|
});
|
|
|
|
await ctx.runMutation(internal.runs.appendEventInternal, {
|
|
runId,
|
|
level: "error",
|
|
message: "Website-Enrichment fehlgeschlagen.",
|
|
details: [
|
|
{ label: "Fehler", value: errorSummary, source: "website_enrichment" },
|
|
],
|
|
});
|
|
|
|
if (started) {
|
|
try {
|
|
await ctx.runMutation(internal.pageSpeed.queueLeadPageSpeedAudit, {
|
|
leadId: started.lead._id,
|
|
parentRunId: runId,
|
|
});
|
|
} catch (pageSpeedQueueError) {
|
|
await ctx.runMutation(internal.runs.appendEventInternal, {
|
|
runId,
|
|
level: "warning",
|
|
message: "PageSpeed-Analyse konnte nicht in die Warteschlange gesetzt werden.",
|
|
details: [
|
|
{ label: "Lead", value: started.lead._id },
|
|
{
|
|
label: "Fehler",
|
|
value: messageFromError(pageSpeedQueueError),
|
|
source: "pagespeed_queue",
|
|
},
|
|
],
|
|
});
|
|
}
|
|
await ctx.runMutation(internal.websiteEnrichment.patchLeadFromWebsiteEnrichment, {
|
|
leadId: started.lead._id,
|
|
currentContactStatus: started.lead.contactStatus,
|
|
contactStatusReason: `Website-Enrichment fehlgeschlagen: ${errorSummary}`,
|
|
});
|
|
}
|
|
|
|
return null;
|
|
} finally {
|
|
if (desktopContext) {
|
|
await closePlaywrightResourceSafely(
|
|
desktopContext,
|
|
"desktop browser context",
|
|
);
|
|
}
|
|
if (mobileContext) {
|
|
await closePlaywrightResourceSafely(
|
|
mobileContext,
|
|
"mobile browser context",
|
|
);
|
|
}
|
|
if (browser) {
|
|
await closePlaywrightResourceSafely(browser, "browser");
|
|
}
|
|
}
|
|
},
|
|
});
|