feat: add website enrichment crawler

This commit is contained in:
2026-06-04 20:29:23 +02:00
parent ca42c8d5a6
commit 1f6e31c01c
25 changed files with 3539 additions and 56 deletions

605
lib/website-crawler.ts Normal file
View File

@@ -0,0 +1,605 @@
import { normalizeEmailAddress } from "./lead-discovery-google";
const HTTP_SCHEMES = new Set(["http:", "https:"]);
const RELEVANT_PATH_PATTERNS = [
/(?:^|\/)(kontakt|contact)(?:[-/]|$)/i,
/(?:^|\/)(impressum|imprint)(?:[-/]|$)/i,
/(?:^|\/)(leistungen|angebot|services?)(?:[-/]|$)/i,
/(?:^|\/)(ueber|über|team|about)(?:[-/]|$)/i,
];
const CONTACT_CONTEXT_KEYWORDS = [
"ansprechpartner",
"kontakt",
"e-mail",
"email",
"team",
"impressum",
"geschäftsführung",
"imprint",
"footer",
"anfrage",
];
const GENERIC_BUSINESS_LOCALS = new Set([
"info",
"kontakt",
"contact",
"office",
"hello",
"sales",
"support",
"service",
"team",
"post",
]);
export type WebsiteCrawlEmailCandidate = {
email: string;
emailSource: string | null;
contactPerson: string | null;
isBusinessContactAddress: boolean;
};
export type WebsiteCrawlContactSignals = {
visibleText: string;
phoneNumbers: string[];
emailCandidates: WebsiteCrawlEmailCandidate[];
hasContactFormSignal: boolean;
hasContactCtaSignal: boolean;
};
export type TechnicalChecksInput = {
rootUrl?: string | null;
finalUrl?: string | null;
title?: string | null;
metaDescription?: string | null;
visibleText?: string | null;
checkedUrls?: string[];
links?: Array<
| string
| {
href?: string;
status?: number;
statusCode?: number;
isBroken?: boolean;
}
>;
};
export type WebsiteTechnicalChecks = {
https: boolean;
finalUrl: string;
missingTitle: boolean;
missingMetaDescription: boolean;
hasVisibleContactPath: boolean;
brokenInternalLinks: string[];
};
function stripWww(host: string) {
return host.replace(/^www\./i, "");
}
function toLowerHost(value: string) {
try {
return new URL(value).hostname.toLowerCase();
} catch {
return "";
}
}
export function normalizeCrawlUrl(input?: string | null, base?: string) {
if (!input) {
return null;
}
const trimmed = input.trim();
if (!trimmed) {
return null;
}
if (!base && (trimmed.startsWith("//") || !trimmed.includes("://"))) {
return null;
}
let parsed: URL;
try {
parsed = new URL(trimmed, base);
} catch {
return null;
}
if (!HTTP_SCHEMES.has(parsed.protocol)) {
return null;
}
const normalizedHost = stripWww(parsed.hostname.toLowerCase());
const search = parsed.search;
const path = parsed.pathname || "/";
return `${parsed.protocol}//${normalizedHost}${parsed.port ? `:${parsed.port}` : ""}${path}${search}`;
}
export function isSameRegistrableHostishDomain(
candidateUrl: string,
rootUrl: string,
) {
const root = normalizeCrawlUrl(rootUrl) ?? undefined;
const candidate = normalizeCrawlUrl(candidateUrl, root);
if (!candidate || !root) {
return false;
}
const candidateHost = stripWww(toLowerHost(candidate));
const rootHost = stripWww(toLowerHost(root));
return candidateHost === rootHost && candidateHost.length > 0;
}
function normalizeForQueue(value: string | null) {
if (!value) {
return null;
}
let url: URL;
try {
url = new URL(value);
} catch {
return null;
}
const host = `${stripWww(url.hostname.toLowerCase())}${url.port ? `:${url.port}` : ""}`;
return `${url.protocol}//${host}${url.pathname.replace(/\/$/, "") || "/"}`;
}
export function discoverRelevantSubpageUrls(links: string[], rootUrl: string) {
const root = normalizeCrawlUrl(rootUrl);
if (!root) {
return [];
}
const parsedRoot = new URL(root);
const homepage = `${parsedRoot.protocol}//${stripWww(
parsedRoot.hostname.toLowerCase(),
)}${parsedRoot.port ? `:${parsedRoot.port}` : ""}/`;
const seen = new Set<string>([homepage]);
const buckets: string[][] = [[], [], [], []];
for (const link of links) {
const normalized = normalizeCrawlUrl(link, rootUrl);
if (!normalized || !isSameRegistrableHostishDomain(normalized, rootUrl)) {
continue;
}
const canonical = normalizeForQueue(normalized);
if (!canonical || seen.has(canonical)) {
continue;
}
let path: string;
try {
path = new URL(normalized).pathname.toLowerCase();
} catch {
continue;
}
for (const [priority, pattern] of RELEVANT_PATH_PATTERNS.entries()) {
if (pattern.test(path)) {
if (buckets[priority].length > 0) {
break;
}
buckets[priority].push(canonical);
seen.add(canonical);
break;
}
}
}
const relevant = [...buckets.flat()];
return [homepage, ...relevant].slice(0, 5);
}
function stripHtml(input: string) {
return input
.replace(/<script[\s\S]*?<\/script>/gi, " ")
.replace(/<style[\s]*?[\s\S]*?<\/style>/gi, " ")
.replace(/<[^>]*>/g, " ")
.replace(/\s+/g, " ")
.trim();
}
function stripLeadingToText(input: string) {
return input.replace(/<[^>]*>/g, "").replace(/\s+/g, " ").trim();
}
function decodeCommonEmailEntities(input: string) {
return input
.replace(/&nbsp;|&#xa0;|&#160;/gi, " ")
.replace(/&commat;|&#64;|&#x40;/gi, "@")
.replace(/&period;|&#46;|&#x2e;/gi, ".");
}
function normalizeEmailExtractionInput(input: string) {
return decodeCommonEmailEntities(input)
.replace(/<script[\s\S]*?<\/script>/gi, " ")
.replace(/<style[\s\S]*?<\/style>/gi, " ")
.replace(/\s+/g, " ")
.trim();
}
function normalizeMailtoAddress(value: string) {
const strippedQuery = value.split("?")[0] ?? "";
const withoutMailto = strippedQuery.replace(/^mailto:/i, "");
try {
return decodeURIComponent(withoutMailto).trim();
} catch {
return withoutMailto.trim();
}
}
function denormalizeObfuscatedEmail(value: string) {
const withAt = value
.replace(/\[\s*at\s*\]|\(\s*at\s*\)|\{\s*at\s*\}/gi, "@")
.replace(/\bpunkt\b|\bdot\b/gi, ".")
.replace(/\[\s*dot\s*\]|\(\s*dot\s*\)|\{\s*dot\s*\}/gi, ".");
return withAt
.replace(/\s*@\s*/g, "@")
.replace(/\s*\.\s*/g, ".")
.replace(/\s+/g, "");
}
function addEmailCandidate(
entries: WebsiteCrawlEmailCandidate[],
seen: Set<string>,
email: string,
source: string,
index: number,
length: number,
explicitPersons: Map<string, string>,
) {
const normalized = normalizeEmailAddress(email);
if (!normalized || seen.has(normalized)) {
return;
}
const businessContext = hasBusinessContactContext(source, index, length);
const explicitPerson =
explicitPersons.get(normalized) ?? getContactPersonForEmail(source, email, index);
entries.push({
email: normalized,
emailSource: null,
contactPerson: explicitPerson,
isBusinessContactAddress: businessContext,
});
seen.add(normalized);
}
function collectObfuscatedEmailCandidates(
source: string,
explicitPersons: Map<string, string>,
) {
const normalizedSource = normalizeEmailExtractionInput(source);
const localPart = "[a-z0-9._%+-]{1,64}";
const domainLabel = "[a-z0-9](?:[a-z0-9-]{0,61}[a-z0-9])?";
const tld = "[a-z]{2,}";
const strictAtSeparator =
"(?:@|\\[\\s*at\\s*\\]|\\(\\s*at\\s*\\)|\\{\\s*at\\s*\\})";
const looseAtSeparator = "\\bat\\b";
const atSeparator = `(?:${strictAtSeparator}|${looseAtSeparator})`;
const strictDotSeparator =
"(?:\\.|\\[\\s*(?:dot|punkt)\\s*\\]|\\(\\s*(?:dot|punkt)\\s*\\)|\\{\\s*(?:dot|punkt)\\s*\\})";
const looseDotSeparator = "\\b(?:dot|punkt)\\b";
const dotSeparator = `(?:${strictDotSeparator}|${looseDotSeparator})`;
const obfuscatedEmailRegex = new RegExp(
`\\b(?<local>${localPart})\\s*(?<at>${atSeparator})\\s*(?<domain>${domainLabel}(?:\\s*${dotSeparator}\\s*${domainLabel})*\\s*${dotSeparator}\\s*${tld})\\b`,
"gi",
);
const candidates: WebsiteCrawlEmailCandidate[] = [];
const seen = new Set<string>();
for (const match of normalizedSource.matchAll(obfuscatedEmailRegex)) {
const rawCandidate = match[0];
if (!rawCandidate) {
continue;
}
const localPartMatch = match.groups?.local ?? "";
const atSeparatorMatch = match.groups?.at ?? "";
const domainPartMatch = match.groups?.domain ?? "";
const isBareAt =
/\bat\b/i.test(atSeparatorMatch) && !/@|\[|\(|\{/.test(atSeparatorMatch);
const hasBareDot = /\b(?:dot|punkt)\b/i.test(domainPartMatch);
const deobfuscationIndex = match.index ?? -1;
if (deobfuscationIndex < 0) {
continue;
}
if ((isBareAt || hasBareDot) && !GENERIC_BUSINESS_LOCALS.has(localPartMatch.toLowerCase()) &&
!hasBusinessContactContext(
normalizedSource,
deobfuscationIndex,
rawCandidate.length,
)) {
continue;
}
const normalized = denormalizeObfuscatedEmail(rawCandidate);
const normalizedEmail = normalizeEmailAddress(normalized);
if (!normalizedEmail || seen.has(normalizedEmail)) {
continue;
}
const explicitPerson =
explicitPersons.get(normalizedEmail) ??
getContactPersonForEmail(normalizedSource, rawCandidate, deobfuscationIndex);
const businessContext = hasBusinessContactContext(
normalizedSource,
deobfuscationIndex,
rawCandidate.length,
);
candidates.push({
email: normalizedEmail,
emailSource: null,
contactPerson: explicitPerson,
isBusinessContactAddress: businessContext,
});
seen.add(normalizedEmail);
}
return candidates;
}
function getContactPersonForEmail(
text: string,
email: string,
index: number,
) {
const windowStart = Math.max(0, index - 120);
const windowEnd = Math.min(text.length, index + email.length + 120);
const context = text.slice(windowStart, windowEnd);
const beforeEmailContext = context.slice(0, index - windowStart);
const anchorMatches = Array.from(
beforeEmailContext.matchAll(/<a\b[^>]*>(.*?)<\/a>/gi),
);
const nearestAnchor = anchorMatches.at(-1);
if (nearestAnchor?.[1]) {
const anchorText = stripLeadingToText(nearestAnchor[1]).trim();
if (anchorText && !/@/.test(anchorText) && anchorText.length < 120) {
return anchorText;
}
}
const nearMatch = context.match(
/(?:(?:^|[>\s])([A-ZÄÖÜ][a-zäöüßÄÖÜ]+\s+[A-ZÄÖÜ][a-zäöüßÄÖÜ-]+(?:\s+[A-ZÄÖÜ][a-zäöüßÄÖÜ-]+)?))$/u,
);
if (nearMatch?.[1]) {
return stripLeadingToText(nearMatch[1]!).trim();
}
const directMatch = text.slice(0, index).match(
/([A-ZÄÖÜ][a-zäöüßÄÖÜ-]+\s+[A-ZÄÖÜ][a-zäöüßÄÖÜ-]+)\s*(?:,|\s+\()?\s*$/u,
);
return directMatch?.[1]?.trim() ?? null;
}
function hasBusinessContactContext(text: string, index: number, length: number) {
const context = text
.slice(Math.max(0, index - 140), Math.min(text.length, index + length + 140))
.toLowerCase();
return CONTACT_CONTEXT_KEYWORDS.some((keyword) => context.includes(keyword));
}
function makePhoneNumberSet(input: string) {
const phoneRegex = /(?:\+?\d[\d\s./()-]{7,}\d)/g;
const matches = input.matchAll(phoneRegex);
const values = new Set<string>();
for (const match of matches) {
const raw = match[0] ?? "";
const normalized = raw.replace(/[^\d+]/g, "");
if (normalized.length >= 7) {
values.add(raw.trim());
values.add(normalized);
}
}
return Array.from(values).filter((value) => value.length >= 7);
}
function makeEmailCandidates(input: string) {
const emailRegex = /[a-z0-9._%+-]+@[a-z0-9.-]+\.[a-z]{2,}(?:\b)?/gi;
const mailtoAnchors = input.matchAll(
/href=["']mailto:([^"'>\s]+)["'][^>]*>(.*?)<\/a>/gi,
);
const normalizedInput = normalizeEmailExtractionInput(input);
const explicitPersons = new Map<string, string>();
const entries: WebsiteCrawlEmailCandidate[] = [];
const seen = new Set<string>();
for (const anchorMatch of mailtoAnchors) {
const rawHref = normalizeMailtoAddress(anchorMatch[1] ?? "");
const email = normalizeEmailAddress(rawHref);
if (!email) {
continue;
}
const label = stripLeadingToText(
decodeCommonEmailEntities(anchorMatch[2] ?? ""),
).trim();
const normalizedLabelEmail = normalizeEmailAddress(label);
if (label && label.length <= 64 && !label.includes("@")) {
explicitPersons.set(email, label);
}
if (seen.has(email)) {
continue;
}
const anchorIndex = anchorMatch.index ?? -1;
if (anchorIndex < 0) {
continue;
}
const contactPerson =
normalizedLabelEmail && normalizedLabelEmail === email ? null : label || null;
entries.push({
email,
emailSource: null,
contactPerson,
isBusinessContactAddress: hasBusinessContactContext(
input,
anchorIndex,
email.length,
),
});
seen.add(email);
}
for (const match of normalizedInput.matchAll(emailRegex)) {
const rawEmail = match[0] ?? "";
const idx = match.index ?? -1;
if (rawEmail.length === 0 || idx < 0) {
continue;
}
addEmailCandidate(
entries,
seen,
rawEmail,
normalizedInput,
idx,
rawEmail.length,
explicitPersons,
);
}
for (const candidate of collectObfuscatedEmailCandidates(input, explicitPersons)) {
if (seen.has(candidate.email)) {
continue;
}
entries.push(candidate);
seen.add(candidate.email);
}
return entries;
}
export function extractContactSignalsFromHtmlLikeText(input: string) {
const visibleText = stripHtml(input);
const phoneNumbers = makePhoneNumberSet(visibleText);
const emailCandidates = makeEmailCandidates(input);
const lowerInput = input.toLowerCase();
const hasContactFormSignal =
/kontaktformular|anfrageformular|contact form|<form\b/i.test(lowerInput);
const hasContactCtaSignal =
/kontaktformular|anfrageformular|anfrage\s*senden|anfrage\s*stellen|schreiben\s+sie\s+uns|kontaktieren\s+(?:sie|du)|kontakt\s+(?:und|mit|zu)|<form\b/i.test(
lowerInput,
);
return {
visibleText,
phoneNumbers,
emailCandidates,
hasContactFormSignal,
hasContactCtaSignal,
};
}
function isRelevantContactPathText(value: string) {
const normalized = value.toLowerCase();
return (
/(?:^|\/)(kontakt|contact)(?:[-/]|$)/.test(normalized) ||
/(?:^|\/)(impressum|imprint)(?:[-/]|$)/.test(normalized) ||
/(?:^|\/)(leistungen|angebot|services?)(?:[-/]|$)/.test(normalized) ||
/(?:^|\/)(ueber|über|about|team)(?:[-/]|$)/.test(normalized) ||
/\bkontakt\b/.test(normalized) ||
/\bkontaktformular\b/.test(normalized)
);
}
export function buildTechnicalChecks(input: TechnicalChecksInput): WebsiteTechnicalChecks {
const finalUrl = normalizeCrawlUrl(input.finalUrl ?? "", input.rootUrl ?? undefined) ??
normalizeCrawlUrl(input.rootUrl ?? "", undefined) ??
"";
const normalizedRoot = normalizeCrawlUrl(input.rootUrl ?? finalUrl ?? "", undefined) ??
finalUrl;
const title = input.title?.trim() ?? "";
const metaDescription = input.metaDescription?.trim() ?? "";
const visibleText = input.visibleText ?? "";
const relevantVisibleText = visibleText.toLowerCase();
const hasVisibleContactPath =
isRelevantContactPathText(relevantVisibleText) ||
isRelevantContactPathText(finalUrl) ||
isRelevantContactPathText(new URL(finalUrl || "https://localhost").pathname);
const checkedUrlSet = new Set<string>();
const checkedUrls = input.checkedUrls ?? [];
for (const checkedUrl of checkedUrls) {
const normalizedCheckedUrl = normalizeCrawlUrl(checkedUrl, normalizedRoot ?? undefined);
if (!normalizedCheckedUrl || !isSameRegistrableHostishDomain(normalizedCheckedUrl, normalizedRoot)) {
continue;
}
const canonicalCheckedUrl = normalizeForQueue(normalizedCheckedUrl);
if (canonicalCheckedUrl) {
checkedUrlSet.add(canonicalCheckedUrl);
}
}
const hasCheckedUrls = checkedUrlSet.size > 0;
const brokenInternalLinksSet = new Set<string>();
for (const entry of input.links ?? []) {
const href = typeof entry === "string" ? entry : (entry.href ?? "");
const normalizedLink = normalizeCrawlUrl(href, normalizedRoot ?? undefined);
if (!normalizedLink || !isSameRegistrableHostishDomain(normalizedLink, normalizedRoot)) {
continue;
}
const canonical = normalizeForQueue(normalizedLink);
if (!canonical) {
continue;
}
if (hasCheckedUrls && !checkedUrlSet.has(canonical)) {
continue;
}
let isBroken = false;
if (typeof entry !== "string") {
if (entry.isBroken === true) {
isBroken = true;
}
const status = entry.status ?? entry.statusCode;
if (typeof status === "number" && (status >= 400 || status <= 0)) {
isBroken = true;
}
}
if (isBroken) {
brokenInternalLinksSet.add(canonical);
}
}
return {
https: finalUrl.startsWith("https://"),
finalUrl,
missingTitle: title.length === 0,
missingMetaDescription: metaDescription.length === 0,
hasVisibleContactPath,
brokenInternalLinks: Array.from(brokenInternalLinksSet),
};
}