606 lines
16 KiB
TypeScript
606 lines
16 KiB
TypeScript
import { normalizeEmailAddress } from "./lead-discovery-google";
|
|
|
|
const HTTP_SCHEMES = new Set(["http:", "https:"]);
|
|
|
|
const RELEVANT_PATH_PATTERNS = [
|
|
/(?:^|\/)(kontakt|contact)(?:[-/]|$)/i,
|
|
/(?:^|\/)(impressum|imprint)(?:[-/]|$)/i,
|
|
/(?:^|\/)(leistungen|angebot|services?)(?:[-/]|$)/i,
|
|
/(?:^|\/)(ueber|über|team|about)(?:[-/]|$)/i,
|
|
];
|
|
|
|
const CONTACT_CONTEXT_KEYWORDS = [
|
|
"ansprechpartner",
|
|
"kontakt",
|
|
"e-mail",
|
|
"email",
|
|
"team",
|
|
"impressum",
|
|
"geschäftsführung",
|
|
"imprint",
|
|
"footer",
|
|
"anfrage",
|
|
];
|
|
|
|
const GENERIC_BUSINESS_LOCALS = new Set([
|
|
"info",
|
|
"kontakt",
|
|
"contact",
|
|
"office",
|
|
"hello",
|
|
"sales",
|
|
"support",
|
|
"service",
|
|
"team",
|
|
"post",
|
|
]);
|
|
|
|
export type WebsiteCrawlEmailCandidate = {
|
|
email: string;
|
|
emailSource: string | null;
|
|
contactPerson: string | null;
|
|
isBusinessContactAddress: boolean;
|
|
};
|
|
|
|
export type WebsiteCrawlContactSignals = {
|
|
visibleText: string;
|
|
phoneNumbers: string[];
|
|
emailCandidates: WebsiteCrawlEmailCandidate[];
|
|
hasContactFormSignal: boolean;
|
|
hasContactCtaSignal: boolean;
|
|
};
|
|
|
|
export type TechnicalChecksInput = {
|
|
rootUrl?: string | null;
|
|
finalUrl?: string | null;
|
|
title?: string | null;
|
|
metaDescription?: string | null;
|
|
visibleText?: string | null;
|
|
checkedUrls?: string[];
|
|
links?: Array<
|
|
| string
|
|
| {
|
|
href?: string;
|
|
status?: number;
|
|
statusCode?: number;
|
|
isBroken?: boolean;
|
|
}
|
|
>;
|
|
};
|
|
|
|
export type WebsiteTechnicalChecks = {
|
|
https: boolean;
|
|
finalUrl: string;
|
|
missingTitle: boolean;
|
|
missingMetaDescription: boolean;
|
|
hasVisibleContactPath: boolean;
|
|
brokenInternalLinks: string[];
|
|
};
|
|
|
|
function stripWww(host: string) {
|
|
return host.replace(/^www\./i, "");
|
|
}
|
|
|
|
function toLowerHost(value: string) {
|
|
try {
|
|
return new URL(value).hostname.toLowerCase();
|
|
} catch {
|
|
return "";
|
|
}
|
|
}
|
|
|
|
export function normalizeCrawlUrl(input?: string | null, base?: string) {
|
|
if (!input) {
|
|
return null;
|
|
}
|
|
|
|
const trimmed = input.trim();
|
|
if (!trimmed) {
|
|
return null;
|
|
}
|
|
|
|
if (!base && (trimmed.startsWith("//") || !trimmed.includes("://"))) {
|
|
return null;
|
|
}
|
|
|
|
let parsed: URL;
|
|
try {
|
|
parsed = new URL(trimmed, base);
|
|
} catch {
|
|
return null;
|
|
}
|
|
|
|
if (!HTTP_SCHEMES.has(parsed.protocol)) {
|
|
return null;
|
|
}
|
|
|
|
const normalizedHost = stripWww(parsed.hostname.toLowerCase());
|
|
|
|
const search = parsed.search;
|
|
const path = parsed.pathname || "/";
|
|
|
|
return `${parsed.protocol}//${normalizedHost}${parsed.port ? `:${parsed.port}` : ""}${path}${search}`;
|
|
}
|
|
|
|
export function isSameRegistrableHostishDomain(
|
|
candidateUrl: string,
|
|
rootUrl: string,
|
|
) {
|
|
const root = normalizeCrawlUrl(rootUrl) ?? undefined;
|
|
const candidate = normalizeCrawlUrl(candidateUrl, root);
|
|
|
|
if (!candidate || !root) {
|
|
return false;
|
|
}
|
|
|
|
const candidateHost = stripWww(toLowerHost(candidate));
|
|
const rootHost = stripWww(toLowerHost(root));
|
|
|
|
return candidateHost === rootHost && candidateHost.length > 0;
|
|
}
|
|
|
|
function normalizeForQueue(value: string | null) {
|
|
if (!value) {
|
|
return null;
|
|
}
|
|
|
|
let url: URL;
|
|
try {
|
|
url = new URL(value);
|
|
} catch {
|
|
return null;
|
|
}
|
|
|
|
const host = `${stripWww(url.hostname.toLowerCase())}${url.port ? `:${url.port}` : ""}`;
|
|
return `${url.protocol}//${host}${url.pathname.replace(/\/$/, "") || "/"}`;
|
|
}
|
|
|
|
export function discoverRelevantSubpageUrls(links: string[], rootUrl: string) {
|
|
const root = normalizeCrawlUrl(rootUrl);
|
|
if (!root) {
|
|
return [];
|
|
}
|
|
|
|
const parsedRoot = new URL(root);
|
|
const homepage = `${parsedRoot.protocol}//${stripWww(
|
|
parsedRoot.hostname.toLowerCase(),
|
|
)}${parsedRoot.port ? `:${parsedRoot.port}` : ""}/`;
|
|
|
|
const seen = new Set<string>([homepage]);
|
|
const buckets: string[][] = [[], [], [], []];
|
|
|
|
for (const link of links) {
|
|
const normalized = normalizeCrawlUrl(link, rootUrl);
|
|
if (!normalized || !isSameRegistrableHostishDomain(normalized, rootUrl)) {
|
|
continue;
|
|
}
|
|
|
|
const canonical = normalizeForQueue(normalized);
|
|
if (!canonical || seen.has(canonical)) {
|
|
continue;
|
|
}
|
|
|
|
let path: string;
|
|
try {
|
|
path = new URL(normalized).pathname.toLowerCase();
|
|
} catch {
|
|
continue;
|
|
}
|
|
|
|
for (const [priority, pattern] of RELEVANT_PATH_PATTERNS.entries()) {
|
|
if (pattern.test(path)) {
|
|
if (buckets[priority].length > 0) {
|
|
break;
|
|
}
|
|
buckets[priority].push(canonical);
|
|
seen.add(canonical);
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
const relevant = [...buckets.flat()];
|
|
|
|
return [homepage, ...relevant].slice(0, 5);
|
|
}
|
|
|
|
function stripHtml(input: string) {
|
|
return input
|
|
.replace(/<script[\s\S]*?<\/script>/gi, " ")
|
|
.replace(/<style[\s]*?[\s\S]*?<\/style>/gi, " ")
|
|
.replace(/<[^>]*>/g, " ")
|
|
.replace(/\s+/g, " ")
|
|
.trim();
|
|
}
|
|
|
|
function stripLeadingToText(input: string) {
|
|
return input.replace(/<[^>]*>/g, "").replace(/\s+/g, " ").trim();
|
|
}
|
|
|
|
function decodeCommonEmailEntities(input: string) {
|
|
return input
|
|
.replace(/ | | /gi, " ")
|
|
.replace(/@|@|@/gi, "@")
|
|
.replace(/.|.|./gi, ".");
|
|
}
|
|
|
|
function normalizeEmailExtractionInput(input: string) {
|
|
return decodeCommonEmailEntities(input)
|
|
.replace(/<script[\s\S]*?<\/script>/gi, " ")
|
|
.replace(/<style[\s\S]*?<\/style>/gi, " ")
|
|
.replace(/\s+/g, " ")
|
|
.trim();
|
|
}
|
|
|
|
function normalizeMailtoAddress(value: string) {
|
|
const strippedQuery = value.split("?")[0] ?? "";
|
|
const withoutMailto = strippedQuery.replace(/^mailto:/i, "");
|
|
try {
|
|
return decodeURIComponent(withoutMailto).trim();
|
|
} catch {
|
|
return withoutMailto.trim();
|
|
}
|
|
}
|
|
|
|
function denormalizeObfuscatedEmail(value: string) {
|
|
const withAt = value
|
|
.replace(/\[\s*at\s*\]|\(\s*at\s*\)|\{\s*at\s*\}/gi, "@")
|
|
.replace(/\bpunkt\b|\bdot\b/gi, ".")
|
|
.replace(/\[\s*dot\s*\]|\(\s*dot\s*\)|\{\s*dot\s*\}/gi, ".");
|
|
|
|
return withAt
|
|
.replace(/\s*@\s*/g, "@")
|
|
.replace(/\s*\.\s*/g, ".")
|
|
.replace(/\s+/g, "");
|
|
}
|
|
|
|
function addEmailCandidate(
|
|
entries: WebsiteCrawlEmailCandidate[],
|
|
seen: Set<string>,
|
|
email: string,
|
|
source: string,
|
|
index: number,
|
|
length: number,
|
|
explicitPersons: Map<string, string>,
|
|
) {
|
|
const normalized = normalizeEmailAddress(email);
|
|
if (!normalized || seen.has(normalized)) {
|
|
return;
|
|
}
|
|
|
|
const businessContext = hasBusinessContactContext(source, index, length);
|
|
const explicitPerson =
|
|
explicitPersons.get(normalized) ?? getContactPersonForEmail(source, email, index);
|
|
|
|
entries.push({
|
|
email: normalized,
|
|
emailSource: null,
|
|
contactPerson: explicitPerson,
|
|
isBusinessContactAddress: businessContext,
|
|
});
|
|
seen.add(normalized);
|
|
}
|
|
|
|
function collectObfuscatedEmailCandidates(
|
|
source: string,
|
|
explicitPersons: Map<string, string>,
|
|
) {
|
|
const normalizedSource = normalizeEmailExtractionInput(source);
|
|
const localPart = "[a-z0-9._%+-]{1,64}";
|
|
const domainLabel = "[a-z0-9](?:[a-z0-9-]{0,61}[a-z0-9])?";
|
|
const tld = "[a-z]{2,}";
|
|
const strictAtSeparator =
|
|
"(?:@|\\[\\s*at\\s*\\]|\\(\\s*at\\s*\\)|\\{\\s*at\\s*\\})";
|
|
const looseAtSeparator = "\\bat\\b";
|
|
const atSeparator = `(?:${strictAtSeparator}|${looseAtSeparator})`;
|
|
const strictDotSeparator =
|
|
"(?:\\.|\\[\\s*(?:dot|punkt)\\s*\\]|\\(\\s*(?:dot|punkt)\\s*\\)|\\{\\s*(?:dot|punkt)\\s*\\})";
|
|
const looseDotSeparator = "\\b(?:dot|punkt)\\b";
|
|
const dotSeparator = `(?:${strictDotSeparator}|${looseDotSeparator})`;
|
|
|
|
const obfuscatedEmailRegex = new RegExp(
|
|
`\\b(?<local>${localPart})\\s*(?<at>${atSeparator})\\s*(?<domain>${domainLabel}(?:\\s*${dotSeparator}\\s*${domainLabel})*\\s*${dotSeparator}\\s*${tld})\\b`,
|
|
"gi",
|
|
);
|
|
|
|
const candidates: WebsiteCrawlEmailCandidate[] = [];
|
|
const seen = new Set<string>();
|
|
|
|
for (const match of normalizedSource.matchAll(obfuscatedEmailRegex)) {
|
|
const rawCandidate = match[0];
|
|
if (!rawCandidate) {
|
|
continue;
|
|
}
|
|
|
|
const localPartMatch = match.groups?.local ?? "";
|
|
const atSeparatorMatch = match.groups?.at ?? "";
|
|
const domainPartMatch = match.groups?.domain ?? "";
|
|
const isBareAt =
|
|
/\bat\b/i.test(atSeparatorMatch) && !/@|\[|\(|\{/.test(atSeparatorMatch);
|
|
const hasBareDot = /\b(?:dot|punkt)\b/i.test(domainPartMatch);
|
|
|
|
const deobfuscationIndex = match.index ?? -1;
|
|
if (deobfuscationIndex < 0) {
|
|
continue;
|
|
}
|
|
|
|
if ((isBareAt || hasBareDot) && !GENERIC_BUSINESS_LOCALS.has(localPartMatch.toLowerCase()) &&
|
|
!hasBusinessContactContext(
|
|
normalizedSource,
|
|
deobfuscationIndex,
|
|
rawCandidate.length,
|
|
)) {
|
|
continue;
|
|
}
|
|
|
|
const normalized = denormalizeObfuscatedEmail(rawCandidate);
|
|
const normalizedEmail = normalizeEmailAddress(normalized);
|
|
if (!normalizedEmail || seen.has(normalizedEmail)) {
|
|
continue;
|
|
}
|
|
|
|
const explicitPerson =
|
|
explicitPersons.get(normalizedEmail) ??
|
|
getContactPersonForEmail(normalizedSource, rawCandidate, deobfuscationIndex);
|
|
const businessContext = hasBusinessContactContext(
|
|
normalizedSource,
|
|
deobfuscationIndex,
|
|
rawCandidate.length,
|
|
);
|
|
candidates.push({
|
|
email: normalizedEmail,
|
|
emailSource: null,
|
|
contactPerson: explicitPerson,
|
|
isBusinessContactAddress: businessContext,
|
|
});
|
|
seen.add(normalizedEmail);
|
|
}
|
|
|
|
return candidates;
|
|
}
|
|
|
|
function getContactPersonForEmail(
|
|
text: string,
|
|
email: string,
|
|
index: number,
|
|
) {
|
|
const windowStart = Math.max(0, index - 120);
|
|
const windowEnd = Math.min(text.length, index + email.length + 120);
|
|
const context = text.slice(windowStart, windowEnd);
|
|
|
|
const beforeEmailContext = context.slice(0, index - windowStart);
|
|
const anchorMatches = Array.from(
|
|
beforeEmailContext.matchAll(/<a\b[^>]*>(.*?)<\/a>/gi),
|
|
);
|
|
const nearestAnchor = anchorMatches.at(-1);
|
|
if (nearestAnchor?.[1]) {
|
|
const anchorText = stripLeadingToText(nearestAnchor[1]).trim();
|
|
if (anchorText && !/@/.test(anchorText) && anchorText.length < 120) {
|
|
return anchorText;
|
|
}
|
|
}
|
|
|
|
const nearMatch = context.match(
|
|
/(?:(?:^|[>\s])([A-ZÄÖÜ][a-zäöüßÄÖÜ]+\s+[A-ZÄÖÜ][a-zäöüßÄÖÜ-]+(?:\s+[A-ZÄÖÜ][a-zäöüßÄÖÜ-]+)?))$/u,
|
|
);
|
|
if (nearMatch?.[1]) {
|
|
return stripLeadingToText(nearMatch[1]!).trim();
|
|
}
|
|
|
|
const directMatch = text.slice(0, index).match(
|
|
/([A-ZÄÖÜ][a-zäöüßÄÖÜ-]+\s+[A-ZÄÖÜ][a-zäöüßÄÖÜ-]+)\s*(?:,|\s+\()?\s*$/u,
|
|
);
|
|
return directMatch?.[1]?.trim() ?? null;
|
|
}
|
|
|
|
function hasBusinessContactContext(text: string, index: number, length: number) {
|
|
const context = text
|
|
.slice(Math.max(0, index - 140), Math.min(text.length, index + length + 140))
|
|
.toLowerCase();
|
|
|
|
return CONTACT_CONTEXT_KEYWORDS.some((keyword) => context.includes(keyword));
|
|
}
|
|
|
|
function makePhoneNumberSet(input: string) {
|
|
const phoneRegex = /(?:\+?\d[\d\s./()-]{7,}\d)/g;
|
|
const matches = input.matchAll(phoneRegex);
|
|
const values = new Set<string>();
|
|
|
|
for (const match of matches) {
|
|
const raw = match[0] ?? "";
|
|
const normalized = raw.replace(/[^\d+]/g, "");
|
|
if (normalized.length >= 7) {
|
|
values.add(raw.trim());
|
|
values.add(normalized);
|
|
}
|
|
}
|
|
|
|
return Array.from(values).filter((value) => value.length >= 7);
|
|
}
|
|
|
|
function makeEmailCandidates(input: string) {
|
|
const emailRegex = /[a-z0-9._%+-]+@[a-z0-9.-]+\.[a-z]{2,}(?:\b)?/gi;
|
|
const mailtoAnchors = input.matchAll(
|
|
/href=["']mailto:([^"'>\s]+)["'][^>]*>(.*?)<\/a>/gi,
|
|
);
|
|
const normalizedInput = normalizeEmailExtractionInput(input);
|
|
const explicitPersons = new Map<string, string>();
|
|
|
|
const entries: WebsiteCrawlEmailCandidate[] = [];
|
|
const seen = new Set<string>();
|
|
|
|
for (const anchorMatch of mailtoAnchors) {
|
|
const rawHref = normalizeMailtoAddress(anchorMatch[1] ?? "");
|
|
const email = normalizeEmailAddress(rawHref);
|
|
if (!email) {
|
|
continue;
|
|
}
|
|
const label = stripLeadingToText(
|
|
decodeCommonEmailEntities(anchorMatch[2] ?? ""),
|
|
).trim();
|
|
const normalizedLabelEmail = normalizeEmailAddress(label);
|
|
if (label && label.length <= 64 && !label.includes("@")) {
|
|
explicitPersons.set(email, label);
|
|
}
|
|
if (seen.has(email)) {
|
|
continue;
|
|
}
|
|
const anchorIndex = anchorMatch.index ?? -1;
|
|
if (anchorIndex < 0) {
|
|
continue;
|
|
}
|
|
const contactPerson =
|
|
normalizedLabelEmail && normalizedLabelEmail === email ? null : label || null;
|
|
entries.push({
|
|
email,
|
|
emailSource: null,
|
|
contactPerson,
|
|
isBusinessContactAddress: hasBusinessContactContext(
|
|
input,
|
|
anchorIndex,
|
|
email.length,
|
|
),
|
|
});
|
|
seen.add(email);
|
|
}
|
|
|
|
for (const match of normalizedInput.matchAll(emailRegex)) {
|
|
const rawEmail = match[0] ?? "";
|
|
const idx = match.index ?? -1;
|
|
if (rawEmail.length === 0 || idx < 0) {
|
|
continue;
|
|
}
|
|
addEmailCandidate(
|
|
entries,
|
|
seen,
|
|
rawEmail,
|
|
normalizedInput,
|
|
idx,
|
|
rawEmail.length,
|
|
explicitPersons,
|
|
);
|
|
}
|
|
|
|
for (const candidate of collectObfuscatedEmailCandidates(input, explicitPersons)) {
|
|
if (seen.has(candidate.email)) {
|
|
continue;
|
|
}
|
|
entries.push(candidate);
|
|
seen.add(candidate.email);
|
|
}
|
|
|
|
return entries;
|
|
}
|
|
|
|
export function extractContactSignalsFromHtmlLikeText(input: string) {
|
|
const visibleText = stripHtml(input);
|
|
const phoneNumbers = makePhoneNumberSet(visibleText);
|
|
const emailCandidates = makeEmailCandidates(input);
|
|
|
|
const lowerInput = input.toLowerCase();
|
|
|
|
const hasContactFormSignal =
|
|
/kontaktformular|anfrageformular|contact form|<form\b/i.test(lowerInput);
|
|
const hasContactCtaSignal =
|
|
/kontaktformular|anfrageformular|anfrage\s*senden|anfrage\s*stellen|schreiben\s+sie\s+uns|kontaktieren\s+(?:sie|du)|kontakt\s+(?:und|mit|zu)|<form\b/i.test(
|
|
lowerInput,
|
|
);
|
|
|
|
return {
|
|
visibleText,
|
|
phoneNumbers,
|
|
emailCandidates,
|
|
hasContactFormSignal,
|
|
hasContactCtaSignal,
|
|
};
|
|
}
|
|
|
|
function isRelevantContactPathText(value: string) {
|
|
const normalized = value.toLowerCase();
|
|
return (
|
|
/(?:^|\/)(kontakt|contact)(?:[-/]|$)/.test(normalized) ||
|
|
/(?:^|\/)(impressum|imprint)(?:[-/]|$)/.test(normalized) ||
|
|
/(?:^|\/)(leistungen|angebot|services?)(?:[-/]|$)/.test(normalized) ||
|
|
/(?:^|\/)(ueber|über|about|team)(?:[-/]|$)/.test(normalized) ||
|
|
/\bkontakt\b/.test(normalized) ||
|
|
/\bkontaktformular\b/.test(normalized)
|
|
);
|
|
}
|
|
|
|
export function buildTechnicalChecks(input: TechnicalChecksInput): WebsiteTechnicalChecks {
|
|
const finalUrl = normalizeCrawlUrl(input.finalUrl ?? "", input.rootUrl ?? undefined) ??
|
|
normalizeCrawlUrl(input.rootUrl ?? "", undefined) ??
|
|
"";
|
|
const normalizedRoot = normalizeCrawlUrl(input.rootUrl ?? finalUrl ?? "", undefined) ??
|
|
finalUrl;
|
|
|
|
const title = input.title?.trim() ?? "";
|
|
const metaDescription = input.metaDescription?.trim() ?? "";
|
|
const visibleText = input.visibleText ?? "";
|
|
|
|
const relevantVisibleText = visibleText.toLowerCase();
|
|
const hasVisibleContactPath =
|
|
isRelevantContactPathText(relevantVisibleText) ||
|
|
isRelevantContactPathText(finalUrl) ||
|
|
isRelevantContactPathText(new URL(finalUrl || "https://localhost").pathname);
|
|
|
|
const checkedUrlSet = new Set<string>();
|
|
const checkedUrls = input.checkedUrls ?? [];
|
|
for (const checkedUrl of checkedUrls) {
|
|
const normalizedCheckedUrl = normalizeCrawlUrl(checkedUrl, normalizedRoot ?? undefined);
|
|
if (!normalizedCheckedUrl || !isSameRegistrableHostishDomain(normalizedCheckedUrl, normalizedRoot)) {
|
|
continue;
|
|
}
|
|
const canonicalCheckedUrl = normalizeForQueue(normalizedCheckedUrl);
|
|
if (canonicalCheckedUrl) {
|
|
checkedUrlSet.add(canonicalCheckedUrl);
|
|
}
|
|
}
|
|
|
|
const hasCheckedUrls = checkedUrlSet.size > 0;
|
|
|
|
const brokenInternalLinksSet = new Set<string>();
|
|
|
|
for (const entry of input.links ?? []) {
|
|
const href = typeof entry === "string" ? entry : (entry.href ?? "");
|
|
const normalizedLink = normalizeCrawlUrl(href, normalizedRoot ?? undefined);
|
|
if (!normalizedLink || !isSameRegistrableHostishDomain(normalizedLink, normalizedRoot)) {
|
|
continue;
|
|
}
|
|
|
|
const canonical = normalizeForQueue(normalizedLink);
|
|
if (!canonical) {
|
|
continue;
|
|
}
|
|
|
|
if (hasCheckedUrls && !checkedUrlSet.has(canonical)) {
|
|
continue;
|
|
}
|
|
|
|
let isBroken = false;
|
|
if (typeof entry !== "string") {
|
|
if (entry.isBroken === true) {
|
|
isBroken = true;
|
|
}
|
|
|
|
const status = entry.status ?? entry.statusCode;
|
|
if (typeof status === "number" && (status >= 400 || status <= 0)) {
|
|
isBroken = true;
|
|
}
|
|
}
|
|
|
|
if (isBroken) {
|
|
brokenInternalLinksSet.add(canonical);
|
|
}
|
|
}
|
|
|
|
return {
|
|
https: finalUrl.startsWith("https://"),
|
|
finalUrl,
|
|
missingTitle: title.length === 0,
|
|
missingMetaDescription: metaDescription.length === 0,
|
|
hasVisibleContactPath,
|
|
brokenInternalLinks: Array.from(brokenInternalLinksSet),
|
|
};
|
|
}
|