import { normalizeEmailAddress } from "./lead-discovery-google"; const HTTP_SCHEMES = new Set(["http:", "https:"]); const RELEVANT_PATH_PATTERNS = [ /(?:^|\/)(kontakt|contact)(?:[-/]|$)/i, /(?:^|\/)(impressum|imprint)(?:[-/]|$)/i, /(?:^|\/)(leistungen|angebot|services?)(?:[-/]|$)/i, /(?:^|\/)(ueber|über|team|about)(?:[-/]|$)/i, ]; const CONTACT_CONTEXT_KEYWORDS = [ "ansprechpartner", "kontakt", "e-mail", "email", "team", "impressum", "geschäftsführung", "imprint", "footer", "anfrage", ]; const GENERIC_BUSINESS_LOCALS = new Set([ "info", "kontakt", "contact", "office", "hello", "sales", "support", "service", "team", "post", ]); export type WebsiteCrawlEmailCandidate = { email: string; emailSource: string | null; contactPerson: string | null; isBusinessContactAddress: boolean; }; export type WebsiteCrawlContactSignals = { visibleText: string; phoneNumbers: string[]; emailCandidates: WebsiteCrawlEmailCandidate[]; hasContactFormSignal: boolean; hasContactCtaSignal: boolean; }; export type TechnicalChecksInput = { rootUrl?: string | null; finalUrl?: string | null; title?: string | null; metaDescription?: string | null; visibleText?: string | null; checkedUrls?: string[]; links?: Array< | string | { href?: string; status?: number; statusCode?: number; isBroken?: boolean; } >; }; export type WebsiteTechnicalChecks = { https: boolean; finalUrl: string; missingTitle: boolean; missingMetaDescription: boolean; hasVisibleContactPath: boolean; brokenInternalLinks: string[]; }; function stripWww(host: string) { return host.replace(/^www\./i, ""); } function toLowerHost(value: string) { try { return new URL(value).hostname.toLowerCase(); } catch { return ""; } } export function normalizeCrawlUrl(input?: string | null, base?: string) { if (!input) { return null; } const trimmed = input.trim(); if (!trimmed) { return null; } if (!base && (trimmed.startsWith("//") || !trimmed.includes("://"))) { return null; } let parsed: URL; try { parsed = new URL(trimmed, base); } catch { return null; } if (!HTTP_SCHEMES.has(parsed.protocol)) { return null; } const normalizedHost = stripWww(parsed.hostname.toLowerCase()); const search = parsed.search; const path = parsed.pathname || "/"; return `${parsed.protocol}//${normalizedHost}${parsed.port ? `:${parsed.port}` : ""}${path}${search}`; } export function isSameRegistrableHostishDomain( candidateUrl: string, rootUrl: string, ) { const root = normalizeCrawlUrl(rootUrl) ?? undefined; const candidate = normalizeCrawlUrl(candidateUrl, root); if (!candidate || !root) { return false; } const candidateHost = stripWww(toLowerHost(candidate)); const rootHost = stripWww(toLowerHost(root)); return candidateHost === rootHost && candidateHost.length > 0; } function normalizeForQueue(value: string | null) { if (!value) { return null; } let url: URL; try { url = new URL(value); } catch { return null; } const host = `${stripWww(url.hostname.toLowerCase())}${url.port ? `:${url.port}` : ""}`; return `${url.protocol}//${host}${url.pathname.replace(/\/$/, "") || "/"}`; } export function discoverRelevantSubpageUrls(links: string[], rootUrl: string) { const root = normalizeCrawlUrl(rootUrl); if (!root) { return []; } const parsedRoot = new URL(root); const homepage = `${parsedRoot.protocol}//${stripWww( parsedRoot.hostname.toLowerCase(), )}${parsedRoot.port ? `:${parsedRoot.port}` : ""}/`; const seen = new Set([homepage]); const buckets: string[][] = [[], [], [], []]; for (const link of links) { const normalized = normalizeCrawlUrl(link, rootUrl); if (!normalized || !isSameRegistrableHostishDomain(normalized, rootUrl)) { continue; } const canonical = normalizeForQueue(normalized); if (!canonical || seen.has(canonical)) { continue; } let path: string; try { path = new URL(normalized).pathname.toLowerCase(); } catch { continue; } for (const [priority, pattern] of RELEVANT_PATH_PATTERNS.entries()) { if (pattern.test(path)) { if (buckets[priority].length > 0) { break; } buckets[priority].push(canonical); seen.add(canonical); break; } } } const relevant = [...buckets.flat()]; return [homepage, ...relevant].slice(0, 5); } function stripHtml(input: string) { return input .replace(//gi, " ") .replace(//gi, " ") .replace(/<[^>]*>/g, " ") .replace(/\s+/g, " ") .trim(); } function stripLeadingToText(input: string) { return input.replace(/<[^>]*>/g, "").replace(/\s+/g, " ").trim(); } function decodeCommonEmailEntities(input: string) { return input .replace(/ | | /gi, " ") .replace(/@|@|@/gi, "@") .replace(/.|.|./gi, "."); } function normalizeEmailExtractionInput(input: string) { return decodeCommonEmailEntities(input) .replace(//gi, " ") .replace(//gi, " ") .replace(/\s+/g, " ") .trim(); } function normalizeMailtoAddress(value: string) { const strippedQuery = value.split("?")[0] ?? ""; const withoutMailto = strippedQuery.replace(/^mailto:/i, ""); try { return decodeURIComponent(withoutMailto).trim(); } catch { return withoutMailto.trim(); } } function denormalizeObfuscatedEmail(value: string) { const withAt = value .replace(/\[\s*at\s*\]|\(\s*at\s*\)|\{\s*at\s*\}/gi, "@") .replace(/\bpunkt\b|\bdot\b/gi, ".") .replace(/\[\s*dot\s*\]|\(\s*dot\s*\)|\{\s*dot\s*\}/gi, "."); return withAt .replace(/\s*@\s*/g, "@") .replace(/\s*\.\s*/g, ".") .replace(/\s+/g, ""); } function addEmailCandidate( entries: WebsiteCrawlEmailCandidate[], seen: Set, email: string, source: string, index: number, length: number, explicitPersons: Map, ) { const normalized = normalizeEmailAddress(email); if (!normalized || seen.has(normalized)) { return; } const businessContext = hasBusinessContactContext(source, index, length); const explicitPerson = explicitPersons.get(normalized) ?? getContactPersonForEmail(source, email, index); entries.push({ email: normalized, emailSource: null, contactPerson: explicitPerson, isBusinessContactAddress: businessContext, }); seen.add(normalized); } function collectObfuscatedEmailCandidates( source: string, explicitPersons: Map, ) { const normalizedSource = normalizeEmailExtractionInput(source); const localPart = "[a-z0-9._%+-]{1,64}"; const domainLabel = "[a-z0-9](?:[a-z0-9-]{0,61}[a-z0-9])?"; const tld = "[a-z]{2,}"; const strictAtSeparator = "(?:@|\\[\\s*at\\s*\\]|\\(\\s*at\\s*\\)|\\{\\s*at\\s*\\})"; const looseAtSeparator = "\\bat\\b"; const atSeparator = `(?:${strictAtSeparator}|${looseAtSeparator})`; const strictDotSeparator = "(?:\\.|\\[\\s*(?:dot|punkt)\\s*\\]|\\(\\s*(?:dot|punkt)\\s*\\)|\\{\\s*(?:dot|punkt)\\s*\\})"; const looseDotSeparator = "\\b(?:dot|punkt)\\b"; const dotSeparator = `(?:${strictDotSeparator}|${looseDotSeparator})`; const obfuscatedEmailRegex = new RegExp( `\\b(?${localPart})\\s*(?${atSeparator})\\s*(?${domainLabel}(?:\\s*${dotSeparator}\\s*${domainLabel})*\\s*${dotSeparator}\\s*${tld})\\b`, "gi", ); const candidates: WebsiteCrawlEmailCandidate[] = []; const seen = new Set(); for (const match of normalizedSource.matchAll(obfuscatedEmailRegex)) { const rawCandidate = match[0]; if (!rawCandidate) { continue; } const localPartMatch = match.groups?.local ?? ""; const atSeparatorMatch = match.groups?.at ?? ""; const domainPartMatch = match.groups?.domain ?? ""; const isBareAt = /\bat\b/i.test(atSeparatorMatch) && !/@|\[|\(|\{/.test(atSeparatorMatch); const hasBareDot = /\b(?:dot|punkt)\b/i.test(domainPartMatch); const deobfuscationIndex = match.index ?? -1; if (deobfuscationIndex < 0) { continue; } if ((isBareAt || hasBareDot) && !GENERIC_BUSINESS_LOCALS.has(localPartMatch.toLowerCase()) && !hasBusinessContactContext( normalizedSource, deobfuscationIndex, rawCandidate.length, )) { continue; } const normalized = denormalizeObfuscatedEmail(rawCandidate); const normalizedEmail = normalizeEmailAddress(normalized); if (!normalizedEmail || seen.has(normalizedEmail)) { continue; } const explicitPerson = explicitPersons.get(normalizedEmail) ?? getContactPersonForEmail(normalizedSource, rawCandidate, deobfuscationIndex); const businessContext = hasBusinessContactContext( normalizedSource, deobfuscationIndex, rawCandidate.length, ); candidates.push({ email: normalizedEmail, emailSource: null, contactPerson: explicitPerson, isBusinessContactAddress: businessContext, }); seen.add(normalizedEmail); } return candidates; } function getContactPersonForEmail( text: string, email: string, index: number, ) { const windowStart = Math.max(0, index - 120); const windowEnd = Math.min(text.length, index + email.length + 120); const context = text.slice(windowStart, windowEnd); const beforeEmailContext = context.slice(0, index - windowStart); const anchorMatches = Array.from( beforeEmailContext.matchAll(/]*>(.*?)<\/a>/gi), ); const nearestAnchor = anchorMatches.at(-1); if (nearestAnchor?.[1]) { const anchorText = stripLeadingToText(nearestAnchor[1]).trim(); if (anchorText && !/@/.test(anchorText) && anchorText.length < 120) { return anchorText; } } const nearMatch = context.match( /(?:(?:^|[>\s])([A-ZÄÖÜ][a-zäöüßÄÖÜ]+\s+[A-ZÄÖÜ][a-zäöüßÄÖÜ-]+(?:\s+[A-ZÄÖÜ][a-zäöüßÄÖÜ-]+)?))$/u, ); if (nearMatch?.[1]) { return stripLeadingToText(nearMatch[1]!).trim(); } const directMatch = text.slice(0, index).match( /([A-ZÄÖÜ][a-zäöüßÄÖÜ-]+\s+[A-ZÄÖÜ][a-zäöüßÄÖÜ-]+)\s*(?:,|\s+\()?\s*$/u, ); return directMatch?.[1]?.trim() ?? null; } function hasBusinessContactContext(text: string, index: number, length: number) { const context = text .slice(Math.max(0, index - 140), Math.min(text.length, index + length + 140)) .toLowerCase(); return CONTACT_CONTEXT_KEYWORDS.some((keyword) => context.includes(keyword)); } function makePhoneNumberSet(input: string) { const phoneRegex = /(?:\+?\d[\d\s./()-]{7,}\d)/g; const matches = input.matchAll(phoneRegex); const values = new Set(); for (const match of matches) { const raw = match[0] ?? ""; const normalized = raw.replace(/[^\d+]/g, ""); if (normalized.length >= 7) { values.add(raw.trim()); values.add(normalized); } } return Array.from(values).filter((value) => value.length >= 7); } function makeEmailCandidates(input: string) { const emailRegex = /[a-z0-9._%+-]+@[a-z0-9.-]+\.[a-z]{2,}(?:\b)?/gi; const mailtoAnchors = input.matchAll( /href=["']mailto:([^"'>\s]+)["'][^>]*>(.*?)<\/a>/gi, ); const normalizedInput = normalizeEmailExtractionInput(input); const explicitPersons = new Map(); const entries: WebsiteCrawlEmailCandidate[] = []; const seen = new Set(); for (const anchorMatch of mailtoAnchors) { const rawHref = normalizeMailtoAddress(anchorMatch[1] ?? ""); const email = normalizeEmailAddress(rawHref); if (!email) { continue; } const label = stripLeadingToText( decodeCommonEmailEntities(anchorMatch[2] ?? ""), ).trim(); const normalizedLabelEmail = normalizeEmailAddress(label); if (label && label.length <= 64 && !label.includes("@")) { explicitPersons.set(email, label); } if (seen.has(email)) { continue; } const anchorIndex = anchorMatch.index ?? -1; if (anchorIndex < 0) { continue; } const contactPerson = normalizedLabelEmail && normalizedLabelEmail === email ? null : label || null; entries.push({ email, emailSource: null, contactPerson, isBusinessContactAddress: hasBusinessContactContext( input, anchorIndex, email.length, ), }); seen.add(email); } for (const match of normalizedInput.matchAll(emailRegex)) { const rawEmail = match[0] ?? ""; const idx = match.index ?? -1; if (rawEmail.length === 0 || idx < 0) { continue; } addEmailCandidate( entries, seen, rawEmail, normalizedInput, idx, rawEmail.length, explicitPersons, ); } for (const candidate of collectObfuscatedEmailCandidates(input, explicitPersons)) { if (seen.has(candidate.email)) { continue; } entries.push(candidate); seen.add(candidate.email); } return entries; } export function extractContactSignalsFromHtmlLikeText(input: string) { const visibleText = stripHtml(input); const phoneNumbers = makePhoneNumberSet(visibleText); const emailCandidates = makeEmailCandidates(input); const lowerInput = input.toLowerCase(); const hasContactFormSignal = /kontaktformular|anfrageformular|contact form|(); const checkedUrls = input.checkedUrls ?? []; for (const checkedUrl of checkedUrls) { const normalizedCheckedUrl = normalizeCrawlUrl(checkedUrl, normalizedRoot ?? undefined); if (!normalizedCheckedUrl || !isSameRegistrableHostishDomain(normalizedCheckedUrl, normalizedRoot)) { continue; } const canonicalCheckedUrl = normalizeForQueue(normalizedCheckedUrl); if (canonicalCheckedUrl) { checkedUrlSet.add(canonicalCheckedUrl); } } const hasCheckedUrls = checkedUrlSet.size > 0; const brokenInternalLinksSet = new Set(); for (const entry of input.links ?? []) { const href = typeof entry === "string" ? entry : (entry.href ?? ""); const normalizedLink = normalizeCrawlUrl(href, normalizedRoot ?? undefined); if (!normalizedLink || !isSameRegistrableHostishDomain(normalizedLink, normalizedRoot)) { continue; } const canonical = normalizeForQueue(normalizedLink); if (!canonical) { continue; } if (hasCheckedUrls && !checkedUrlSet.has(canonical)) { continue; } let isBroken = false; if (typeof entry !== "string") { if (entry.isBroken === true) { isBroken = true; } const status = entry.status ?? entry.statusCode; if (typeof status === "number" && (status >= 400 || status <= 0)) { isBroken = true; } } if (isBroken) { brokenInternalLinksSet.add(canonical); } } return { https: finalUrl.startsWith("https://"), finalUrl, missingTitle: title.length === 0, missingMetaDescription: metaDescription.length === 0, hasVisibleContactPath, brokenInternalLinks: Array.from(brokenInternalLinksSet), }; }