feat: add website enrichment crawler
This commit is contained in:
@@ -21,6 +21,21 @@ type LeadDiscoveryContactInput = {
|
||||
usableEmail?: string | null;
|
||||
};
|
||||
|
||||
export type LeadDiscoveryContactStatus =
|
||||
| "new"
|
||||
| "missing_contact"
|
||||
| "audit_ready"
|
||||
| "outreach_ready"
|
||||
| "contacted"
|
||||
| "replied"
|
||||
| "do_not_contact";
|
||||
|
||||
type WebsiteEnrichmentScheduleInput = {
|
||||
websiteUrl?: string | null;
|
||||
websiteDomain?: string | null;
|
||||
contactStatus: LeadDiscoveryContactStatus;
|
||||
};
|
||||
|
||||
export type LeadDiscoveryPriority = "high" | "medium" | "low" | "defer" | "blocked";
|
||||
|
||||
type LeadDiscoveryPriorityInput = {
|
||||
@@ -39,7 +54,7 @@ type LeadDiscoveryLeadRecordInput<TCampaignId extends string, TRunId extends str
|
||||
now: number;
|
||||
};
|
||||
|
||||
function optionalString(value: string | null) {
|
||||
function optionalString(value: string | null | undefined) {
|
||||
return value && value.trim().length > 0 ? value : undefined;
|
||||
}
|
||||
|
||||
@@ -91,6 +106,16 @@ export function getLeadDiscoveryContactStatus(
|
||||
return "missing_contact";
|
||||
}
|
||||
|
||||
export function shouldScheduleWebsiteEnrichment(
|
||||
input: WebsiteEnrichmentScheduleInput,
|
||||
) {
|
||||
const hasWebsiteData =
|
||||
optionalString(input.websiteUrl) !== undefined ||
|
||||
optionalString(input.websiteDomain) !== undefined;
|
||||
|
||||
return input.contactStatus === "missing_contact" && hasWebsiteData;
|
||||
}
|
||||
|
||||
export function buildLeadDiscoveryLeadRecord<
|
||||
TCampaignId extends string,
|
||||
TRunId extends string,
|
||||
|
||||
605
lib/website-crawler.ts
Normal file
605
lib/website-crawler.ts
Normal file
@@ -0,0 +1,605 @@
|
||||
import { normalizeEmailAddress } from "./lead-discovery-google";
|
||||
|
||||
const HTTP_SCHEMES = new Set(["http:", "https:"]);
|
||||
|
||||
const RELEVANT_PATH_PATTERNS = [
|
||||
/(?:^|\/)(kontakt|contact)(?:[-/]|$)/i,
|
||||
/(?:^|\/)(impressum|imprint)(?:[-/]|$)/i,
|
||||
/(?:^|\/)(leistungen|angebot|services?)(?:[-/]|$)/i,
|
||||
/(?:^|\/)(ueber|über|team|about)(?:[-/]|$)/i,
|
||||
];
|
||||
|
||||
const CONTACT_CONTEXT_KEYWORDS = [
|
||||
"ansprechpartner",
|
||||
"kontakt",
|
||||
"e-mail",
|
||||
"email",
|
||||
"team",
|
||||
"impressum",
|
||||
"geschäftsführung",
|
||||
"imprint",
|
||||
"footer",
|
||||
"anfrage",
|
||||
];
|
||||
|
||||
const GENERIC_BUSINESS_LOCALS = new Set([
|
||||
"info",
|
||||
"kontakt",
|
||||
"contact",
|
||||
"office",
|
||||
"hello",
|
||||
"sales",
|
||||
"support",
|
||||
"service",
|
||||
"team",
|
||||
"post",
|
||||
]);
|
||||
|
||||
export type WebsiteCrawlEmailCandidate = {
|
||||
email: string;
|
||||
emailSource: string | null;
|
||||
contactPerson: string | null;
|
||||
isBusinessContactAddress: boolean;
|
||||
};
|
||||
|
||||
export type WebsiteCrawlContactSignals = {
|
||||
visibleText: string;
|
||||
phoneNumbers: string[];
|
||||
emailCandidates: WebsiteCrawlEmailCandidate[];
|
||||
hasContactFormSignal: boolean;
|
||||
hasContactCtaSignal: boolean;
|
||||
};
|
||||
|
||||
export type TechnicalChecksInput = {
|
||||
rootUrl?: string | null;
|
||||
finalUrl?: string | null;
|
||||
title?: string | null;
|
||||
metaDescription?: string | null;
|
||||
visibleText?: string | null;
|
||||
checkedUrls?: string[];
|
||||
links?: Array<
|
||||
| string
|
||||
| {
|
||||
href?: string;
|
||||
status?: number;
|
||||
statusCode?: number;
|
||||
isBroken?: boolean;
|
||||
}
|
||||
>;
|
||||
};
|
||||
|
||||
export type WebsiteTechnicalChecks = {
|
||||
https: boolean;
|
||||
finalUrl: string;
|
||||
missingTitle: boolean;
|
||||
missingMetaDescription: boolean;
|
||||
hasVisibleContactPath: boolean;
|
||||
brokenInternalLinks: string[];
|
||||
};
|
||||
|
||||
function stripWww(host: string) {
|
||||
return host.replace(/^www\./i, "");
|
||||
}
|
||||
|
||||
function toLowerHost(value: string) {
|
||||
try {
|
||||
return new URL(value).hostname.toLowerCase();
|
||||
} catch {
|
||||
return "";
|
||||
}
|
||||
}
|
||||
|
||||
export function normalizeCrawlUrl(input?: string | null, base?: string) {
|
||||
if (!input) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const trimmed = input.trim();
|
||||
if (!trimmed) {
|
||||
return null;
|
||||
}
|
||||
|
||||
if (!base && (trimmed.startsWith("//") || !trimmed.includes("://"))) {
|
||||
return null;
|
||||
}
|
||||
|
||||
let parsed: URL;
|
||||
try {
|
||||
parsed = new URL(trimmed, base);
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
|
||||
if (!HTTP_SCHEMES.has(parsed.protocol)) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const normalizedHost = stripWww(parsed.hostname.toLowerCase());
|
||||
|
||||
const search = parsed.search;
|
||||
const path = parsed.pathname || "/";
|
||||
|
||||
return `${parsed.protocol}//${normalizedHost}${parsed.port ? `:${parsed.port}` : ""}${path}${search}`;
|
||||
}
|
||||
|
||||
export function isSameRegistrableHostishDomain(
|
||||
candidateUrl: string,
|
||||
rootUrl: string,
|
||||
) {
|
||||
const root = normalizeCrawlUrl(rootUrl) ?? undefined;
|
||||
const candidate = normalizeCrawlUrl(candidateUrl, root);
|
||||
|
||||
if (!candidate || !root) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const candidateHost = stripWww(toLowerHost(candidate));
|
||||
const rootHost = stripWww(toLowerHost(root));
|
||||
|
||||
return candidateHost === rootHost && candidateHost.length > 0;
|
||||
}
|
||||
|
||||
function normalizeForQueue(value: string | null) {
|
||||
if (!value) {
|
||||
return null;
|
||||
}
|
||||
|
||||
let url: URL;
|
||||
try {
|
||||
url = new URL(value);
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
|
||||
const host = `${stripWww(url.hostname.toLowerCase())}${url.port ? `:${url.port}` : ""}`;
|
||||
return `${url.protocol}//${host}${url.pathname.replace(/\/$/, "") || "/"}`;
|
||||
}
|
||||
|
||||
export function discoverRelevantSubpageUrls(links: string[], rootUrl: string) {
|
||||
const root = normalizeCrawlUrl(rootUrl);
|
||||
if (!root) {
|
||||
return [];
|
||||
}
|
||||
|
||||
const parsedRoot = new URL(root);
|
||||
const homepage = `${parsedRoot.protocol}//${stripWww(
|
||||
parsedRoot.hostname.toLowerCase(),
|
||||
)}${parsedRoot.port ? `:${parsedRoot.port}` : ""}/`;
|
||||
|
||||
const seen = new Set<string>([homepage]);
|
||||
const buckets: string[][] = [[], [], [], []];
|
||||
|
||||
for (const link of links) {
|
||||
const normalized = normalizeCrawlUrl(link, rootUrl);
|
||||
if (!normalized || !isSameRegistrableHostishDomain(normalized, rootUrl)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const canonical = normalizeForQueue(normalized);
|
||||
if (!canonical || seen.has(canonical)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
let path: string;
|
||||
try {
|
||||
path = new URL(normalized).pathname.toLowerCase();
|
||||
} catch {
|
||||
continue;
|
||||
}
|
||||
|
||||
for (const [priority, pattern] of RELEVANT_PATH_PATTERNS.entries()) {
|
||||
if (pattern.test(path)) {
|
||||
if (buckets[priority].length > 0) {
|
||||
break;
|
||||
}
|
||||
buckets[priority].push(canonical);
|
||||
seen.add(canonical);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const relevant = [...buckets.flat()];
|
||||
|
||||
return [homepage, ...relevant].slice(0, 5);
|
||||
}
|
||||
|
||||
function stripHtml(input: string) {
|
||||
return input
|
||||
.replace(/<script[\s\S]*?<\/script>/gi, " ")
|
||||
.replace(/<style[\s]*?[\s\S]*?<\/style>/gi, " ")
|
||||
.replace(/<[^>]*>/g, " ")
|
||||
.replace(/\s+/g, " ")
|
||||
.trim();
|
||||
}
|
||||
|
||||
function stripLeadingToText(input: string) {
|
||||
return input.replace(/<[^>]*>/g, "").replace(/\s+/g, " ").trim();
|
||||
}
|
||||
|
||||
function decodeCommonEmailEntities(input: string) {
|
||||
return input
|
||||
.replace(/ | | /gi, " ")
|
||||
.replace(/@|@|@/gi, "@")
|
||||
.replace(/.|.|./gi, ".");
|
||||
}
|
||||
|
||||
function normalizeEmailExtractionInput(input: string) {
|
||||
return decodeCommonEmailEntities(input)
|
||||
.replace(/<script[\s\S]*?<\/script>/gi, " ")
|
||||
.replace(/<style[\s\S]*?<\/style>/gi, " ")
|
||||
.replace(/\s+/g, " ")
|
||||
.trim();
|
||||
}
|
||||
|
||||
function normalizeMailtoAddress(value: string) {
|
||||
const strippedQuery = value.split("?")[0] ?? "";
|
||||
const withoutMailto = strippedQuery.replace(/^mailto:/i, "");
|
||||
try {
|
||||
return decodeURIComponent(withoutMailto).trim();
|
||||
} catch {
|
||||
return withoutMailto.trim();
|
||||
}
|
||||
}
|
||||
|
||||
function denormalizeObfuscatedEmail(value: string) {
|
||||
const withAt = value
|
||||
.replace(/\[\s*at\s*\]|\(\s*at\s*\)|\{\s*at\s*\}/gi, "@")
|
||||
.replace(/\bpunkt\b|\bdot\b/gi, ".")
|
||||
.replace(/\[\s*dot\s*\]|\(\s*dot\s*\)|\{\s*dot\s*\}/gi, ".");
|
||||
|
||||
return withAt
|
||||
.replace(/\s*@\s*/g, "@")
|
||||
.replace(/\s*\.\s*/g, ".")
|
||||
.replace(/\s+/g, "");
|
||||
}
|
||||
|
||||
function addEmailCandidate(
|
||||
entries: WebsiteCrawlEmailCandidate[],
|
||||
seen: Set<string>,
|
||||
email: string,
|
||||
source: string,
|
||||
index: number,
|
||||
length: number,
|
||||
explicitPersons: Map<string, string>,
|
||||
) {
|
||||
const normalized = normalizeEmailAddress(email);
|
||||
if (!normalized || seen.has(normalized)) {
|
||||
return;
|
||||
}
|
||||
|
||||
const businessContext = hasBusinessContactContext(source, index, length);
|
||||
const explicitPerson =
|
||||
explicitPersons.get(normalized) ?? getContactPersonForEmail(source, email, index);
|
||||
|
||||
entries.push({
|
||||
email: normalized,
|
||||
emailSource: null,
|
||||
contactPerson: explicitPerson,
|
||||
isBusinessContactAddress: businessContext,
|
||||
});
|
||||
seen.add(normalized);
|
||||
}
|
||||
|
||||
function collectObfuscatedEmailCandidates(
|
||||
source: string,
|
||||
explicitPersons: Map<string, string>,
|
||||
) {
|
||||
const normalizedSource = normalizeEmailExtractionInput(source);
|
||||
const localPart = "[a-z0-9._%+-]{1,64}";
|
||||
const domainLabel = "[a-z0-9](?:[a-z0-9-]{0,61}[a-z0-9])?";
|
||||
const tld = "[a-z]{2,}";
|
||||
const strictAtSeparator =
|
||||
"(?:@|\\[\\s*at\\s*\\]|\\(\\s*at\\s*\\)|\\{\\s*at\\s*\\})";
|
||||
const looseAtSeparator = "\\bat\\b";
|
||||
const atSeparator = `(?:${strictAtSeparator}|${looseAtSeparator})`;
|
||||
const strictDotSeparator =
|
||||
"(?:\\.|\\[\\s*(?:dot|punkt)\\s*\\]|\\(\\s*(?:dot|punkt)\\s*\\)|\\{\\s*(?:dot|punkt)\\s*\\})";
|
||||
const looseDotSeparator = "\\b(?:dot|punkt)\\b";
|
||||
const dotSeparator = `(?:${strictDotSeparator}|${looseDotSeparator})`;
|
||||
|
||||
const obfuscatedEmailRegex = new RegExp(
|
||||
`\\b(?<local>${localPart})\\s*(?<at>${atSeparator})\\s*(?<domain>${domainLabel}(?:\\s*${dotSeparator}\\s*${domainLabel})*\\s*${dotSeparator}\\s*${tld})\\b`,
|
||||
"gi",
|
||||
);
|
||||
|
||||
const candidates: WebsiteCrawlEmailCandidate[] = [];
|
||||
const seen = new Set<string>();
|
||||
|
||||
for (const match of normalizedSource.matchAll(obfuscatedEmailRegex)) {
|
||||
const rawCandidate = match[0];
|
||||
if (!rawCandidate) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const localPartMatch = match.groups?.local ?? "";
|
||||
const atSeparatorMatch = match.groups?.at ?? "";
|
||||
const domainPartMatch = match.groups?.domain ?? "";
|
||||
const isBareAt =
|
||||
/\bat\b/i.test(atSeparatorMatch) && !/@|\[|\(|\{/.test(atSeparatorMatch);
|
||||
const hasBareDot = /\b(?:dot|punkt)\b/i.test(domainPartMatch);
|
||||
|
||||
const deobfuscationIndex = match.index ?? -1;
|
||||
if (deobfuscationIndex < 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if ((isBareAt || hasBareDot) && !GENERIC_BUSINESS_LOCALS.has(localPartMatch.toLowerCase()) &&
|
||||
!hasBusinessContactContext(
|
||||
normalizedSource,
|
||||
deobfuscationIndex,
|
||||
rawCandidate.length,
|
||||
)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const normalized = denormalizeObfuscatedEmail(rawCandidate);
|
||||
const normalizedEmail = normalizeEmailAddress(normalized);
|
||||
if (!normalizedEmail || seen.has(normalizedEmail)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const explicitPerson =
|
||||
explicitPersons.get(normalizedEmail) ??
|
||||
getContactPersonForEmail(normalizedSource, rawCandidate, deobfuscationIndex);
|
||||
const businessContext = hasBusinessContactContext(
|
||||
normalizedSource,
|
||||
deobfuscationIndex,
|
||||
rawCandidate.length,
|
||||
);
|
||||
candidates.push({
|
||||
email: normalizedEmail,
|
||||
emailSource: null,
|
||||
contactPerson: explicitPerson,
|
||||
isBusinessContactAddress: businessContext,
|
||||
});
|
||||
seen.add(normalizedEmail);
|
||||
}
|
||||
|
||||
return candidates;
|
||||
}
|
||||
|
||||
function getContactPersonForEmail(
|
||||
text: string,
|
||||
email: string,
|
||||
index: number,
|
||||
) {
|
||||
const windowStart = Math.max(0, index - 120);
|
||||
const windowEnd = Math.min(text.length, index + email.length + 120);
|
||||
const context = text.slice(windowStart, windowEnd);
|
||||
|
||||
const beforeEmailContext = context.slice(0, index - windowStart);
|
||||
const anchorMatches = Array.from(
|
||||
beforeEmailContext.matchAll(/<a\b[^>]*>(.*?)<\/a>/gi),
|
||||
);
|
||||
const nearestAnchor = anchorMatches.at(-1);
|
||||
if (nearestAnchor?.[1]) {
|
||||
const anchorText = stripLeadingToText(nearestAnchor[1]).trim();
|
||||
if (anchorText && !/@/.test(anchorText) && anchorText.length < 120) {
|
||||
return anchorText;
|
||||
}
|
||||
}
|
||||
|
||||
const nearMatch = context.match(
|
||||
/(?:(?:^|[>\s])([A-ZÄÖÜ][a-zäöüßÄÖÜ]+\s+[A-ZÄÖÜ][a-zäöüßÄÖÜ-]+(?:\s+[A-ZÄÖÜ][a-zäöüßÄÖÜ-]+)?))$/u,
|
||||
);
|
||||
if (nearMatch?.[1]) {
|
||||
return stripLeadingToText(nearMatch[1]!).trim();
|
||||
}
|
||||
|
||||
const directMatch = text.slice(0, index).match(
|
||||
/([A-ZÄÖÜ][a-zäöüßÄÖÜ-]+\s+[A-ZÄÖÜ][a-zäöüßÄÖÜ-]+)\s*(?:,|\s+\()?\s*$/u,
|
||||
);
|
||||
return directMatch?.[1]?.trim() ?? null;
|
||||
}
|
||||
|
||||
function hasBusinessContactContext(text: string, index: number, length: number) {
|
||||
const context = text
|
||||
.slice(Math.max(0, index - 140), Math.min(text.length, index + length + 140))
|
||||
.toLowerCase();
|
||||
|
||||
return CONTACT_CONTEXT_KEYWORDS.some((keyword) => context.includes(keyword));
|
||||
}
|
||||
|
||||
function makePhoneNumberSet(input: string) {
|
||||
const phoneRegex = /(?:\+?\d[\d\s./()-]{7,}\d)/g;
|
||||
const matches = input.matchAll(phoneRegex);
|
||||
const values = new Set<string>();
|
||||
|
||||
for (const match of matches) {
|
||||
const raw = match[0] ?? "";
|
||||
const normalized = raw.replace(/[^\d+]/g, "");
|
||||
if (normalized.length >= 7) {
|
||||
values.add(raw.trim());
|
||||
values.add(normalized);
|
||||
}
|
||||
}
|
||||
|
||||
return Array.from(values).filter((value) => value.length >= 7);
|
||||
}
|
||||
|
||||
function makeEmailCandidates(input: string) {
|
||||
const emailRegex = /[a-z0-9._%+-]+@[a-z0-9.-]+\.[a-z]{2,}(?:\b)?/gi;
|
||||
const mailtoAnchors = input.matchAll(
|
||||
/href=["']mailto:([^"'>\s]+)["'][^>]*>(.*?)<\/a>/gi,
|
||||
);
|
||||
const normalizedInput = normalizeEmailExtractionInput(input);
|
||||
const explicitPersons = new Map<string, string>();
|
||||
|
||||
const entries: WebsiteCrawlEmailCandidate[] = [];
|
||||
const seen = new Set<string>();
|
||||
|
||||
for (const anchorMatch of mailtoAnchors) {
|
||||
const rawHref = normalizeMailtoAddress(anchorMatch[1] ?? "");
|
||||
const email = normalizeEmailAddress(rawHref);
|
||||
if (!email) {
|
||||
continue;
|
||||
}
|
||||
const label = stripLeadingToText(
|
||||
decodeCommonEmailEntities(anchorMatch[2] ?? ""),
|
||||
).trim();
|
||||
const normalizedLabelEmail = normalizeEmailAddress(label);
|
||||
if (label && label.length <= 64 && !label.includes("@")) {
|
||||
explicitPersons.set(email, label);
|
||||
}
|
||||
if (seen.has(email)) {
|
||||
continue;
|
||||
}
|
||||
const anchorIndex = anchorMatch.index ?? -1;
|
||||
if (anchorIndex < 0) {
|
||||
continue;
|
||||
}
|
||||
const contactPerson =
|
||||
normalizedLabelEmail && normalizedLabelEmail === email ? null : label || null;
|
||||
entries.push({
|
||||
email,
|
||||
emailSource: null,
|
||||
contactPerson,
|
||||
isBusinessContactAddress: hasBusinessContactContext(
|
||||
input,
|
||||
anchorIndex,
|
||||
email.length,
|
||||
),
|
||||
});
|
||||
seen.add(email);
|
||||
}
|
||||
|
||||
for (const match of normalizedInput.matchAll(emailRegex)) {
|
||||
const rawEmail = match[0] ?? "";
|
||||
const idx = match.index ?? -1;
|
||||
if (rawEmail.length === 0 || idx < 0) {
|
||||
continue;
|
||||
}
|
||||
addEmailCandidate(
|
||||
entries,
|
||||
seen,
|
||||
rawEmail,
|
||||
normalizedInput,
|
||||
idx,
|
||||
rawEmail.length,
|
||||
explicitPersons,
|
||||
);
|
||||
}
|
||||
|
||||
for (const candidate of collectObfuscatedEmailCandidates(input, explicitPersons)) {
|
||||
if (seen.has(candidate.email)) {
|
||||
continue;
|
||||
}
|
||||
entries.push(candidate);
|
||||
seen.add(candidate.email);
|
||||
}
|
||||
|
||||
return entries;
|
||||
}
|
||||
|
||||
export function extractContactSignalsFromHtmlLikeText(input: string) {
|
||||
const visibleText = stripHtml(input);
|
||||
const phoneNumbers = makePhoneNumberSet(visibleText);
|
||||
const emailCandidates = makeEmailCandidates(input);
|
||||
|
||||
const lowerInput = input.toLowerCase();
|
||||
|
||||
const hasContactFormSignal =
|
||||
/kontaktformular|anfrageformular|contact form|<form\b/i.test(lowerInput);
|
||||
const hasContactCtaSignal =
|
||||
/kontaktformular|anfrageformular|anfrage\s*senden|anfrage\s*stellen|schreiben\s+sie\s+uns|kontaktieren\s+(?:sie|du)|kontakt\s+(?:und|mit|zu)|<form\b/i.test(
|
||||
lowerInput,
|
||||
);
|
||||
|
||||
return {
|
||||
visibleText,
|
||||
phoneNumbers,
|
||||
emailCandidates,
|
||||
hasContactFormSignal,
|
||||
hasContactCtaSignal,
|
||||
};
|
||||
}
|
||||
|
||||
function isRelevantContactPathText(value: string) {
|
||||
const normalized = value.toLowerCase();
|
||||
return (
|
||||
/(?:^|\/)(kontakt|contact)(?:[-/]|$)/.test(normalized) ||
|
||||
/(?:^|\/)(impressum|imprint)(?:[-/]|$)/.test(normalized) ||
|
||||
/(?:^|\/)(leistungen|angebot|services?)(?:[-/]|$)/.test(normalized) ||
|
||||
/(?:^|\/)(ueber|über|about|team)(?:[-/]|$)/.test(normalized) ||
|
||||
/\bkontakt\b/.test(normalized) ||
|
||||
/\bkontaktformular\b/.test(normalized)
|
||||
);
|
||||
}
|
||||
|
||||
export function buildTechnicalChecks(input: TechnicalChecksInput): WebsiteTechnicalChecks {
|
||||
const finalUrl = normalizeCrawlUrl(input.finalUrl ?? "", input.rootUrl ?? undefined) ??
|
||||
normalizeCrawlUrl(input.rootUrl ?? "", undefined) ??
|
||||
"";
|
||||
const normalizedRoot = normalizeCrawlUrl(input.rootUrl ?? finalUrl ?? "", undefined) ??
|
||||
finalUrl;
|
||||
|
||||
const title = input.title?.trim() ?? "";
|
||||
const metaDescription = input.metaDescription?.trim() ?? "";
|
||||
const visibleText = input.visibleText ?? "";
|
||||
|
||||
const relevantVisibleText = visibleText.toLowerCase();
|
||||
const hasVisibleContactPath =
|
||||
isRelevantContactPathText(relevantVisibleText) ||
|
||||
isRelevantContactPathText(finalUrl) ||
|
||||
isRelevantContactPathText(new URL(finalUrl || "https://localhost").pathname);
|
||||
|
||||
const checkedUrlSet = new Set<string>();
|
||||
const checkedUrls = input.checkedUrls ?? [];
|
||||
for (const checkedUrl of checkedUrls) {
|
||||
const normalizedCheckedUrl = normalizeCrawlUrl(checkedUrl, normalizedRoot ?? undefined);
|
||||
if (!normalizedCheckedUrl || !isSameRegistrableHostishDomain(normalizedCheckedUrl, normalizedRoot)) {
|
||||
continue;
|
||||
}
|
||||
const canonicalCheckedUrl = normalizeForQueue(normalizedCheckedUrl);
|
||||
if (canonicalCheckedUrl) {
|
||||
checkedUrlSet.add(canonicalCheckedUrl);
|
||||
}
|
||||
}
|
||||
|
||||
const hasCheckedUrls = checkedUrlSet.size > 0;
|
||||
|
||||
const brokenInternalLinksSet = new Set<string>();
|
||||
|
||||
for (const entry of input.links ?? []) {
|
||||
const href = typeof entry === "string" ? entry : (entry.href ?? "");
|
||||
const normalizedLink = normalizeCrawlUrl(href, normalizedRoot ?? undefined);
|
||||
if (!normalizedLink || !isSameRegistrableHostishDomain(normalizedLink, normalizedRoot)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const canonical = normalizeForQueue(normalizedLink);
|
||||
if (!canonical) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (hasCheckedUrls && !checkedUrlSet.has(canonical)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
let isBroken = false;
|
||||
if (typeof entry !== "string") {
|
||||
if (entry.isBroken === true) {
|
||||
isBroken = true;
|
||||
}
|
||||
|
||||
const status = entry.status ?? entry.statusCode;
|
||||
if (typeof status === "number" && (status >= 400 || status <= 0)) {
|
||||
isBroken = true;
|
||||
}
|
||||
}
|
||||
|
||||
if (isBroken) {
|
||||
brokenInternalLinksSet.add(canonical);
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
https: finalUrl.startsWith("https://"),
|
||||
finalUrl,
|
||||
missingTitle: title.length === 0,
|
||||
missingMetaDescription: metaDescription.length === 0,
|
||||
hasVisibleContactPath,
|
||||
brokenInternalLinks: Array.from(brokenInternalLinksSet),
|
||||
};
|
||||
}
|
||||
Reference in New Issue
Block a user