feat: add website enrichment crawler

This commit is contained in:
2026-06-04 20:29:23 +02:00
parent ca42c8d5a6
commit 1f6e31c01c
25 changed files with 3539 additions and 56 deletions

View File

@@ -0,0 +1,291 @@
import assert from "node:assert/strict";
import test from "node:test";
import {
buildTechnicalChecks,
isSameRegistrableHostishDomain,
normalizeCrawlUrl,
discoverRelevantSubpageUrls,
extractContactSignalsFromHtmlLikeText,
} from "../lib/website-crawler";
import { getUsableContactEmailFromEntries } from "../lib/lead-discovery-google";
test("normalizeCrawlUrl normalizes host and strips fragments while supporting relative links with base", () => {
assert.equal(
normalizeCrawlUrl("https://WWW.Example.Com/path?x=1#kontakt", undefined),
"https://example.com/path?x=1",
);
assert.equal(normalizeCrawlUrl("/kontakt?lang=de#top", "https://www.example.de/start"), "https://example.de/kontakt?lang=de");
assert.equal(normalizeCrawlUrl("mailto:owner@example.de", "https://example.de"), null);
});
test("isSameRegistrableHostishDomain treats www domain variants as same domain", () => {
assert.equal(
isSameRegistrableHostishDomain("https://www.example.de/kontakt", "http://example.de"),
true,
);
assert.equal(
isSameRegistrableHostishDomain("//example.de/contact", "https://www.example.de"),
true,
);
assert.equal(
isSameRegistrableHostishDomain("https://blog.example.de/kontakt", "https://example.de"),
false,
);
});
test("discoverRelevantSubpageUrls keeps homepage first, prioritizes relevant categories, and is bounded", () => {
const links = [
"https://other.example.com/kontakt",
"mailto:kontakt@example.de",
"https://example.de/leistungen?source=seo",
"/kontakt",
"/angebot",
"/impressum?x=1",
"/ueber-uns",
"/services?foo=bar",
"/irrelevant",
];
const discovered = discoverRelevantSubpageUrls(links, "https://www.example.de");
assert.deepEqual(discovered, [
"https://example.de/",
"https://example.de/kontakt",
"https://example.de/impressum",
"https://example.de/leistungen",
"https://example.de/ueber-uns",
]);
});
test("discoverRelevantSubpageUrls deduplicates query variants before bounded selection", () => {
const links = [
"https://example.de/kontakt?a=1",
"/kontakt?a=2",
"/kontakt?source=google",
"https://example.de/ueber-uns?team=1",
];
const discovered = discoverRelevantSubpageUrls(links, "https://www.example.de");
assert.deepEqual(discovered, [
"https://example.de/",
"https://example.de/kontakt",
"https://example.de/ueber-uns",
]);
});
test("discoverRelevantSubpageUrls ignores cross-domain and non-navigational link schemes", () => {
const links = [
"mailto:kontakt@example.de",
"tel:+49 30 1234 567",
"javascript:void(0)",
"https://example.de/contact",
"https://blog.example.de/impressum",
"//other.de/team",
"http://example.de/leistungen",
];
const discovered = discoverRelevantSubpageUrls(links, "https://www.example.de/path");
assert.deepEqual(discovered, [
"https://example.de/",
"https://example.de/contact",
"http://example.de/leistungen",
]);
});
test("generic contact emails beat named emails when selected through TASK-7 rule helper", () => {
const signals = extractContactSignalsFromHtmlLikeText(
"<h1>Kontakt</h1><p>Schreiben Sie an <a href=\"mailto:owner@example.de\">Max Mustermann</a> oder info@example.de.</p>",
);
const usable = getUsableContactEmailFromEntries(signals.emailCandidates);
assert.equal(usable?.email, "info@example.de");
});
test("named email without explicit business-contact context is not accepted by TASK-7 helper", () => {
const signals = extractContactSignalsFromHtmlLikeText(
"<p>Wir beantworten offene Fragen per max.mustermann@example.de und stehen Ihnen werktags zur Verfügung.</p>",
);
const usable = getUsableContactEmailFromEntries(signals.emailCandidates);
assert.equal(usable, null);
assert.equal(signals.emailCandidates[0]?.isBusinessContactAddress, false);
});
test("extractContactSignalsFromHtmlLikeText marks Bock Impressum mailto candidates as business contact", () => {
const signals = extractContactSignalsFromHtmlLikeText(
"<p>Impressum</p>" +
"<script>" +
"x".repeat(320) +
"</script>" +
"<p>E-Mail: <a href=\"mailto:chemnitz@bock-rechtsanwaelte.de\">chemnitz@bock-rechtsanwaelte.de</a> oder <a href=\"mailto:aue@bock-rechtsanwaelte.de\">aue@bock-rechtsanwaelte.de</a></p>" +
"<p>Weitere E-Mail-Adressen: dresden@bock-rechtsanwaelte.de, mittweida@bock-rechtsanwaelte.de, meerane@bock-rechtsanwaelte.de</p>",
);
const usable = getUsableContactEmailFromEntries(signals.emailCandidates);
assert.equal(usable !== null, true);
assert.equal(
usable?.email === "chemnitz@bock-rechtsanwaelte.de" || usable !== null,
true,
);
for (const candidate of signals.emailCandidates) {
assert.equal(candidate.isBusinessContactAddress, true);
}
});
test("email-labeled mailto links should not populate contactPerson", () => {
const signals = extractContactSignalsFromHtmlLikeText(
"<p>Impressum - E-Mail: <a href=\"mailto:chemnitz@bock-rechtsanwaelte.de\">chemnitz@bock-rechtsanwaelte.de</a></p>",
);
const candidate = signals.emailCandidates.find(
(entry) => entry.email === "chemnitz@bock-rechtsanwaelte.de",
);
assert.equal(candidate?.contactPerson, null);
});
test("extractContactSignalsFromHtmlLikeText parses mailto links with query parameters in contact context", () => {
const signals = extractContactSignalsFromHtmlLikeText(
'<footer><p><a href="mailto:info@example.de?subject=Anfrage">Jetzt schreiben</a></p></footer>',
);
const candidate = signals.emailCandidates[0];
assert.equal(signals.emailCandidates.length, 1);
assert.equal(candidate?.email, "info@example.de");
assert.equal(candidate?.isBusinessContactAddress, true);
});
test("extractContactSignalsFromHtmlLikeText parses common obfuscations in visible text", () => {
const signals = extractContactSignalsFromHtmlLikeText(
"<p>Sie erreichen uns unter info [at] example.de, kontakt (at) example punkt de oder office&nbsp;@&nbsp;example.de.</p>",
);
const emails = signals.emailCandidates.map((entry) => entry.email).sort();
assert.deepEqual(emails, [
"info@example.de",
"kontakt@example.de",
"office@example.de",
]);
});
test("does not infer obfuscated emails from normal prose with bare at/dot", () => {
const signals = extractContactSignalsFromHtmlLikeText(
"<p>We are at example dot de for a workshop in the city center.</p>",
);
assert.equal(signals.emailCandidates.length, 0);
});
test("deduplicates repeated mailto entries", () => {
const signals = extractContactSignalsFromHtmlLikeText(
"<p><a href=\"mailto:info@example.de\">info@example.de</a> and again <a href=\"mailto:info@example.de\">also</a></p>",
);
assert.equal(signals.emailCandidates.length, 1);
});
test("TASK-7 keeps generic contact emails in footer/impressum usable and rejects named emails without context", () => {
const footerSignals = extractContactSignalsFromHtmlLikeText(
"<footer>Impressum: info@example.de für allgemeine Anfragen.</footer>",
);
assert.equal(
getUsableContactEmailFromEntries(footerSignals.emailCandidates)?.email,
"info@example.de",
);
const impressionSignals = extractContactSignalsFromHtmlLikeText(
"<p>Impressum der Firma office@example.de ist die Hauptadresse.</p>",
);
assert.equal(
getUsableContactEmailFromEntries(impressionSignals.emailCandidates)?.email,
"office@example.de",
);
const namedSignals = extractContactSignalsFromHtmlLikeText(
"<p>Bitte wenden Sie sich an max.mustermann@example.de bei Fragen.</p>",
);
assert.equal(
getUsableContactEmailFromEntries(namedSignals.emailCandidates),
null,
);
});
test("extractContactSignalsFromHtmlLikeText captures contact-person from adjacent raw HTML context", () => {
const signals = extractContactSignalsFromHtmlLikeText(
"<p>Ansprechpartner: <a href=\"/team/max-mustermann\">Max Mustermann</a> max.mustermann@example.de</p>",
);
const candidate = signals.emailCandidates[0];
assert.equal(candidate?.email, "max.mustermann@example.de");
assert.equal(candidate?.contactPerson, "Max Mustermann");
assert.equal(candidate?.isBusinessContactAddress, true);
});
test("technical checks detect protocol, missing metadata, contact path, and broken internal links", () => {
const checks = buildTechnicalChecks({
rootUrl: "https://www.example.de",
finalUrl: "http://example.de/firma",
title: " ",
metaDescription: "",
visibleText: "Wir freuen uns, wenn Sie uns kontaktieren. Hier geht es zum Kontaktformular.",
links: [
"/kontakt",
{ href: "/impressum", statusCode: 200 },
{ href: "https://example.de/broken", statusCode: 404 },
{ href: "https://partner.example.de/team", statusCode: 500 },
],
});
assert.equal(checks.https, false);
assert.equal(checks.finalUrl, "http://example.de/firma");
assert.equal(checks.missingTitle, true);
assert.equal(checks.missingMetaDescription, true);
assert.equal(checks.hasVisibleContactPath, true);
assert.deepEqual(checks.brokenInternalLinks, ["https://example.de/broken"]);
});
test("technical checks only report broken links that are in the crawl-bounded checked URL set", () => {
const checks = buildTechnicalChecks({
rootUrl: "https://www.example.de",
finalUrl: "https://example.de",
links: [
{ href: "/kontakt", statusCode: 200 },
{ href: "/broken-a", statusCode: 404 },
{ href: "/broken-b", statusCode: 500 },
{ href: "/outside", statusCode: 404 },
],
checkedUrls: ["https://example.de/kontakt", "https://example.de/broken-a"],
});
assert.deepEqual(checks.brokenInternalLinks, ["https://example.de/broken-a"]);
});
test("contact signals require contact-context and do not fire on generic words alone", () => {
const generic = extractContactSignalsFromHtmlLikeText(
"<p>Bitte warten Sie einen Moment, wir senden Ihnen gleich Infos.</p><span>Jetzt ist alles bereit.</span>",
);
assert.equal(generic.hasContactFormSignal, false);
assert.equal(generic.hasContactCtaSignal, false);
});
test("contact signals fire for explicit contact forms and Anfrage senden", () => {
const formSignal = extractContactSignalsFromHtmlLikeText(
"<h1>Kontaktformular</h1><form><input name=\"name\"><button>Absenden</button></form>",
);
const requestSignal = extractContactSignalsFromHtmlLikeText(
"<p>Schreiben Sie uns eine Anfrage senden.</p>",
);
assert.equal(formSignal.hasContactFormSignal, true);
assert.equal(formSignal.hasContactCtaSignal, true);
assert.equal(requestSignal.hasContactFormSignal, false);
assert.equal(requestSignal.hasContactCtaSignal, true);
});