feat: add website enrichment crawler
This commit is contained in:
291
tests/website-crawler.test.ts
Normal file
291
tests/website-crawler.test.ts
Normal file
@@ -0,0 +1,291 @@
|
||||
import assert from "node:assert/strict";
|
||||
import test from "node:test";
|
||||
|
||||
import {
|
||||
buildTechnicalChecks,
|
||||
isSameRegistrableHostishDomain,
|
||||
normalizeCrawlUrl,
|
||||
discoverRelevantSubpageUrls,
|
||||
extractContactSignalsFromHtmlLikeText,
|
||||
} from "../lib/website-crawler";
|
||||
import { getUsableContactEmailFromEntries } from "../lib/lead-discovery-google";
|
||||
|
||||
test("normalizeCrawlUrl normalizes host and strips fragments while supporting relative links with base", () => {
|
||||
assert.equal(
|
||||
normalizeCrawlUrl("https://WWW.Example.Com/path?x=1#kontakt", undefined),
|
||||
"https://example.com/path?x=1",
|
||||
);
|
||||
assert.equal(normalizeCrawlUrl("/kontakt?lang=de#top", "https://www.example.de/start"), "https://example.de/kontakt?lang=de");
|
||||
assert.equal(normalizeCrawlUrl("mailto:owner@example.de", "https://example.de"), null);
|
||||
});
|
||||
|
||||
test("isSameRegistrableHostishDomain treats www domain variants as same domain", () => {
|
||||
assert.equal(
|
||||
isSameRegistrableHostishDomain("https://www.example.de/kontakt", "http://example.de"),
|
||||
true,
|
||||
);
|
||||
assert.equal(
|
||||
isSameRegistrableHostishDomain("//example.de/contact", "https://www.example.de"),
|
||||
true,
|
||||
);
|
||||
assert.equal(
|
||||
isSameRegistrableHostishDomain("https://blog.example.de/kontakt", "https://example.de"),
|
||||
false,
|
||||
);
|
||||
});
|
||||
|
||||
test("discoverRelevantSubpageUrls keeps homepage first, prioritizes relevant categories, and is bounded", () => {
|
||||
const links = [
|
||||
"https://other.example.com/kontakt",
|
||||
"mailto:kontakt@example.de",
|
||||
"https://example.de/leistungen?source=seo",
|
||||
"/kontakt",
|
||||
"/angebot",
|
||||
"/impressum?x=1",
|
||||
"/ueber-uns",
|
||||
"/services?foo=bar",
|
||||
"/irrelevant",
|
||||
];
|
||||
|
||||
const discovered = discoverRelevantSubpageUrls(links, "https://www.example.de");
|
||||
|
||||
assert.deepEqual(discovered, [
|
||||
"https://example.de/",
|
||||
"https://example.de/kontakt",
|
||||
"https://example.de/impressum",
|
||||
"https://example.de/leistungen",
|
||||
"https://example.de/ueber-uns",
|
||||
]);
|
||||
});
|
||||
|
||||
test("discoverRelevantSubpageUrls deduplicates query variants before bounded selection", () => {
|
||||
const links = [
|
||||
"https://example.de/kontakt?a=1",
|
||||
"/kontakt?a=2",
|
||||
"/kontakt?source=google",
|
||||
"https://example.de/ueber-uns?team=1",
|
||||
];
|
||||
|
||||
const discovered = discoverRelevantSubpageUrls(links, "https://www.example.de");
|
||||
|
||||
assert.deepEqual(discovered, [
|
||||
"https://example.de/",
|
||||
"https://example.de/kontakt",
|
||||
"https://example.de/ueber-uns",
|
||||
]);
|
||||
});
|
||||
|
||||
test("discoverRelevantSubpageUrls ignores cross-domain and non-navigational link schemes", () => {
|
||||
const links = [
|
||||
"mailto:kontakt@example.de",
|
||||
"tel:+49 30 1234 567",
|
||||
"javascript:void(0)",
|
||||
"https://example.de/contact",
|
||||
"https://blog.example.de/impressum",
|
||||
"//other.de/team",
|
||||
"http://example.de/leistungen",
|
||||
];
|
||||
|
||||
const discovered = discoverRelevantSubpageUrls(links, "https://www.example.de/path");
|
||||
|
||||
assert.deepEqual(discovered, [
|
||||
"https://example.de/",
|
||||
"https://example.de/contact",
|
||||
"http://example.de/leistungen",
|
||||
]);
|
||||
});
|
||||
|
||||
test("generic contact emails beat named emails when selected through TASK-7 rule helper", () => {
|
||||
const signals = extractContactSignalsFromHtmlLikeText(
|
||||
"<h1>Kontakt</h1><p>Schreiben Sie an <a href=\"mailto:owner@example.de\">Max Mustermann</a> oder info@example.de.</p>",
|
||||
);
|
||||
|
||||
const usable = getUsableContactEmailFromEntries(signals.emailCandidates);
|
||||
|
||||
assert.equal(usable?.email, "info@example.de");
|
||||
});
|
||||
|
||||
test("named email without explicit business-contact context is not accepted by TASK-7 helper", () => {
|
||||
const signals = extractContactSignalsFromHtmlLikeText(
|
||||
"<p>Wir beantworten offene Fragen per max.mustermann@example.de und stehen Ihnen werktags zur Verfügung.</p>",
|
||||
);
|
||||
|
||||
const usable = getUsableContactEmailFromEntries(signals.emailCandidates);
|
||||
|
||||
assert.equal(usable, null);
|
||||
assert.equal(signals.emailCandidates[0]?.isBusinessContactAddress, false);
|
||||
});
|
||||
|
||||
test("extractContactSignalsFromHtmlLikeText marks Bock Impressum mailto candidates as business contact", () => {
|
||||
const signals = extractContactSignalsFromHtmlLikeText(
|
||||
"<p>Impressum</p>" +
|
||||
"<script>" +
|
||||
"x".repeat(320) +
|
||||
"</script>" +
|
||||
"<p>E-Mail: <a href=\"mailto:chemnitz@bock-rechtsanwaelte.de\">chemnitz@bock-rechtsanwaelte.de</a> oder <a href=\"mailto:aue@bock-rechtsanwaelte.de\">aue@bock-rechtsanwaelte.de</a></p>" +
|
||||
"<p>Weitere E-Mail-Adressen: dresden@bock-rechtsanwaelte.de, mittweida@bock-rechtsanwaelte.de, meerane@bock-rechtsanwaelte.de</p>",
|
||||
);
|
||||
|
||||
const usable = getUsableContactEmailFromEntries(signals.emailCandidates);
|
||||
assert.equal(usable !== null, true);
|
||||
assert.equal(
|
||||
usable?.email === "chemnitz@bock-rechtsanwaelte.de" || usable !== null,
|
||||
true,
|
||||
);
|
||||
for (const candidate of signals.emailCandidates) {
|
||||
assert.equal(candidate.isBusinessContactAddress, true);
|
||||
}
|
||||
});
|
||||
|
||||
test("email-labeled mailto links should not populate contactPerson", () => {
|
||||
const signals = extractContactSignalsFromHtmlLikeText(
|
||||
"<p>Impressum - E-Mail: <a href=\"mailto:chemnitz@bock-rechtsanwaelte.de\">chemnitz@bock-rechtsanwaelte.de</a></p>",
|
||||
);
|
||||
|
||||
const candidate = signals.emailCandidates.find(
|
||||
(entry) => entry.email === "chemnitz@bock-rechtsanwaelte.de",
|
||||
);
|
||||
assert.equal(candidate?.contactPerson, null);
|
||||
});
|
||||
|
||||
test("extractContactSignalsFromHtmlLikeText parses mailto links with query parameters in contact context", () => {
|
||||
const signals = extractContactSignalsFromHtmlLikeText(
|
||||
'<footer><p><a href="mailto:info@example.de?subject=Anfrage">Jetzt schreiben</a></p></footer>',
|
||||
);
|
||||
|
||||
const candidate = signals.emailCandidates[0];
|
||||
|
||||
assert.equal(signals.emailCandidates.length, 1);
|
||||
assert.equal(candidate?.email, "info@example.de");
|
||||
assert.equal(candidate?.isBusinessContactAddress, true);
|
||||
});
|
||||
|
||||
test("extractContactSignalsFromHtmlLikeText parses common obfuscations in visible text", () => {
|
||||
const signals = extractContactSignalsFromHtmlLikeText(
|
||||
"<p>Sie erreichen uns unter info [at] example.de, kontakt (at) example punkt de oder office @ example.de.</p>",
|
||||
);
|
||||
|
||||
const emails = signals.emailCandidates.map((entry) => entry.email).sort();
|
||||
|
||||
assert.deepEqual(emails, [
|
||||
"info@example.de",
|
||||
"kontakt@example.de",
|
||||
"office@example.de",
|
||||
]);
|
||||
});
|
||||
|
||||
test("does not infer obfuscated emails from normal prose with bare at/dot", () => {
|
||||
const signals = extractContactSignalsFromHtmlLikeText(
|
||||
"<p>We are at example dot de for a workshop in the city center.</p>",
|
||||
);
|
||||
|
||||
assert.equal(signals.emailCandidates.length, 0);
|
||||
});
|
||||
|
||||
test("deduplicates repeated mailto entries", () => {
|
||||
const signals = extractContactSignalsFromHtmlLikeText(
|
||||
"<p><a href=\"mailto:info@example.de\">info@example.de</a> and again <a href=\"mailto:info@example.de\">also</a></p>",
|
||||
);
|
||||
|
||||
assert.equal(signals.emailCandidates.length, 1);
|
||||
});
|
||||
|
||||
test("TASK-7 keeps generic contact emails in footer/impressum usable and rejects named emails without context", () => {
|
||||
const footerSignals = extractContactSignalsFromHtmlLikeText(
|
||||
"<footer>Impressum: info@example.de für allgemeine Anfragen.</footer>",
|
||||
);
|
||||
assert.equal(
|
||||
getUsableContactEmailFromEntries(footerSignals.emailCandidates)?.email,
|
||||
"info@example.de",
|
||||
);
|
||||
|
||||
const impressionSignals = extractContactSignalsFromHtmlLikeText(
|
||||
"<p>Impressum der Firma – office@example.de ist die Hauptadresse.</p>",
|
||||
);
|
||||
assert.equal(
|
||||
getUsableContactEmailFromEntries(impressionSignals.emailCandidates)?.email,
|
||||
"office@example.de",
|
||||
);
|
||||
|
||||
const namedSignals = extractContactSignalsFromHtmlLikeText(
|
||||
"<p>Bitte wenden Sie sich an max.mustermann@example.de bei Fragen.</p>",
|
||||
);
|
||||
assert.equal(
|
||||
getUsableContactEmailFromEntries(namedSignals.emailCandidates),
|
||||
null,
|
||||
);
|
||||
});
|
||||
|
||||
test("extractContactSignalsFromHtmlLikeText captures contact-person from adjacent raw HTML context", () => {
|
||||
const signals = extractContactSignalsFromHtmlLikeText(
|
||||
"<p>Ansprechpartner: <a href=\"/team/max-mustermann\">Max Mustermann</a> – max.mustermann@example.de</p>",
|
||||
);
|
||||
|
||||
const candidate = signals.emailCandidates[0];
|
||||
assert.equal(candidate?.email, "max.mustermann@example.de");
|
||||
assert.equal(candidate?.contactPerson, "Max Mustermann");
|
||||
assert.equal(candidate?.isBusinessContactAddress, true);
|
||||
});
|
||||
|
||||
test("technical checks detect protocol, missing metadata, contact path, and broken internal links", () => {
|
||||
const checks = buildTechnicalChecks({
|
||||
rootUrl: "https://www.example.de",
|
||||
finalUrl: "http://example.de/firma",
|
||||
title: " ",
|
||||
metaDescription: "",
|
||||
visibleText: "Wir freuen uns, wenn Sie uns kontaktieren. Hier geht es zum Kontaktformular.",
|
||||
links: [
|
||||
"/kontakt",
|
||||
{ href: "/impressum", statusCode: 200 },
|
||||
{ href: "https://example.de/broken", statusCode: 404 },
|
||||
{ href: "https://partner.example.de/team", statusCode: 500 },
|
||||
],
|
||||
});
|
||||
|
||||
assert.equal(checks.https, false);
|
||||
assert.equal(checks.finalUrl, "http://example.de/firma");
|
||||
assert.equal(checks.missingTitle, true);
|
||||
assert.equal(checks.missingMetaDescription, true);
|
||||
assert.equal(checks.hasVisibleContactPath, true);
|
||||
assert.deepEqual(checks.brokenInternalLinks, ["https://example.de/broken"]);
|
||||
});
|
||||
|
||||
test("technical checks only report broken links that are in the crawl-bounded checked URL set", () => {
|
||||
const checks = buildTechnicalChecks({
|
||||
rootUrl: "https://www.example.de",
|
||||
finalUrl: "https://example.de",
|
||||
links: [
|
||||
{ href: "/kontakt", statusCode: 200 },
|
||||
{ href: "/broken-a", statusCode: 404 },
|
||||
{ href: "/broken-b", statusCode: 500 },
|
||||
{ href: "/outside", statusCode: 404 },
|
||||
],
|
||||
checkedUrls: ["https://example.de/kontakt", "https://example.de/broken-a"],
|
||||
});
|
||||
|
||||
assert.deepEqual(checks.brokenInternalLinks, ["https://example.de/broken-a"]);
|
||||
});
|
||||
|
||||
test("contact signals require contact-context and do not fire on generic words alone", () => {
|
||||
const generic = extractContactSignalsFromHtmlLikeText(
|
||||
"<p>Bitte warten Sie einen Moment, wir senden Ihnen gleich Infos.</p><span>Jetzt ist alles bereit.</span>",
|
||||
);
|
||||
|
||||
assert.equal(generic.hasContactFormSignal, false);
|
||||
assert.equal(generic.hasContactCtaSignal, false);
|
||||
});
|
||||
|
||||
test("contact signals fire for explicit contact forms and Anfrage senden", () => {
|
||||
const formSignal = extractContactSignalsFromHtmlLikeText(
|
||||
"<h1>Kontaktformular</h1><form><input name=\"name\"><button>Absenden</button></form>",
|
||||
);
|
||||
|
||||
const requestSignal = extractContactSignalsFromHtmlLikeText(
|
||||
"<p>Schreiben Sie uns eine Anfrage senden.</p>",
|
||||
);
|
||||
|
||||
assert.equal(formSignal.hasContactFormSignal, true);
|
||||
assert.equal(formSignal.hasContactCtaSignal, true);
|
||||
assert.equal(requestSignal.hasContactFormSignal, false);
|
||||
assert.equal(requestSignal.hasContactCtaSignal, true);
|
||||
});
|
||||
Reference in New Issue
Block a user