Files
pitchfast/tests/website-crawler.test.ts

292 lines
10 KiB
TypeScript
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import assert from "node:assert/strict";
import test from "node:test";
import {
buildTechnicalChecks,
isSameRegistrableHostishDomain,
normalizeCrawlUrl,
discoverRelevantSubpageUrls,
extractContactSignalsFromHtmlLikeText,
} from "../lib/website-crawler";
import { getUsableContactEmailFromEntries } from "../lib/lead-discovery-google";
test("normalizeCrawlUrl normalizes host and strips fragments while supporting relative links with base", () => {
assert.equal(
normalizeCrawlUrl("https://WWW.Example.Com/path?x=1#kontakt", undefined),
"https://example.com/path?x=1",
);
assert.equal(normalizeCrawlUrl("/kontakt?lang=de#top", "https://www.example.de/start"), "https://example.de/kontakt?lang=de");
assert.equal(normalizeCrawlUrl("mailto:owner@example.de", "https://example.de"), null);
});
test("isSameRegistrableHostishDomain treats www domain variants as same domain", () => {
assert.equal(
isSameRegistrableHostishDomain("https://www.example.de/kontakt", "http://example.de"),
true,
);
assert.equal(
isSameRegistrableHostishDomain("//example.de/contact", "https://www.example.de"),
true,
);
assert.equal(
isSameRegistrableHostishDomain("https://blog.example.de/kontakt", "https://example.de"),
false,
);
});
test("discoverRelevantSubpageUrls keeps homepage first, prioritizes relevant categories, and is bounded", () => {
const links = [
"https://other.example.com/kontakt",
"mailto:kontakt@example.de",
"https://example.de/leistungen?source=seo",
"/kontakt",
"/angebot",
"/impressum?x=1",
"/ueber-uns",
"/services?foo=bar",
"/irrelevant",
];
const discovered = discoverRelevantSubpageUrls(links, "https://www.example.de");
assert.deepEqual(discovered, [
"https://example.de/",
"https://example.de/kontakt",
"https://example.de/impressum",
"https://example.de/leistungen",
"https://example.de/ueber-uns",
]);
});
test("discoverRelevantSubpageUrls deduplicates query variants before bounded selection", () => {
const links = [
"https://example.de/kontakt?a=1",
"/kontakt?a=2",
"/kontakt?source=google",
"https://example.de/ueber-uns?team=1",
];
const discovered = discoverRelevantSubpageUrls(links, "https://www.example.de");
assert.deepEqual(discovered, [
"https://example.de/",
"https://example.de/kontakt",
"https://example.de/ueber-uns",
]);
});
test("discoverRelevantSubpageUrls ignores cross-domain and non-navigational link schemes", () => {
const links = [
"mailto:kontakt@example.de",
"tel:+49 30 1234 567",
"javascript:void(0)",
"https://example.de/contact",
"https://blog.example.de/impressum",
"//other.de/team",
"http://example.de/leistungen",
];
const discovered = discoverRelevantSubpageUrls(links, "https://www.example.de/path");
assert.deepEqual(discovered, [
"https://example.de/",
"https://example.de/contact",
"http://example.de/leistungen",
]);
});
test("generic contact emails beat named emails when selected through TASK-7 rule helper", () => {
const signals = extractContactSignalsFromHtmlLikeText(
"<h1>Kontakt</h1><p>Schreiben Sie an <a href=\"mailto:owner@example.de\">Max Mustermann</a> oder info@example.de.</p>",
);
const usable = getUsableContactEmailFromEntries(signals.emailCandidates);
assert.equal(usable?.email, "info@example.de");
});
test("named email without explicit business-contact context is not accepted by TASK-7 helper", () => {
const signals = extractContactSignalsFromHtmlLikeText(
"<p>Wir beantworten offene Fragen per max.mustermann@example.de und stehen Ihnen werktags zur Verfügung.</p>",
);
const usable = getUsableContactEmailFromEntries(signals.emailCandidates);
assert.equal(usable, null);
assert.equal(signals.emailCandidates[0]?.isBusinessContactAddress, false);
});
test("extractContactSignalsFromHtmlLikeText marks Bock Impressum mailto candidates as business contact", () => {
const signals = extractContactSignalsFromHtmlLikeText(
"<p>Impressum</p>" +
"<script>" +
"x".repeat(320) +
"</script>" +
"<p>E-Mail: <a href=\"mailto:chemnitz@bock-rechtsanwaelte.de\">chemnitz@bock-rechtsanwaelte.de</a> oder <a href=\"mailto:aue@bock-rechtsanwaelte.de\">aue@bock-rechtsanwaelte.de</a></p>" +
"<p>Weitere E-Mail-Adressen: dresden@bock-rechtsanwaelte.de, mittweida@bock-rechtsanwaelte.de, meerane@bock-rechtsanwaelte.de</p>",
);
const usable = getUsableContactEmailFromEntries(signals.emailCandidates);
assert.equal(usable !== null, true);
assert.equal(
usable?.email === "chemnitz@bock-rechtsanwaelte.de" || usable !== null,
true,
);
for (const candidate of signals.emailCandidates) {
assert.equal(candidate.isBusinessContactAddress, true);
}
});
test("email-labeled mailto links should not populate contactPerson", () => {
const signals = extractContactSignalsFromHtmlLikeText(
"<p>Impressum - E-Mail: <a href=\"mailto:chemnitz@bock-rechtsanwaelte.de\">chemnitz@bock-rechtsanwaelte.de</a></p>",
);
const candidate = signals.emailCandidates.find(
(entry) => entry.email === "chemnitz@bock-rechtsanwaelte.de",
);
assert.equal(candidate?.contactPerson, null);
});
test("extractContactSignalsFromHtmlLikeText parses mailto links with query parameters in contact context", () => {
const signals = extractContactSignalsFromHtmlLikeText(
'<footer><p><a href="mailto:info@example.de?subject=Anfrage">Jetzt schreiben</a></p></footer>',
);
const candidate = signals.emailCandidates[0];
assert.equal(signals.emailCandidates.length, 1);
assert.equal(candidate?.email, "info@example.de");
assert.equal(candidate?.isBusinessContactAddress, true);
});
test("extractContactSignalsFromHtmlLikeText parses common obfuscations in visible text", () => {
const signals = extractContactSignalsFromHtmlLikeText(
"<p>Sie erreichen uns unter info [at] example.de, kontakt (at) example punkt de oder office&nbsp;@&nbsp;example.de.</p>",
);
const emails = signals.emailCandidates.map((entry) => entry.email).sort();
assert.deepEqual(emails, [
"info@example.de",
"kontakt@example.de",
"office@example.de",
]);
});
test("does not infer obfuscated emails from normal prose with bare at/dot", () => {
const signals = extractContactSignalsFromHtmlLikeText(
"<p>We are at example dot de for a workshop in the city center.</p>",
);
assert.equal(signals.emailCandidates.length, 0);
});
test("deduplicates repeated mailto entries", () => {
const signals = extractContactSignalsFromHtmlLikeText(
"<p><a href=\"mailto:info@example.de\">info@example.de</a> and again <a href=\"mailto:info@example.de\">also</a></p>",
);
assert.equal(signals.emailCandidates.length, 1);
});
test("TASK-7 keeps generic contact emails in footer/impressum usable and rejects named emails without context", () => {
const footerSignals = extractContactSignalsFromHtmlLikeText(
"<footer>Impressum: info@example.de für allgemeine Anfragen.</footer>",
);
assert.equal(
getUsableContactEmailFromEntries(footerSignals.emailCandidates)?.email,
"info@example.de",
);
const impressionSignals = extractContactSignalsFromHtmlLikeText(
"<p>Impressum der Firma office@example.de ist die Hauptadresse.</p>",
);
assert.equal(
getUsableContactEmailFromEntries(impressionSignals.emailCandidates)?.email,
"office@example.de",
);
const namedSignals = extractContactSignalsFromHtmlLikeText(
"<p>Bitte wenden Sie sich an max.mustermann@example.de bei Fragen.</p>",
);
assert.equal(
getUsableContactEmailFromEntries(namedSignals.emailCandidates),
null,
);
});
test("extractContactSignalsFromHtmlLikeText captures contact-person from adjacent raw HTML context", () => {
const signals = extractContactSignalsFromHtmlLikeText(
"<p>Ansprechpartner: <a href=\"/team/max-mustermann\">Max Mustermann</a> max.mustermann@example.de</p>",
);
const candidate = signals.emailCandidates[0];
assert.equal(candidate?.email, "max.mustermann@example.de");
assert.equal(candidate?.contactPerson, "Max Mustermann");
assert.equal(candidate?.isBusinessContactAddress, true);
});
test("technical checks detect protocol, missing metadata, contact path, and broken internal links", () => {
const checks = buildTechnicalChecks({
rootUrl: "https://www.example.de",
finalUrl: "http://example.de/firma",
title: " ",
metaDescription: "",
visibleText: "Wir freuen uns, wenn Sie uns kontaktieren. Hier geht es zum Kontaktformular.",
links: [
"/kontakt",
{ href: "/impressum", statusCode: 200 },
{ href: "https://example.de/broken", statusCode: 404 },
{ href: "https://partner.example.de/team", statusCode: 500 },
],
});
assert.equal(checks.https, false);
assert.equal(checks.finalUrl, "http://example.de/firma");
assert.equal(checks.missingTitle, true);
assert.equal(checks.missingMetaDescription, true);
assert.equal(checks.hasVisibleContactPath, true);
assert.deepEqual(checks.brokenInternalLinks, ["https://example.de/broken"]);
});
test("technical checks only report broken links that are in the crawl-bounded checked URL set", () => {
const checks = buildTechnicalChecks({
rootUrl: "https://www.example.de",
finalUrl: "https://example.de",
links: [
{ href: "/kontakt", statusCode: 200 },
{ href: "/broken-a", statusCode: 404 },
{ href: "/broken-b", statusCode: 500 },
{ href: "/outside", statusCode: 404 },
],
checkedUrls: ["https://example.de/kontakt", "https://example.de/broken-a"],
});
assert.deepEqual(checks.brokenInternalLinks, ["https://example.de/broken-a"]);
});
test("contact signals require contact-context and do not fire on generic words alone", () => {
const generic = extractContactSignalsFromHtmlLikeText(
"<p>Bitte warten Sie einen Moment, wir senden Ihnen gleich Infos.</p><span>Jetzt ist alles bereit.</span>",
);
assert.equal(generic.hasContactFormSignal, false);
assert.equal(generic.hasContactCtaSignal, false);
});
test("contact signals fire for explicit contact forms and Anfrage senden", () => {
const formSignal = extractContactSignalsFromHtmlLikeText(
"<h1>Kontaktformular</h1><form><input name=\"name\"><button>Absenden</button></form>",
);
const requestSignal = extractContactSignalsFromHtmlLikeText(
"<p>Schreiben Sie uns eine Anfrage senden.</p>",
);
assert.equal(formSignal.hasContactFormSignal, true);
assert.equal(formSignal.hasContactCtaSignal, true);
assert.equal(requestSignal.hasContactFormSignal, false);
assert.equal(requestSignal.hasContactCtaSignal, true);
});