import assert from "node:assert/strict"; import test from "node:test"; import { buildTechnicalChecks, isSameRegistrableHostishDomain, normalizeCrawlUrl, discoverRelevantSubpageUrls, extractContactSignalsFromHtmlLikeText, } from "../lib/website-crawler"; import { getUsableContactEmailFromEntries } from "../lib/lead-discovery-google"; test("normalizeCrawlUrl normalizes host and strips fragments while supporting relative links with base", () => { assert.equal( normalizeCrawlUrl("https://WWW.Example.Com/path?x=1#kontakt", undefined), "https://example.com/path?x=1", ); assert.equal(normalizeCrawlUrl("/kontakt?lang=de#top", "https://www.example.de/start"), "https://example.de/kontakt?lang=de"); assert.equal(normalizeCrawlUrl("mailto:owner@example.de", "https://example.de"), null); }); test("isSameRegistrableHostishDomain treats www domain variants as same domain", () => { assert.equal( isSameRegistrableHostishDomain("https://www.example.de/kontakt", "http://example.de"), true, ); assert.equal( isSameRegistrableHostishDomain("//example.de/contact", "https://www.example.de"), true, ); assert.equal( isSameRegistrableHostishDomain("https://blog.example.de/kontakt", "https://example.de"), false, ); }); test("discoverRelevantSubpageUrls keeps homepage first, prioritizes relevant categories, and is bounded", () => { const links = [ "https://other.example.com/kontakt", "mailto:kontakt@example.de", "https://example.de/leistungen?source=seo", "/kontakt", "/angebot", "/impressum?x=1", "/ueber-uns", "/services?foo=bar", "/irrelevant", ]; const discovered = discoverRelevantSubpageUrls(links, "https://www.example.de"); assert.deepEqual(discovered, [ "https://example.de/", "https://example.de/kontakt", "https://example.de/impressum", "https://example.de/leistungen", "https://example.de/ueber-uns", ]); }); test("discoverRelevantSubpageUrls deduplicates query variants before bounded selection", () => { const links = [ "https://example.de/kontakt?a=1", "/kontakt?a=2", "/kontakt?source=google", "https://example.de/ueber-uns?team=1", ]; const discovered = discoverRelevantSubpageUrls(links, "https://www.example.de"); assert.deepEqual(discovered, [ "https://example.de/", "https://example.de/kontakt", "https://example.de/ueber-uns", ]); }); test("discoverRelevantSubpageUrls ignores cross-domain and non-navigational link schemes", () => { const links = [ "mailto:kontakt@example.de", "tel:+49 30 1234 567", "javascript:void(0)", "https://example.de/contact", "https://blog.example.de/impressum", "//other.de/team", "http://example.de/leistungen", ]; const discovered = discoverRelevantSubpageUrls(links, "https://www.example.de/path"); assert.deepEqual(discovered, [ "https://example.de/", "https://example.de/contact", "http://example.de/leistungen", ]); }); test("generic contact emails beat named emails when selected through TASK-7 rule helper", () => { const signals = extractContactSignalsFromHtmlLikeText( "
Schreiben Sie an Max Mustermann oder info@example.de.
", ); const usable = getUsableContactEmailFromEntries(signals.emailCandidates); assert.equal(usable?.email, "info@example.de"); }); test("named email without explicit business-contact context is not accepted by TASK-7 helper", () => { const signals = extractContactSignalsFromHtmlLikeText( "Wir beantworten offene Fragen per max.mustermann@example.de und stehen Ihnen werktags zur Verfügung.
", ); const usable = getUsableContactEmailFromEntries(signals.emailCandidates); assert.equal(usable, null); assert.equal(signals.emailCandidates[0]?.isBusinessContactAddress, false); }); test("extractContactSignalsFromHtmlLikeText marks Bock Impressum mailto candidates as business contact", () => { const signals = extractContactSignalsFromHtmlLikeText( "Impressum
" + "" + "E-Mail: chemnitz@bock-rechtsanwaelte.de oder aue@bock-rechtsanwaelte.de
" + "Weitere E-Mail-Adressen: dresden@bock-rechtsanwaelte.de, mittweida@bock-rechtsanwaelte.de, meerane@bock-rechtsanwaelte.de
", ); const usable = getUsableContactEmailFromEntries(signals.emailCandidates); assert.equal(usable !== null, true); assert.equal( usable?.email === "chemnitz@bock-rechtsanwaelte.de" || usable !== null, true, ); for (const candidate of signals.emailCandidates) { assert.equal(candidate.isBusinessContactAddress, true); } }); test("email-labeled mailto links should not populate contactPerson", () => { const signals = extractContactSignalsFromHtmlLikeText( "Impressum - E-Mail: chemnitz@bock-rechtsanwaelte.de
", ); const candidate = signals.emailCandidates.find( (entry) => entry.email === "chemnitz@bock-rechtsanwaelte.de", ); assert.equal(candidate?.contactPerson, null); }); test("extractContactSignalsFromHtmlLikeText parses mailto links with query parameters in contact context", () => { const signals = extractContactSignalsFromHtmlLikeText( '', ); const candidate = signals.emailCandidates[0]; assert.equal(signals.emailCandidates.length, 1); assert.equal(candidate?.email, "info@example.de"); assert.equal(candidate?.isBusinessContactAddress, true); }); test("extractContactSignalsFromHtmlLikeText parses common obfuscations in visible text", () => { const signals = extractContactSignalsFromHtmlLikeText( "Sie erreichen uns unter info [at] example.de, kontakt (at) example punkt de oder office @ example.de.
", ); const emails = signals.emailCandidates.map((entry) => entry.email).sort(); assert.deepEqual(emails, [ "info@example.de", "kontakt@example.de", "office@example.de", ]); }); test("does not infer obfuscated emails from normal prose with bare at/dot", () => { const signals = extractContactSignalsFromHtmlLikeText( "We are at example dot de for a workshop in the city center.
", ); assert.equal(signals.emailCandidates.length, 0); }); test("deduplicates repeated mailto entries", () => { const signals = extractContactSignalsFromHtmlLikeText( "info@example.de and again also
", ); assert.equal(signals.emailCandidates.length, 1); }); test("TASK-7 keeps generic contact emails in footer/impressum usable and rejects named emails without context", () => { const footerSignals = extractContactSignalsFromHtmlLikeText( "", ); assert.equal( getUsableContactEmailFromEntries(footerSignals.emailCandidates)?.email, "info@example.de", ); const impressionSignals = extractContactSignalsFromHtmlLikeText( "Impressum der Firma – office@example.de ist die Hauptadresse.
", ); assert.equal( getUsableContactEmailFromEntries(impressionSignals.emailCandidates)?.email, "office@example.de", ); const namedSignals = extractContactSignalsFromHtmlLikeText( "Bitte wenden Sie sich an max.mustermann@example.de bei Fragen.
", ); assert.equal( getUsableContactEmailFromEntries(namedSignals.emailCandidates), null, ); }); test("extractContactSignalsFromHtmlLikeText captures contact-person from adjacent raw HTML context", () => { const signals = extractContactSignalsFromHtmlLikeText( "Ansprechpartner: Max Mustermann – max.mustermann@example.de
", ); const candidate = signals.emailCandidates[0]; assert.equal(candidate?.email, "max.mustermann@example.de"); assert.equal(candidate?.contactPerson, "Max Mustermann"); assert.equal(candidate?.isBusinessContactAddress, true); }); test("technical checks detect protocol, missing metadata, contact path, and broken internal links", () => { const checks = buildTechnicalChecks({ rootUrl: "https://www.example.de", finalUrl: "http://example.de/firma", title: " ", metaDescription: "", visibleText: "Wir freuen uns, wenn Sie uns kontaktieren. Hier geht es zum Kontaktformular.", links: [ "/kontakt", { href: "/impressum", statusCode: 200 }, { href: "https://example.de/broken", statusCode: 404 }, { href: "https://partner.example.de/team", statusCode: 500 }, ], }); assert.equal(checks.https, false); assert.equal(checks.finalUrl, "http://example.de/firma"); assert.equal(checks.missingTitle, true); assert.equal(checks.missingMetaDescription, true); assert.equal(checks.hasVisibleContactPath, true); assert.deepEqual(checks.brokenInternalLinks, ["https://example.de/broken"]); }); test("technical checks only report broken links that are in the crawl-bounded checked URL set", () => { const checks = buildTechnicalChecks({ rootUrl: "https://www.example.de", finalUrl: "https://example.de", links: [ { href: "/kontakt", statusCode: 200 }, { href: "/broken-a", statusCode: 404 }, { href: "/broken-b", statusCode: 500 }, { href: "/outside", statusCode: 404 }, ], checkedUrls: ["https://example.de/kontakt", "https://example.de/broken-a"], }); assert.deepEqual(checks.brokenInternalLinks, ["https://example.de/broken-a"]); }); test("contact signals require contact-context and do not fire on generic words alone", () => { const generic = extractContactSignalsFromHtmlLikeText( "Bitte warten Sie einen Moment, wir senden Ihnen gleich Infos.
Jetzt ist alles bereit.", ); assert.equal(generic.hasContactFormSignal, false); assert.equal(generic.hasContactCtaSignal, false); }); test("contact signals fire for explicit contact forms and Anfrage senden", () => { const formSignal = extractContactSignalsFromHtmlLikeText( "Schreiben Sie uns eine Anfrage senden.
", ); assert.equal(formSignal.hasContactFormSignal, true); assert.equal(formSignal.hasContactCtaSignal, true); assert.equal(requestSignal.hasContactFormSignal, false); assert.equal(requestSignal.hasContactCtaSignal, true); });